实现的功能:
爬取最好大学网http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html软科中国最好大学2018排名,获取各高校的排名、名称
、省市、得分等字段,并将数据存入数据库
所用的库:
bs4、import requests、pymysql 可以自行安装
完整代码附于文末
1.getHTMLText(url):
传入要爬取网站的网址作为参数,通过request模块与网站交互,调用requests.get(url, timeout=30)方法获取网站源代码。
2.fillUnivList (uList, html):
传入一个空列表和网站源代码作为参数来填充该空列表,通过BeautifulSoup调用BeautifulSoup(html, "html.parser")方法来解析源代码,得到所需的标签值存入列表,函数返回填充后的列表。
这里可以看出我们所需要的字段都在一个tr标签下,获取该tr标签下的所有td标签,并分别取出钱四个字段添加到list中,以备后用
def fillUnivList(uList, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find("tbody").children:
if isinstance(tr, bs4.element.Tag): #过滤掉非标签类型的字符串
tds = tr('td')
uList.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
# pass;
return uList
3.printUnivList(list, num):
传入fillUnivList()函数获得的列表和要显示的数据条数作为参数,将列表中的数据按照设定的模板输出前num条数据到控制台。
4.printSQL(list):
传入fillUnivList()函数获得的列表作为参数,连接mysql数据库后通过sql语句建立新的数据表,然后遍历存放数据的列表并执行sql语句将爬取的数据都存入数据库,最后关闭连接。
完整代码:
import bs4
import requests
from bs4 import BeautifulSoup
import pymysql
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(uList, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find("tbody").children:
if isinstance(tr, bs4.element.Tag): #过滤掉非标签类型的字符串
tds = tr('td')
uList.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
# pass;
return uList
def printUnivList(uList, num):
tplt = "{0:<6}\t{1:>12}\t{2:^26}\t{3:>2}"
tplt1 = "{0:<10}\t{1:>12}\t{2:^26}\t{3:<4}"
print(tplt.format("排名", "学校名称", "省市", "总分"))
for i in range(num):
u = uList[i]
print(tplt1.format(u[0], u[1], u[2], u[3]))
def print1(uList):
count = 0
conn = pymysql.connect(host='127.0.0.1', user='root', password='root', db='pachong', charset='utf8')
cur = conn.cursor()
sqla = '''
CREATE TABLE `pachong`.`university12` (
`id` INT NOT NULL AUTO_INCREMENT,
`ranking` VARCHAR(45) NULL,
`name` VARCHAR(45) NULL,
`province` VARCHAR(45) NULL,
`grade` VARCHAR(45) NULL,
PRIMARY KEY (`id`))
ENGINE = InnoDB
DEFAULT CHARACTER SET = utf8
COLLATE = utf8_bin;
'''
try:
cur.execute(sqla)
conn.commit()
except Exception as e:
print("Reason:", e)
conn.rollback()
for i in uList:
count = count + 1
sqlb = '''
INSERT INTO `pachong`.`university12` ( `ranking`, `name`, `province`, `grade`)
VALUES (%s, "%s", "%s", %s);
''' % (i[0], i[1], i[2], i[3])
try:
cur.execute(sqlb)
conn.commit()
# print('成功')
except Exception as e:
print("Reason:", e)
conn.rollback()
cur.close()
conn.close()
def main():
uinfo = []
html = ''
url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html"
html = getHTMLText(url)
uList = fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
# print1(uList)
main()