python3定向爬取网页内容

import requests
import bs4
from bs4 import BeautifulSoup

def getHTMLText(url):  # 获取网页内容
	try:
		r = requests.get(url, timeout=30)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		return ''

def fillUnivList(ulist, html):  # 解析网页内容,并提取相关信息
	soup = BeautifulSoup(html, 'html.parser')
	for tr in soup.find('tbody').children:
		if isinstance(tr, bs4.element.Tag):
		    tds = tr('td')
		    ulist.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
	

def printUnivList(ulist, num):
	tplt = '{0:^10}\t{1:{3}^10}\t{2:^10}\t{4:^8}'  # 输出格式
	print(tplt.format('排名','学校名称','地址',chr(12288), '总分'))
	for i in range(num):
		u = ulist[i]
		print(tplt.format(u[0], u[1], u[2], chr(12288), u[3]))

def main():
	uinfo = []
	url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html' #爬取的网页
	html = getHTMLText(url)
	fillUnivList(uinfo, html)
	printUnivList(uinfo, 20)

main()

猜你喜欢

转载自blog.csdn.net/qq_27668313/article/details/80567618