Crawling links: http: //www.zuihaodaxue.com/zuihaodaxuepaiming2019.html
programming constructs:
1. Obtain University Rankings Web content getHTMLText ()
2. extract web page content information to the appropriate data structure fillUnivList ()
3. use of data structure shown and outputs the result printUnivList ()
The complete code Python3.7 pycharm
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')#等价于tr.find_all('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string])
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","地区"))
for i in range (num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo=[]
url="http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)
if __name__ == '__main__':
main()
Chinese-justification optimization:
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","地区"))
for i in range (num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
change into
def printUnivList(ulist, num):
print("{0:^10}\t{1:{3}^10}\t{2:^10}".format("排名", "学校", "地区",chr(12288)))
for i in range(num):
u = ulist[i]
print("{0:^10}\t{1:{3}^10}\t{2:^10}".format(u[0], u[1], u[2],chr(12288)))
IndexError modification process may occur: tuple index out of range
so that a certain vertical alignment, CHR () format in which, as well as writing.