# Page Structure
# Source code
import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"} r = requests.get(url, timeout=30, headers=headers) r.raise_for_status() # 产生异常信息 r.encoding = r.apparent_encoding # 修改编码 return r.text #Return to the page information the except : return "" # extract html information critical data, and extract it to the list DEF fillList (Ulist, html): Soup = BeautifulSoup (html, " html.parser " ) # All news and information is encapsulated in the table , this table is called tbody tag # in tbody, each news information has been encapsulated in a tr in each tr tag, it contains all the information all the current news # each tr information, and is surrounded by td # 1. traversing tbody, tr i.e. each news information for TR in soup.find ( ' tbody ' ) .children: # filtering other types of tag data non IF the isinstance (TR, bs4.element.Tag): TDS = TR ( 'TD ' ) # query tr in TD ulist.append ([TDS [. 1] .string, TDS [2 ] .string]) DEF printList (Ulist, NUM): Print ( " {:}. 6 ^ \ T {: ^ } 10 " .format ( " title " , " heat " )) for I in (NUM) Range: U = Ulist [I] Print ( " {:}. 6 ^ \ T {:} ^ 10 " .format (U [ 0], U [1 ])) DEF main (): # News information into the list uInfo = [] # Baidu news url url = " https://tophub.today/n/Jb0vmloB1G " # convert url into HTML HTML = getHTMLText (url) fillList (uInfo, HTML) printList (uInfo, 10) # 10 News IF __name__ = = ' __main__ ' : main ()
# Obtain data capture