Obtaining an internet (e.g., Twitter, almost known, micro-letters, Baidu, etc.) before the hot heat ten header information and data

# Page Structure

 

 # Source code

import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()              # 产生异常信息
        r.encoding = r.apparent_encoding  # 修改编码
        return r.text   #Return to the page information 
    the except :
         return  "" 
    
# extract html information critical data, and extract it to the list 
DEF fillList (Ulist, html): 
    Soup = BeautifulSoup (html, " html.parser " )
     # All news and information is encapsulated in the table , this table is called tbody tag 
    # in tbody, each news information has been encapsulated in a tr in each tr tag, it contains all the information all the current news 
    # each tr information, and is surrounded by td 
    # 1. traversing tbody, tr i.e. each news information 
    for TR in soup.find ( ' tbody ' ) .children:
         # filtering other types of tag data non 
        IF the isinstance (TR, bs4.element.Tag): 
            TDS = TR ( 'TD ' )     # query tr in TD 
            ulist.append ([TDS [. 1] .string, TDS [2 ] .string]) 
            
DEF printList (Ulist, NUM):
     Print ( " {:}. 6 ^ \ T {: ^ } 10 " .format ( " title " , " heat " ))
     for I in (NUM) Range: 
        U = Ulist [I]
         Print ( " {:}. 6 ^ \ T {:} ^ 10 " .format (U [ 0], U [1 ])) 
        
DEF main ():
     # News information into the list 
    uInfo = []       
    # Baidu news url 
    url = " https://tophub.today/n/Jb0vmloB1G " 
    # convert url into HTML 
    HTML = getHTMLText (url)    
    fillList (uInfo, HTML) 
    printList (uInfo, 10)     # 10 News 
    
IF  __name__ = = ' __main__ ' : 
    main ()

# Obtain data capture

 

 

 

 

 

Guess you like

Origin www.cnblogs.com/hhb123/p/12530688.html