Crawling know almost ten Billboard

1. Open the Web page https://tophub.today/

2. Press Ctrl + U to open the page source

3. Locate the data you want to crawl

4.

Import Requests
 Import PANDAS AS pd
 from BS4 Import BeautifulSoup
 from PANDAS Import DataFrame 
LST = [] # create an empty list 
url = ' https://tophub.today/ ' # URLs crawled pages 
DEF GET (url):
     the try : 
        headers = { ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 80.0.3987.122 Safari / 537.36 ' } # disguise reptiles
        = requests.get URL (URL, timeout = 30, headers = headers) # transmission request 
        url.raise_for_status () 
        url.encoding = ' UTF-. 8 ' 
        return url.text 
     the except :
         return  " abnormal " 
# Create a data placement folder 
DEF Create (LST, HTML, NUM): 
        Soup = the BeautifulSoup (HTML, ' html.parser ' ) 
        A = soup.find_all ( ' span ' , the class_ = ' T ' ) 
        B = soup.find_all ( 'span ' , the class_ = ' E ' )
         Print ( ' {:} 10 ^ \ T {:} 30 ^ \ T {:} ^ 10 ' .format ( ' Rank ' , ' title ' , ' heat ' ))
         for I in Range (NUM):
             Print ( ' {:} 10 ^ \ T {:} 30 ^ \ T {:} ^ 10 ' .format (I +. 1, A [I + 50] .string, B [I + 50 ] .string)) # print out the content crawl 
            lst.append ([I +. 1, a [I + 50] .string, B [I + 50] .string]) # the crawling data into the list 
= HTML GET (url)
Create (LST, HTML, 10 ) 
DF = pd.DataFrame (LST, Columns = [ ' ranking ' , ' title ' , ' heat ' ]) 
ZHHot = ' E: \ New Folder \ know almost ten ranked data trending .xlsx ' 
df.to_excel (ZHHot)

 

 

 

The data crawling

Guess you like

Origin www.cnblogs.com/Creasura/p/12518876.html