python-- crawling game ranking information

Crawling game before the game ranked 100, as follows:

Import Requests
 Import BS4
 from BS4 Import the BeautifulSoup
 Import Re 
 
DEF main (): 
    A = []         # define empty list 
    URL = " http://top.baidu.com/buzz?b=62 " 
    HTML = getHTMLText (URL) 
    collectlist ( A, HTML) 
    printList (A, 50 ) 
 
 
DEF getHTMLText (URL):
     the try :
         # camouflage browser 
        headers = { ' User-Agent ' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        r = requests.get(url,headers = headers,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return "error"
 
def collectlist(plist,html):
    soup = BeautifulSoup(html,'html.parser')
    #找到<table>下的所有<tr>,返回的是列表
    s = soup.find('table') .find_all ( ' TR ' )
     for I in S: 
        
        IF i.find ( ' TD ' , the class_ = ' First ' ) = None:!        # need to find qualified <td>, then the operation 
            con = i.find ( ' TD ' , the class_ = ' first ' .text)     # rank 
            CON1 = the re.search (R & lt ' \ + D ' , CON)               # text type requires regular match, returns a list of every time only one element 
            con2 = i. Find ( ' TD ' , the class_ =" Keyword " ) .find ( ' A ' , the class_ = " List-title " ) .string   # game name, string type may append to the array directly 
            CON3 = i.find ( ' TD ' , the class_ = " Last " ) .find ( ' span ' ) .string      # index             
            plist.append ([CON1 [0], CON2, CON3])       # returns the element 
    
    
    
            
    
DEF printList (the plist, NUM): 
    TPLT = " {0: 10} ^ \ T {. 1 :. 3} ^ {10} \ {2 T: 20 is} ^ "           # Name of the game for the text case 
    tplt1 =" {0: ^ 10} \ T {. 1: ^ 20 is} \ T {2: ^ 20 is} "              # game name appears letter case 
    tplt2 = " {0: ^ 10} \ T {. 1: {. 3} ^ 10} \ t {2: 3} ^ {10} "        # The first column theme item 
    Print (tplt2.format ( " rank " , " name of the game " , " search index " , CHR (12288 )))
     for i in the Range (NUM ): 
        P = the plist [I]
         IF re.match (R & lt ' [AZ] ' , p [1]) == None:          # regular matches p [1] where text 
            Print (tplt.format (P [0], p [1], p [2 ], chr (12288)))
        else:                                       
            print(tplt1.format(p[0],p[1],p[2],chr(12288)))
      
 
main()

 

 

 

 Summary The following small problem:

1, the difference find () and find_all () of

find () returns the first element object

find_all () returns all elements

2, .text multilayer form may return a text label, .string must return only when the text in the label layer. But using .text note text if not when the label layer, which returns the type of the array type, the elements needed to extract them, i.e. str [0], represents the first element of the array; and when .string the text under the label layer, naturally it is the return of a string.

3, f12 viewing a web page elements

Guess you like

Origin www.cnblogs.com/yezishen/p/11878092.html