Python crawling reptile of simple data Baidu Post Bar

First, there is the first library to be used time and parse package random packages under request and urllib under urllib

Then we define a class named BaiduSpider information for crawling

 

Attributes url: URL headers for crawling: request header

class BaiduSpider(object):
    def __init__(self):
        self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
        self.headers = {'User-Agent':'Win7:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}

 

Then we define three methods that do not involve cleaning data

Get the page

1  # Get page 
2      DEF the get_page (Self, URL):
 . 3          # define the requested object 
. 4          REQ = request.Request (URL = URL, headers = self.headers)
 . 5          # initiation request 
. 6          RES = request.urlopen (REQ)
 . 7          # obtain the corresponding objects 
. 8          HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' )
 . 9          return HTML

save data

# Save data 
    DEF write_page (Self, filename, HTML):
         # saving data to a local 
        with Open (filename, ' W ' , encoding = ' UTF-. 8 ' ) AS F: 
            f.write (HTML)

The main function

# Main function 
    DEF main (Self): 
        name = INPUT ( ' Enter name paste it: >>>> ' ) 
        Start = int (INPUT ( ' Enter Home ' )) 
        End = int (INPUT ( ' Enter termination p ' ))
         for page in Range (Start, End +. 1 ):
             # spliced URL address' http://tieba.baidu.com/f?kw PN = {} {} &' 
            # encoding the Chinese character encoding url address code 
            kW = parse.quote (name)
             # Get the current page 
            PN = (. 1-Page) * 50 # splicing address for url
            
            = URL self.url.format (kW, PN)
             # obtain the appropriate 
            HTML = self.get_page (URL) 
            filename = ' {} - {} on page .html ' .format (name, Page) 
            self.write_page (filename, HTML ) 
            # prompt 
            Print ( ' on page crawling successful {} ' .format (page))
             # control crawling speed 
            time.sleep (random.randint (1,3))

 

Finally, all of the code shown below

# Import module 
from the urllib Import Request, the parse
 Import Time
 Import Random 

class Baiduspider (Object):
     DEF  the __init__ (Self): 
        self.url = ' http://tieba.baidu.com/f?kw= & PN = {} {} ' 
        self.headers = { ' the User-- Agent ' : ' Win7: the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 535.1 (KHTML, like the Gecko) the Chrome / 14.0.835.163 Safari / 535.1 ' } 

    # get page 
    DEF the get_page ( Self, URL):
         # definition request objects
        = request.Request REQ (URL = URL, headers = self.headers)
         # initiation request 
        RES = request.urlopen (REQ)
         # obtain the appropriate target 
        HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' )
         return HTML 


    # parsed data 
    DEF parse_page (Self):
         Pass 

    # save data 
    DEF write_page (Self, filename, HTML):
         # saving data to a local 
        with Open (filename, ' W ' , encoding = ' UTF-. 8 ' ) AS f:
            f.write (HTML) 

    # main function 
    DEF main (Self): 
        name = INPUT ( ' Enter name paste it: >>>> ' ) 
        Start = int (INPUT ( ' Enter Home ' )) 
        End = int ( iNPUT ( ' enter termination page ' ))
         for page in Range (Start, End +. 1 ):
             # spliced URL address 'http://tieba.baidu.com/f?kw PN = {} {} &' 
            # encoding the Chinese character encoding for the url address code 
            kw = parse.quote (name)
             # get the current pages of 
            the pn = (Page-1) * 50 #
            Url addresses for splicing 
            url = self.url.format (kW, PN)
             # obtain the appropriate 
            HTML = self.get_page (url) 
            filename = ' {} - {} on page .html ' .format (name, Page) 
            Self. write_page (filename, HTML) 
            # prompt 
            Print ( ' on page crawling successful {} ' .format (page))
             # control crawling speed 
            the time.sleep (the random.randint (l, 3 )) 

IF  the __name__ == ' __main__ ' : 
    Spider = Baiduspider () 
    spider.main ()

A very, very simple reptile is complete let's look at operating results screenshot:

 

 

 

 html file is opened with our normal opening page and not much difference

 

Guess you like

Origin www.cnblogs.com/gongdada/p/11620613.html