First, there is the first library to be used time and parse package random packages under request and urllib under urllib
Then we define a class named BaiduSpider information for crawling
Attributes url: URL headers for crawling: request header
class BaiduSpider(object): def __init__(self): self.url = 'http://tieba.baidu.com/f?kw={}&pn={}' self.headers = {'User-Agent':'Win7:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
Then we define three methods that do not involve cleaning data
Get the page
1 # Get page 2 DEF the get_page (Self, URL): . 3 # define the requested object . 4 REQ = request.Request (URL = URL, headers = self.headers) . 5 # initiation request . 6 RES = request.urlopen (REQ) . 7 # obtain the corresponding objects . 8 HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' ) . 9 return HTML
save data
# Save data DEF write_page (Self, filename, HTML): # saving data to a local with Open (filename, ' W ' , encoding = ' UTF-. 8 ' ) AS F: f.write (HTML)
The main function
# Main function DEF main (Self): name = INPUT ( ' Enter name paste it: >>>> ' ) Start = int (INPUT ( ' Enter Home ' )) End = int (INPUT ( ' Enter termination p ' )) for page in Range (Start, End +. 1 ): # spliced URL address' http://tieba.baidu.com/f?kw PN = {} {} &' # encoding the Chinese character encoding url address code kW = parse.quote (name) # Get the current page PN = (. 1-Page) * 50 # splicing address for url = URL self.url.format (kW, PN) # obtain the appropriate HTML = self.get_page (URL) filename = ' {} - {} on page .html ' .format (name, Page) self.write_page (filename, HTML ) # prompt Print ( ' on page crawling successful {} ' .format (page)) # control crawling speed time.sleep (random.randint (1,3))
Finally, all of the code shown below
# Import module from the urllib Import Request, the parse Import Time Import Random class Baiduspider (Object): DEF the __init__ (Self): self.url = ' http://tieba.baidu.com/f?kw= & PN = {} {} ' self.headers = { ' the User-- Agent ' : ' Win7: the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 535.1 (KHTML, like the Gecko) the Chrome / 14.0.835.163 Safari / 535.1 ' } # get page DEF the get_page ( Self, URL): # definition request objects = request.Request REQ (URL = URL, headers = self.headers) # initiation request RES = request.urlopen (REQ) # obtain the appropriate target HTML res.read = (). decode ( ' UTF-. 8 ' , ' the ignore ' ) return HTML # parsed data DEF parse_page (Self): Pass # save data DEF write_page (Self, filename, HTML): # saving data to a local with Open (filename, ' W ' , encoding = ' UTF-. 8 ' ) AS f: f.write (HTML) # main function DEF main (Self): name = INPUT ( ' Enter name paste it: >>>> ' ) Start = int (INPUT ( ' Enter Home ' )) End = int ( iNPUT ( ' enter termination page ' )) for page in Range (Start, End +. 1 ): # spliced URL address 'http://tieba.baidu.com/f?kw PN = {} {} &' # encoding the Chinese character encoding for the url address code kw = parse.quote (name) # get the current pages of the pn = (Page-1) * 50 # Url addresses for splicing url = self.url.format (kW, PN) # obtain the appropriate HTML = self.get_page (url) filename = ' {} - {} on page .html ' .format (name, Page) Self. write_page (filename, HTML) # prompt Print ( ' on page crawling successful {} ' .format (page)) # control crawling speed the time.sleep (the random.randint (l, 3 )) IF the __name__ == ' __main__ ' : Spider = Baiduspider () spider.main ()
A very, very simple reptile is complete let's look at operating results screenshot:
html file is opened with our normal opening page and not much difference