Now we use XPath to do a simple reptiles, we try crawling all posts an paste it in the post, and download each floor publish pictures to the local.
# - * - Coding: UTF-8 - * - # tieba_xpath.py "" " effect: in this case the use of XPath to make a simple reptiles, we try to climb a post bar all posts " "" Import os Import urllib2 Import urllib from lxml Import etree class Spider: DEF __init__ (Self): self.tiebaName = raw_input ( " Please enter the need to access posted it: " ) self.beginPage = int (raw_input ( " Please enter start: " )) self.endPage int = (raw_input ( " Please enter an end page: ")) self.url = "http://tieba.baidu.com/f" self.ua_header = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"} #图片编号 self.userName = 1 def tiebaSpider(self): for page in range(self.beginPage, self.endPage+1): pn = (page-1) * 50 #page number word = {'pn':pn, 'kw':self.tiebaName} Word = urllib.urlencode (Word) # converted into a coding format url (string) myurl = self.url + " ? " + Word # Example: http:? //tieba.baidu.com/f kw =% E7% BE 8E% E5% A5%% B3 & the pn = 50 # call handler page load_Page # and get all posts links page links = self.loadPage (myurl) # urllib2_test3.py # get the page content DEF loadPage (Self, url): REQ urllib2.Request = (url, headers = self.ua_header) html = urllib2.urlopen (REQ) .read () # parse html documents to HTML DOM selector =etree.HTML (HTML) # grab the second half url of all posts of the current page, that is the message number # http://tieba.baidu.com/p/4884069807 where "the p-/ 4,884,069,807" links = Selector. XPath ( ' // div [@ class = "threadlist_lz clearfix"] / div / a [@ the rel = "noreferrer"] / @ the href ' ) # links etreeElementString type list # traversing the list, and combined into a message address, call Image processing functions loadImage for Link in links: Link = " http://tieba.baidu.com " + Link self.loadImage (Link) # get picture DEF loadImage (Self, Link): REQUrllib2.Request = (Link, headers = self.ua_header) HTML = urllib2.urlopen (REQ) .read () Selector = etree.HTML (HTML) # get this post which all image src path imageLinks = selector.xpath ( ' img // [@ class = "BDE_Image"] / @ src ' ) # followed the path taken pictures, download and save for the ImageLink in imageLinks: self.writeImages (the ImageLink) # save the page content DEF writeImages (Self, the ImageLink): "" " the images stored in binary contents to file userName "" " print(imageLink) print "D ... being stored files% " % self.userName # 1. Open a file and returns a file object File Open = ( ' ./images/ ' + STR (self.userName) + ' .png ' , ' WB ' ) # get the picture content ImagesRF Royalty Free = urllib2.urlopen (the ImageLink) .read () # call the file object write () method writes the contents of the file page_html file.write (ImagesRF Royalty Free) # close the file file.close ( ) # counter is incremented. 1 self.userName +. 1 = # analog __main__ function: IF the __name__ == ' __main__ ' : # First create an object reptile mySpider = Spider () # call the object's methods reptiles, started working mySpider.tiebaSpider ()