Python Reptile (xi) _ Case: Using XPath of reptiles

Now we use XPath to do a simple reptiles, we try crawling all posts an paste it in the post, and download each floor publish pictures to the local.

# - * - Coding: UTF-8 - * - 
# tieba_xpath.py 

"" " 
    effect: in this case the use of XPath to make a simple reptiles, we try to climb a post bar all posts 
" "" 

Import os
 Import urllib2
 Import urllib
 from lxml Import etree 

class Spider:
     DEF  __init__ (Self): 
        self.tiebaName = raw_input ( " Please enter the need to access posted it: " ) 
        self.beginPage = int (raw_input ( " Please enter start: " )) 
        self.endPage int = (raw_input ( " Please enter an end page: "))

        self.url = "http://tieba.baidu.com/f"
        self.ua_header = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

        #图片编号
        self.userName = 1

    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage+1):
            pn = (page-1) * 50   #page number
            word = {'pn':pn, 'kw':self.tiebaName}

            Word = urllib.urlencode (Word)    # converted into a coding format url (string) 
            myurl = self.url + " ? " + Word
             # Example: http:? //tieba.baidu.com/f kw =% E7% BE 8E% E5% A5%% B3 & the pn = 50 
            # call handler page load_Page 
            # and get all posts links page 
            links = self.loadPage (myurl)   # urllib2_test3.py 

    # get the page content 
    DEF loadPage (Self, url): 
        REQ urllib2.Request = (url, headers = self.ua_header) 
        html = urllib2.urlopen (REQ) .read () 

        # parse html documents to HTML DOM 
        selector =etree.HTML (HTML) 

        # grab the second half url of all posts of the current page, that is the message number 
        # http://tieba.baidu.com/p/4884069807 where "the p-/ 4,884,069,807" 
        links = Selector. XPath ( ' // div [@ class = "threadlist_lz clearfix"] / div / a [@ the rel = "noreferrer"] / @ the href ' ) 

        # links etreeElementString type list 
        # traversing the list, and combined into a message address, call Image processing functions loadImage 
        for Link in links: 
            Link = " http://tieba.baidu.com " + Link 
            self.loadImage (Link) 

    # get picture 
    DEF loadImage (Self, Link): 
        REQUrllib2.Request = (Link, headers = self.ua_header) 
        HTML = urllib2.urlopen (REQ) .read () 

        Selector = etree.HTML (HTML) 

        # get this post which all image src path 
        imageLinks = selector.xpath ( ' img // [@ class = "BDE_Image"] / @ src ' ) 

        # followed the path taken pictures, download and save 
        for the ImageLink in imageLinks: 
            self.writeImages (the ImageLink) 


    # save the page content 
    DEF writeImages (Self, the ImageLink):
         "" " 
            the images stored in binary contents to file userName 

        "" "

        print(imageLink)
        print "D ... being stored files% " % self.userName
         # 1. Open a file and returns a file object 
        File Open = ( ' ./images/ ' + STR (self.userName) + ' .png ' , ' WB ' ) 

        # get the picture content 
        ImagesRF Royalty Free = urllib2.urlopen (the ImageLink) .read () 

        # call the file object write () method writes the contents of the file page_html 
        file.write (ImagesRF Royalty Free) 

        # close the file 
        file.close ( ) 

        # counter is incremented. 1 
        self.userName +. 1 = # analog __main__ function: IF the __name__ == ' __main__



 ' :
     # First create an object reptile 
    mySpider = Spider ()
     # call the object's methods reptiles, started working 
    mySpider.tiebaSpider ()

 

 

 





Guess you like

Origin www.cnblogs.com/moying-wq/p/11569992.html