Using xpath crawling technology segment subnet

from lxml Import etree
 Import Time
 Import JSON
 Import the urllib.request 
item_list = []   # Create a list storing the acquired information 


# configuration request object 
DEF handler_request (URL, Page): 
    headers = {
         " User-Agent " : " the Mozilla / 5.0 ( NT 10.0 the Windows; the WOW64) the Apple \ 
                              the WebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " 
    } 
    GET_URL = URL + STR (Page) 
    RequestUrllib.request.Request = (URL = GET_URL, headers = headers)
     return Request 


# parsed html files acquired 
DEF parse_content (Content):
     # generating object 
    Tree = etree.HTML (Content) 
    article_list = tree.xpath ( ' // main [@ class = "COL. 8-MD-main-Content"] / article ' )
     # traversal list article 
    for article in article_list:
         # Get title 
        title = article.xpath ( ' .//div[@class="post-head "] / h1 of / A / text () ' ) [0]
         # acquires the content 
        text = article.xpath (' .// div [@ class = "POST-Content"] / P / text () ' ) 
        text = ' \ n- ' .join (text)   # contents spliced 
        Item = {
             ' Title ' : title,
             ' content ' : text, 
        } 
        item_list.append (Item) 


DEF main (): 
    START_PAGE = int (iNPUT ( " Please enter the start page: " )) 
    end_page = int (iNPUT ( " query page end: " )) 
    URL = "http://duanziwang.com/page/"
    for page in range(start_page, end_page+1):
        request = handler_request(url, page)
        try:
            content = urllib.request.urlopen(request).read().decode()
            parse_content(content)
        except:
            print("第%d页面爬取失败" % page)
    string = json.dumps(item_list, ensure_ascii=False)
    with open('duanzi.txt', "w", encoding='utf-8') as f:
        f.write(string)


if __name__ == '__main__':
    main()

 

Guess you like

Origin www.cnblogs.com/nxrs/p/11365422.html