from lxml Import etree Import Time Import JSON Import the urllib.request item_list = [] # Create a list storing the acquired information # configuration request object DEF handler_request (URL, Page): headers = { " User-Agent " : " the Mozilla / 5.0 ( NT 10.0 the Windows; the WOW64) the Apple \ the WebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " } GET_URL = URL + STR (Page) RequestUrllib.request.Request = (URL = GET_URL, headers = headers) return Request # parsed html files acquired DEF parse_content (Content): # generating object Tree = etree.HTML (Content) article_list = tree.xpath ( ' // main [@ class = "COL. 8-MD-main-Content"] / article ' ) # traversal list article for article in article_list: # Get title title = article.xpath ( ' .//div[@class="post-head "] / h1 of / A / text () ' ) [0] # acquires the content text = article.xpath (' .// div [@ class = "POST-Content"] / P / text () ' ) text = ' \ n- ' .join (text) # contents spliced Item = { ' Title ' : title, ' content ' : text, } item_list.append (Item) DEF main (): START_PAGE = int (iNPUT ( " Please enter the start page: " )) end_page = int (iNPUT ( " query page end: " )) URL = "http://duanziwang.com/page/" for page in range(start_page, end_page+1): request = handler_request(url, page) try: content = urllib.request.urlopen(request).read().decode() parse_content(content) except: print("第%d页面爬取失败" % page) string = json.dumps(item_list, ensure_ascii=False) with open('duanzi.txt', "w", encoding='utf-8') as f: f.write(string) if __name__ == '__main__': main()