Encyclopedia of embarrassments crawling all popular piece, title, content links, funny number, the number of comments
# coding=utf-8 from lxml import etree import requests import json class QiubaiSpider: def __init__(self): self.url_temp="https://www.qiushibaike.com/8hr/page/{}/" self.header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} def get_url_list(self): url_list=[self.url_temp.format(i) for i in range(1,14)] return url_list def parse_url(self,url): print("正在爬取:",url) response=requests.get(url,headers=self.header) return response.content.decode() def get_content_list(self,html_str): html=etree.HTML(html_str) #1.分组 div_list=html.xpath("//div[@class='recommend-article']//li") content_list=[] for div in div_list: item={} item["作者名"]=div.xpath(".//span[@class='recmd-name']/text()")[0] if len(div.xpath(".//span[@class='recmd-name']/text()"))>0 else None item["标题"]=div.xpath(".//a[@class='recmd-content']/text()") item["内容链接"]=div.xpath(".//a[@class='recmd-content']/@href") Item [ " content link " ] = ' https://www.qiushibaike.com ' + Item [ " content link " ] [0] IF len ([Item " content link " ])> 0 the else None Item [ " funny Number " ] = div.xpath ( " .// div [@ class = 'recmd-NUM'] / span / text () " ) Item [ " funny number " ] = [Item " funny number " ] [0] IF len (Item [ " funny number "])>0 else None item["评论"]=div.xpath(".//div[@class='recmd-num']/span/text()")[-2] #item["评论"]=item["评论"][3] if len(item["评论"])>0 else None content_list.append(item) return content_list #b保存 def save_content(self,content_list): with open("qiubai.txt","a",encoding="utf-8") as f: for content in content_list: f.write (json.dumps (Content, ensure_ascii = False)) f.write ( " \ the n- " ) Print ( " Save Success " ) DEF RUN (Self): # 1. According to the law of the url address, url structure list url_list = self.get_url_list () # 2. sends a request acquisition response for URL in URL_LIST: html_str = self.parse_url (URL) # 3. extracting data CONTENT_LIST = self.get_content_list (html_str) # 4. save self.save_content(content_list) if __name__ == '__main__': qiubai=QiubaiSpider() qiubai.run()