import requests from lxml import etree import json import scrapy class Qiushibaike(): def __init__(self): self.url_temp = 'https://www.qiushibaike.com/8hr/page/{}/' self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} def get_url_list(self): url_list = [self.url_temp.format(i) for i in range(1,14)] return url_list def parse_url(self,url): print('请求的页面是:%s' % url) response = requests.get(url,headers=self.headers) return response.content.decode() def get_content_list(self,html_str): html = etree.HTML(html_str) # 分组 div_list = html.xpath("//div[@id='content-left']/div") content_list = [] for div in div_list: # 头像,名称,内容,好笑,评论,跳转链接 item = {} # item['pic'] = div.xpath('') item['name'] = div.xpath('.//a/h2/text()')[0].strip().replace("\n",'') if len(div.xpath('.//a/h2/text()')) > 0 else None item['content'] = div.xpath(".//div[@class='content']/span/text()")[0].strip().replace("\n",'') item['haoxiao'] = div.xpath(".//div//span[@class='stats-vote']//i[@class='number']/text()")[0].strip().replace("\n",'') if len(div.xpath(".//div//span[@class='stats-vote']//i[@class='number']/text()")) > 0 else None item['pinglun'] = div.xpath(".//div//span[@class='stats-comments']//i[@class='number']/text()")[0].strip().replace("\n",'') if len(div.xpath(".//div//span[@class='stats-comments']//i[@class='number']/text()")) > 0 else None item['url'] = div.xpath(".//a[@class='contentHerf']/@href")[0].strip().replace("\n",'') if len(div.xpath(".//a[@class='contentHerf']/@href")) > 0 else None item['img'] = div.xpath(".//div[@class='thumb']//img/@src") item['img'] = "https:" + item['img'][0] if len(item['img']) > 0 else None content_list.append(item) return content_list def save_content_list(self,content_list): with open("qiubai.json","a",encoding='utf-8') as f: for content in content_list: f.write(json.dumps(content,ensure_ascii=False)) f.write("\n") print("爬取成功") def run(self): # 1.根据URL地址规律构建URL地址列表 url_list = self.get_url_list() # 2.发送请求获取响应 for url in url_list: html_str = self.parse_url(url) # 3.提取数据 content_list = self.get_content_list(html_str) # 4.保存数据 self.save_content_list(content_list) if __name__=='__main__': qs = Qiushibaike() qs.run()
爬取糗事百科练习
猜你喜欢
转载自www.cnblogs.com/zqrios/p/9114800.html
今日推荐
周排行