爬取糗事百科练习

import requests
from lxml import etree
import json
import scrapy


class Qiushibaike():

    def __init__(self):

        self.url_temp = 'https://www.qiushibaike.com/8hr/page/{}/'

        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}


    def get_url_list(self):

        url_list = [self.url_temp.format(i) for i in range(1,14)]

        return url_list


    def parse_url(self,url):
        print('请求的页面是:%s' % url)
        response = requests.get(url,headers=self.headers)

        return response.content.decode()


    def get_content_list(self,html_str):

        html = etree.HTML(html_str)

        # 分组
        div_list = html.xpath("//div[@id='content-left']/div")
        content_list = []
        for div in div_list:
            # 头像,名称,内容,好笑,评论,跳转链接
            item = {}
            # item['pic'] = div.xpath('')
            item['name'] = div.xpath('.//a/h2/text()')[0].strip().replace("\n",'') if len(div.xpath('.//a/h2/text()')) > 0 else None
            item['content'] = div.xpath(".//div[@class='content']/span/text()")[0].strip().replace("\n",'')
            item['haoxiao'] = div.xpath(".//div//span[@class='stats-vote']//i[@class='number']/text()")[0].strip().replace("\n",'') if len(div.xpath(".//div//span[@class='stats-vote']//i[@class='number']/text()")) > 0 else None
            item['pinglun'] = div.xpath(".//div//span[@class='stats-comments']//i[@class='number']/text()")[0].strip().replace("\n",'') if len(div.xpath(".//div//span[@class='stats-comments']//i[@class='number']/text()")) > 0 else None
            item['url'] = div.xpath(".//a[@class='contentHerf']/@href")[0].strip().replace("\n",'') if len(div.xpath(".//a[@class='contentHerf']/@href")) > 0 else None
            item['img'] = div.xpath(".//div[@class='thumb']//img/@src")
            item['img'] = "https:" + item['img'][0] if len(item['img']) > 0 else None
            content_list.append(item)
        return content_list


    def save_content_list(self,content_list):
        with open("qiubai.json","a",encoding='utf-8') as f:
            for content in content_list:

                f.write(json.dumps(content,ensure_ascii=False))
                f.write("\n")
        print("爬取成功")




    def run(self):
        # 1.根据URL地址规律构建URL地址列表
        url_list = self.get_url_list()

        # 2.发送请求获取响应
        for url in url_list:

            html_str = self.parse_url(url)

            # 3.提取数据
            content_list = self.get_content_list(html_str)
            # 4.保存数据
            self.save_content_list(content_list)



if __name__=='__main__':
    qs = Qiushibaike()
    qs.run()

猜你喜欢

转载自www.cnblogs.com/zqrios/p/9114800.html