爬虫 糗事百科

1.爬取多个网址的数据,用list来处理

url = "https://www.qiushibaike.com/hot/page/{}/"

url_list = [url.format(i) for i in range(1, 14)]

for url in url_list:
    response = requests.get(url, headers=headers)
    print(response.content.decode())

2.从lxml中导入etree,使用xpath()方法

目前用到的方法有

1.// 当前子目录的属性。

2.@ 查找属性。

3..split(" ")[-1] 获取最后一个数据。

3.文本提取

1.提取属性中包含abcd的文本

contains[contains(@class, 'abcd')]

2.提取属性为abcd的文本

div[@class='abcd'].text()

list中加入str,需要取数据再加

3.提取属性名@

例如:@class

4.提取属性内文本

/text()

4.爬取糗事百科源码



import requests
from lxml import etree
import json

class QiubaiSpider:
    def __init__(self):
        self.url = "https://www.qiushibaike.com/text/page/{}/"
        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}

    def get_url_list(self):
        return [self.url.format(i) for i in range(1, 14)]

    def parse_url(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def get_content_list(self, html_str):
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[@id='content-left']/div")
        content_list = []
        for div in div_list:
            item = {}
            item["id"] = div.xpath(".//h2/text()")
            item["id"] = [i.replace("\n", "") for i in item["id"]]
            item["content"] = div.xpath(".//div[@class='content']/span/text()")
            item["content"] = [i.replace("\n", "") for i in item["content"]]
            item["author_image"] = div.xpath(".//div/a/img/@src")
            item["author_image"] = "https:"+item["author_image"][0] if len(item["author_image"]) > 0 else None
            item["age"] = div.xpath(".//div[contains(@class, 'articleGender')]/text()")
            item["age"] = item["age"] if len(item["age"]) > 0 else None
            item["author_gender"] = div.xpath(".//div[contains(@class, 'articleGender')]/@class")
            item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"]) > 0 else None
            item["stats_vote"] = div.xpath(".//div/span/i/text()")
            item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
            item["user_commons"] = div.xpath(".//span[@class='stats-comments']/a/i/text()")
            item["user_commons"] = item["user_commons"][0] if len(item["stats_vote"]) > 0 else None
            content_list.append(item)
        return content_list

            # 保存文件
    def save_html(self, html_str, page_num):
        file_path = "糗事百科-第{}页.html".format(page_num)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(html_str)

    # 打印内容
    def save_content_list(self, content_list):
        for i in content_list:
            print(i)
            with open("糗事百科.text", "w", encoding="utf-8") as f:
                f.writelines(json.dumps(i)+'\n')

    def run1(self):
        url_list = self.get_url_list()

        for url in url_list:
            page_num = url_list.index(url) + 1
            html_str = self.parse_url(url)
            self.save_html(html_str, page_num)

    def run(self):
        url_list = self.get_url_list()
        sava_file = ""
        for url in url_list:
            html_str = self.parse_url(url)
            # 提取数据
            content_list = self.get_content_list(html_str)
            # 保存
            self.save_content_list(content_list)


if __name__ == '__main__':
    qiubaiSpider = QiubaiSpider()
    qiubaiSpider.run()






猜你喜欢

转载自blog.csdn.net/Rand_C/article/details/86589781