Python Scrapy 爬取李毅吧内容

Scrapy笔记

scrapy使用

1、创建scrapy项目

scrapy startproject mySpider

2、编写第一个scrapy爬虫

#可以用于调试xpath或css选择器
scrapy shell https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7  

运行爬虫

scrapy crawl first_spider  #first_spider是你在类中定义的name值

爬取李毅吧下的第一页贴子链接和文本:

import scrapy


class firstSpider(scrapy.Spider):
    # # 构造方法
    # def __init__(self, name, age):
    #     self.name = name
    #     self.age = age

    name = 'first_spider'

    def start_requests(self):# 此方法用于通过链接爬取页面
        # 爬取页面的链接
        urls = [
            "https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
        ]

        yield scrapy.Request(urls[0], callback=self.parse)  # 爬取页面内容后如何处理通过self.parse来执行

    # 定义回调函数
    def parse(self, response):
        # 指定规则下的所有a标签元素
        xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
        link_list = response.xpath(xpath)

        for link in link_list:
            href = link.xpath('@href').extract_first()
            text = link.xpath('text()').extract_first()
            print('text: %s, href: %s' % (text, href))




if __name__ == '__main__':
    pass
    # 获得类的实例
    # a = firstSpider('张三', 18)
    # print('name: %s, age: %d' % (a.name, a.age))

爬取李毅吧下的下一页贴子链接和文本:

import scrapy

class firstSpider(scrapy.Spider):
    # # 构造方法
    # def __init__(self, name, age):
    #     self.name = name
    #     self.age = age

    name = 'second_spider'

    def start_requests(self): # 此方法用于通过链接爬取页面
        # 爬取页面的链接
        urls = [
            "https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
        ]

        yield scrapy.Request(urls[0], callback=self.parse)  # 爬取页面内容后如何处理通过self.parse来执行

    # 定义回调函数
    def parse(self, response):
        # 指定规则下的所有a标签元素
        xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
        link_list = response.xpath(xpath)

        filename = '李毅吧贴子内容.txt'
        for link in link_list:
            href = link.xpath('@href').extract_first()
            text = link.xpath('text()').extract_first()
            line = 'text: %s, href: %s' % (text, href)
            print(line)

            # 写文件
            # a 追加到文件末尾
            with open(filename, 'a', encoding='utf-8',) as f:
                f.write(line)
                f.write('\n')

        # 爬取下一页的链接地址:css选择器
        next_page = response.css('#frs_list_pager a:nth-last-child(2)::attr(href)').extract_first()
        # 判断下一页是否不为空
        if next_page is not None:
            next_page = 'https:'+next_page
            yield scrapy.Request(next_page, callback=self.parse)


if __name__ == '__main__':
    pass
    # 获得类的实例
    # a = firstSpider('张三', 18)
    # print('name: %s, age: %d' % (a.name, a.age))

urls的简写形式

import scrapy


class firstSpider(scrapy.Spider):
    # # 构造方法
    # def __init__(self, name, age):
    #     self.name = name
    #     self.age = age

    name = 'three_spider'

    # 简化形式
    start_urls = [
        "https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7",
        "https://www.baidu.com"
    ]

    # def start_requests(self): # 此方法用于通过链接爬取页面
    #     # 爬取页面的链接
    #     urls = [
    #         "https://tieba.baidu.com/f?kw=%E6%9D%8E%E6%AF%85%E5%90%A7"
    #     ]
    #
    #     yield scrapy.Request(urls[0], callback=self.parse)  # 爬取页面内容后如何处理通过self.parse来执行

    # 定义回调函数
    def parse(self, response):
        # 指定规则下的所有a标签元素
        xpath='//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]//div[@class="threadlist_lz clearfix"]/div/a'
        link_list = response.xpath(xpath)

        for link in link_list:
            href = link.xpath('@href').extract_first()
            text = link.xpath('text()').extract_first()
            print('text: %s, href: %s' % (text, href))




if __name__ == '__main__':
    pass
    # 获得类的实例
    # a = firstSpider('张三', 18)
    # print('name: %s, age: %d' % (a.name, a.age))

猜你喜欢

转载自www.cnblogs.com/Transkai/p/10534648.html
今日推荐