爬虫之Scrapy递归爬取网页信息

# -*- coding: utf-8 -*-
import re

import scrapy

from zhipin.items import ZhipinItem


class BossZhipinSpider(scrapy.Spider):
    name = 'boss_zhipin'
    allowed_domains = ['https://www.zhipin.com']
    url = 'https://www.zhipin.com/c101020100/h_101020100/?query=python&page=%s'
    offset = 1
    start_urls = [url % offset]

    # https: // www.zhipin.com / c101020100 / h_101020100 /?query = python & page = 1
    # https: // www.zhipin.com / c101020100 / h_101020100 /?query = python & page = 10

    def parse(self, response):
        item = ZhipinItem()
        for response_part in response.css('#main > div > div.job-list > ul').extract():
            company_size = []
            company_info = re.findall('<em class="vline"></em>(.*?)</p>', response_part)
            # 算出列表长度,拿到偶数位的数据
            # for idx in range(1, len(company_info)+1, 2):
            for idx, val in enumerate(company_info):
                if int(idx) % 2 != 0:
                    if '<em class="vline"></em>' in company_info[idx]:
                        new_item = company_info[idx].rsplit('</em>')[-1]
                        company_size.append(new_item)
                    else:
                        # 没带</em>直接加到company_size中
                        company_size.append(company_info[idx])
            result = zip(re.findall('title">(.*?)</div>', response_part),
                         re.findall('<span class="red">(.*?)</span>', response_part),
                         re.findall('ka="search_list_company_\d+_custompage" target="_blank">(.*?)</a>', response_part),
                         company_size,
                         re.findall('发布于(.*?)</p>', response_part))
            for job_item in result:
                """
                处理元组数据,返回item
                """
                item['job_title'] = job_item[0]
                item['job_salary'] = job_item[1]
                item['job_company'] = job_item[2]
                item['company_size'] = job_item[3]
                item['publish_date'] = job_item[4]
                yield item

        if self.offset < 10:
            self.offset += 1
            yield scrapy.Request(self.url % self.offset, callback=self.parse, dont_filter=True)

在递归爬取过程中遇到一个warning:

2018-07-24 16:38:09 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.zhipin.com':

问题出现原因:

因为 Request中请求的 URL 和 allowed_domains 中定义的域名冲突,所以将Request中请求的URL过滤掉了,无法请求

问题解决方案,有个参数dont_filter:

yield scrapy.Request(self.url % self.offset, callback=self.parse, dont_filter=True)

递归无法停止?分析下条件,将代码缩进下就可以了!!!

GitHub地址:
https://github.com/QuincyC379/zhipin

猜你喜欢

转载自blog.csdn.net/qq_33733970/article/details/81187737