scrapy 出现404处理

第一种解决策略:

from scrapy.http import Request
from scrapy.spider import BaseSpider


class MySpider(BaseSpider):
    handle_httpstatus_list = [404, 500]
    name = "my_crawler"

    start_urls = ["http://github.com/illegal_username"]

    def parse(self, response):
        if response.status in self.handle_httpstatus_list:
            return Request(url="https://github.com/kennethreitz/", callback=self.after_404)

    def after_404(self, response):

print response.url

转载至 stackoverflow

http://stackoverflow.com/questions/16909106/scrapyin-a-request-fails-eg-404-500-how-to-ask-for-another-alternative-reque

第二种解决策略:

from scrapy.spider import BaseSpider
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals

class MySpider(BaseSpider):
    handle_httpstatus_list = [404] 
    name = "myspider"
    allowed_domains = ["example.com"]
    start_urls = [
        'http://www.example.com/thisurlexists.html',
        'http://www.example.com/thisurldoesnotexist.html',
        'http://www.example.com/neitherdoesthisone.html'
    ]

    def __init__(self, category=None):
        self.failed_urls = []

    def parse(self, response):
        if response.status == 404:
            self.crawler.stats.inc_value('failed_url_count')
            self.failed_urls.append(response.url)

    def handle_spider_closed(spider, reason):
        self.crawler.stats.set_value('failed_urls', ','.join(spider.failed_urls))

    def process_exception(self, response, exception, spider):
        ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
        self.crawler.stats.inc_value('downloader/exception_count', spider=spider)
        self.crawler.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)

    dispatcher.connect(handle_spider_closed, signals.spider_closed)

地址为http://stackoverflow.com/questions/13724730/how-to-get-the-scrapy-failure-urls

猜你喜欢

转载自blog.csdn.net/u012122743/article/details/52596129
今日推荐