scrapy 框架中遇到的bug~~~~~~持续更新

  1. 域名错误
    
    class WendaSpider(scrapy.Spider):
        name = 'wenda'
        # allowed_domains = ['autohome.com']
    #域名错误会导致后续的URL无法继续解析,所以要求要么不写。要写就必须确保写对~@@@@
        start_urls = ['https://wenda.autohome.com.cn/topic/list-0-0-0-0-0-1']
        item = QichezhijiaItem()
        n = 1
        def parse(self, response):
            detail_url = response.xpath('//ul[@class="question-list"]//h4/a/@href').extract()
    
            for url in detail_url:
                url = response.urljoin(url)
                yield scrapy.Request(url, callback=self.parse_info)
    
                print(response.urljoin(url))
    
                next_url = response.xpath('//div[@class="athm-page__info"]/a/@href').extract_first()
                if next_url:
                    yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
    
        def parse_info(self, response):
    
            titles = response.xpath('//h1[@class="card-title"]/text()').extract_first()
            requests = response.xpath('//div[@class="card-content "]//p/text()').extract_first()
            level = '--'.join(response.xpath('//ul[@class="card-tag-list"]/li/text()').extract())
            answer_url = response.xpath('//div[@class="text-wrap"]/a[@class="text"]/@href').extract()
            # answer_imgs = []
            # contentss = []
            # zan = []
            for url in answer_url:
                yield scrapy.Request(response.urljoin(url),callback=self.parse_answer)
    
    

猜你喜欢

转载自blog.csdn.net/qq_42709587/article/details/81877249