利用scrapy爬取某汽车网站经销商店信息

最近做一个爬虫项目,爬取经销商店信息,随附代码

import scrapy
from scrapy.http import Request
from dealer.items import DealerItem


class XcarSpider(scrapy.Spider):
    name = "xcar"
    allowed_doma  = ["xcar.com.cn"]
    start_urls = [
        'http://dealer.xcar.com.cn/d1/',
        'http://dealer.xcar.com.cn/d24/',
        'http://dealer.xcar.com.cn/d4/',
        'http://dealer.xcar.com.cn/d33/',
        'http://dealer.xcar.com.cn/d19/',
        'http://dealer.xcar.com.cn/d12/',
        'http://dealer.xcar.com.cn/d31/',
        'http://dealer.xcar.com.cn/d30/',
        'http://dealer.xcar.com.cn/d22/',
        'http://dealer.xcar.com.cn/d8/',
        'http://dealer.xcar.com.cn/d21/',
        'http://dealer.xcar.com.cn/d34/',
        'http://dealer.xcar.com.cn/d5/',
        'http://dealer.xcar.com.cn/d20/',
        'http://dealer.xcar.com.cn/d25/',
        'http://dealer.xcar.com.cn/d32/',
        'http://dealer.xcar.com.cn/d7/',
        'http://dealer.xcar.com.cn/d6/',
        'http://dealer.xcar.com.cn/d13/',
        'http://dealer.xcar.com.cn/d9/',
        'http://dealer.xcar.com.cn/d16/',
        'http://dealer.xcar.com.cn/d23/',
        'http://dealer.xcar.com.cn/d11/',
        'http://dealer.xcar.com.cn/d17/',
        'http://dealer.xcar.com.cn/d10/',
        'http://dealer.xcar.com.cn/d2/',
        'http://dealer.xcar.com.cn/d3/',
        'http://dealer.xcar.com.cn/d14/',
        'http://dealer.xcar.com.cn/d15/',
        'http://dealer.xcar.com.cn/d18/',
        'http://dealer.xcar.com.cn/d26/',
    ]

    # 城市网址, 并将城市名称传递到下一层
    def parse(self, response):
        nodes = response.xpath('//ul[@id="select_city"]/li[@class=""]')
        for node in nodes:
            city_name = node.xpath('a/text()').extract_first()
            city_url = node.xpath('@id').extract_first()
            city_url = 'http://dealer.xcar.com.cn/' + city_url
            for i in range(2):
                url = city_url + '?type=%d' % (i + 1)
                yield Request(url, meta={'city_name': city_name},
                              callback=self.parse_dealer)

    # 经销商网址
    def parse_dealer(self, response):

        dealer_type = response.xpath('//ul[@id="dealer_list_tab"]/li')

        if len(dealer_type) > 1:
            if response.url.find('type=1') > 0:
                node = response.xpath('//div[@id="dlists_4s_isfee"]')
            else:
                node = response.xpath('//div[@id="dlists_zh"]')

            meta = response.meta
            dealer_urls = node.xpath(
                './/a[@title="查看地图"]//@href').re('.+(?=#map)')
            for dealer_url in dealer_urls:
                dealer_url = response.urljoin(dealer_url)
                yield Request(url=dealer_url, meta=meta, callback=self.parse_content)

            next_page = response.xpath(
                '//a[@class="page_down"]/@href').extract_first()
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield Request(url=next_page, meta=meta, callback=self.parse_dealer)

    # 经销商详细信息
    def parse_content(self, response):
        from scrapy.shell import inspect_response
        inspect_response(response, self)
        item = DealerItem()

        item['city_name'] = response.meta['city_name']
        item['dealer_source'] = '爱卡'
        item['dealer_url'] = response.url
        item['dealer_id'] = response.url[26:-10]
        item['dealer_name'] = response.xpath(
            '//h2[@class="jxs_bt1"]/a/text()').re_first('.*')

        node = response.xpath('//ul[@class="jxs_l1"]')
        item['dealer_type'] = node.xpath('./li/b//text()').re_first('..$')
        item['dealer_tel'] = node.xpath(
            './li[re:match(text(),"电话")]//b//text()').re_first('[\d-]+')
        item['dealer_addr'] = node.xpath('./li[@class="dz"]/text()[last()]'
                                         ).extract_first()

        # 如果经销商Id不存在于黑名单中, 获取它的经纬度
        baidu_lng = response.xpath(
            '//script[@language="JavaScript"]/text()').re_first('lon =.*')
        if baidu_lng is not None:
            item['baidu_lng'] = baidu_lng[11:-2]
        baidu_lat = response.xpath(
            '//script[@language="JavaScript"]/text()').re_first('lat =.*')
        if baidu_lat is not None:
            item['baidu_lat'] = baidu_lat[11:-2]

        model_ids = response.xpath('//ul[@class="jxs_l3"]//a/@href').re('_s\d+')
        for model_id in model_ids:
            item['model_id'] = model_id[2:]
            yield item
以上爬取内容仅做学习,不得用于商业用途。

猜你喜欢

转载自blog.csdn.net/gz_wiilian/article/details/81053941