最近做一个爬虫项目,爬取经销商店信息,随附代码
import scrapy from scrapy.http import Request from dealer.items import DealerItem class XcarSpider(scrapy.Spider): name = "xcar" allowed_doma = ["xcar.com.cn"] start_urls = [ 'http://dealer.xcar.com.cn/d1/', 'http://dealer.xcar.com.cn/d24/', 'http://dealer.xcar.com.cn/d4/', 'http://dealer.xcar.com.cn/d33/', 'http://dealer.xcar.com.cn/d19/', 'http://dealer.xcar.com.cn/d12/', 'http://dealer.xcar.com.cn/d31/', 'http://dealer.xcar.com.cn/d30/', 'http://dealer.xcar.com.cn/d22/', 'http://dealer.xcar.com.cn/d8/', 'http://dealer.xcar.com.cn/d21/', 'http://dealer.xcar.com.cn/d34/', 'http://dealer.xcar.com.cn/d5/', 'http://dealer.xcar.com.cn/d20/', 'http://dealer.xcar.com.cn/d25/', 'http://dealer.xcar.com.cn/d32/', 'http://dealer.xcar.com.cn/d7/', 'http://dealer.xcar.com.cn/d6/', 'http://dealer.xcar.com.cn/d13/', 'http://dealer.xcar.com.cn/d9/', 'http://dealer.xcar.com.cn/d16/', 'http://dealer.xcar.com.cn/d23/', 'http://dealer.xcar.com.cn/d11/', 'http://dealer.xcar.com.cn/d17/', 'http://dealer.xcar.com.cn/d10/', 'http://dealer.xcar.com.cn/d2/', 'http://dealer.xcar.com.cn/d3/', 'http://dealer.xcar.com.cn/d14/', 'http://dealer.xcar.com.cn/d15/', 'http://dealer.xcar.com.cn/d18/', 'http://dealer.xcar.com.cn/d26/', ] # 城市网址, 并将城市名称传递到下一层 def parse(self, response): nodes = response.xpath('//ul[@id="select_city"]/li[@class=""]') for node in nodes: city_name = node.xpath('a/text()').extract_first() city_url = node.xpath('@id').extract_first() city_url = 'http://dealer.xcar.com.cn/' + city_url for i in range(2): url = city_url + '?type=%d' % (i + 1) yield Request(url, meta={'city_name': city_name}, callback=self.parse_dealer) # 经销商网址 def parse_dealer(self, response): dealer_type = response.xpath('//ul[@id="dealer_list_tab"]/li') if len(dealer_type) > 1: if response.url.find('type=1') > 0: node = response.xpath('//div[@id="dlists_4s_isfee"]') else: node = response.xpath('//div[@id="dlists_zh"]') meta = response.meta dealer_urls = node.xpath( './/a[@title="查看地图"]//@href').re('.+(?=#map)') for dealer_url in dealer_urls: dealer_url = response.urljoin(dealer_url) yield Request(url=dealer_url, meta=meta, callback=self.parse_content) next_page = response.xpath( '//a[@class="page_down"]/@href').extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield Request(url=next_page, meta=meta, callback=self.parse_dealer) # 经销商详细信息 def parse_content(self, response): from scrapy.shell import inspect_response inspect_response(response, self) item = DealerItem() item['city_name'] = response.meta['city_name'] item['dealer_source'] = '爱卡' item['dealer_url'] = response.url item['dealer_id'] = response.url[26:-10] item['dealer_name'] = response.xpath( '//h2[@class="jxs_bt1"]/a/text()').re_first('.*') node = response.xpath('//ul[@class="jxs_l1"]') item['dealer_type'] = node.xpath('./li/b//text()').re_first('..店$') item['dealer_tel'] = node.xpath( './li[re:match(text(),"电话")]//b//text()').re_first('[\d-]+') item['dealer_addr'] = node.xpath('./li[@class="dz"]/text()[last()]' ).extract_first() # 如果经销商Id不存在于黑名单中, 获取它的经纬度 baidu_lng = response.xpath( '//script[@language="JavaScript"]/text()').re_first('lon =.*') if baidu_lng is not None: item['baidu_lng'] = baidu_lng[11:-2] baidu_lat = response.xpath( '//script[@language="JavaScript"]/text()').re_first('lat =.*') if baidu_lat is not None: item['baidu_lat'] = baidu_lat[11:-2] model_ids = response.xpath('//ul[@class="jxs_l3"]//a/@href').re('_s\d+') for model_id in model_ids: item['model_id'] = model_id[2:] yield item以上爬取内容仅做学习,不得用于商业用途。