汽车之家车型的简单爬取
spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):
name = 'car_home'
allowed_domains = ['autohome.com.cn']
start_urls = [
# 'https://www.autohome.com.cn/grade/carhtml/A.html',
]
# 自定义配置
custom_settings = {
'ITEM_PIPELINES': {
'mininova.pipelines.CarPipeline': 300,
}
}
def start_requests(self): #重新定义起始爬取点
words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
# words = ['A']
for word in words:
self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html')
for url in self.start_urls:
yield Request(url,meta={'word':word})
def parse(self, response):
print('url')
print(response.url)
word = response.meta['word']
car_cates = response.xpath('//dl').extract()
brand_id = 0
total_cars = []
for brand_index in range(len(car_cates)):
brand_num = brand_index + 1
brand_num = str(brand_num)
brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]
print('brand:'+brand)
brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]
brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()
brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()
for brand_item_index in range(len(brand_items)):
brand_item_num = brand_item_index + 1
brand_item_num = str(brand_item_num)
brand_item = brand_items[brand_item_index]
brand_item_url = brand_item_urls[brand_item_index]
print('brand_item:'+brand_item)
print('brand_item_url:'+brand_item_url)
cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()
print('cars_count:'+str(len(cars)))
for car_index in range(len(cars)):
car_num = car_index + 1
car_num = str(car_num)
name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]
url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]
price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]
prices = price.split('-')
price_base = '万'
if len(prices) != 2:
max_price = '暂无'
min_price = '暂无'
else:
max_price = str(prices[1].replace(price_base,''))
min_price = str(prices[0])
print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)
car_item = carItem()
car_item['name'] = name
car_item['url'] = url
car_item['brand_item'] = brand_item
car_item['first_word'] = word
car_item['brand'] = brand
car_item['brand_logo_url'] = brand_logo_url
car_item['max_price'] = max_price
car_item['min_price'] = min_price
total_cars.append(car_item)
return total_cars
item
# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
max_price = scrapy.Field()
min_price = scrapy.Field()
brand = scrapy.Field()
brand_logo_url = scrapy.Field()
brand_item = scrapy.Field()
first_word = scrapy.Field()
mongo_car
from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
db_name = 'car'
brand_set_name = 'brand'
brand_item_set_name = 'brand_item'
car_set_name = 'car'
def __init__(self):
self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])
def insert(self,item):
brand_where = {'name':item['brand']}
brand = self.brand_exist(self.db,brand_where)
if brand == False:
brand = {'name':item['brand'],'first_word':item['first_word']}
brand = self.insert_brand(self.db,brand)
print('brand insert ok!')
else:
brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
brand = self.update_brand(self.db,brand_where,brand)
print('brand_exist!')
brand_item_where = {'name':item['brand_item']}
brand_item = self.brand_item_exist(self.db,brand_item_where)
if brand_item == False:
brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
brand_item = self.insert_brand_item(self.db,brand_item)
print('brand_item insert ok!')
else:
print('brand_item_exist!')
car_where = {'name':item['brand_item'],'name':item['name']}
car = self.car_exist(self.db,car_where)
if car == False:
car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
car = self.insert_car(self.db,car)
print('car insert ok!')
else:
print('car_exist!')
if car != False:
return True;
else:
return False;
def update_brand(self,db,brand_where,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.update_one(brand_where,{'$set':brand})
exist = my_set.find_one(brand_where)
if(exist is None):
return False
else:
return exist
def brand_exist(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
exist = my_set.find_one(brand)
if(exist is None):
return False
else:
return exist
def insert_brand(self,db,brand):
my_set = db.set(self.db_name,self.brand_set_name)
my_set.insert_one(brand)
brand = my_set.find_one(brand)
return brand
def brand_item_exist(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
exist = my_set.find_one(brand_item)
if(exist is None):
return False
else:
return exist
def insert_brand_item(self,db,brand_item):
my_set = db.set(self.db_name,self.brand_item_set_name)
my_set.insert_one(brand_item)
brand = my_set.find_one(brand_item)
return brand
def car_exist(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
exist = my_set.find_one(car)
if(exist is None):
return False
else:
return exist
def insert_car(self,db,car):
my_set = db.set(self.db_name,self.car_set_name)
my_set.insert_one(car)
brand = my_set.find_one(car)
return brand
pipeline
from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):
def process_item(self,item,spider):
mongo_car = MongoCar()
mongo_car.insert(item)
print(item['name'])
print('item insert ok!')
setting
mongo_setting = {
'mongo_host' : 'xxx.xxx.xxx.xxx',
'mongo_port' : 27017,
'mongo_user' : 'username',
'mongo_password' : 'password'
}