scrapy汽车之家车型的简单爬取

汽车之家车型的简单爬取
spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):
    name = 'car_home'
    allowed_domains = ['autohome.com.cn']
    start_urls = [
    # 'https://www.autohome.com.cn/grade/carhtml/A.html',
    ]
     # 自定义配置
    custom_settings = {
         'ITEM_PIPELINES': {
         'mininova.pipelines.CarPipeline': 300,
         }
    }
    def start_requests(self): #重新定义起始爬取点
        words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
        # words = ['A']
        for word in words:
            self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') 
        for url in self.start_urls:
            yield Request(url,meta={'word':word})

    def parse(self, response): 
        print('url')
        print(response.url)
        word = response.meta['word']
        car_cates = response.xpath('//dl').extract()
        brand_id = 0
        total_cars = []
        for brand_index in range(len(car_cates)):
            brand_num = brand_index + 1
            brand_num = str(brand_num)
            brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]
            print('brand:'+brand)
            brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]
            brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()
            brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()
            for brand_item_index in range(len(brand_items)):
                brand_item_num = brand_item_index + 1
                brand_item_num = str(brand_item_num)
                brand_item = brand_items[brand_item_index]
                brand_item_url = brand_item_urls[brand_item_index]
                print('brand_item:'+brand_item)
                print('brand_item_url:'+brand_item_url)
                cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()
                print('cars_count:'+str(len(cars)))
                for car_index in range(len(cars)):
                    car_num = car_index + 1
                    car_num = str(car_num)
                    name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]
                    url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]
                    price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]
                    prices = price.split('-')
                    price_base = '万'
                    if len(prices) != 2:
                        max_price = '暂无'
                        min_price = '暂无'
                    else:
                        max_price = str(prices[1].replace(price_base,''))
                        min_price = str(prices[0])
                    print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)
                    car_item = carItem()
                    car_item['name'] = name
                    car_item['url'] = url
                    car_item['brand_item'] = brand_item
                    car_item['first_word'] = word
                    car_item['brand'] = brand
                    car_item['brand_logo_url'] = brand_logo_url
                    car_item['max_price'] = max_price
                    car_item['min_price'] = min_price
                    total_cars.append(car_item)
        return total_cars

item

# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):
	name = scrapy.Field()
	url = scrapy.Field()
	max_price = scrapy.Field()
	min_price = scrapy.Field()
	brand = scrapy.Field()
	brand_logo_url = scrapy.Field()
	brand_item = scrapy.Field()
	first_word = scrapy.Field() 

mongo_car

from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
    db_name = 'car'
    brand_set_name = 'brand'
    brand_item_set_name = 'brand_item'
    car_set_name = 'car'
    def __init__(self):
        self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])

    def insert(self,item):
        brand_where = {'name':item['brand']}
        brand = self.brand_exist(self.db,brand_where)
        if brand == False:
            brand = {'name':item['brand'],'first_word':item['first_word']}
            brand = self.insert_brand(self.db,brand)
            print('brand insert ok!')
        else:
            brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
            brand = self.update_brand(self.db,brand_where,brand)
            print('brand_exist!')

        brand_item_where = {'name':item['brand_item']}
        brand_item = self.brand_item_exist(self.db,brand_item_where)
        if brand_item == False:
            brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
            brand_item = self.insert_brand_item(self.db,brand_item)
            print('brand_item insert ok!')
        else:
            print('brand_item_exist!')

        car_where = {'name':item['brand_item'],'name':item['name']}
        car = self.car_exist(self.db,car_where)
        if car == False:
            car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
            car = self.insert_car(self.db,car)
            print('car insert ok!')
        else:
            print('car_exist!')
            


        if car != False:
            return True;
        else:
            return False;
    def update_brand(self,db,brand_where,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.update_one(brand_where,{'$set':brand})
        exist = my_set.find_one(brand_where)
        if(exist is None):
            return False
        else:
            return exist

    def brand_exist(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        exist = my_set.find_one(brand)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.insert_one(brand)
        brand = my_set.find_one(brand)
        return brand

    def brand_item_exist(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        exist = my_set.find_one(brand_item)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand_item(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        my_set.insert_one(brand_item)
        brand = my_set.find_one(brand_item)
        return brand

    def car_exist(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        exist = my_set.find_one(car)
        if(exist is None):
            return False
        else:
            return exist

    def insert_car(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        my_set.insert_one(car)
        brand = my_set.find_one(car)
        return brand

pipeline

from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):   
    def process_item(self,item,spider):
        mongo_car = MongoCar()
        mongo_car.insert(item)
        print(item['name'])
        print('item insert ok!')

setting

mongo_setting = {
	'mongo_host' : 'xxx.xxx.xxx.xxx',
	'mongo_port' : 27017,
	'mongo_user' : 'username',
	'mongo_password' : 'password'
}

猜你喜欢

转载自blog.csdn.net/weixin_42625306/article/details/83451578