python爬虫实战 爬取汽车之家上车型价格

版权声明:本文为博主原创文章,未经博主允许不得转载。如有写的不恰当或者不正确的地方,请指正,欢迎与我讨论。如需要查看博客中的源代码,请联系博主QQ:1477517404 https://blog.csdn.net/hfutzhouyonghang/article/details/82155151

相关库

import pymysql
import pymysql.cursors
from bs4 import BeautifulSoup
import requests
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
import codecs
from selenium.common.exceptions import TimeoutException

从数据库中读取车型(车型已经存放再数据库,这里读取车型的id,拼接到url上)

cars = []
conn = pymysql.connect(host='114.213.252.89',charset='utf8',user='root',passwd='112233',db='mysql',cursorclass=pymysql.cursors.DictCursor)

try:
    cur = conn.cursor()
    cur.execute("USE data_etl")
    cur.execute("select distinct(car_id),car_name from user_car_port")
    item = cur.fetchone()
    count = 0
    while item is not None:
        cars.append(item)
        count+=1
        item = cur.fetchone()
    print(count)
finally:
    conn.close()

由于汽车之家反爬比较复杂,我们直接调用浏览器接口

driver = webdriver.Chrome('chromedriver.exe')
def getCarPriceOffSale(innerHtml):
    button = 0.0
    top = 0.0
    print("此车型已经停售!")
    bsObj = BeautifulSoup(innerHtml)
    try:
        spanPrice = bsObj.findAll("span",{"class":"price"})[0]
        if spanPrice is not None:
            strongPrice = spanPrice.find("strong",{"class":"red"})
            if strongPrice is not None:
                text = strongPrice.text
                if text is not None:
                    prices = text.split("-")
                    prices = text.split("-")
                    prices[0] = prices[0].replace("万","")
                    prices[0] = prices[0].replace("元","")
                    button = float(prices[0])
                    if(len(prices) == 2):
                        prices[1] = prices[1].replace("万","")
                        prices[1] = prices[1].replace("元","")
                        top = float(prices[1])
                    else:
                        top = button
                else:
                    print("价格字段为空")
            else:
                print("价格strong为空")
        else:
            print("价格span为空")
    except Exception:
        print("程序出错!停售车型")
    return button,top

处理在售车型的价格 信息

def getCarPriceOnSale(innerHtml):
    button = 0.0
    top = 0.0
    print("此车型在售")
    bsObj = BeautifulSoup(innerHtml)
    try:
            ddprice = bsObj.findAll("dd")[0]
            if ddprice is not None:
                a = ddprice.find("a",{"class":"emphasis"})
                if a is not None:
                    text = a.text
                    prices = text.split("-")
                    prices[0] = prices[0].replace("万","")
                    prices[0] = prices[0].replace("元","")
                    button = float(prices[0])
                    if(len(prices) == 2):
                        prices[1] = prices[1].replace("万","")
                        prices[1] = prices[1].replace("元","")
                        top = float(prices[1])
                    else:
                        top = button
                else:
                    print("此车型暂时无法查询价格")
    except Exception:
            print("程序出错!在售车型")
    return button,top  

处理停售车型的价格信息

def getCarPrice(carId):
    button = 0.0
    top = 0.0
    try:
        driver.get(url+str(carId))
        wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"information-summary")))
        ele = driver.find_element_by_class_name("information-price").get_attribute('innerHTML')
        button,top=getCarPriceOnSale(ele)
    except TimeoutException:
        try:
            wait = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"car_price")))
            ele = driver.find_element_by_class_name("car_price").get_attribute('innerHTML')
            button,top=getCarPriceOffSale(ele)
        except TimeoutException:
            print("此车型有问题:"+str(carId))   
    return button,top

遍历数据库所有车型的id

for car in cars:
    id = car["car_id"]
    time.sleep(random.randint(1,5))
    button,top = getCarPrice(id)
    if button == 0.0 and top == 0.0:
        car["button"] = 9999
        car["top"] = 9999
    else:
        car["button"] = button
        car["top"] = top

猜你喜欢

转载自blog.csdn.net/hfutzhouyonghang/article/details/82155151