Python爬取搜狐车型数据

主要工具Python+selenium+Excel

因为工作需要想要获得车型参数数据,查找了网络上面的教程和相关的文章都没有达到我想要的效果,所以自己慢慢摸索着写了这些代码,可能有一些代码繁琐且没有必要,但我毕竟只是懂点皮毛的小菜鸟,如果大家有什么可以优化的方法,欢迎指教~

如果你同我一样是需要车型参数数据的,可以按照我的方法来,不过selenium有个缺点就是非常慢,而且有时候会出现抓不到数据的情况,但是如果你跟我一样有耐心且网速好的话,车型数据是肯定可以抓取下来的,而且时间说不定会比我少,毕竟我抓取这些数据花费了我一星期的时间,哎,无力吐槽的网速啊。

主要获取的数据如下图所示:

Python部分

因为搜狐汽车车型库的数据用requests爬取之后得不到对应的数据,所以需要用selenium来抓取渲染之后的数据。

步骤1:

首先打开搜狐车型库的网页,按F12进入开发者模式,点击Network里面的Doc查看源网页数据,可以看到每个品牌对应的链接。

import requests
import re
from bs4 import BeautifulSoup


def getHtml(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    html = requests.get(url, headers=headers)
    return html.text

# def getCCfirm(html):  # 获得厂商的名称和链接
#     soup = BeautifulSoup(html, 'lxml')
#     string = soup.find_all(class_='con_tit')  # 获得车型的品牌的分类
#     pattern = re.compile('<a href="(.*?)".*?>(.*?)</a>', re.S)  # 获得车型的URL和车型的品牌
#     items = re.findall(pattern, str(string))
#     for item in items:
#         yield {
#             'href': item[0],
#             'name': item[1].split()
#         }


def getCarModel(html):
    soup = BeautifulSoup(html, 'lxml')
    string = soup.find_all(class_='model-a')  # 获得车型的品牌的分类
    pattern = re.compile('<a.*?href="(.*?)".*?</em>(.*?)<span>', re.S)  # 获得车型的URL和车型的品牌
    items = re.findall(pattern, str(string))
    for item in items:
        yield {
            'href': item[0],
            'name': item[1]
        }



url = 'http://db.auto.sohu.com/home/'
html = getHtml(url)
with open('Model_ID.txt', 'w', encoding='utf-8') as f:
    for i in getCarModel(html):
        f.write(str(i) + '\n')
f.close()

步骤2:

通过遍历步骤1得到的Model_ID.txt里面的链接,获得每个车系对应的链接。

import re
import requests
# 获得品牌的ID
def Uniq():
    with open('Model_ID.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('/(\d{4})\'')
            uniq = re.findall(pattern, i)
            yield {
                'uniq': uniq[0]
            }
        f.close()


def getallyrl(mids, tids):
    with open('Model_ID.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('/(\d{4})\'')
            uniq = re.findall(pattern, i)
            pattern3 = re.compile('\'href\': \'(.*?)\'')
            uniqHref = re.findall(pattern3, i)
            if mids == uniq:
                urlone = 'http:' + uniqHref[0]
                urltwo = urlone + '/' + tids + '/trim.html'
                return urltwo
    f.close()


def getmid():
    for mid in Uniq():
        url = 'http://db.auto.sohu.com/api/model/select/trims_' + mid['uniq'] + '.json'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
        html = requests.get(url, headers=headers).text
        pattern1 = re.compile('"tid":(\d{6})')
        pattern2 = re.compile('"mid":(\d{4})')
        mids = re.findall(pattern2, html)
        tids = re.findall(pattern1, html)
        for i in tids:
            perfer = getallyrl(mids, i)
            yield {
                'all_url': perfer
            }
with open('allurls.txt', 'w', encoding='utf-8') as f:
    for i in getmid():
        f.write(str(i) + '\n')
f.close()

步骤3:

根据步骤2里面得的allurls.txt,遍历里面的数据,得到最终需要的车型参数数据。

from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
from random import randint
from selenium.webdriver.chrome.options import Options

start = time.clock()
chrome_options = Options()
chrome_options.add_argument('--headless ')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser = webdriver.Chrome()

def getHTML(url):
    browser.get(url)
    time.sleep(randint(2,5)*2)
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    strtext = soup.find_all(class_="th2")
    return strtext


def rules(strtext):
    len = str(strtext).count('"th2"')  # 确定th2的个数
    return len


def getstep(len):
    stepnum = int(len / 208)
    return stepnum


def getUrl():
    with open('allurls_new.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('\'all_url\': \'(.*?)\'}')
            urls = re.findall(pattern, i)
            yield {
                'url': urls[0]
            }


def clearFile():  # 后期导入Excel会用到
    with open('cartest.txt', 'w', encoding='utf-8') as f:
        f.close()


def clearUFile():  # 后期导入Excel会用到
    with open('allurls.txt', 'w', encoding='utf-8') as f:
        f.close()


def main():
        clearFile()
        for i in getUrl():
            try:
                strs=[]
                url = i['url']
                strtext = getHTML(url)
                len = rules(strtext)
                stepnum = getstep(len)
                pattern = re.compile('>(.*?)<')
                with open('cartest.txt', 'a+', encoding='utf-8') as f:
                    for i in range(0, len, stepnum):
                        strmiddata = re.findall(pattern, str(strtext[i]))
                        strs.append(strmiddata)
                    strs.append(url)
                    for i in strs:
                        f.write(str(i) + '\n')
                f.close()
            except:
                continue
        clearUFile()

main()
browser.close()
elapsed = (time.clock() - start) / 60
print("Time used:%d 分钟" % elapsed)

其实主要的时间花费在第三步,因为需要遍历3万多条链接,还要每个链接之间的设置的时间段,虽然情感上我不想设置时间,但是理智上必须设置,毕竟我怕搜狐把我给挂掉,所以耗费的时间可想而知。。。。漫长呀。。。。

我在简书上也有写的,恩,可以浏览一下,咳咳,虽然内容是一样的。

简书链接:https://www.jianshu.com/p/e2b54c7eefb1(未修改)

--------2018.8.8-----------

修改:

1.之前复制的时候,未检查代码,导致之后复制到pycharm中运行时出现错误,已修改代码。

2.在步骤的cartest里面增加的了每个车型的连接,避免当出现错误时,不知道自己已经爬到了那个车型。




 

猜你喜欢

转载自blog.csdn.net/nodoself/article/details/81328088