跟搜狐车库的爬取思路是一样的。首先找到每个车型的连接,然后遍历每个车型的连接去爬取所需的数据。不过网易车型库相较于搜狐车库而言是爬取的时间是远远少于搜狐汽车的。比较网易汽车的数据是不用渲染就可以爬取下来的,而搜狐汽车的数据需要渲染之后才可以爬取下来。
步骤1:获得品牌的连接
import requests
import re
url = 'http://product.auto.163.com/'
def getHtml(url):
data={'test':'data'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - CN, zh;q = 0.9'
}
html=requests.get(url,headers=headers,params=data)
html.encoding='GBK'
return html.text
def cutstr(html):
pattern=re.compile('<a.*?id="(.*?)".*?_seriseId=.*?</a>')
strs=re.findall(pattern,html)
return strs
def gotoFile():
html = getHtml(url)
with open('wangyicar2.txt','w',encoding='utf-8') as f:
for i in cutstr(html):
str='http://product.auto.163.com/series/'+i+'.html#008B00'
f.write(str+'\n')
f.close()
gotoFile()
步骤2:获得每个车型的连接
import requests
import re
# url = 'http://product.auto.163.com/series/16979.html#008B00'
def getHtml(url):
data={'test':'data'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - CN, zh;q = 0.9'
}
html=requests.get(url,headers=headers,params=data)
html.encoding='GBK'
return html.text
def cutstr(html):
pattern=re.compile('{product_id:(.*?),.*?product_name:(.*?)}')
strs=re.findall(pattern,html)
return strs
def gotoFile():
with open('wangyicar2.txt','r',encoding='utf-8') as a:
for url in a:
html = getHtml(url)
with open('wangyicar6.txt', 'a+', encoding='utf-8') as f:
for i in cutstr(html):
all=[]
urls='http://product.auto.163.com/config_compare/'+eval(i[0])+'.html#ncx00023'
name=i[1]
all.append(urls)
all.append(name)
all.append(url)
f.write(str(all)+'\n')
f.close()
a.close()
gotoFile()
步骤3:遍历每个车型的连接获得想要的数据
import requests
import re
from bs4 import BeautifulSoup
def getHtml(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
html=requests.get(url,headers=headers)
html.encoding = 'GBK'
return html.text
def Soup(html):
soup=BeautifulSoup(html,'lxml')
text=soup.find_all(class_="car_config_param_list")
return str(text)
def gotoFile():
# url = 'http://product.auto.163.com/config_compare/000BedBL.html#ncx00023'
with open('wangyicar5.txt', 'r', encoding='utf-8') as a:
for url in a:
str = []
html = getHtml(url)
text = Soup(html)
pattern = re.compile('<div class="cell"><span class="cell_text">(.*?)</span></div>')
datas = re.findall(pattern, text)
str = [datas[0], datas[3], datas[4], datas[9], datas[30], datas[38], datas[39], datas[41], datas[42],
datas[73], datas[74], datas[81], url]
with open('wangyi2.txt','a+',encoding='utf-8') as f:
for i in str:
f.write(i+'\n')
f.close()
str=[]
a.close()
gotoFile()
应为爬取搜狐车型应该让我对这种车型的爬取有些熟悉了,所以我这边现在只爬取了我想要的一些参数数据。如果想要整个的参数数据可以直接遍历datas,然后再写入TXT里面。