版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Martin201609/article/details/53406630
python小程序练习,做代码处理,记录之:
# -*- encoding:utf8 -*-
import re
import sys
import urllib2
import urllib
import os
import cookielib
import json
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")
'''
http://dealer.bitauto.com/rizhao/
http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120
从 易车网 获取山东日照经销商的所有地址和信息
'''
def getCarName():
#url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx'
url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120'
data = 'tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120'
html = urllib2.urlopen(url).read()
content = html.encode('utf8')
#print content
#print type(content)
#print len('JsonpCallBack({char:{A:1,B:1,C:1,D:1,E:0,F:1,G:1,H:1,I:0,J:1,K:1,L:1,M:1,N:1,O:1,P:1,Q:1,R:1,S:1,T:1,U:0,V:0,W:1,X:1,Y:1,Z:1},')
#print len(',setcityurl:[{tagname:"baojia",tagurl:"http://price.bitauto.com/mb@objid@",otherpara:"_c@cityid@",allspell:""},{tagname:"yanghu",tagurl:"http://car.bitauto.com/tree_baoyang/mb_@objid@/",otherpara:"?citycode=@cityid@",allspell:""}]})')
text = content[132:-232] #获取brand对应的json数据的
text = text.replace("'",'"') #单引号替换为双引号,符合json格式
text = re.sub(r'(\w+):',r'"\1":',text) #将冒号前的key值,增加双引号格式
car_name = json.loads(text);
#print type(car_name)
return car_name
def getUrlList(car_name):
car_name = car_name
ulist = []
for k in car_name:
key = k
for i in car_name[key]:
#print i["name"],i["url"]
ulist.append(i["url"])
return ulist
#获取key 车辆类型,value 页面url
def getUrlDict(car_name):
car_name = car_name
urldic = dict()
#print type(car_name)
for k in car_name:
#print '%s:%s'%(i,car_name[i])
key = k
for i in car_name[key]: #car_name[key]是每一个字典的元素,存放的一种car的信息
#print type(i)
urldic[i['name']]=i['url']
#print '%s:%s'%(i['name'],i['url'])
return urldic
#获取url对应的网页的content
def getHtmlContentInfo(url):
uh = 'http://dealer.bitauto.com'
url = uh + str(url)
html = urllib2.urlopen(url).read()
return html
#file = open('1.html','w+')
#file.write(html)
#获取经销商信息
def getSellerInfo(keyname,html):
seller_list = []
index = 0
soup = BeautifulSoup(html, "lxml")
title = soup.find(name="title").get_text().strip()
l = soup.find_all(name="div",class_='intro-box')
f = open('seller.txt','w+')
for i in l:
f.write('汽车品牌:'+ keyname + '\n')
f.write('公司名称:' + i.a['title'] + '\n') #公司名称
#print '公司名称:' , i.a['title']
f.write('公司类型:' + i.a.get_text() + '\n') #公司类型
print i.a.get_text() + '\n'
#print i.find_all(name='span',class_ ='add-sty')[0]
f.write('公司地址:' + i.find_all(name='span',class_ ='add-sty')[1]['title'] + '\n') #公司地址
#print type(i.find_all(name='span',class_ ='phone-sty'))
j = i.find_all(name='span',class_ ='phone-sty')
for k in j :
s = k.get_text().strip()
f.write('公司电话:'+ s.split('\n')[0] +'\n') #电话
f.write('售卖范围' + s.split('\n')[1] + '\n') #售卖范围
f.write('------------------------------------------------------------------------------' + '\n')
f.close()
def getSellerInfo2(keyname,html):
seller_list = []
index = 0
soup = BeautifulSoup(html, "lxml")
title = soup.find(name="title").get_text().strip()
l = soup.find_all(name="div",class_='intro-box')
for i in l:
print '汽车品牌:', keyname
print '公司名称:', i.a['title'] #公司名称
#print '公司名称:' , i.a['title']
print '公司类型:',i.a.get_text() #公司类型
# print i.a.get_text()
#print i.find_all(name='span',class_ ='add-sty')[0]
print '公司地址:',i.find_all(name='span',class_ ='add-sty')[1]['title'] #公司地址
#print type(i.find_all(name='span',class_ ='phone-sty'))
j = i.find_all(name='span',class_ ='phone-sty')
for k in j :
s = k.get_text().strip()
print '公司电话:',s.split('\n')[0] #电话
print '售卖范围:' ,s.split('\n')[1] #售卖范围
print '------------------------------------------------------------------------------'
# main excution
cname = getCarName()
dic = getUrlDict(cname) #存放所有经销商的url
print 'Started...'
for keyname,urlvalue in dic.items():
#print 'KeyName: %s',keyname
html = getHtmlContentInfo(urlvalue)
getSellerInfo2(keyname,html)
print 'Finished !!'