python爬去易车网某地市经销商信息

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Martin201609/article/details/53406630

python小程序练习,做代码处理,记录之:

# -*- encoding:utf8 -*-
import re
import sys
import urllib2
import urllib
import os
import cookielib
import json
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding("utf-8")


'''
http://dealer.bitauto.com/rizhao/
http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120
从 易车网 获取山东日照经销商的所有地址和信息

'''

def getCarName():
    #url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx'
    url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120'
    data = 'tagtype=jingxiaoshang&pagetype=masterbrand&citycode=rizhao%2F&cityid=2120'
    html = urllib2.urlopen(url).read()
    content  = html.encode('utf8')
    #print content
    #print type(content)
    #print len('JsonpCallBack({char:{A:1,B:1,C:1,D:1,E:0,F:1,G:1,H:1,I:0,J:1,K:1,L:1,M:1,N:1,O:1,P:1,Q:1,R:1,S:1,T:1,U:0,V:0,W:1,X:1,Y:1,Z:1},')
    #print len(',setcityurl:[{tagname:"baojia",tagurl:"http://price.bitauto.com/mb@objid@",otherpara:"_c@cityid@",allspell:""},{tagname:"yanghu",tagurl:"http://car.bitauto.com/tree_baoyang/mb_@objid@/",otherpara:"?citycode=@cityid@",allspell:""}]})')

    text = content[132:-232]  #获取brand对应的json数据的
    text = text.replace("'",'"')  #单引号替换为双引号,符合json格式
    text = re.sub(r'(\w+):',r'"\1":',text) #将冒号前的key值,增加双引号格式
    car_name = json.loads(text);
    #print type(car_name)
    return car_name

def getUrlList(car_name):
    car_name = car_name
    ulist = []
    for k in car_name:
        key = k
        for i in car_name[key]:
            #print i["name"],i["url"]
            ulist.append(i["url"])
    return ulist

#获取key 车辆类型,value 页面url
def getUrlDict(car_name):
    car_name = car_name
    urldic = dict()
    #print type(car_name)
    for k in car_name:
        #print '%s:%s'%(i,car_name[i])
        key = k
        for i in car_name[key]:  #car_name[key]是每一个字典的元素,存放的一种car的信息
            #print type(i)
            urldic[i['name']]=i['url']
            #print '%s:%s'%(i['name'],i['url'])
    return urldic

#获取url对应的网页的content
def getHtmlContentInfo(url):
    uh = 'http://dealer.bitauto.com'
    url = uh + str(url)
    html = urllib2.urlopen(url).read()
    return html
    #file = open('1.html','w+')
    #file.write(html)

#获取经销商信息
def getSellerInfo(keyname,html):
    seller_list = []
    index = 0
    soup = BeautifulSoup(html, "lxml")
    title = soup.find(name="title").get_text().strip()
    l =  soup.find_all(name="div",class_='intro-box')
    f = open('seller.txt','w+')
    for i in l:
        f.write('汽车品牌:'+ keyname + '\n')
        f.write('公司名称:' + i.a['title'] + '\n') #公司名称
        #print '公司名称:' , i.a['title']
        f.write('公司类型:' + i.a.get_text() + '\n')  #公司类型
        print i.a.get_text() + '\n'
        #print i.find_all(name='span',class_ ='add-sty')[0]
        f.write('公司地址:' + i.find_all(name='span',class_ ='add-sty')[1]['title'] + '\n')   #公司地址
        #print type(i.find_all(name='span',class_ ='phone-sty'))
        j = i.find_all(name='span',class_ ='phone-sty')
        for k in j :
            s = k.get_text().strip()
            f.write('公司电话:'+ s.split('\n')[0] +'\n')      #电话
            f.write('售卖范围' + s.split('\n')[1] + '\n')    #售卖范围
        f.write('------------------------------------------------------------------------------' + '\n')
    f.close()

def getSellerInfo2(keyname,html):
    seller_list = []
    index = 0
    soup = BeautifulSoup(html, "lxml")
    title = soup.find(name="title").get_text().strip()
    l =  soup.find_all(name="div",class_='intro-box')
    for i in l:
        print '汽车品牌:', keyname
        print '公司名称:', i.a['title']  #公司名称
        #print '公司名称:' , i.a['title']
        print '公司类型:',i.a.get_text() #公司类型
       # print i.a.get_text()
        #print i.find_all(name='span',class_ ='add-sty')[0]
        print '公司地址:',i.find_all(name='span',class_ ='add-sty')[1]['title']   #公司地址
        #print type(i.find_all(name='span',class_ ='phone-sty'))
        j = i.find_all(name='span',class_ ='phone-sty')
        for k in j :
            s = k.get_text().strip()
            print '公司电话:',s.split('\n')[0]    #电话
            print '售卖范围:' ,s.split('\n')[1]  #售卖范围
        print '------------------------------------------------------------------------------'

# main excution
cname = getCarName()
dic = getUrlDict(cname)  #存放所有经销商的url
print 'Started...'
for keyname,urlvalue  in dic.items():
    #print 'KeyName: %s',keyname
    html = getHtmlContentInfo(urlvalue)
    getSellerInfo2(keyname,html)
print 'Finished !!'

猜你喜欢

转载自blog.csdn.net/Martin201609/article/details/53406630
今日推荐