python 爬取淘宝信息

 
 
import requests
import re
import json
import time
import xlwt
import draw


DATA = []
url = 'https://s.taobao.com/search?q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180415&ie=utf8'
respoonse = requests.get(url)
html = respoonse.text
# print(html)
content = re.findall(r' g_page_config = (.*)g_srp_loadCss', html, re.S)[0].strip()[:-1]#去掉末尾的;
# print(content)#去掉末尾的;

content = json.loads(content)
# print(type(content))
# print(content)
data_list = content['mods']['itemlist']['data']['auctions']
for item in data_list:
    # print(item)
    temp = {
        'title': item['title'],
        'view_price': item['view_price'],
        'view_sales': item['view_sales'],
        'view_fee': 'no' if float (item['view_fee']) else 'yes',
        'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no',
        'area': item['item_loc'],
        'name': item['nick'],
        'detail_url': item['detail_url'],
    }
    DATA.append(temp)

    # print(len(DATA))



#cookies 保持...............
cookies = respoonse.cookies

#首页异步加载(动态加载)
url2 = 'https://s.taobao.com/api?_ksTS=1523758090700_266&callback=jsonp267&ajax=true&m=customized&stats_click=search_radio_all:1&q=python&s=36&imgfile=&initiative_id=staobaoz_20180415&bcoffset=-1&js=1&ie=utf8&rn=fa39f53d2fd39257eda01c2471e30967'
response2 = requests.get(url2,cookies=cookies)
html2 = response2.text
# print(html2)
content = re.findall(r'{.*}', html2)[0]#贪婪匹配
# print(content)
content = json.loads(content)
# print(content)
data_list = content['API.CustomizedApi']['itemlist']['auctions']
for item in data_list:
    # print(item)
    temp = {
        'title': item['title'],
        'view_price': item['view_price'],
        'view_sales': item['view_sales'],
        'view_fee': 'no' if float(item['view_fee']) else 'yes',
        'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no',
        'area': item['item_loc'],
        'name': item['nick'],
        'detail_url': item['detail_url'],
    }
    DATA.append(temp)
# print(DATA)

#翻页
cookies = response2.cookies
for i in range(1, 10):
    ktsts = time.time()
    _ksTS = '%s_%s' % (int(ktsts*1000), str(ktsts)[-3:])
    callback = "jsonp%s" % (int(str(ktsts)[-3:])+1)
    data_value = 44*i
    url = 'https://s.taobao.com/search?data-key=s&data-value={}&ajax=true&_ksTS={}&callback={}&' \
          'q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_' \
          'id=staobaoz_20180415&ie=utf8&bcoffset=3&ntoffset=0&p4ppushleft=1%2C48'.format(data_value,_ksTS,callback)
    response3 = requests.get(url, cookies=cookies)
    html = response3.text
    # print(html)

    data_list = json.loads(re.findall(r'{.*}', html)[0])['mods']['itemlist']['data']['auctions']
    # print(data_list)
#     #提取数据
    for item in data_list:
        # print(item)
        temp = {
            'title': item['title'],
            'view_price': item['view_price'],
            'view_sales': item['view_sales'],
            'view_fee': 'no' if float(item['view_fee']) else 'yes',
            'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no',
            'area': item['item_loc'],
            'name': item['nick'],
            'detail_url': item['detail_url'],
        }
        DATA.append(temp)
# print(len(DATA))

#画图
data1 = {'包邮': 0, '不包邮':0}
data2 = {'天猫': 0, '淘宝': 0}
data3 = {}
for item in DATA:
    if item['view_fee'] == 'no':
        data1['不包邮'] += 1
    else:
        data1['包邮'] += 1
    if item['isTmall'] == 'yes':
        data2['天猫'] += 1
    else:
        data2['淘宝'] += 1
    data3[item['area'].split(' ')[0]] = data3.get(item['area'].split(' ')[0],0) + 1
# print(data3)
# draw.pie(data1, '是否包邮')
# draw.pie(data2, '是否天猫')
draw.bar(data3,'地区分布')


# import matplotlib.pyplot as plt
# labels='frogs','hogs','dogs','logs'
# sizes=15,20,45,10
# colors='yellowgreen','gold','lightskyblue','lightcoral'
# explode=0,0.1,0,0
# plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
# plt.axis('equal')
# plt.show()



# 持久化形成excel表
f = xlwt.Workbook(encoding='utf-8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
#写标题
sheet01.write(0,0,'标题')
sheet01.write(0,1,'标价')
sheet01.write(0,2,'购买人数')
sheet01.write(0,3,'是否包邮')
sheet01.write(0,4,'是否天猫')
sheet01.write(0,5,'地区')
sheet01.write(0,6,'店名')
sheet01.write(0,7,'url')
#写内容
for i in range(len(DATA)):
    sheet01.write(i+1, 0, DATA[i]['title'])
    sheet01.write(i+1,1,DATA[i]['view_price'])
    sheet01.write(i+1,2,DATA[i]['view_sales'])
    sheet01.write(i+1,3,DATA[i]['view_fee'])
    sheet01.write(i+1,4,DATA[i]['isTmall'])
    sheet01.write(i+1,5,DATA[i]['area'])
    sheet01.write(i+1,6,DATA[i]['name'])
    sheet01.write(i+1,7,DATA[i]['detail_url'])
f.save(u'搜索python结果.xls')


















import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
#有中文出现的情况,需要u'内容'
def pie(data,img_name):
    fig = plt.figure(figsize=(8,8))
    cities = [x for x in data.keys()]
    values = [x for x in data.values()]
    ax1 = fig.add_subplot(111)
    ax1.set_title('饼图')
    labels = ['{}:{}'.format(city, value) for city, value in zip(cities,values)]
    explode = [0, 0.1]
    ax1.pie(values, labels=labels, explode=explode, shadow=True,autopct='%1.2f%%')
    plt.savefig('%s.png'% img_name)
    plt.show()

def bar(data,img_name):

    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(111)
    ax1.set_title('柱状图')
    cities = [x for x in data.keys()]
    values = [x for x in data.values()]
    plt.xlabel('城市')
    plt.ylabel('数量')
    plt.bar(range(len(cities)), values, color='rgb',tick_label=cities)
    plt.savefig('%s.png' % img_name)
    plt.show()


猜你喜欢

转载自blog.csdn.net/qq_38900441/article/details/79981884