import requests import re import json import time import xlwt import draw DATA = [] url = 'https://s.taobao.com/search?q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180415&ie=utf8' respoonse = requests.get(url) html = respoonse.text # print(html) content = re.findall(r' g_page_config = (.*)g_srp_loadCss', html, re.S)[0].strip()[:-1]#去掉末尾的; # print(content)#去掉末尾的; content = json.loads(content) # print(type(content)) # print(content) data_list = content['mods']['itemlist']['data']['auctions'] for item in data_list: # print(item) temp = { 'title': item['title'], 'view_price': item['view_price'], 'view_sales': item['view_sales'], 'view_fee': 'no' if float (item['view_fee']) else 'yes', 'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no', 'area': item['item_loc'], 'name': item['nick'], 'detail_url': item['detail_url'], } DATA.append(temp) # print(len(DATA)) #cookies 保持............... cookies = respoonse.cookies #首页异步加载(动态加载) url2 = 'https://s.taobao.com/api?_ksTS=1523758090700_266&callback=jsonp267&ajax=true&m=customized&stats_click=search_radio_all:1&q=python&s=36&imgfile=&initiative_id=staobaoz_20180415&bcoffset=-1&js=1&ie=utf8&rn=fa39f53d2fd39257eda01c2471e30967' response2 = requests.get(url2,cookies=cookies) html2 = response2.text # print(html2) content = re.findall(r'{.*}', html2)[0]#贪婪匹配 # print(content) content = json.loads(content) # print(content) data_list = content['API.CustomizedApi']['itemlist']['auctions'] for item in data_list: # print(item) temp = { 'title': item['title'], 'view_price': item['view_price'], 'view_sales': item['view_sales'], 'view_fee': 'no' if float(item['view_fee']) else 'yes', 'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no', 'area': item['item_loc'], 'name': item['nick'], 'detail_url': item['detail_url'], } DATA.append(temp) # print(DATA) #翻页 cookies = response2.cookies for i in range(1, 10): ktsts = time.time() _ksTS = '%s_%s' % (int(ktsts*1000), str(ktsts)[-3:]) callback = "jsonp%s" % (int(str(ktsts)[-3:])+1) data_value = 44*i url = 'https://s.taobao.com/search?data-key=s&data-value={}&ajax=true&_ksTS={}&callback={}&' \ 'q=python&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_' \ 'id=staobaoz_20180415&ie=utf8&bcoffset=3&ntoffset=0&p4ppushleft=1%2C48'.format(data_value,_ksTS,callback) response3 = requests.get(url, cookies=cookies) html = response3.text # print(html) data_list = json.loads(re.findall(r'{.*}', html)[0])['mods']['itemlist']['data']['auctions'] # print(data_list) # #提取数据 for item in data_list: # print(item) temp = { 'title': item['title'], 'view_price': item['view_price'], 'view_sales': item['view_sales'], 'view_fee': 'no' if float(item['view_fee']) else 'yes', 'isTmall': 'yes' if item['shopcard']['isTmall'] else 'no', 'area': item['item_loc'], 'name': item['nick'], 'detail_url': item['detail_url'], } DATA.append(temp) # print(len(DATA)) #画图 data1 = {'包邮': 0, '不包邮':0} data2 = {'天猫': 0, '淘宝': 0} data3 = {} for item in DATA: if item['view_fee'] == 'no': data1['不包邮'] += 1 else: data1['包邮'] += 1 if item['isTmall'] == 'yes': data2['天猫'] += 1 else: data2['淘宝'] += 1 data3[item['area'].split(' ')[0]] = data3.get(item['area'].split(' ')[0],0) + 1 # print(data3) # draw.pie(data1, '是否包邮') # draw.pie(data2, '是否天猫') draw.bar(data3,'地区分布') # import matplotlib.pyplot as plt # labels='frogs','hogs','dogs','logs' # sizes=15,20,45,10 # colors='yellowgreen','gold','lightskyblue','lightcoral' # explode=0,0.1,0,0 # plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=50) # plt.axis('equal') # plt.show() # 持久化形成excel表 f = xlwt.Workbook(encoding='utf-8') sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) #写标题 sheet01.write(0,0,'标题') sheet01.write(0,1,'标价') sheet01.write(0,2,'购买人数') sheet01.write(0,3,'是否包邮') sheet01.write(0,4,'是否天猫') sheet01.write(0,5,'地区') sheet01.write(0,6,'店名') sheet01.write(0,7,'url') #写内容 for i in range(len(DATA)): sheet01.write(i+1, 0, DATA[i]['title']) sheet01.write(i+1,1,DATA[i]['view_price']) sheet01.write(i+1,2,DATA[i]['view_sales']) sheet01.write(i+1,3,DATA[i]['view_fee']) sheet01.write(i+1,4,DATA[i]['isTmall']) sheet01.write(i+1,5,DATA[i]['area']) sheet01.write(i+1,6,DATA[i]['name']) sheet01.write(i+1,7,DATA[i]['detail_url']) f.save(u'搜索python结果.xls') import numpy as np import matplotlib.pyplot as plt plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号 #有中文出现的情况,需要u'内容' def pie(data,img_name): fig = plt.figure(figsize=(8,8)) cities = [x for x in data.keys()] values = [x for x in data.values()] ax1 = fig.add_subplot(111) ax1.set_title('饼图') labels = ['{}:{}'.format(city, value) for city, value in zip(cities,values)] explode = [0, 0.1] ax1.pie(values, labels=labels, explode=explode, shadow=True,autopct='%1.2f%%') plt.savefig('%s.png'% img_name) plt.show() def bar(data,img_name): fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(111) ax1.set_title('柱状图') cities = [x for x in data.keys()] values = [x for x in data.values()] plt.xlabel('城市') plt.ylabel('数量') plt.bar(range(len(cities)), values, color='rgb',tick_label=cities) plt.savefig('%s.png' % img_name) plt.show()
python 爬取淘宝信息
猜你喜欢
转载自blog.csdn.net/qq_38900441/article/details/79981884
周排行