python2.7爬虫实现淘宝指定商品信息输出到excel中进行比价

#encoding=utf-8
import requests
import re
from bs4 import BeautifulSoup
import json
import xlwt
import xlrd

DATA= []

goods = raw_input('请输入您想要进行比价的商品名称(如:果冻包)\n>>>')

#获取第一个页面的所有与python有关的数据信息
# url = 'https://s.taobao.com/search?q=python'
url = 'https://s.taobao.com/search?q={}&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0'.format(goods)
r = requests.get(url, timeout = 30)
#获取html页面
html = r.text
#获取json数据
content = re.findall(r'g_page_config = (.+?)g_srp_loadCss', html, re.S)[0].strip()[:-1]
# 格式化json数据
content = json.loads(content)
#信息列表
dataList = content['mods']['itemlist']['data']['auctions']
#提取数据
for item in dataList:
    temp = {
        'raw_title': item['raw_title'],
        'view_price': item['view_price'],
        'view_sales': item['view_sales'],
        'view_fee': '否' if float(item['view_fee']) else '是',
        'isTmall': '是' if item['shopcard']['isTmall'] else '否',
        'view_loc': item['item_loc'],
        'name': item['nick'],
        'detail_url': item['detail_url']
    }
    DATA.append(temp)
# print len(DATA)

#cookie保持
cookies = r.cookies
#获取剩余的12条数据
url2 = 'https://s.taobao.com/api?_ksTS=1531540228441_814&callback=jsonp815&ajax=true&m=customized&q={}&ntoffset=9&p4ppushleft=1,48&s=36&bcoffset=-1&rn=fb9f089092fbceed410248ff5e71d997'.format(goods)
r2 = requests.get(url2, cookies=cookies)
#获取html页面
html2 = r2.text
#获取json数据
content = re.findall(r'{.+}', html2)[0]
#格式化json数据成为字典
content = json.loads(content)
#信息列表
dataList = content['API.CustomizedApi']['itemlist']['auctions']
#提取信息
for item in dataList:
    temp = {
        'raw_title': item['raw_title'],
        'view_price': item['view_price'],
        'view_sales': item['view_sales'],
        'view_fee': '否' if float(item['view_fee']) else '是',
        'isTmall': '是' if item['shopcard']['isTmall'] else '否',
        'view_loc': item['item_loc'],
        'name': item['nick'],
        'detail_url': item['detail_url']
    }
    DATA.append(temp)
print len(DATA)

#cookie保持
cookies = r2.cookies
for i in range(1,2):
    url = 'https://s.taobao.com/search?q={}&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s={}'.format(goods, i*44)
    r3 = requests.get(url, cookies=cookies)
    html3 = r3.text
    # 获取json数据
    content = re.findall(r'g_page_config = (.+?)g_srp_loadCss', html3, re.S)[0].strip()[:-1]
    # 格式化json数据
    content = json.loads(content)

    # 信息列表
    dataList = content['mods']['itemlist']['data']['auctions']

    # 提取数据
    for item in dataList:
        temp = {
            'raw_title': item['raw_title'],
            'view_price': item['view_price'],
            'view_sales': item['view_sales'],
            'view_fee': '否' if float(item['view_fee']) else '是',
            'isTmall': '是' if item['shopcard']['isTmall'] else '否',
            'view_loc': item['item_loc'],
            'name': item['nick'],
            'detail_url': item['detail_url']
        }
        DATA.append(temp)
    print len(DATA)
    # 因为在后续的翻页过程中不存在异步加载的数据信息,所以说直接使用url1获取第一次加载获取得到的html页面,并得到其中的数据内容即可,
    # 否则可以使用cookie保持,然后借助于2次到3次加载剩余数据时采用的url的不同,以及之间的联系信息进行url的处理并获取其中的数据信息
    # 获取剩余的12条数据
    # cookies = r3.cookies
    # url2 = 'https://s.taobao.com/api?_ksTS=1531490330846_224&callback=jsonp225&ajax=true&m=customized&sourceId=tb.index&q=python&spm=a21bo.2017.201856-taobao-item.1&s=36&imgfile=&initiative_id=tbindexz_20170306&bcoffset=-1&commend=all&ie=utf8&rn=cfa45b12557fdf04fda5b2f0bff49239&ssid=s5-e&search_type=item'
    # r2 = requests.get(url2, cookies=cookies)
    # # 获取html页面
    # html2 = r2.text
    # # print html2
    #
    # # 获取json数据
    # content = re.findall(r'{.+}', html2)[0]
    #
    # # 格式化json数据成为字典
    # content = json.loads(content)
    #
    # # 信息列表
    # dataList = content['API.CustomizedApi']['itemlist']['auctions']
    #
    # # 提取信息
    # for item in dataList:
    #     temp = {
    #         'raw_title': item['raw_title'],
    #         'view_price': item['view_price'],
    #         'view_sales': item['view_sales'],
    #         'view_fee': '否' if float(item['view_fee']) else '是',
    #         'isTmall': '是' if item['shopcard']['isTmall'] else '否',
    #         'view_loc': item['item_loc'],
    #         'name': item['nick'],
    #         'detail_url': item['detail_url']
    #     }
    #     DATA.append(temp)

print len(DATA)

#画图
#未实现

# 写入表格
#持久化
f = xlwt.Workbook(encoding = 'utf-8')
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = 'SimSun'    # 指定“宋体”
style.font = font

worksheet = f.add_sheet('my_firt_xlwt', cell_overwrite_ok=False)

#写标题
worksheet.write(0,0,'标题')
worksheet.write(0,1,'标价')
worksheet.write(0,2,'购买人数')
worksheet.write(0,3,'是否包邮')
worksheet.write(0,4,'是否天猫')
worksheet.write(0,5,'地区')
worksheet.write(0,6,'店名')
worksheet.write(0,7,'url')

#写内容
for i in range(len(DATA)):
    worksheet.write(i+1,0,DATA[i]['raw_title'])
    worksheet.write(i+1,1,DATA[i]['view_price'])
    worksheet.write(i+1,2,DATA[i]['view_sales'])
    worksheet.write(i+1,3,DATA[i]['view_fee'])
    worksheet.write(i+1,4,DATA[i]['isTmall'])
    worksheet.write(i+1,5,DATA[i]['view_loc'])
    worksheet.write(i+1,6,DATA[i]['name'])
    worksheet.write(i+1,7,DATA[i]['detail_url'])

f.save(u'the result of search {}.xls'.format(goods))


# #从表格中读取数据
'''
文件路径比较重要,要以这种方式去写文件路径不用
'''
file_path = r'd:/python-workspace/python-pachong/taobao-price/the result of search {}.xls'.format(goods)
#读取的文件路径
file_path = file_path.decode('utf-8')
#文件中的中文转码
data = xlrd.open_workbook(file_path)
#获取数据
# table = data.sheet_by_name('my_firt_xlwt')
table = data.sheets()[0]
#获取sheet
nrows = table.nrows
#获取总行数
ncols = table.ncols
#获取总列数

#获取每一行的所有数据
for i in range(nrows):
    rows_data = table.row_values(i)
    # print rows_data
# print nrows

#获取一列的数值
for i in range(ncols):
    cols_data = table.col_values(i)
    # print cols_data
# print type(cols_data)
# print ncols

#获取一个单元格的数值
for row in range(1, nrows):
    print "%d行的值为: " %row,
    for col in range(ncols):
        #table为sheet的名字
        #用行进行读取某行某列的数据  table.row_values(row_num)[col_num]
        # print type(table.row_values(row)[col].encode('utf-8'))
        # cell_value = table.row_values(row)[col].value
        #用列进行读取某行某列的数据  table.col_values(col_num)[row_num]
        # cell_value = table.col_values(col)[row].value
        #用单元格进行某行某列数据的读取操作  table.cell(row_num,col_num).value
        cell_value = table.cell(row, col).value

        if (len(cell_value) > 20):
            cell_value = cell_value[:20]+'...'
        print cell_value + '\t\t',
    print ''

猜你喜欢

转载自blog.csdn.net/qq_32670879/article/details/81903101