python:爬虫获取淘宝/天猫的商品信息

【需求】输入关键字,如书包,可以搜索出对应商品的信息,包括:商品标题、商品链接、价格范围;且最终的商品信息需要符合:包邮、价格差不会超过某数值

#coding=utf-8
"""
以下三个字可以自行设置:search_keyword、page、price_interval_max
"""
#设置搜索的关键字
search_keyword = "戒指"
#设置需要搜索的商品的页数,比如设置10,就是淘宝搜出结果中前10页的商品数据,淘宝默认一页有44个商品
page = 10
#设置最大价格和最小价格之间可接受的差
price_interval_max = 1000

import re, os, requests, sys, time, shutil
from selenium import webdriver
from lxml import etree
from xlrd import open_workbook
from xlutils.copy import copy
reload(sys)
sys.setdefaultencoding( "utf-8" )

time1 = time.time()
phantomjs_path = os.getcwd() + "phantomjs.exe"
driver=webdriver.PhantomJS(executable_path='D:/Python27/Scripts/phantomjs.exe')
# driver=webdriver.PhantomJS(executable_path=phantomjs_path)
search_url = 'https://s.taobao.com/search'
payload = {'q':search_keyword, 's':'1', 'ie':'utf8'}  #字典传递url参数
payload1 = {'ie':'utf8'}
excel_path_ori = os.getcwd() + "//result.xls"
excel_path = os.getcwd() + "//tb_result.xls"
if not os.path.exists(excel_path):
    shutil.copy(excel_path_ori, excel_path)
else:
    os.remove(excel_path)
    shutil.copy(excel_path_ori, excel_path)
file = open('taobao_test.txt', 'w')

sheetName = "Sheet1"
url_lineindex = 0
title_lineindex = 1
price_lineindex = 2
price_interval_lineindex = 3
interval_lineindex = 4
fee_lineindex = 5

def Write_Excel(rowIndex, lineIndex, content):
    """
    - rowIndex:行
    - lineIndex:列
    """
    rowIndex = int(rowIndex)
    lineIndex = int(lineIndex)
    rb = 'r+w'
    rb = open_workbook(excel_path, 'r')
    rbook = open_workbook(excel_path, 'w')
    wb = copy(rbook)
    sheetIndex = rbook.sheet_names().index(sheetName)
    wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content)
    wb.save(excel_path)

def get_detail_price(url):
    """
    获取价格范围字段
    :param url:
    :return:
    """
    driver.get(url)
    time.sleep(1)
    html=driver.page_source
    selector=etree.HTML(html)
    if "tmall" in url:
        detail_price = selector.xpath('//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()')

    elif "taobao" in url:
        detail_price = selector.xpath('//em[@class="tb-rmb-num"]/text()')
    return detail_price

def get_price_interval(price):
    """
    部分商品的价格是一个范围,如:12.00-25.00,以下获取价格范围,及价格差
    :param price:
    :return:
    """
    print price
    price_interval = price[0]
    price_interval = ''.join(price_interval)
    if "-" in price_interval:
        start_price = price_interval.split("-")[0]
        end_price = price_interval.split("-")[1]
        interval = float(end_price) - float(start_price)
    else:
        interval = 0
    return price_interval, interval

def get_url_test():
    """
    获取商品信息:标题、链接、最大价格、价格范围、价格差
    :return:NONE
    """
    j = 0
    Write_Excel(j, url_lineindex, u"商品链接")
    Write_Excel(j, title_lineindex, u"商品标题")
    Write_Excel(j, price_lineindex, u"最低价格")
    Write_Excel(j, price_interval_lineindex, u"价格范围")
    Write_Excel(j, interval_lineindex, u"价格差")
    Write_Excel(j, fee_lineindex, u"运费")
    for k in range(0, page):        #10次,就是10页的商品数据

        payload['s'] = 44 * k + 1   #此处改变的url参数为s,s为1时第一页,s为45是第二页,89时第三页以此类推
        resp = requests.get(search_url, params=payload)
          #设置编码
        title = re.findall(r'"raw_title":"([^"]+)"', resp.text, re.I)  #正则保存所有raw_title的内容,这个是书名,下面是价格,地址
        price = re.findall(r'"view_price":"([^"]+)"', resp.text, re.I)
        loc = re.findall(r'"i003d568963194127tem_loc":"([^"]+)"', resp.text, re.I)
        url = re.findall(r'"detail_url":"([^"]+)"', resp.text, re.I)
        fee = re.findall(r'"view_fee":"([^"]+)"', resp.text, re.I)
        x = len(title)           #每一页商品的数量

        for i in range(0, x) :    #把缓冲中的数据保存到文件中
            print i
            print('商品标题:' + title[i])
            print('最低价格:' + price[i])
            print('运费:' + fee[i])
            #获取商品链接
            url[i] = url[i].replace("\u003d","=").replace("\u0026","&")
            # print('goods_url:' + url[i])
            url[i] = "https:" + url[i]
            print('商品链接:' + url[i])
            #获取商品价格区间
            try:
                resp_detail = requests.get(url[i])
                resp_detail.encoding = 'utf-8'
                detail_price = get_detail_price(url[i])
                data = get_price_interval(detail_price)
                price_interval = data[0]
                interval = data[1]
                print('price_interval:' + price_interval)
                print('interval:' + str(interval))
                #保存数据
                file.write(
                    str(k * 44 + i + 1) +
                    '商品链接:' + url[i] + '\n' +
                    '商品标题:' + title[i] + '\n' +
                    '最低价格:' + price[i] + '\n' +
                    '价格范围:' + str(price_interval) + '\n' +
                    '价格差:' + str(interval) + '\n' )
                    # 'goods_fee:' + fee[i] + '\n')
                #将过滤数据写入excel表格
                if fee[i] == "0.00" and interval < int(price_interval_max):
                    print "该商品符合要求:包邮,且最大价格与最小价格差小于%s" % price_interval_max
                    j = j + 1
                    Write_Excel(j, url_lineindex, url[i])
                    Write_Excel(j, title_lineindex, title[i])
                    Write_Excel(j, price_lineindex, price[i])
                    Write_Excel(j, price_interval_lineindex, price_interval)
                    Write_Excel(j, interval_lineindex, interval)
                    Write_Excel(j, fee_lineindex, fee[i])
            except:
                print "该商品信息获取失败,跳过"
                continue


get_url_test()
# #环境恢复
file.close()
os.system("taskkill /im phantomjs.exe")
time2 = time.time()
print u'ok,结束!'
print u'总共耗时:' + str((time2 - time1)/60) + '分钟'

猜你喜欢

转载自www.cnblogs.com/channy14/p/9266979.html