Python爬取淘宝商品价格销量信息

用到了requests库和re库 

import requests
import re
lis = []
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=荣耀V20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=88
kv = {
    "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Cookie":
        '换成自己的Cookie'
}

# 获取html页面
def getHTMLpages(url):
    try:
        r = requests.get(url, headers=kv, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


# 将商品信息存入列表
def getGoodsinfo(lis, html):
    # "raw_title":"honor/荣耀 荣耀play 微商手机 营销手机 小V  不凡 霸屏推"
    # "view_price":"6280.00"
    # "view_sales":"7人付款"或者"2.0万+人付款"或者"6000+人付款"
    title = re.findall(r'\"raw_title\"\:\".*?\"', html)  # .字符,*0或n次扩展,?表示最小匹配
    price = re.findall(r'\"view_price\"\:\"[\d.]*\"',
                       html)  # [\d.]*表示0-9或者.的0次或者多次取值
    sales = re.findall(r'\"view_sales\"\:\"[\d\.]*[\u4e00-\u9fa5]?\+?人付款\"',
                       html)  # [\u4e00-\u9fa5]表示中文字符
    for i in range(len(title)):
        tit = eval(title[i].split(':')[1])
        pri = eval(price[i].split(':')[1])
        sal = eval(sales[i].split(':')[1])
        lis.append([tit, pri, sal])
    # return lis


# 从列表中读取商品信息
def printGoodsinfo(lis):
    form = "{:^2}\t{:<50}\t{:>8}\t{:>8},"
    print(form.format("序号", "商品名称", "商品价格", "商品销量", chr(12288)))
    # for i in range(len(lis)):
    #     print("{:^3}\t{:^20}\t{:^10}\t{:^20}".format(i+1,lis[i][0],lis[i][1],lis[i][2],lis[i][3]))
    count = 1
    print()
    for i in lis:  # i代表一行
        print(form.format(count, i[0], i[1], i[2], chr(12288)))
        count += 1


if __name__ == "__main__":
    # f = open("D:\VscodePy\pytest\html.txt", encoding='utf-8')
    # sss = f.read()
    # f.close()
    s = input("请输入要查询的商品名称:")
    start_url = "https://s.taobao.com/search?q=" + s
    print(start_url)
    count = eval(input("请输入要查询的页数:"))
    for i in range(count):
        url = start_url + '&s=' + str(i * 44)
        html = getHTMLpages(url)
        # print(html)
        getGoodsinfo(lis, html)
    # getGoodsinfo(lis, sss)
    printGoodsinfo(lis)

运行结果:

注意: 同一个cookie和ip如果反复爬取,会被淘宝的反爬限制,会让滑动验证,这时候如果不想办法解决这个滑动验证,就会出现爬取数据为空的情况,因为你获取到的网页是滑动验证页面的网页。我的做法是把一次爬取下来的网页存放到一个文本文件里面,然后调试代码时直接从那个文件读取网页代码就行,这样就不需要反复爬取了。上面贴的代码是我调试后的最终代码。

发布了143 篇原创文章 · 获赞 78 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/KK_2018/article/details/104107778