Python crawling Taobao sales of commodity prices information

Requests to use the library and re Library 

import requests
import re
lis = []
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306
# https://s.taobao.com/search?q=荣耀v20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=荣耀V20&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=88
kv = {
    "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Cookie":
        '换成自己的Cookie'
}

# 获取html页面
def getHTMLpages(url):
    try:
        r = requests.get(url, headers=kv, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


# 将商品信息存入列表
def getGoodsinfo(lis, html):
    # "raw_title":"honor/荣耀 荣耀play 微商手机 营销手机 小V  不凡 霸屏推"
    # "view_price":"6280.00"
    # "view_sales":"7人付款"或者"2.0万+人付款"或者"6000+人付款"
    title = re.findall(r'\"raw_title\"\:\".*?\"', html)  # .字符,*0或n次扩展,?表示最小匹配
    price = re.findall(r'\"view_price\"\:\"[\d.]*\"',
                       html)  # [\d.]*表示0-9或者.的0次或者多次取值
    sales = re.findall(r'\"view_sales\"\:\"[\d\.]*[\u4e00-\u9fa5]?\+?人付款\"',
                       html)  # [\u4e00-\u9fa5]表示中文字符
    for i in range(len(title)):
        tit = eval(title[i].split(':')[1])
        pri = eval(price[i].split(':')[1])
        sal = eval(sales[i].split(':')[1])
        lis.append([tit, pri, sal])
    # return lis


# 从列表中读取商品信息
def printGoodsinfo(lis):
    form = "{:^2}\t{:<50}\t{:>8}\t{:>8},"
    print(form.format("序号", "商品名称", "商品价格", "商品销量", chr(12288)))
    # for i in range(len(lis)):
    #     print("{:^3}\t{:^20}\t{:^10}\t{:^20}".format(i+1,lis[i][0],lis[i][1],lis[i][2],lis[i][3]))
    count = 1
    print()
    for i in lis:  # i代表一行
        print(form.format(count, i[0], i[1], i[2], chr(12288)))
        count += 1


if __name__ == "__main__":
    # f = open("D:\VscodePy\pytest\html.txt", encoding='utf-8')
    # sss = f.read()
    # f.close()
    s = input("请输入要查询的商品名称:")
    start_url = "https://s.taobao.com/search?q=" + s
    print(start_url)
    count = eval(input("请输入要查询的页数:"))
    for i in range(count):
        url = start_url + '&s=' + str(i * 44)
        html = getHTMLpages(url)
        # print(html)
        getGoodsinfo(lis, html)
    # getGoodsinfo(lis, sss)
    printGoodsinfo(lis)

operation result:

Note: If you repeatedly crawling with a cookie and a ip, will be Taobao anti-climb limit, let slip verification, this time you do not want a solution to this slide verified, the case will be empty crawling data to appear, because you get to the page is sliding to verify web page. My approach is to take the time to climb down the page stored in a text file which, when debugging code and then read that file directly from the web page code on the line, so you do not need to repeatedly crawling up. It posted the above code is the final code after I debugging.

Published 143 original articles · won praise 78 · views 40000 +

Guess you like

Origin blog.csdn.net/KK_2018/article/details/104107778