Taobao product information crawling (logged)

Thanks for the link Taobao login help
has been successfully crawling, the following is the source code:

# 目标:获取淘宝搜索页面的信息 提取其中的商品名称和价格
# 理解:1、淘宝的搜索接口 2、翻页处理
# 技术路线:requests re
# http://s.taobao.com/search?q=书包&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20170105&ie=utf8
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20170105&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
# https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20170105&ie=utf8&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
# 程序的结构设计
# 步骤1:提交商品搜索请求,循环获取页面
# 步骤2:对于每个页面,提取商品名称和价格信息
# 步骤3:将信息输出到屏幕上

import requests
import re


def getHtmlText(url):
    try:
        headers = {
            'authority': 's.taobao.com',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
            'sec-fetch-dest': 'document',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'referer': 'https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200312&ie=utf8',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': 'miid=253146491399089190; thw=cn; cna=o391FTBhsgUCAdrD5AnR+QjC; hng=CN%7Czh-CN%7CCNY%7C156; tracknick=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; tg=0; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; t=a7362f9a33953472d0663cbb296a53fd; enc=jXmwR%2BzLAdHsQMP0d%2F9pId1Dn%2BW%2FtphblF4%2FsFXBDkrBKBLkGD4tksHXhn5%2BPTVPEfnLekpkIzYNZENT3sCchA%3D%3D; _samesite_flag_=true; cookie2=1bb1c10d29aca83180c0390f0d1fb1fc; _tb_token_=33b66b43e3335; sgcookie=EBnq%2FRUJ6VqRYs7GbgSt0; unb=2269301708; uc3=id2=UUpngTzc2Y13gg%3D%3D&lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dBxd7CT335hOeshsM%3D&nk2=o688bJ2t2lOAtK8MXRkc9rS1xdZODw%3D%3D; csg=bb101fa2; lgc=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; cookie17=UUpngTzc2Y13gg%3D%3D; dnk=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; skt=d1b053176dd80fc6; existShop=MTU4Mzk4NzY5Ng%3D%3D; uc4=id4=0%40U2gtEEjyK2oSMoVT0y3FwsMQmuba&nk4=0%40ofkQgMIYV1w9DtinNnKgQnwEY%2B9mIA3bhXzykl%2Bh56ME; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=%E9%A2%968e; _nk_=%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu5566%5Cu9896; cookie1=UNcJvhnwxmsnc%2BQXwkd900lS4%2BSdsCS20k5pE5xYJ2Q%3D; alitrackid=login.taobao.com; lastalitrackid=login.taobao.com; tfstk=cFMhBQjc8XPBkECPG6wIVOcoz-9AZZIURSFZQvf4ybSgh8kNiq5Nge_NRPune_1..; mt=ci=20_1; v=0; Hm_lvt_eaa57ca47dacb4ad4f5a257001a3457c=1583980126,1583987707; uc1=cookie16=Vq8l%2BKCLySLZMFWHxqs8fwqnEw%3D%3D&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=VFC%2FuZ9ayeYq2g%3D%3D&existShop=false&pas=0&cookie14=UoTUOafM2gaxgg%3D%3D&tag=8&lng=zh_CN; JSESSIONID=AF10B48836FA6F017CE4F6D93629F9C4; Hm_lpvt_eaa57ca47dacb4ad4f5a257001a3457c=1583989886; isg=BM3NGh5pr8U80wicTAKS00EO3OlHqgF8Ot6V2g9TpGTZBuy41_jeTHTUcJpgnhk0; l=dBEklFpHqPrWkOUQBOfNqASSiU_ONIdb8SFy7q0UFICPO7CHlfwOWZq1YKLMCnGVH6kWR3Rp9jjQBqLO1yCrnxv9-3k_J_DmndC..',
        }
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        print(r.text)
        return r.text
    except:
        print("爬取失败")

def parsePage(ilist,html):  # ilt返回结果的列表类型
    try:
        # .表示任何单个字符
        # *表示前一个字符0次或无限次扩展
        # +表示前一个字符1次或无限次扩展
        # ?表示前一个字符0次或1次扩展
        # *?表示前一个字符0次或或无限次扩展 最小匹配
        # \d 数字 等价于[0-9]
        # findall 搜索字符串 以列表类型返回全部能匹配的子串
        plt=plt = re.findall(r'"view_price":"\d+.\d*"',html)
        # plt = re.findall(r'\"view_price\":\"\d+\.\d*\"',html)
        tlt = re.findall(r'\"raw_title\":\".*?\"',html)
        #print(tlt)
        print(len(plt))
        for i in range(len(plt)):
            price = eval(plt[i].split('\"')[3]) # eval去掉最外层的引号
            title = tlt[i].split('\"')[3]
            ilist.append([title,price])
        #print(ilist)
    except:
        print("解析出错")


def printGoodsList(ilist, num):
    print("=====================================================================================================")
    tplt = "{0:<3}\t{1:<30}\t{2:>6}"
    print(tplt.format("序号", "商品名称", "价格"))
    count = 0
    for g in ilist:
        count += 1
        if count <= num:
            print(tplt.format(count, g[0], g[1]))
    print("=====================================================================================================")


def main():
    goods = "书包"
    depth = 2
    start_url = "https://s.taobao.com/search?q=" + goods
    infoList = []
    num = 20
    for i in range(depth):
        try:
            url = start_url + '$S=' + str(44 * i)
            html = getHtmlText(url)
            parsePage(infoList, html)
        except:
            continue

    printGoodsList(infoList, num)


main()

The results are:

No. Product Name Price
1 backpack shoulder bag custom printed logo design company custom gift customized 128.0
2 custom printed logo backpack shoulder bag casual and simple portable bag customized 118.0
3 Business backpack shoulder bag men working business computer bag 49.0 custom logo
4 kk tree schoolbag pupils Girl children 6-12 years of age one hundred twenty-three to six-grade girls' shoulder bags spinal care burdens 119.0
5 kk tree bag boy pupils 1--3--4--5 grade children backpack shoulder bag girls 6-12 years of age Guards 119.0
6 JanSport flagship store Jess Peter shoulder bag computer bag schoolbag men and women leather soled 448.0 TYP7
7 Kipling handbags large capacity canvas bag campus casual bags fashion simple shoulder bag | CAYENNE S 749.0
8 Ma Shalan Ti schoolbag female Korean tidal 2019 new high-capacity shoulder bag lady fashion wild theft bag 299.0
9 JanSport official website flagship store in Jasper shoulder bag men and women 15 inch computer bag schoolbag fashion 448.0 leather TYP7
10 Dickies2019 brand new wave of female to male high-capacity portable shoulder bag backpack schoolbag campus 189.0 S014
11 Dickies2020 new wave of brand shoulder bag large capacity Lesbian portable backpack schoolbag campus 219.0 S018
12 primary school girls schoolbag 1--3--4--6-grade boy burdens spinal care Children will be large-capacity shoulder bag 99.0
13 CAMS suspended burdens schoolbag male high school students junior high schoolgirl backpack shoulder bag large capacity weight loss 398.0
14 OMI Omi shoulder bag 2020 new wild female Korean fashion Oxford College Wind satchel bag 459.0
15 Caran 1--4 Grade-one can open a good cleaning easy to incorporate care burdens schoolbag ridge 239.0
16 Caran junior high school female students bag large capacity shoulder bag Korean high school students lightweight backpack 109.0
17 sunearth shoulder bag women Rakuten Super fire backpack schoolbag male college students away from home alleno package 158.0
18 North bag shoulder bag female 2019 new backpack College Wind large capacity bag cute female students travel bag tide 75.0
19 business men shoulder bag Korean wave of minimalist backpack computer bag casual female high school students travel bag bag fashion 199.0
20 Japanese CILOCALA shoulder bag transparent jelly bag female high school students travel bag travel backpack cloth 330.0

Published 15 original articles · won praise 5 · Views 7642

Guess you like

Origin blog.csdn.net/qq_39926861/article/details/104825109