第一个淘宝搜索爬虫

在上次的基础上,自己摸索着调试出对淘宝商品的搜索和整理,其中主要使用了正则表达式,基本代码如下:

# CrowTaobaoPrice
import requests
import re
import traceback


def getHtmlText(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def parsePage( html):
    goodsInfo = []
    try:
        goods_name_sub = re.findall(r'\"raw_title\"\:\".*?\"', html)  #*?表示最小匹配
        goods_price_sub = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)  #查找 view_price所对应的数值,即:后面到 . 的数字
        goods_saler_sub = re.findall(r'\"nick\"\:\".*?\"', html)   #表示查找nick 对应的字符串,采取最小匹配的方式
        goods_local_sub = re.findall(r'\"item_loc"\:\".*?\"', html)
        for i in range(len(goods_price_sub)):
            goods_name = eval(goods_name_sub[i].split(':')[1])
            goods_price = eval(goods_price_sub[i].split(':')[1])
            goods_saler = eval(goods_saler_sub[i].split(':')[1])
            goods_local = eval(goods_local_sub[i].split(':')[1])
            goodsInfo.append([goods_name, goods_price, goods_saler, goods_local])
            #goodsInfo.append(goods_name)
            #goodsInfo.append(goods_price)
            #goodsInfo.append(goods_saler)
            #goodsInfo.append(goods_local)

        return goodsInfo
    except:
        print("解析出错了")


def printGoodsList(goodsList):
    #tplt = "{0:^5}\t\t{1:^10}\t{2:^10}\t{3:^10}\t{4:^10}"
    #print('{0:^4}\t{1:^35}\t{2:^10}\t{3:^10}\t{4:^10}'.format("序号", "商品名称", "价格", "商家", "所在地", chr(12288)))
    print("序号", "商品名称", "价格", "商家", "所在地", chr(12288))
    count = 1
    for i in range(len(goodsList)):
        for j in range(len(goodsList[i])):
            print(count, goodsList[i][j][0], goodsList[i][j][1], goodsList[i][j][2], goodsList[i][j][3])
            count = count+1
        # print(tplt.format(i+1, goodsList[i][0], goodsList[i][1], goodsList[i][2], goodsList[i][3], chr(12288)))
        #print(i + 1, goodsList[i][0], goodsList[i][1], goodsList[i][2], goodsList[i][3])
        # print(tplt.format(i+1, goodsList[count], goodsList[count+1], goodsList[count+2], goodsList[count+3], chr(12288)))
        # print(i+1, goodsList[count], goodsList[count+1], goodsList[count+2], goodsList[count+3])
        # count = count+4


def main():
    goodsName = "cat男鞋"
    searchdepth = 2
    url = "https://s.taobao.com/search?q=" + goodsName
    img_0 = '&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180513&ie=utf8&bcoffset=3&ntoffset=0&p4ppushleft=1%2C48&s='
    gList = []
    for i in range(searchdepth):
        url = url + img_0 + str(44 * i)
        html = getHtmlText(url)
        gList.append(parsePage(html))
    printGoodsList(gList)


if __name__ == '__main__':
    try:
        main()
    except:
        print("出现错误")
        traceback.print_exc()

猜你喜欢

转载自blog.csdn.net/kevinqt/article/details/80300313