Python爬虫学习笔记(实例:淘宝商品信息定向爬虫)

淘宝商品信息定向爬虫先贴代码,后看解析:
#淘宝商品信息定向爬虫
import requests
import re


#获得页面
def getHTMLText(url):
    try:
        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        headers = {
            'authority': 'i.taobao.com',
            'pragma': 'no-cache',
            'cache-control': 'no-cache',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
            'sec-fetch-user': '?1',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-site',
            'sec-fetch-mode': 'navigate',
            'referer': 'https://login.taobao.com/member/login.jhtml?from=taobaoindex&f=top&style=&sub=true&redirect_url=https%3A%2F%2Fi.taobao.com%2Fmy_taobao.htm%3Fspm%3Da21bo.2017.201864-1.1.2e8e11d9vo2Ast%26ad_id%3D%26am_id%3D%26cm_id%3D%26pm_id%3D1501036000a02c5c3739',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': 't=6a14d922a5ccdf410cf61b8a7d676cff; cna=dGmbFhCoQXcCAXAgYmoPXQLr; lgc=%5Cu8FC7%5Cu5BA2ljd; tracknick=%5Cu8FC7%5Cu5BA2ljd; tg=0; mt=ci=2_1; thw=cn; v=0; cookie2=110991b8eefb85d5f8c765d284e787cd; _tb_token_=e3eaee3da6fe4; _samesite_flag_=true; dnk=%5Cu8FC7%5Cu5BA2ljd; enc=1B3F381ziwGsMftxhP8Fe6csX9dar6plw4P71IaQJ%2BwoL4FaQtO2LaG5xMP476EEK8tcF%2FsHesoU4T4SqIqCRw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; _mw_us_time_=1579680174094; unb=2567573738; uc1=lng=zh_CN&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&existShop=false&cookie14=UoTblAGu7t2ZtA%3D%3D&pas=0&cookie21=Vq8l%2BKCLjhS4UhJVbhgU&tag=8&cookie15=U%2BGCWk%2F75gdr5Q%3D%3D; uc3=nk2=2nLNM%2FFIuw%3D%3D&vt3=F8dBxdrOu2PtC2x4eRY%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UU20sr67TFYA3A%3D%3D; csg=806ca7d3; cookie17=UU20sr67TFYA3A%3D%3D; skt=aaab3aae58e61948; existShop=MTU3OTY4MDE5Nw%3D%3D; uc4=nk4=0%402EiIau6LxxDAk%2BCx7wx2HFFW&id4=0%40U2%2Fz99G3B5JVS0pXyBxPeNKUyd5l; _cc_=WqG3DMC9EA%3D%3D; _l_g_=Ug%3D%3D; sg=d86; _nk_=%5Cu8FC7%5Cu5BA2ljd; cookie1=B0avTxixm8wz3r%2FxbcUWzku9pi7fZ0YpXHusB8qx7OA%3D; isg=BCcnChNwNRZEd7GUQmPrlk1KtlvxrPuOzgwha_mUQ7bd6EeqAXyL3mXqDuj2G9MG; l=cBQO09_mQaXsIxtsBOCanurza77OSIRYYuPzaNbMi_5dE6Ts_e7Oocn__F96VjWd9NLB43ral1J9-etkZaA4-aR8E5vP.',
        }

        params = (
            ('spm', 'a21bo.2017.201864-1.1.2e8e11d9vo2Ast'),
            ('ad_id', ''),
            ('am_id', ''),
            ('cm_id', ''),
            ('pm_id', '1501036000a02c5c3739'),
            ('nekot', 'uf2/zWxqZA==1579680197588'),
        )
        r = requests.get(url, timeout=30, headers=headers,params=params)
        r.raise_for_status()  # 如果状态不是200,引发HTTPError异常
        r.encoding = r.apparent_encoding
        # print(r.text[:1000])
        return r.text
    except:
        return ""

#关键:解析每一个获得的页面
def parsePage(ilt, html): #结果的页表类型
    try:
        plt = re.findall(r'"view_price":"[\d.]*"', html)#获得商品价格和价格前对应的标识,保    
                                                        #存在plt列表
        tlt = re.findall(r'"raw_title":".*?"', html)#获取商品本身的名字
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")

#输出商品信息
def printGoodlist(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))


def main():
    goods = '书包'
    depth = 2
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):#对每一页进行单独的访问处理
        try:
            url = start_url + '&s=' + str(44 * i) #对URL中最后的s变量赋值形成每一商品页的URL
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodlist(infoList)


#if __name__ == "__main__":
main()

代码解析:

由于淘宝具有反爬虫机制,所以在爬取商品信息的时候需要有以下几点注意:

1.首先登录淘宝网页,然后登录你的淘宝账号

2.按下F12查看控制台(我使用的是Chrome浏览器),点击Network->All,刷新页面,点击下面Name列的第一行,右键Copy->Copy as cURL(bash)

3.进入一下网站:https://curl.trillworks.com/,将刚才copy的内容复制到左边的框,下面按钮选中python,发现右边的框中出现你需要的headers和params,替换文中的相应的代码部分即可。

cURL

发布了33 篇原创文章 · 获赞 15 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_33360009/article/details/104070504