爬虫(七)基于requests‐bs4‐re的淘宝&股票数据爬虫

1.淘宝商品信息定向爬虫

(1)实例介绍

程序的结构设计

(2)实例编写

import requests
import re

def getHTMLText(url):
    #kv={'cookie':'thw=cn; cna=JLCtFo6ALgkCAXjssV9uBEPl; t=56bc12421971bfee4fb1ae693df72b74; _m_h5_tk=429c470ea6b853c66b5756a66bbc6f33_1581511187869; _m_h5_tk_enc=98d21fcbc8108a01f1264093b8b65583; _tb_token_=tPknUvgMMWJgQNOfmb0k; _samesite_flag_=true; cookie2=1999acd8b0d2adbca43a6d89c60c03d2; unb=2969533717; uc3=vt3=F8dBxdzwEV59Qp3pRmw%3D&nk2=kcISB5EuBmYW8w%3D%3D&lg2=UIHiLt3xD8xYTw%3D%3D&id2=UUGk1y%2Bf6KVCmQ%3D%3D; csg=75f44e05; lgc=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; cookie17=UUGk1y%2Bf6KVCmQ%3D%3D; dnk=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; skt=90932341c5d385f5; existShop=MTU4MTUwNTI5OQ%3D%3D; uc4=nk4=0%40kyT3A%2F2wjZYrauk2fa8mpW3YuVbT&id4=0%40U2OT53FN4LOs8gi4f%2FZk0G9ur%2Fw0; tracknick=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; _cc_=VT5L2FSpdA%3D%3D; tg=0; _l_g_=Ug%3D%3D; sg=%E5%8D%8E7d; _nk_=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; cookie1=BxVXTMAqUkBqsiCJNKan6RZOY%2FiP%2FtQJq13bKsCT%2BdE%3D; mt=ci=2_1; v=0; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=UIHiLt3xTIkz&cookie15=WqG3DMC9VAQiUQ%3D%3D&existShop=false&pas=0&cookie14=UoTUO8VqBo0RxQ%3D%3D&tag=8&lng=zh_CN; enc=Vt9at4W%2BsQ%2Bwr1C88wjvHng417hZ7ZoVWHFv9Se%2FjoliVRR4yZbTR5jFjpG9dFiw%2F2MQA88Kh0dPlnWfAiDoaA%3D%3D; alitrackid=i.taobao.com; lastalitrackid=i.taobao.com; l=cBE329rqQY0hWKOoBOCZhurza77OSIRxWuPzaNbMi_5Ca6L6DvbOoSXOxFp62jWd988B4Tn8Nrv9-etkiRVGacK-g3fP.; isg=BCwserbAXrxZW0qA7KRw0-Wv_Qpe5dCPOznWM4ZtOFd6kcybrvWgHyIjtFkpRAjn; hng=CN%7Czh-CN%7CCNY%7C156; JSESSIONID=0A4B511A7DF2B15518DD3659427B3024',
    #    'user-agent':'Mozilla/5.0'}
    try:
        header = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'cache-control': 'max-age=0',
            'authority': 's.taobao.com',
            'cookie': 'thw=cn; tracknick=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; tg=0; enc=ExCpUIZ5v513Pqh%2B10%2FV1W0ozagi9e0s%2Fohw2E9dbBkiE4g%2BOTIa9CWvD4B%2Bd7apKloAsTEIpGfQdgKooUY4YA%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; miid=78926411508356534; __guid=154677242.1543442992267341300.1578395296708.383; lgc=%5Cu8748%5Cu8748%5Cu4E0A%5Cu6E05%5Cu534E; t=48d82d1fd3153d29657f2fd5f5e37f48; uc3=vt3=F8dBxdsSHe7TDgCcVu8%3D&nk2=kcISB5EuBmYW8w%3D%3D&id2=UUGk1y%2Bf6KVCmQ%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; uc4=id4=0%40U2OT53FN4LOs8gi4f%2FZlkGle8Ori&nk4=0%40kyT3A%2F2wjZYrauk2fa8nyNePTVep; _cc_=VT5L2FSpdA%3D%3D; mt=ci=-1_0; _m_h5_tk=ff60a1bc1b103f0796974e2b8ec5cf1e_1581308704085; _m_h5_tk_enc=6069ab606c5d113bd17879c9a4ea4cea; v=0; cookie2=116b4f2753af1e2125fa30b6f848c067; _tb_token_=eedeee77575e6; cna=MnL7FH3z1EwCAdNhA4WodNyG; JSESSIONID=CA5D004A576DBD8068C6957684815204; monitor_count=3; uc1=cookie14=UoTUO8VqByokSA%3D%3D; l=cBPsjRTcvwFcqqFzBOCwNuIRGobtjIRYHuPRwVYXi_5aZ6L1La_OoSXixFp6DjWd9pTB4IFj7TJ9-etkiKy06Pt-g3fP.; isg=BDEx7Whdy4Nn4WWtrqzL7hdFQL3LHqWQ8hNQIRNGJfgXOlGMW2_LYScYWc5cqD3I',
        }
        r=requests.get(url,headers=header)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

#对每个页面进行解析
def parsePage(ilt,html):
    try:
        plt=re.findall(r'\"view_price\":\"[\d\.]*\"',html)
        tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price=eval(plt[i].split(':')[1])
            titile=eval(tlt[i].split(':')[1])
            #获得键值对后面的部分
            ilt.append([price,title])
    except:
        print("")

def printGoodsList(ilt):
    tplt="{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    cout=0
    for g in ilt:
        count=count+1
        print(tplt.format(count,g[0],g[1]))
        
def main():
    goods='口红'
    depth=2
    start_url='http://s.taotao.com/search?q='+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url+'&s='+str(44*i)
            html=getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)

main()

注意上面的header在口红页面右键查看源代码,点击Network->All,刷新页面,点击下面Name列的第一行,右键Copy->Copy as cURL(bash),在https://curl.trillworks.com/的左边框粘贴,把有边框的header赋值到上图代码中,注意header作为requests.get的参数之一。(有点奇怪,爬虫几次后就爬不了了,后面再研究下。。。)

结果输出:

48
=====================================================================================================
序号 	商品名称                          	    价格
1  	Gella\u0026#39;s星愿钻闪丝绒口红 雾感唇妆 	  25.0
2  	ChristianLouboutin萝卜丁CL 黑管三支口红限量套装001M/001/001s	2799.0
3  	Christian Louboutin萝卜丁女王限量口红3支装001/001s/001m 3.5g	2528.0
4  	CL口红唇膏 女王权杖萝卜丁口红 情人节限量礼盒 送女友001M 001s	 680.0
5  	【官方正品】安娜苏 魔漾晶灵唇膏S系列滋润保湿口红     	 235.0
6  	Oden\u0027s eye Alva系列精灵果霜丝绒唇釉6ml哑光口红复古红豆沙色	 119.0
7  	小萝卜丁女王的权杖口红001S/001M正红色薄纱限量豹纹礼盒滋润	 728.0
8  	CL口红 Christian Louboutin萝卜丁黑管口红套装001#001s/001m	2099.0
9  	Anna sui/安娜苏魔漾晶灵唇膏口红女 S系列400滋润保湿李佳琦推荐	 228.0
10 	Christian Louboutin 女王权杖CL萝卜丁口红唇膏 001M/001/001s	 599.0
11 	CL口红 萝卜丁套盒 三只装金管黑管口红 女王权杖001#001s/001m	 685.1
12 	罗伯cl小萝卜丁口红黑管001m s女王的权杖官方旗舰限量版礼盒套装	 530.0
13 	Lofree洛斐EH112S无线蓝牙限量口红绽放彩妆色mini键盘女神限量款	 658.0
14 	顺丰英购CT现货Charlotte Tilbury豹纹星色口红carina\u0027s star嘉玲	 195.0
15 	Christian Louboutin CL萝卜丁口红女王的权杖黑管口红001M 001S	 748.0
16 	单向书衣口红系列S号小32开手帐书衣单向空间文艺简约手工书皮	  88.0
17 	Burt\u0027s Bees伯特小蜜蜂淡彩口红保湿有色润唇膏玫瑰红女 孕妇可用	  41.0
18 	Burt\u0027s Bees (伯特小蜜蜂) - 口红 Blush 盆地 - 0.12 盎司	 106.0
19 	Burt\u0027s Bees (伯特小蜜蜂) - 口红 Blush 盆地 - 0.12 盎司	  99.0
20 	Burt\u0027s Bees伯特小蜜蜂淡彩口红保湿有色润唇膏豆沙红女 孕妇可用	  41.0
=====================================================================================================

2.股票数据定向爬虫

(1)实例介绍

股票网站:(不过现在的网站好像变了。。)

(2)实例编写

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html=getHTMLText(stockURL)
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
        except:
            continue
    return ""

def getStockInfo(lst,stockURL,fpath):
    for stock in lst:
        url=stockURL+stock+".html"
        html=getHTMLText(url)
        try:
            if html =="":
                continue
            infoDict={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'stock-bets'})

            name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDick.update({'股票名称':name.text.split()[0]})

            keyList=stockInfo.find_all('dt')
            valueList=stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key=keyList[i].text
                val=valueList[i].text
                infoDict[key]=val

            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
        except:
            traceback.print_exc()
            continue


def main():

    stock_list_url='http://quote.eastmoney.com/stocklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file='D://BaiduStockInfo.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()

(3)实例优化

优化点:

(1)编码识别的优化:r.apparent_encoding需要分析文本,运行较慢,可辅助人工分析。

(2)增加动态进度显示。

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url,code='utf-8'):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        return ""

def getStockList(lst,stockURL):
    html=getHTMLText(stockURL,'GB2312')
    soup=BeautifulSoup(html,'html.parser')
    a=soup.find_all('a')
    for i in a:
        try:
            href=i.attrs['href']
            lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
        except:
            continue
    return ""

def getStockInfo(lst,stockURL,fpath):
    count=0
    for stock in lst:
        url=stockURL+stock+".html"
        html=getHTMLText(url)
        try:
            if html =="":
                continue
            infoDict={}
            soup=BeautifulSoup(html,'html.parser')
            stockInfo=soup.find('div',attrs={'class':'stock-bets'})

            name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDick.update({'股票名称':name.text.split()[0]})

            keyList=stockInfo.find_all('dt')
            valueList=stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key=keyList[i].text
                val=valueList[i].text
                infoDict[key]=val

            with open(fpath,'a',encoding='utf-8') as f:
                f.write(str(infoDict)+'\n')
                count=count+1
                print('\r当前速度:{:.2f}%'.format(count*100/len(lst)),end='')
        except:
            traceback.print_exc()
            print('\r当前速度:{:.2f}%'.format(count*100/len(lst)),end='')
            continue


def main():

    stock_list_url='http://quote.eastmoney.com/stocklist.html'
    stock_info_url='https://gupiao.baidu.com/stock/'
    output_file='D://BaiduStockInfo.txt'
    slist=[]
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()
发布了219 篇原创文章 · 获赞 13 · 访问量 9797

猜你喜欢

转载自blog.csdn.net/qq_35812205/article/details/104281522
今日推荐