Python 学习之股票信息爬取

技术路线：requests-bs4-re

使用场景：股票信息存储在静态页面中，非js调用输出；本例使用东方财务网、百度股票；

from bs4 import BeautifulSoup

import requests

import re

#获取html
def getHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "获取html异常"

#解析股票list清单 http://quote.eastmoney.com/sh500029.html
def resolveGupiaoList(glist, htext):
try:
soup = BeautifulSoup(htext, "html.parser")
alist = soup.find_all("a")
pat = re.compile(r'http://quote.eastmoney.com/sh\d{6}.html|http://quote.eastmoney.com/sz\d{6}.html')
for i in range(len(alist)):
if (alist[i].get('href')):
link = pat.search(alist[i].attrs['href'])
if link:
glist.append(str(link.group(0).split('/')[-1].split('.')[0]))

except:
print("resolveGupiaoList异常")

#解析个股交易详情,写入文件
def getGupiaoDetail(htext, fileAddress):
try:
gpdict = {}
soup = BeautifulSoup(htext, "html.parser")
gpname = soup.find('a', attrs={'class':'bets-name'})
if(gpname):
gpdict['股票名称'] = gpname.text.split()[0]
gpprice = soup.find('strong', attrs={'class':'_close'})
if(gpprice):
gpdict['当前股价'] = gpprice.text.split()[0]
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
gpdict[key] = val
f = open(fileAddress, 'a', encoding='utf-8')
f.write(str(gpdict) + '\n' )

except:
print("获取个股信息失败")

def main():
glist = []
baiduGupiaoUrl = "https://gupiao.baidu.com/stock/"
gupiaoListUrl = "http://quote.eastmoney.com/stocklist.html"
fileAddress = 'D:/gupiao.txt'
htext = getHtmlText(gupiaoListUrl)
resolveGupiaoList(glist, htext)
for gpitem in glist:
baiduhtext = getHtmlText(baiduGupiaoUrl + gpitem + '.html')
getGupiaoDetail(baiduhtext, fileAddress)

main()

优化方向：1、优化requests编码解析；

   r.apparent_encoding 通过全文遍历url返回的html文本，解析可能使用的编码。执行效率收到影响。 
 

   改进：当返回的页面内容较大，或者需要反复返回同一类页面时，可以提前确认页面的编码，从而直接赋值encoding。 
 

   code="utf-8" 
 

   r.encoding = code 
 

2、动态显示股票解析进度，提高用户体验。

print("\r当前进度: {:.2f}%".format(count*100/len(glist)),end="")

Python 学习之股票信息爬取

猜你喜欢