In the last blog , we introduced the crawler program that climbs the ranking of colleges and universities. In this blog, we will introduce the program that climbs the stock data.
Source of the program: " Crawler and Information Extraction Course " of China University MOOC .
The purpose of the program: to obtain part of the stock information of the Shanghai Stock Exchange and Shenzhen Stock Exchange, and output it to a file.
To understand the following procedures, you need to understand the requests library, BeautifulSoup library and re library in advance. You will have the relevant knowledge in the " Crawler and Information Extraction Course ".
import requests from bs4 import BeautifulSoup import re def getHTMLText(url, code="utf-8"): try: r = requests.get(url) r.raise_for_status() r.encoding = code # Directly specify utf-8 decoding return r.text except: return "" def getStockList(lst, stockURL): html = getHTMLText (stockURL, "GB2312") # Eastern Fortune Network uses GB2312 encoding soup = BeautifulSoup(html, 'html.parser') a = soup.find_all ('a') [: 200] # The stock symbol is in the href attribute in the a tag. We only take the first 100 a tags for i in a: try: href = i.attrs['href'] lst.append(re.findall(r"\d{6}", href)[0]) except: continue def getStockInfo(lst, stockURL, fpath): count = 0 for stock in lst: url = stockURL + stock # link to information about each stock html = getHTMLText(url) try: if html == "": continue infoDict = {} # The information of each stock is stored in the dictionary soup = BeautifulSoup(html, 'html.parser') stockInfo = soup.find('div', attrs={'class': 'stock-info'}) name = stockInfo.find_all(attrs={'class': 'stock-name'})[0] infoDict.update ({'Stock name': name.text.split () [0]}) # .text can take out the string in the label keyList = stockInfo.find_all ('dt') [: 4] # Take only the first 4 information of stocks valueList = stockInfo.find_all('dd')[:4] for i in range(len(keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val with open(fpath, 'a', encoding='utf-8') as f: f.write(str(infoDict) + '\n') count = count + 1 print ("\ rCurrent progress: {: .2f}%". format (count * 100 / len (lst)), end = "") # \ r means to return to the beginning of the current line, the printed content will be Overwrite previously printed content # After each print, it will automatically wrap. end is used to cancel word wrap. except: count = count + 1 print ("\ rCurrent progress: {: .2f}%". format (count * 100 / len (lst)), end = "") continue def main(): stock_list_url = 'https://quote.eastmoney.com/stock_list.html' # Get the code of each stock from Dongfang Fortune.com stock_info_url = 'https://www.laohu8.com/stock/' # Get information about each stock from the Tiger Community website output_file = '/Users/wangpeng/Desktop/BaiduStockInfo.txt' slist = [] getStockList(slist, stock_list_url) getStockInfo(slist, stock_info_url, output_file) main()
Document results: