Python crawler small example: crawling stock data

In the last blog , we introduced the crawler program that climbs the ranking of colleges and universities. In this blog, we will introduce the program that climbs the stock data.

Source of the program: " Crawler and Information Extraction Course " of China University MOOC .

The purpose of the program: to obtain part of the stock information of the Shanghai Stock Exchange and Shenzhen Stock Exchange, and output it to a file.

To understand the following procedures, you need to understand the requests library, BeautifulSoup library and re library in advance. You will have the relevant knowledge in the " Crawler and Information Extraction Course ".

import requests
from bs4 import BeautifulSoup
import re


def getHTMLText(url, code="utf-8"):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = code # Directly specify utf-8 decoding
        return r.text
    except:
        return ""


def getStockList(lst, stockURL):
    html = getHTMLText (stockURL, "GB2312") # Eastern Fortune Network uses GB2312 encoding
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all ('a') [: 200] # The stock symbol is in the href attribute in the a tag. We only take the first 100 a tags
    for i in a:
        try:
            href = i.attrs['href']
            lst.append(re.findall(r"\d{6}", href)[0])
        except:
            continue


def getStockInfo(lst, stockURL, fpath):
    count = 0
    for stock in lst:
        url = stockURL + stock # link to information about each stock
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            infoDict = {} # The information of each stock is stored in the dictionary
            soup = BeautifulSoup(html, 'html.parser')
            stockInfo = soup.find('div', attrs={'class': 'stock-info'})
            name = stockInfo.find_all(attrs={'class': 'stock-name'})[0]
            infoDict.update ({'Stock name': name.text.split () [0]}) # .text can take out the string in the label
            keyList = stockInfo.find_all ('dt') [: 4] # Take only the first 4 information of stocks
            valueList = stockInfo.find_all('dd')[:4]
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                count = count + 1
                print ("\ rCurrent progress: {: .2f}%". format (count * 100 / len (lst)), end = "") # \ r means to return to the beginning of the current line, the printed content will be Overwrite previously printed content
                                                                                     # After each print, it will automatically wrap. end is used to cancel word wrap.
        except:
            count = count + 1
            print ("\ rCurrent progress: {: .2f}%". format (count * 100 / len (lst)), end = "")
            continue


def main():
    stock_list_url = 'https://quote.eastmoney.com/stock_list.html' # Get the code of each stock from Dongfang Fortune.com
    stock_info_url = 'https://www.laohu8.com/stock/' # Get information about each stock from the Tiger Community website
    output_file = '/Users/wangpeng/Desktop/BaiduStockInfo.txt'
    slist = []
    getStockList(slist, stock_list_url)
    getStockInfo(slist, stock_info_url, output_file)


main()

Document results:

 

Guess you like

Origin www.cnblogs.com/picassooo/p/12670835.html