Python crawling financial statements

In this article, we will introduce how to use Python to write a simple data grabber for crawling various financial statement data on the Oriental Fortune website. We will use requestsand lxmllibrary for data requesting and parsing, and save the scraped data to a CSV file.

1. Preparations

First, make sure you have the following Python libraries installed:

pip install requests
pip install lxml

2. Create a data grabber

We will create a DataScraperclass called , which encapsulates all data fetching related methods. Key features of the data grabber include:

Get report data
Parse and extract header information
Write data to CSV file

2.1 Initialization

In the methods DataScraperof the class __init__, we will initialize some necessary properties such as report type, report name, etc. In addition, we also need to set the request URL and request headers for data requests later.

class DataScraper:
    def __init__(self):
        self.pagename_type = {
    
    
            # ...
        }

        self.pagename_en = {
    
    
            # ...
        }

        self.en_list = []

        self.url = 'https://datacenter-web.eastmoney.com/api/data/v1/get'
        self.headers = {
    
    
            # ...
        }

2.2 Get report data

We define a get_tablemethod called , which is used to send requests to Dongfang Fortune.com and obtain report data. Pass in the number of pages as a parameter, and return the report data of the current page.

    def get_table(self, page):
        # ...

2.3 Parsing header

Before grabbing the data, we need to parse the header information. We create a get_headermethod called , passing in a list of all table headers in English. This method will request the report page, use lxmlthe library to parse the HTML, and extract the Chinese header information.

    def get_header(self, all_en_list):
        # ...

2.4 Write header

Next, we create a write_headermethod named , which writes the parsed header information to a CSV file. In this method, we first call get_headerthe method to get the header information, and then csv.writerwrite it to the CSV file using

    def write_header(self, table_data):
        # ...

2.5 Write report data

Define a write_tablemethod called to write the captured report data line by line to the CSV file. In this method, we iterate through the fetched data and write each row to a CSV file.

    def write_table(self, table_data):
        # ...

2.6 Get time list

In order to allow users to choose the report time to crawl, we define a get_timeListmethod called . This method will send a request to Oriental Fortune, parse and extract the optional time list.

    def get_timeList(self):
        # ...

3 Using the data grabber

After creating the DataScraper class, we can use the following code to instantiate it and crawl the required report data:

if __name__ == '__main__':
    scraper = DataScraper()

    timeList = scraper.get_timeList()
    for index, value in enumerate(timeList):
        if (index + 1) % 5 == 0:
            print(value)
        else:
            print(value, end=' ; ')
    timePoint = str(input('\n请选择时间（可选项如上）:'))
    pagename = str(input('请输入报表类型（业绩报表;业绩快报;业绩预告;预约披露时间;资产负债表;利润表；现金流量表）:'))
    # 校验输入
    assert timePoint in timeList, '时间输入错误'
    assert pagename in list(scraper.pagename_type.keys()), '报表类型输入错误'
    table_type = scraper.pagename_type[pagename]
    filename = f'{
      
      pagename}_{
      
      timePoint}.csv'

    # 写入表头
    scraper.write_header(scraper.get_table(1))

    # 循环遍历所有页数
    page = 1
    while True:
        table = scraper.get_table(page)
        if table:
            scraper.write_table(table)
        else:
            break
        page += 1

4 Complete code and result screenshot

import csv
import json
import requests
from lxml import etree


class DataScraper:
    def __init__(self):
        self.pagename_type = {
    
    
            "业绩报表": "RPT_LICO_FN_CPD",
            "业绩快报": "RPT_FCI_PERFORMANCEE",
            "业绩预告": "RPT_PUBLIC_OP_NEWPREDICT",
            "预约披露时间": "RPT_PUBLIC_BS_APPOIN",
            "资产负债表": "RPT_DMSK_FN_BALANCE",
            "利润表": "RPT_DMSK_FN_INCOME",
            "现金流量表": "RPT_DMSK_FN_CASHFLOW"
        }

        self.pagename_en = {
    
    
            "业绩报表": "yjbb",
            "业绩快报": "yjkb",
            "业绩预告": "yjyg",
            "预约披露时间": "yysj",
            "资产负债表": "zcfz",
            "利润表": "lrb",
            "现金流量表": "xjll"
        }

        self.en_list = []

        self.url = 'https://datacenter-web.eastmoney.com/api/data/v1/get'
        self.headers = {
    
    
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'closed',
            'Referer': 'https://data.eastmoney.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"'
        }

    def get_table(self, page):
        params = {
    
    
            'sortTypes': '-1,-1',
            'reportName': self.table_type,
            'columns': 'ALL',
            'filter': f'(REPORT_DATE=\'{
      
      self.timePoint}\')'
        }

        if self.table_type in ['RPT_LICO_FN_CPD']:
            params['filter'] = f'(REPORTDATE=\'{
      
      self.timePoint}\')'
        params['pageNumber'] = str(page)
        response = requests.get(url=self.url, params=params, headers=self.headers)
        data = json.loads(response.text)
        if data['result']:
            return data['result']['data']
        else:
            return

    def get_header(self, all_en_list):
        ch_list = []
        url = f'https://data.eastmoney.com/bbsj/{
      
      self.pagename_en[self.pagename]}.html'
        response = requests.get(url)
        res = etree.HTML(response.text)
        for en in all_en_list:
            ch = ''.join(
                [i.strip() for i in res.xpath(f'//div[@class="dataview"]//table[1]//th[@data-field="{
      
      en}"]//text()')])
            if ch:
                ch_list.append(ch)
                self.en_list.append(en)
        return ch_list

    def write_header(self, table_data):
        with open(self.filename, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            headers = self.get_header(list(table_data[0].keys()))
            writer.writerow(headers)

    def write_table(self, table_data):
        with open(self.filename, 'a', encoding='utf-8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for item in table_data:
                row = []
                for key in item.keys():
                    if key in self.en_list:
                        row.append(str(item[key]))
                print(row)
                writer.writerow(row)

    def get_timeList(self):
        headers = {
    
    
            'Referer': 'https://data.eastmoney.com/bbsj/202206.html',
        }
        response = requests.get('https://data.eastmoney.com/bbsj/202206.html', headers=headers)
        res = etree.HTML(response.text)
        return res.xpath('//*[@id="filter_date"]//option/text()')

    def run(self):
        self.timeList = self.get_timeList()
        for index, value in enumerate(self.timeList):
            if (index + 1) % 5 == 0:
                print(value)
            else:
                print(value, end=' ; ')

        self.timePoint = str(input('\n请选择时间（可选项如上）:'))
        self.pagename = str(
            input('请输入报表类型（业绩报表;业绩快报;业绩预告;预约披露时间;资产负债表;利润表；现金流量表）:'))
        assert self.timePoint in self.timeList, '时间输入错误'
        assert self.pagename in list(self.pagename_type.keys()), '报表类型输入错误'
        self.table_type = self.pagename_type[self.pagename]
        self.filename = f'{
      
      self.pagename}_{
      
      self.timePoint}.csv'
        self.write_header(self.get_table(1))
        page = 1
        while True:
            table = self.get_table(page)
            if table:
                self.write_table(table)
            else:
                break
            page += 1


if __name__ == '__main__':
    scraper = DataScraper()
    scraper.run()

insert image description here
For more reptiles from Dongfang Fortune.com, Juchao.com and CNKI, welcome to visit my github warehouse