In this article, we will introduce how to use Python to write a simple data grabber for crawling various financial statement data on the Oriental Fortune website. We will use requests
and lxml
library for data requesting and parsing, and save the scraped data to a CSV file.
1. Preparations
First, make sure you have the following Python libraries installed:
pip install requests
pip install lxml
2. Create a data grabber
We will create a DataScraper
class called , which encapsulates all data fetching related methods. Key features of the data grabber include:
- Get report data
- Parse and extract header information
- Write data to CSV file
2.1 Initialization
In the methods DataScraper
of the class __init__
, we will initialize some necessary properties such as report type, report name, etc. In addition, we also need to set the request URL and request headers for data requests later.
class DataScraper:
def __init__(self):
self.pagename_type = {
# ...
}
self.pagename_en = {
# ...
}
self.en_list = []
self.url = 'https://datacenter-web.eastmoney.com/api/data/v1/get'
self.headers = {
# ...
}
2.2 Get report data
We define a get_table
method called , which is used to send requests to Dongfang Fortune.com and obtain report data. Pass in the number of pages as a parameter, and return the report data of the current page.
def get_table(self, page):
# ...
2.3 Parsing header
Before grabbing the data, we need to parse the header information. We create a get_header
method called , passing in a list of all table headers in English. This method will request the report page, use lxml
the library to parse the HTML, and extract the Chinese header information.
def get_header(self, all_en_list):
# ...
2.4 Write header
Next, we create a write_header
method named , which writes the parsed header information to a CSV file. In this method, we first call get_header
the method to get the header information, and then csv.writer
write it to the CSV file using
def write_header(self, table_data):
# ...
2.5 Write report data
Define a write_table
method called to write the captured report data line by line to the CSV file. In this method, we iterate through the fetched data and write each row to a CSV file.
def write_table(self, table_data):
# ...
2.6 Get time list
In order to allow users to choose the report time to crawl, we define a get_timeList
method called . This method will send a request to Oriental Fortune, parse and extract the optional time list.
def get_timeList(self):
# ...
3 Using the data grabber
After creating the DataScraper class, we can use the following code to instantiate it and crawl the required report data:
if __name__ == '__main__':
scraper = DataScraper()
timeList = scraper.get_timeList()
for index, value in enumerate(timeList):
if (index + 1) % 5 == 0:
print(value)
else:
print(value, end=' ; ')
timePoint = str(input('\n请选择时间(可选项如上):'))
pagename = str(input('请输入报表类型(业绩报表;业绩快报;业绩预告;预约披露时间;资产负债表;利润表;现金流量表):'))
# 校验输入
assert timePoint in timeList, '时间输入错误'
assert pagename in list(scraper.pagename_type.keys()), '报表类型输入错误'
table_type = scraper.pagename_type[pagename]
filename = f'{
pagename}_{
timePoint}.csv'
# 写入表头
scraper.write_header(scraper.get_table(1))
# 循环遍历所有页数
page = 1
while True:
table = scraper.get_table(page)
if table:
scraper.write_table(table)
else:
break
page += 1
4 Complete code and result screenshot
import csv
import json
import requests
from lxml import etree
class DataScraper:
def __init__(self):
self.pagename_type = {
"业绩报表": "RPT_LICO_FN_CPD",
"业绩快报": "RPT_FCI_PERFORMANCEE",
"业绩预告": "RPT_PUBLIC_OP_NEWPREDICT",
"预约披露时间": "RPT_PUBLIC_BS_APPOIN",
"资产负债表": "RPT_DMSK_FN_BALANCE",
"利润表": "RPT_DMSK_FN_INCOME",
"现金流量表": "RPT_DMSK_FN_CASHFLOW"
}
self.pagename_en = {
"业绩报表": "yjbb",
"业绩快报": "yjkb",
"业绩预告": "yjyg",
"预约披露时间": "yysj",
"资产负债表": "zcfz",
"利润表": "lrb",
"现金流量表": "xjll"
}
self.en_list = []
self.url = 'https://datacenter-web.eastmoney.com/api/data/v1/get'
self.headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'closed',
'Referer': 'https://data.eastmoney.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
def get_table(self, page):
params = {
'sortTypes': '-1,-1',
'reportName': self.table_type,
'columns': 'ALL',
'filter': f'(REPORT_DATE=\'{
self.timePoint}\')'
}
if self.table_type in ['RPT_LICO_FN_CPD']:
params['filter'] = f'(REPORTDATE=\'{
self.timePoint}\')'
params['pageNumber'] = str(page)
response = requests.get(url=self.url, params=params, headers=self.headers)
data = json.loads(response.text)
if data['result']:
return data['result']['data']
else:
return
def get_header(self, all_en_list):
ch_list = []
url = f'https://data.eastmoney.com/bbsj/{
self.pagename_en[self.pagename]}.html'
response = requests.get(url)
res = etree.HTML(response.text)
for en in all_en_list:
ch = ''.join(
[i.strip() for i in res.xpath(f'//div[@class="dataview"]//table[1]//th[@data-field="{
en}"]//text()')])
if ch:
ch_list.append(ch)
self.en_list.append(en)
return ch_list
def write_header(self, table_data):
with open(self.filename, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
headers = self.get_header(list(table_data[0].keys()))
writer.writerow(headers)
def write_table(self, table_data):
with open(self.filename, 'a', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
for item in table_data:
row = []
for key in item.keys():
if key in self.en_list:
row.append(str(item[key]))
print(row)
writer.writerow(row)
def get_timeList(self):
headers = {
'Referer': 'https://data.eastmoney.com/bbsj/202206.html',
}
response = requests.get('https://data.eastmoney.com/bbsj/202206.html', headers=headers)
res = etree.HTML(response.text)
return res.xpath('//*[@id="filter_date"]//option/text()')
def run(self):
self.timeList = self.get_timeList()
for index, value in enumerate(self.timeList):
if (index + 1) % 5 == 0:
print(value)
else:
print(value, end=' ; ')
self.timePoint = str(input('\n请选择时间(可选项如上):'))
self.pagename = str(
input('请输入报表类型(业绩报表;业绩快报;业绩预告;预约披露时间;资产负债表;利润表;现金流量表):'))
assert self.timePoint in self.timeList, '时间输入错误'
assert self.pagename in list(self.pagename_type.keys()), '报表类型输入错误'
self.table_type = self.pagename_type[self.pagename]
self.filename = f'{
self.pagename}_{
self.timePoint}.csv'
self.write_header(self.get_table(1))
page = 1
while True:
table = self.get_table(page)
if table:
self.write_table(table)
else:
break
page += 1
if __name__ == '__main__':
scraper = DataScraper()
scraper.run()
For more reptiles from Dongfang Fortune.com, Juchao.com and CNKI, welcome to visit my github warehouse