import requests
import time
import re
# 数据写入excel
with open('data_xls.xls', 'w', encoding='utf-8') as f:
f.write('代码\t名称\t网址\t标题\t开始年份\t结束年份\t发布日期\n')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'}
def post_html(url, data):
r = requests.post(url, headers=headers, data=data)
if r.status_code == 200:
return r
else:
raise Exception("网页出问题了啊")
def time_process(time_stamp):
time_stamp /= 1000
timeArray = time.localtime(float(time_stamp))
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime[:10]
# 获取四位年份的正则表达式
pattern = re.compile(r'\d{4}')
def json_process(r):
ajax_json = r.json()
with open('data_xls.xls', 'a', encoding='utf-8') as f:
for i in ajax_json['announcements']:
time = time_process(i['announcementTime'])
stack_code = i['secCode']
stack_name = i['secName']
website = 'http://www.cninfo.com.cn/new/disclosure/detail?stockCode={}&announcementId={}&orgId={}&announcementTime={}'.format(
stack_code, i['announcementId'], i['orgId'], time)
title = i['announcementTitle']
years = pattern.findall(title)
start_year, end_year = -1, -1
if len(years) >= 2:
start_year = years[0]
end_year = years[1]
line = '#{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
stack_code, stack_name, website, title, start_year, end_year, time)
f.write(line)
# 按年份遍历
for year in range(20, 10, -1):
pageNum = 1
totalPages = None
while True:
print('正在爬第{}页'.format(pageNum))
data = {'pageNum': pageNum, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext',
'searchkey': '回报规划', 'seDate': '20{}-01-01~20{}-12-31'.format(year, year), 'isHLtitle': True}
r = post_html(
url='http://www.cninfo.com.cn/new/hi
爬巨潮的公告
猜你喜欢
转载自blog.csdn.net/qq_49335502/article/details/121184905
今日推荐
周排行