爬巨潮的公告

import requests
import time
import re

# 数据写入excel
with open('data_xls.xls', 'w', encoding='utf-8') as f:
    f.write('代码\t名称\t网址\t标题\t开始年份\t结束年份\t发布日期\n')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'}


def post_html(url, data):
    r = requests.post(url, headers=headers, data=data)
    if r.status_code == 200:
        return r
    else:
        raise Exception("网页出问题了啊")


def time_process(time_stamp):
    time_stamp /= 1000
    timeArray = time.localtime(float(time_stamp))
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime[:10]

# 获取四位年份的正则表达式
pattern = re.compile(r'\d{4}')
def json_process(r):
    ajax_json = r.json()
    with open('data_xls.xls', 'a', encoding='utf-8') as f:
        for i in ajax_json['announcements']:
            time = time_process(i['announcementTime'])
            stack_code = i['secCode']
            stack_name = i['secName']
            website = 'http://www.cninfo.com.cn/new/disclosure/detail?stockCode={}&announcementId={}&orgId={}&announcementTime={}'.format(
                stack_code, i['announcementId'], i['orgId'], time)
            title = i['announcementTitle']

            years = pattern.findall(title)
            start_year, end_year = -1, -1
            if len(years) >= 2:
                start_year = years[0]
                end_year = years[1]

            line = '#{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                stack_code, stack_name, website, title, start_year, end_year, time)
            f.write(line)


# 按年份遍历
for year in range(20, 10, -1):
    pageNum = 1
    totalPages = None
    while True:
        print('正在爬第{}页'.format(pageNum))
        data = {'pageNum': pageNum, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext',
                'searchkey': '回报规划', 'seDate': '20{}-01-01~20{}-12-31'.format(year, year), 'isHLtitle': True}
        r = post_html(
            url='http://www.cninfo.com.cn/new/hi

猜你喜欢

转载自blog.csdn.net/qq_49335502/article/details/121184905
今日推荐