导入深交所年报代码

# -*- coding: utf-8 -*-
"""
Created on Sat Sep 14 15:34:11 2019

@author: gogogo
"""

import requests
import time
import pandas as pd
import random
import os
import json
# 定义爬取函数
def get_pdf_address(pageNum,start_date,end_date):
    url = 'http://www.szse.cn/api/disc/announcement/annList?random=%s' % random.random()
    headers = {'Accept': 'application/json, text/javascript, */*; q=0.01'
    ,'Accept-Encoding': 'gzip, deflate'
    ,'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
    ,'Content-Type': 'application/json'
    ,'Host': 'www.szse.cn'
    ,'Origin': 'http://www.szse.cn'
    ,'Proxy-Connection': 'keep-alive'
    ,'Referer': 'http://www.szse.cn/disclosure/listed/fixed/index.html'
    ,'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    ,'X-Request-Type': 'ajax'
    ,'X-Requested-With': 'XMLHttpRequest'}
    pagenum = int(pageNum)
    payload = {"seDate":["{}-12-31".format(start_date),"{}-12-31".format(end_date)],"channelCode":["fixed_disc"],"bigCategoryId":["010301"],"pageSize":30,"pageNum":pagenum}
    r = requests.post(url,headers = headers,data = json.dumps(payload))
    result = r.json()
    return result

#创建一个DataFrame储存爬取信息
data_download_pdf = pd.DataFrame(columns =['secCode','secName','url','title','publishTime'])
count = 0
url_head = 'http://disc.static.szse.cn/download/'
end = 2018
while end > 2017:
    start = end  - 2
    result = get_pdf_address(1,start,end)
    print("开始爬取{}-12-31至{}-12-31的年报".format(start,end))
    pages = int(result['announceCount']/30 + 1)
    print("共%s页" % pages)
    for i in range(1,pages):
        print("爬取深交所年报下载地址第{}页".format(i))
        print("完成{}%".format(round(i/pages,4) * 100))
        result = get_pdf_address(i,start,end)
        num = len(result['data'])
        for each in range(num):
            secCode = result['data'][each]['secCode'][0]
            secName = result['data'][each]['secName'][0]
            url = url_head + result['data'][each]['attachPath']
            title = result['data'][each]['title']
            publishTime = result['data'][each]['publishTime']
            dict1 = {'secCode':secCode,'Name':secName,'url':url,'title':title,
                     'publishTime':publishTime}
            data_download_pdf = data_download_pdf.append(dict1, ignore_index=True )
        print('获取完成')
        print("========================================")
        time.sleep(random.uniform(1,2)) # 控制访问速度
    end = end - 2



# 提取title中字符串获取年份
data_download_pdf['Year'] = data_download_pdf['title'].str.extract('([0-9]{4})')
file_path = "F:\\深交所年报\\"
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
# 文件路径要以\\结尾,如放入F盘年报文件夹,现在F盘创建文件夹,将路径写成   F:\\年报\\
for each in range(data_download_pdf.shape[0]):
    # each = 1
    # pdf_url = "http://disc.static.szse.cn/download//disc/disk02/finalpage/2019-07-05/dde0ce5e-e2c7-4c09-b6f4-a03ad9d593ee.PDF"
    code = data_download_pdf.at[each,'secCode']
    name = data_download_pdf.at[each,'secName'].replace("*","")
    year = data_download_pdf.at[each,'Year']
    print("开始下载{},股票代码{}的{}年报".format(name,code,year))
    file_name = "{}{}{}.pdf".format(code,name,year)
    file_full_name = os.path.join(file_path, file_name)
    file_full_name  = 'F:\\1.pdf'
    # print(file_full_name)
    pdf_url = data_download_pdf.at[each,'url']
    rs = requests.get(pdf_url,headers= headers, stream=True)
    with open(file_full_name, "wb") as fp:
        for chunk in rs.iter_content(chunk_size=10240):
            if chunk:
                fp.write(chunk)
    time.sleep(random.uniform(1,2)) # 控制访问速度
    print("===================下载完成==========================")






















发布了8 篇原创文章 · 获赞 3 · 访问量 5577

猜你喜欢

转载自blog.csdn.net/cwjcw81/article/details/104063285