# -*- coding: utf-8 -*-
"""
author:suxuer
date:2018/6/18
version-1.0
"""
import pandas as pd
import requests
import json
import time
import csv
"""
#获取每一家公司的业绩说明会基本信息
#title:业绩说明会标题
#speaktime:说明会举行时间
#speakcontent:说明会基本内容
"""
#获取所有的公司业绩说明会基本信息
def all_companies(path,top):
trainfile = pd.read_csv(path)
trainfile = trainfile[96:top]
PIDs = trainfile['company_PIDs']
DATAs = trainfile['show_datas']
TITLEs = trainfile['show_titles']
return PIDs,DATAs,TITLEs
#获取每一家公司的业绩说明会问答内容
def get_each_company(page_nums): #num = pages,PID,公司ID
#用于存储数据
Questions = []
Answers = []
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
}
url = "http://rs.p5w.net/roadshowLive/getNInteractionDatas.shtml"
for page in range(page_nums):
data = {
"roadshowId": "16995",
#pid, #pid of each company
"isPagination": 0, # each value can success
"type": 2, #1 reprents all, 2 reprents QA, we must set 2
"page": page, #page location
"rows": 10,
}
r = requests.post(url=url, headers=header, data=data)
web_data = json.loads(r.text)
#print (web_data)
for i in web_data['rows']:
#print(i.keys())
#print ('Question:')
Questions.append(i['speakContent'])
Answers.append(i['replyList'][0]['speakContent'])
print ("第%d页数据已抓取完成...." % (page+1))
time.sleep(5) #延迟设置,防止被禁止
return Questions,Answers
if __name__ == '__main__':
#path = './companies.csv'
#top =2# 10 #小数量用于测试
#PIDs, DATAs, TITLEs = all_companies(path,top)
#提取每一次业绩说明会的内容
page_nums = 5
Questions, Answers = get_each_company(page_nums)
# question_column = pd.Series(Questions, name='Question')
# answer_column = pd.Series(Answers, name='Answer')
# predictions = pd.concat([question_column, answer_column], axis=1)
# df1 = pd.DataFrame({'Question': Questions, 'Answer': Answers})
# df1.to_csv('./新莱应材2015年度业绩说明会.csv' )
# print("finish")
for i in range(len(Questions)):
print ("Q:%s" %Questions[i])
print("A:%s" %Answers[i])
each_company(note)
猜你喜欢
转载自blog.csdn.net/weixin_40411446/article/details/80889056
今日推荐
周排行