金融情感分析,股市预测(二)

"""
已周大生为例;
2018/4/10-2018/6/20
抓取业绩说明会前后的股民评论

"""

#!/usr/bin/env python
# -*-coding:utf-8 -*-
import urllib
import urllib2
import re
#导入对excel文件进行操作的库
import xlwt
#创建表格,设置编码模式,创建新的sheet
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('dede',cell_overwrite_ok=True)

#j的作用是对url不断进行修改,翻页
length1 = 1192
for j in range(1,20):     #
    print j
    url = 'http://guba.eastmoney.com/list,002867,5,f_'+str(j)+'.html'
    try:
        request=urllib2.Request(url)
        response=urllib2.urlopen(request)
        content = response.read().decode('utf-8')
        pattern = re.compile('<span class.*?title=(.*?)>',re.S)
        title = re.findall(pattern, content)
        pattern = re.compile('<span class.*?<a href.*?data-popper.*?>(.*?)</a>', re.S)
        author = re.findall(pattern, content)
        pattern = re.compile('<span class.*?data-popper.*?</span><span class.*?>(.*?)</span>.*?<span class.*?>(.*?)</span>', re.S)
        time = re.findall(pattern, content)
        pattern = re.compile('<div class.*?articleh.*?<span.*?>(.*?)</span>.*?<span class.*?>(.*?)</span>', re.S)
        num = re.findall(pattern, content)
        for i in range(0,80):
            titleans=title[i+1]
            sheet.write((j-1)*80+i,0,titleans)
            authorans=author[i]
            sheet.write((j - 1) * 80 + i, 1, authorans)
            fabiaotime=time[i][0]
            sheet.write((j - 1) * 80 + i, 2, fabiaotime)
            gengxintime=time[i][1]
            sheet.write((j - 1) * 80 + i, 3, gengxintime)
            yuedu = num[i][0]
            #print yuedu
            sheet.write((j - 1) * 80 + i, 4, yuedu)
            pinglun = num[i][1]
            #print pinglun
            sheet.write((j - 1) * 80 + i, 5, pinglun)
            #保存
            book.save('G://project/sentimation_analysis/data/002868.xls')

    except urllib2.URLError,e:
        if hasattr(e,"code"):
            print e.code

        if hasattr(e,"reason"):
            print e.reason

代码说明:

网站选取:东方财富网---股吧:链接戳这里:周大生--股吧

只抓取业绩说明会前后的文本信息,保存为xls格式,内容如下形式:

猜你喜欢

转载自blog.csdn.net/weixin_40411446/article/details/81185162
今日推荐