"""
已周大生为例;
2018/4/10-2018/6/20
抓取业绩说明会前后的股民评论
"""
#!/usr/bin/env python
# -*-coding:utf-8 -*-
import urllib
import urllib2
import re
#导入对excel文件进行操作的库
import xlwt
#创建表格,设置编码模式,创建新的sheet
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('dede',cell_overwrite_ok=True)
#j的作用是对url不断进行修改,翻页
length1 = 1192
for j in range(1,20): #
print j
url = 'http://guba.eastmoney.com/list,002867,5,f_'+str(j)+'.html'
try:
request=urllib2.Request(url)
response=urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile('<span class.*?title=(.*?)>',re.S)
title = re.findall(pattern, content)
pattern = re.compile('<span class.*?<a href.*?data-popper.*?>(.*?)</a>', re.S)
author = re.findall(pattern, content)
pattern = re.compile('<span class.*?data-popper.*?</span><span class.*?>(.*?)</span>.*?<span class.*?>(.*?)</span>', re.S)
time = re.findall(pattern, content)
pattern = re.compile('<div class.*?articleh.*?<span.*?>(.*?)</span>.*?<span class.*?>(.*?)</span>', re.S)
num = re.findall(pattern, content)
for i in range(0,80):
titleans=title[i+1]
sheet.write((j-1)*80+i,0,titleans)
authorans=author[i]
sheet.write((j - 1) * 80 + i, 1, authorans)
fabiaotime=time[i][0]
sheet.write((j - 1) * 80 + i, 2, fabiaotime)
gengxintime=time[i][1]
sheet.write((j - 1) * 80 + i, 3, gengxintime)
yuedu = num[i][0]
#print yuedu
sheet.write((j - 1) * 80 + i, 4, yuedu)
pinglun = num[i][1]
#print pinglun
sheet.write((j - 1) * 80 + i, 5, pinglun)
#保存
book.save('G://project/sentimation_analysis/data/002868.xls')
except urllib2.URLError,e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
代码说明:
网站选取:东方财富网---股吧:链接戳这里:周大生--股吧
只抓取业绩说明会前后的文本信息,保存为xls格式,内容如下形式: