import urllib.request
import re
from lxml import etree
import time
def url_open(url):
page_headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Host':'movie.douban.com',
'Cookie':'bid=2lW_lQN3anM; ll="108288";xxxx'
}
req=urllib.request.Request(url,headers=page_headers)
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
return data
def get_nextpage(url):
data=url_open(url)
html=etree.HTML(data)
next_page=html.xpath('//div[@id="paginator"]/a[@class="next"]/@href')
return next_page
def get_comment(url):
data=url_open(url)
#print(data)
html=etree.HTML(data)
cidL=html.xpath('//div[@class="comment-item"]/@data-cid')
nameL=html.xpath('//span[@class="comment-info"]/a/text()')
voteL=html.xpath('//span[@class="comment-vote"]/span[@class="votes"]/text()')
timeL=html.xpath('//span[@class="comment-time "]/@title')
#contentL=html.xpath('//div[@class="comment"]/p/text()')
comment_pat='<p class="">(.*?)<'
contentL=re.compile(comment_pat,re.S).findall(data)
pat1='<span class="comment-info">.*?(<span class=.*?</span>)'
pat2='rating" title="(.*?)">'
rateL=re.compile(pat1,re.S).findall(data)
for i in range(len(rateL)):
rating=re.compile(pat2,re.S).findall(rateL[i])
if(len(rating)==0):
rat='None'
else:
rat=rating[0]
cid=cidL[i]
name=nameL[i]
vote=voteL[i]
time=timeL[i]
content=str(contentL[i]).replace(' ','').replace('\n',';')
with open('douban.csv','a',encoding='utf-8') as fh:
fh.write(str(cid)+','+name+','+rat+','+str(vote)+','+str(time)+','+content+'\n')
#print(cid,name,time,vote,content)
if __name__=='__main__':
url='https://movie.douban.com/subject/26363254/comments'
with open('douban.csv','a',encoding='utf-8') as fh:
fh.write('评论ID,豆瓣昵称,评价,赞同人数,评论时间,评论内容\n')
while True:
print(url)
with open('douban.csv','a',encoding='utf-8') as fh:
fh.write(url+',,,,,\n')
get_comment(url)
next_page=get_nextpage(url)
if(len(next_page)>0):
url='https://movie.douban.com/subject/26363254/comments'+next_page[0]
time.sleep(2)
else:
break
豆瓣影评信息爬取
猜你喜欢
转载自blog.csdn.net/d1240673769/article/details/79859102
今日推荐
周排行