豆瓣影评信息爬取

import urllib.request
import re
from lxml import etree
import time


def url_open(url):
    page_headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
                  'Host':'movie.douban.com',
                  'Cookie':'bid=2lW_lQN3anM; ll="108288";xxxx'
                  }
    req=urllib.request.Request(url,headers=page_headers)
    data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
    return data

def get_nextpage(url):
    data=url_open(url)
    html=etree.HTML(data)
    next_page=html.xpath('//div[@id="paginator"]/a[@class="next"]/@href')
    return next_page


def get_comment(url):
    data=url_open(url)
    #print(data)
    html=etree.HTML(data)
    cidL=html.xpath('//div[@class="comment-item"]/@data-cid')
    nameL=html.xpath('//span[@class="comment-info"]/a/text()')
    voteL=html.xpath('//span[@class="comment-vote"]/span[@class="votes"]/text()')
    timeL=html.xpath('//span[@class="comment-time "]/@title')
    #contentL=html.xpath('//div[@class="comment"]/p/text()')

    comment_pat='<p class="">(.*?)<'
    contentL=re.compile(comment_pat,re.S).findall(data)
    pat1='<span class="comment-info">.*?(<span class=.*?</span>)'
    pat2='rating" title="(.*?)">'
    rateL=re.compile(pat1,re.S).findall(data)
    for i in range(len(rateL)):
        rating=re.compile(pat2,re.S).findall(rateL[i])
        if(len(rating)==0):
            rat='None'
        else:
            rat=rating[0]
        cid=cidL[i]
        name=nameL[i]
        vote=voteL[i]
        time=timeL[i]    
        content=str(contentL[i]).replace(' ','').replace('\n',';')
        with open('douban.csv','a',encoding='utf-8') as fh:
            fh.write(str(cid)+','+name+','+rat+','+str(vote)+','+str(time)+','+content+'\n')
        #print(cid,name,time,vote,content)


if __name__=='__main__':
    url='https://movie.douban.com/subject/26363254/comments'
    with open('douban.csv','a',encoding='utf-8') as fh:
        fh.write('评论ID,豆瓣昵称,评价,赞同人数,评论时间,评论内容\n')
    while True:
        print(url)
        with open('douban.csv','a',encoding='utf-8') as fh:
            fh.write(url+',,,,,\n')
        get_comment(url)
        next_page=get_nextpage(url)
        if(len(next_page)>0):
            url='https://movie.douban.com/subject/26363254/comments'+next_page[0]
            time.sleep(2)
        else:
            break
豆瓣影评信息爬取

猜你喜欢