爬虫综合大作业---分析《我不是药神》豆瓣电影短评

作业要求来源：https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/3075

《我不是药神》是这几年来我所看的电影里感触最大的，甚至成为豆瓣上多年来仅有的六部9.0评分以上的华语电影之一。

影片讲述了神油店老板程勇从一个交不起房租的男性保健品商贩，一跃成为印度仿制药“格列宁”独家代理商的故事

打开豆瓣电影《我不是药神》的短评网页，右键检查或者按F12，然后选择用户名和评论就会显示出对应的代码部分

获取用户名和短评

name=x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i))
content=x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/text()'.format(i))

主要代码

var logMap = {}
var fs = require('fs');
var iconv = require('iconv-lite');
var logger = fs.createWriteStream('./urlLog.log', {
    flags: 'a' // 'a' means appending (old data will be preserved)
})
function logPageFile(url) {
    if (!logMap[url]) {
        logMap[url] = true;
        logger.write(url + '\r\n');
    }
}
function postData(post_data, path, cb) {
    // // Build the post string from an object
    // var post_data = JSON.stringify({
    //     'data': data
    // });

    // An object of options to indicate where to post to
    var post_options = {
        host: '127.0.0.1',
        port: '9999',
        path: '/' + path,
        method: 'POST',
        headers: {
            'Content-Type': 'application/json',
            'Content-Length': Buffer.byteLength(post_data)
        }
    };

    var http = require('http');
    // Set up the request
    var post_req = http.request(post_options, function (res) {
        res.setEncoding('utf8');
        res.on('data', cb);
    });

    logger.write('request post data 1\r\n')

    // post the data
    post_req.write(post_data);

    logger.write('request post data 2\r\n')
    post_req.end();
}

module.exports = {
    summary: 'a rule to modify response',
    * beforeSendResponse(requestDetail, responseDetail) {

      if (/movie\/1200486/i.test(requestDetail.url)) {
          logger.write('matched: ' + requestDetail.url + '\r\n');
          if (responseDetail.response.toString() !== "") {
              logger.write(responseDetail.response.body.toString());
              var post_data = JSON.stringify({
                  'url': requestDetail.url,
                  'body': responseDetail.response.body.toString()
              });
              logger.write("post comment to server -- ext");
              postData(post_data, 'douban_comment', function (chunk) {
              });
         }
      }
    },
};

import requests
from bs4 import BeautifulSoup
import time
import jieba
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plot
 
def getHtml(url):
    try:
        r = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
        r.raise_for_status()
        r.encoding = "utf-8"
        return r.text
    except:
        print("Failed!!!")
 
f = open("E:/movieComment.txt",'wb+')
def getData(html):
    soup = BeautifulSoup(html,"html.parser")
    comment_list = soup.find('div',attrs={'class':'mod-bd'})
    for comment in comment_list.find_all('div',attrs={'class':'comment-item'}):
        comment_content = comment.find('span',attrs={'class':'short'}).get_text()
        f.write(comment_content.encode('UTF-8'))
 
def seg_sentence():
    #创建停用词列表
    filefath = 'E:/stopwords.txt'
    stopwords = [line.strip() for line in open(filefath,'r').readlines()]
 
    #实现句子的分词
    final = ''
    fn1 = open("E:/movieComment.txt", 'r',encoding='utf-8').read() #加载爬取的内容
    sentence_seged = jieba.cut(fn1,cut_all=False) #结巴分词：精确模式
    fn2 = open("E:/new.txt", "w", encoding='utf-8')
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                final +=word
                final +=" "
    fn2.write(final)   #写入去掉停用词的内容
 
def wordcloud():
 
    # 加载图片
    image = Image.open("E:/wc.jpg", 'r')
    img = np.array(image)
 
    # 词云
    cut_text = open('E:/new.txt', 'r', encoding='utf-8').read()  # 加载去掉停用词的内容
    wordcloud = WordCloud(
        mask=img,  # 使用该参数自动忽略height,width
        height=2000,  # 设置图片高度
        width=4000,  # 设置图片宽度
        background_color='white',
        max_words=1000,  # 设置最大词数
        max_font_size=400,
        font_path="C:\Windows\Fonts\msyh.ttc",  # 如有口型乱码问题,可进入目录更换字体
    ).generate(cut_text)
 
    # 显示图片
    plot.imshow(wordcloud, interpolation='bilinear')
    plot.axis('off')  # 去掉坐标轴
    plot.show()        #直接显示
    #plot.savefig('E:/wc1.jpg') #存为图片
 
def main():
    # 翻页处理 : max(start)=200
    k = 0  #start = k
    i = 0
    while k <200:
        url = 'https://movie.douban.com/subject/26752088/comments?start=' + str(k) + '&limit=20&sort=new_score&status=P'
        k += 20
        i += 1
        print("正在爬取第" + str(i) + "页的数据")
        #time.sleep(2) # 设置睡眠时间
        html = getHtml(url)
        getData(html)
    seg_sentence()
    wordcloud()
if __name__ == "__main__":
   main()