《我不是药神》影评,逐页评论自动生成词云

注意事项:

  1. 生成词云的图片必须是黑白照片
  2. 豆瓣不登录用户只能访问到影评的前12页的数据
  3. 豆瓣影评数据只展示24页
  4. 本方法是上一篇博客的另一种写法
  5. 爬取豆瓣的全部内容目前还不太容易,真正可行的我还没见到,希望各位大侠能真正的爬出来分享。

import requests
from bs4 import BeautifulSoup
import time
import jieba
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import re
 
def getHtml(url):
    try:
        r = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; \
WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360EE'},\
                         cookies={'cookie':'1012'})
        r.raise_for_status()
        r.encoding = "utf-8"
        return r.text
    except:
        print("到此为止。原因豆瓣不登录用户只能访问到影评的前12页的数据,\
登录后的用户可以看到前24页的数据(豆瓣影评数据只展示24页")
 

def getData(html):
    f = open("new.txt",'wb+')
    soup = BeautifulSoup(html,"html.parser")
    comment_list = soup.find('div',attrs={'class':'mod-bd'})
    for comment in comment_list.find_all('div',attrs={'class':'comment-item'}):
        comment_content = comment.find('span',attrs={'class':'short'}).get_text()
        f.write(comment_content.encode('utf-8'))
 
def seg_sentence():
    #实现句子的分词
     final = ''
     fn1 = open("new.txt", 'r',encoding='utf-8').read() #加载爬取的内容
     for k in range(len(fn1)):
         final+=str(fn1[k]).strip()
     filtrate= re.compile(r'[\u4e00-\u9fa5]')
     filterdata = re.findall(filtrate, final)
     newtxt=''.join(filterdata)
     #print(newtxt)
     fn2 = open("new.txt", "w", encoding='utf-8') 
     fn2.write(newtxt)   
 
def wordcloud(m):
    
 
    # 加载图片
    image = Image.open('fbb'+str(m)+'.png', 'r')
    img = np.array(image)
    # 词云
    cut = open('new.txt', 'r',encoding='utf-8')
    cut_txt=cut.read()
    cut.close()
    newtxtls = jieba.lcut(cut_txt)
    txtls=[]
    for i in range(len(newtxtls)):
         if len(newtxtls[i])!=1:
              txtls.append(newtxtls[i])
              continue    
    newtxt=' '.join(txtls)
    #print(newtxt)  
    wordcloud = WordCloud(
        mask=img,  # 使用该参数自动忽略height,width
        background_color='white',
        max_words=500,  # 设置最大词数
        max_font_size=40,
        font_path="simhei.ttf").generate(newtxt)
 
    # 显示图片
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # 去掉坐标轴
    plt.show()        #直接显示
    wordcloud .to_file('wc'+str(m)+'.png') #存为图片
    plt.axis("off")
    
def main():
   
    k = 0  #start = k
    i = 0
    while k <300:
        url = 'https://movie.douban.com/subject/26752088/comments?start=' + str(k) + '&limit=20&sort=new_score&status=P'
        k += 20
        i += 1
        print("正在爬取第" + str(i) + "页的数据")
        time.sleep(1) # 设置睡眠时间
        html = getHtml(url)
        getData(html)
        seg_sentence()
        wordcloud(i)
if __name__ == "__main__":
       main()    


猜你喜欢

转载自blog.csdn.net/weixin_41809469/article/details/85146942