批量爬取豆瓣短评并批量生成词云

批量爬取豆瓣短评并批量制作为词云

我分为两步实现获取短评和制作词云


1.批量爬取豆瓣短评

from bs4 import BeautifulSoup
import requests
import threading


# 获取网页信息
def moviesInfo():
    # 1). 获取页面信息
    url = "https://movie.douban.com/cinema/nowplaying/xian/"

    response = requests.get(url)
    content = response.text
    # print(content)

    # 2). 分析页面,获取id和电影名
    soup = BeautifulSoup(content,'lxml')
    # 先找到所有的电影信息
    nowplaying_movie_list = soup.find_all('li',class_ = 'list-item')

    movies_info = []
    for item in nowplaying_movie_list:
        print(item)
        nowplaying_movie_dict = {}
        nowplaying_movie_dict['title'] = item['data-title']
        nowplaying_movie_dict['id'] = item['data-subject']
        movies_info.append(nowplaying_movie_dict)
    return movies_info

# 获得指定电影的影评
def getOnePageComment(id,pageNum):
    start = (pageNum-1)*20
    url='https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' %(id,start)

    # 2).爬取评论信息的网页内容
    content = requests.get(url).text

    # 3).通过bs4分析网页
    soup = BeautifulSoup(content,'lxml')
    commentList = soup.find_all('span',class_='short')
    pageComments = ""
    for commentTag in commentList:
        pageComments += commentTag.text
    print("%s page" %(pageNum))
    print(pageComments)
    with open('./doc/%s.txt'  %(id),'a') as f:
        f.write(pageComments)


threads = []
movies_info = moviesInfo()
for movie in movies_info:
    for pageNum in range(100):
        pageNum = pageNum + 1
        t = threading.Thread(target=getOnePageComment, args=(movie['id'],pageNum))
        threads.append(t)
        t.start()
    _ = [thread.join() for thread in threads]
    print("执行结束")

.txt文件即位所爬取电影的豆瓣短评(.png文件为生成的词云)

在这里插入图片描述

选取部分短评进行展示

《红高粱》电影短评

在这里插入图片描述

《无双》电影短评

在这里插入图片描述

2.批量生成电影词云

import re
import requests
import wordcloud
import jieba
from PIL import Image
import numpy as np
from bs4 import BeautifulSoup


def moviesInfo():
    # 1). 获取页面信息
    url = "https://movie.douban.com/cinema/nowplaying/xian/"

    response = requests.get(url)
    content = response.text
    # print(content)

    # 2). 分析页面,获取id和电影名
    soup = BeautifulSoup(content,'lxml')
    # 先找到所有的电影信息
    nowplaying_movie_list = soup.find_all('li',class_ = 'list-item')

    movies_info = []
    for item in nowplaying_movie_list:

        nowplaying_movie_dict = {}
        nowplaying_movie_dict['title'] = item['data-title']
        nowplaying_movie_dict['id'] = item['data-subject']
        movies_info.append(nowplaying_movie_dict)
    return movies_info

movies_info = moviesInfo()
for i in movies_info:
    # 1. 对于爬取的评论信息进行数据清洗(删除不必要的逗号,问好,句号,表情,只留下中文或者英文)
    with open('./doc/%s.txt' %(i['id'])) as f:
        comments = f.read()
        if comments != '':
            # 通过正则表达式实现
            paatern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
            deal_comments = re.findall(paatern,comments)
            newsComments = ''
            for item in deal_comments:
                newsComments += item


            # 1).切割中文,lcut返回一个列表,cut返回一个生成器
            result = jieba.lcut(newsComments)


            # 3).绘制词云
            wc = wordcloud.WordCloud(
                background_color='snow',
                font_path='./font/msyh.ttf',
                min_font_size=10,
                max_font_size=30,
                width=500,
                height=250,
            )

            wc.generate(','.join(result))
            wc.to_file('./doc/%s.png' %(i['id']))

《红高粱》电影词云

在这里插入图片描述

《影》电影词云

在这里插入图片描述

《嗝嗝老师》电影词云

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_41179709/article/details/83210806