词云分析的进一步理解

  1. 豆瓣电影
    豆瓣评论分析:
    1). 获取豆瓣最新上映的所有电影的前10页评论信息;
    2). 清洗数据;
    3). 分析每个电影评论信息分析绘制成词云, 保存为png图片,文件名为: 电影名.png;
import requests
from bs4 import BeautifulSoup
import re
import jieba
import wordcloud
import numpy
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

def get_movie(url):
    response=requests.get(url)
    content=response.text
    soup=BeautifulSoup(content,'html.parser')
    nowplaying_movie_list=soup.find_all('li',class_='list-item')
    movies_info=[]
    for item in nowplaying_movie_list:
        nowplaying_movie_dict = {}
        nowplaying_movie_dict['title']=item['data-title']
        nowplaying_movie_dict['id']=item['id']
        movies_info.append(nowplaying_movie_dict)
    return movies_info

def get_info(id,pageNum):
    start=20*(pageNum-1)
    url='https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' %(id,start)
    content=requests.get(url).text
    soup=BeautifulSoup(content,'html.parser')
    commentsList=soup.find_all('span',class_='short')
    comments=''
    for commentTag in commentsList:
        comments+=commentTag.text
    return comments

def word_cloud(comment,name):
    pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
    deal_comments = re.findall(pattern, comment)
    newComments = ''
    for item in deal_comments:
        newComments += item
    result = jieba.lcut(newComments)
    imageObj = Image.open('./image.jpg')
    cloud_mask = numpy.array(imageObj)
    wc = wordcloud.WordCloud(
        background_color='snow',
        mask=cloud_mask,
        font_path='./msyh.ttf',
        min_font_size=5,
        max_font_size=50,
        width=260,
        height=260,
    )
    wc.generate(','.join(result))
    wc.to_file('./img/%s.png' % (name))

def main():
    url = 'https://movie.douban.com/cinema/nowplaying/xian/'
    movie_id=get_movie(url)
    for dict in movie_id:
        id=dict['id']
        name=dict['title']
        for page in range(1,10):
            pool = ThreadPoolExecutor(max_workers=10)
            comment=pool.map(get_info(id,page))
            word_cloud(comment,name)


main()
  1. 慕客网
    爬取慕客网所有关于python的课程名及描述信息, 并通过词云进行分析展示;
import re
import requests
from bs4 import BeautifulSoup
import jieba
import numpy
from PIL import Image
import wordcloud

def get_html(url):
    return requests.get(url).text

def get_name(text):
    soup = BeautifulSoup(text, 'html5lib')
    nametag_li = soup.find_all('div', class_="course-item-detail")
    info_li = []
    for i in nametag_li:
        info = {}
        i = re.findall(r'[\u4E00-\u9FA5]+',str(i))
        info['name'] = i[0]
        info['info'] = i[1:]
        info_li.append(info)
    return info_li

def word_cloud(text):
    imgobj = Image.open('./image.jpg')
    cloud_mask = numpy.array(imgobj)
    result = jieba.lcut(text)
    wc = wordcloud.WordCloud(
        width=500,
        mask=cloud_mask,
        max_font_size=50,
        min_font_size=5,
        background_color='snow',
        font_path = './msyh.ttf',
    )
    wc.generate(','.join(result))
    wc.to_file('./muke.png')


def main():
    li = []
    for i in range(2):
        url = 'https://www.imooc.com/search/course?words=python&page=%d' %(i+1)
        info = get_name(get_html(url))
        for j in info:
            li.append(j['name'])
            li.append(''.join(j['info']))
    word_cloud(''.join(li))

main()

执行结果:
在这里插入图片描述

  1. python爬取今日百度热点前10的新闻
from bs4 import BeautifulSoup
from urllib.request import urlopen


def get_html(url):
    a = urlopen(url).read().decode('gb2312')
    return a


def get_info(text):
    soup = BeautifulSoup(text, 'html5lib')
    info_li = soup.find_all('a', class_='list-title')
    news_li = [info_li[i].string for i in range(10)]
    return news_li


def main():
    url = 'http://top.baidu.com/buzz?b=341'
    new_li = get_info(get_html(url))
    [print(i) for i in new_li]


main()

执行结果:
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_42668123/article/details/83185307