利用python爬取微博热搜榜制作词云图

import jieba
from lxml import etree
import urllib
import urllib.request
from wordcloud import WordCloud
import pandas as pd
from imageio import imread
import matplotlib.pyplot as plt

def getpage(url):
    req=urllib.request.Request(url)
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
    data=urllib.request.urlopen(req).read().decode('utf-8')
    return data

def getdata(data):
    html=etree.HTML(data)
    top_search=html.xpath('//td[@class="td-02"]/a[@href]/text()')
    return top_search

def cut_words(top_search):
    top_cut=[]
    for top in top_search:
        top_cut.extend(list(jieba.cut(top)))  #使用精确模式切割词汇
    return top_cut

if __name__=="__main__":
    url="https://s.weibo.com/top/summary?cate=realtimehot"
    top_search = getdata(getpage(url))
    all_words = cut_words(top_search)

    #定义停用词
    stop = ['的','你','了','将','为','例',' ','多','再','有','是','等','天','次']
    words_cut = []
    for word in all_words:
        if word not in stop:
            words_cut.append(word)
    word_count = pd.Series(words_cut).value_counts()
    back_ground = imread("E:\\python\\flower.jpg")
    wc = WordCloud(
                   font_path="C:\\Windows\\Fonts\\simhei.ttf", #设置字体
                   background_color="white",  #设置词云背景颜色
                   max_words=1000,  #词云允许最大词汇数
                   mask=back_ground,  #词云形状
                   max_font_size=200,   #最大字体大小
                   random_state=50  #配色方案的种数
                  )
    wc1 = wc.fit_words(word_count)  #生成词云
    plt.figure()
    plt.imshow(wc1)
    plt.axis("off")
    plt.show()
    wc.to_file("ciyun.png")

在这里插入图片描述

发布了113 篇原创文章 · 获赞 51 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/qq_38883271/article/details/104497482