代码参考:爬取了陈奕迅新歌《我们》10万条评论数据发现:原来,有些人只适合遇见
1. 找到评论url与请求方式:
header与form data(忽略信息解密)
2. 抓取热门评论
3. 热评词云
4. wordcloud练习:按图片的形状和颜色布局生成词云
from wordcloud import WordCloud, ImageColorGenerator import matplotlib.pyplot as plt from scipy.misc import imread import jieba
# wordcloud练习 stopwords_path = 'D:/workspace/my exercises/netmusic/stopwords.txt' # 停用词库存放路径 back_coloring_path = 'D:/workspace/my exercises/netmusic/leslie.jpg' # 背景图存放路径 font_path='C:\Fonts\simkai.ttf' # 中文字体文件路径 back_coloring = imread(back_coloring_path) # 设置背景颜色 # jiaba分词去停用词 def jiebaclearText(text): mywordlist = [] seg_list = jieba.cut(text,cut_all=False) liststr='/ '.join(seg_list) f_stop = open(stopwords_path) try: f_stop_text = f_stop.read( ) finally: f_stop.close( ) f_stop_seg_list =f_stop_text.split('\n') for myword inliststr.split('/'): if not(myword.strip() inf_stop_seg_list) and len(myword.strip())>1: mywordlist.append(myword) return ''.join(mywordlist) text = jiebaclearText(content_text) wc = WordCloud(font_path=font_path, # 设置字体 background_color="white", # 背景颜色 max_words=5000, # 词云显示的最大词数 mask=back_coloring, # 设置背景图片 max_font_size=100, # 字体最大值 random_state=84, width=1000,height=860, margin=2, ) wc.generate(text) image_colors = ImageColorGenerator(back_coloring) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis("off") # 绘制背景图片为颜色的图片 plt.figure() plt.imshow(back_coloring, cmap=plt.cm.gray) plt.axis("off") plt.show()
原图与词云哈哈,真爱粉也看不出来。。