【NLP】NO3：文本可视化

常用可视化图：词云、分布图、Document Cards、树状图、网络图、力导向图、叠式图、Word Tree、地理热力图、ThemeRiver、SparkClouds、TextFlow、基于矩阵视图的情感分析可视化。

前端可视化网站：百度的 Echarts https://echarts.baidu.com/echarts2/doc/example.html

一、词云

分词、去停用词、统计词频、绘制词云

#引入所需要的包
import jieba
import pandas as pd
import numpy as np
from scipy.misc import imread
from wordcloud import WordCloud,ImageColorGenerator #词云显示
import matplotlib.pyplot as plt # plt 用于显示图片


#定义文件路径
dir =  "/Users/jianliu/Downloads/chat1/wordcloud/"
#定义语料文件路径
file = "".join([dir,"z_m.csv"])
#定义停用词文件路径
stop_words = "".join([dir,"stopwords.txt"])
#定义wordcloud中字体文件的路径
simhei = "".join([dir,"simhei.ttf"])
#读取语料
df = pd.read_csv(file, encoding='utf-8')
#如果存在nan，删除
df.dropna(inplace=True)
#将content一列转为list
content=df.content.values.tolist()
#用jieba进行分词操作
segment=[]
for line in content:
    try:
        segs=jieba.cut_for_search(line)
        segs = [v for v in segs if not str(v).isdigit()]#去数字
        segs = list(filter(lambda x:x.strip(), segs))   #去左右空格
        for seg in segs:
            if len(seg)>1 and seg!='\r\n':
                segment.append(seg)
    except:
        print(line)
        continue
#分词后加入一个新的DataFrame
words_df=pd.DataFrame({'segment':segment})
#加载停用词
stopwords=pd.read_csv(stop_words,index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
#安装关键字groupby分组统计词频，并按照计数降序排序
words_stat=words_df.groupby(by=['segment'])['segment'].agg(np.size)
words_stat = words_stat.to_frame()
words_stat.columns = ['计数']
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
#分组之后去掉停用词
words_stat=words_stat[~words_stat.segment.isin(stopwords.stopword)]
#下面是重点，绘制wordcloud词云，这一提供2种方式
#第一种是默认的样式
wordcloud=WordCloud(font_path=simhei,background_color="white",max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
wordcloud.to_file(dir+'wordcloud_1.jpg')  #保存结果
#第二种是自定义图片
text = " ".join(words_stat['segment'].head(100).astype(str))
print(dir+"china.jpg")
abel_mask = imread(dir+"china.jpg")  #这里设置了一张中国地图
wordcloud2 = WordCloud(background_color='white',  # 设置背景颜色
                     mask = abel_mask,  # 设置背景图片
                     max_words = 3000,  # 设置最大现实的字数
                     font_path = simhei,  # 设置字体格式
                     width=2048,
                     height=1024,
                     scale=4.0,
                     max_font_size= 300,  # 字体最大值
                     random_state=42).generate(text)
# 根据图片生成词云颜色
image_colors = ImageColorGenerator(abel_mask)
wordcloud2.recolor(color_func=image_colors)
# 以下代码显示图片
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()
wordcloud2.to_file(dir+'wordcloud_2.jpg') #保存结果

在这里插入图片描述

二、ER图

import Matplotlib as plt
import NetworkX as nx
classes= df['class'].values.tolist()
classrooms=df['classroom'].values.tolist()
nodes = list(set(classes + classrooms))
weights = [(df.loc[index,'class'],df.loc[index,'classroom'])for index in df.index]   
weights =  list(set(weights))
# 设置matplotlib正常显示中文
plt.rcParams['font.sans-serif']=['SimHei']   # 用黑体显示中文
plt.rcParams['axes.unicode_minus']=False 
colors = ['red', 'green', 'blue', 'yellow']
#有向图
DG = nx.DiGraph()
#一次性添加多节点，输入的格式为列表
DG.add_nodes_from(nodes)
#添加边，数据格式为列表
DG.add_edges_from(weights)
#作图，设置节点名显示,节点大小，节点颜色
nx.draw(DG,with_labels=True, node_size=1000, node_color = colors)
plt.show()

c.x.y.07.30

发布了60 篇原创文章 · 获赞 55 · 访问量 3万+

私信关注

【NLP】NO3：文本可视化

一、词云

二、ER图

猜你喜欢