数据可视化词云

自选一部小说（txt格式）参照PPT绘制小说的高频词云+社交网络图

《帝皇书》词云

import jieba
import wordcloud
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib

def getText(filepath):
    f = open(filepath, "r", encoding='utf-8')
    text = f.read()
    f.close()
    return text

# 将停用词文件中的词读到列表stopwords
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords


def wordFreq(filepath, text, topn):
    # 使用jieba库中的lcut()对文本分词
    words = jieba.lcut(text.strip())  # 对读入的文本进行分词操作
    stopwords=stopwordslist('D:\StudyResource\dataVisual\stop.txt')
    counts = {}
    for word in words:
        if len(word)==1:#删除长度为1的字符
            continue
        elif word not in stopwords:
            if word=="殿下"or word=="太子":
                word='韩烨'
            elif word=="梓元" or word=="帝梓元" or word=="小姐":
                word="安乐"
            elif word=="公主":
                word="安宁"
            elif word=="陛下":
                word="嘉宁帝"
            counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)
    f = open(filepath[:-4] + '_词频.txt', 'w')
    for i in range(topn):
        word, count = items[i]
        f.writelines("{}\t{}\n".format(word, count))
    f.close()

if __name__=='__main__':
    filepath='D:\StudyResource\dataVisual\帝皇书_星零.txt'
    text=getText(filepath)
    wordFreq(filepath,text,100)
    # 将词云制作出来
    f=open('D:\StudyResource\dataVisual\帝皇书_星零_词频.txt','r')
    text=f.read()
    wcloud=wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\STKAITI.TTF',
                               background_color="white",width=1000,
                               # mask=,
                               max_words=500,height=860,margin=2).generate(text)
    #generate(text)指根据词频文件生成词云
    wcloud.to_file("帝皇书cloud.png")
    f.close()

    reactions={}
    # lst_para=text.split('\n')

《帝皇书》主要人物社交关系图

import re
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import networkx as nx

matplotlib.rcParams['font.sans-serif'] = ['SimHei']

# 导入Python中的正则表达式（re模块），通过正则匹配，找到文中所有“第**回”形式的字符。
# “\u4e00”和“\u9fa5”是unicode编码，并且正好是中文编码的开始和结束的两个值，
# 所以这个正则表达式可以用来判断字符串中是否包含中文。

f = open('D:\StudyResource\dataVisual\帝皇书_星零.txt', 'r', encoding='utf-8')
s = f.read()
chapter = re.findall("第[\u4E00-\u9FA5]+章", s)
# print(chapter)
lst_chapter = []
for x in chapter:
    if x not in lst_chapter and len(x) <= 5:
        lst_chapter.append(x)
# print(lst_chapter)
# 找出每一章在原文中的起始位置
lst_start_chapterindex = []
for x in lst_chapter:
    lst_start_chapterindex.append(s.index(x))
# print(lst_start_chapterindex)
# 找出每一章在原文中的结束位置，本回的结束位置就是下一回的起始位置。
# 最后一回的结束位置就是全文的结束。zip将每一回的起始和结束位置拼成一个元组，存放在lst_chapterindex列表中。
lst_end_chapterindex = lst_start_chapterindex[1:] + [len(s)]
lst_chapterindex = list(zip(lst_start_chapterindex, lst_end_chapterindex))
# print(lst_chapterindex)
# 统计每一章中韩烨的出场次数
# cnt_hanye = []
# for ii in range(96):
#     start = lst_chapterindex[ii][0]
#     end = lst_chapterindex[ii][1]
#     cnt_hanye.append(s[start:end].count("韩烨"))
# # print(cnt_handy)
# fig = plt.figure(figsize=(10, 6))
# x = list(range(1, 97))
# y = cnt_hanye
# matplotlib.rc("font",family='FangSong') #使用代码帮助matplotlib识别中文字体仿宋
# plt.plot(x, y,c='b')
# plt.xlabel('章节',fontsize=14)
# plt.ylabel('出场次数',fontsize=14)
# plt.title('韩烨每章出场次数折线图',fontsize=14)#" "内的文字为标题，fontsize的值则为文字的大小，若不设置，系统会给出一个默认值
# plt.plot()
# plt.show()
# fig.savefig("画布")

# 帝皇书之人物社交网络
names = ['韩烨', '安乐', '洛铭西', '嘉宁帝', '温朔', '帝承恩', '安宁', '苑琴', '苑书', '赵福', '太后']
relations = {}
lst_para = s.split('\n')  # 按段落划分，假设在同一段落中的两个人物有关系
for text in lst_para:
    for name1 in names:
        if name1 in text:
            for name2 in names:
                if name2 in text and name1 != name2 and (name2, name1) not in relations:
                    relations[(name1, name2)] = relations.get((name1, name2), 0) + 1
print(relations.items())
# 基于共现关系获取人物关系权重
# 接下来对权重值做归一化操作，找到共现次数的最大值，然后将所有的共现次数映射到0-1之间。
maxrela = max([v for k, v in relations.items()])  # 取最大共现次数
relations = {k: v / maxrela for k, v in relations.items()}
fig=plt.figure(figsize=(15, 15))
G = nx.Graph()
# 根据relations的数据向G中添加边
for k, v in relations.items():
    G.add_edge(k[0], k[1], weight=v)
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.6]
emidle = [(u, v) for (u, v, d) in G.edges(data=True) if (d['weight'] <= 0.6) & (d['weight'] > 0.3)]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.3 ]
# 使用networkx库,绘制人物社交网络图
# 设置图形布局
fig=plt.figure(figsize=(20,20))
pos=nx.spring_layout(G)
# 设置节点样式
nx.draw_networkx_nodes(G,pos,alpha=0.8,node_size=800)
nx.draw_networkx_edges(G,pos,edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')
nx.draw_networkx_edges(G,pos,edgelist=emidle,width=1.5,alpha=0.6,edge_color='y')
nx.draw_networkx_edges(G,pos,edgelist=esmall,width=1,alpha=0.4,edge_color='r',style='dashed')
nx.draw_networkx_labels(G,pos,font_size=12)
plt.axis('off')
plt.title('《帝皇书》主要人物社交关系图')
plt.plot()
plt.show()
fig.savefig('《帝皇书》主要人物社交关系图')

帝皇书中主要人物韩烨每章出场次数折线图

# 统计每一章中韩烨的出场次数
cnt_hanye = []
for ii in range(96):
    start = lst_chapterindex[ii][0]
    end = lst_chapterindex[ii][1]
    cnt_hanye.append(s[start:end].count("韩烨"))
# print(cnt_handy)
fig = plt.figure(figsize=(10, 6))
x = list(range(1, 97))
y = cnt_hanye
matplotlib.rc("font",family='FangSong') #使用代码帮助matplotlib识别中文字体仿宋
plt.plot(x, y,c='b')
plt.xlabel('章节',fontsize=14)
plt.ylabel('出场次数',fontsize=14)
plt.title('韩烨每章出场次数折线图',fontsize=14)#" "内的文字为标题，fontsize的值则为文字的大小，若不设置，系统会给出一个默认值
plt.plot()
plt.show()
fig.savefig("画布")

猜你喜欢