Data visualization word cloud

Choose a novel (txt format) to draw the high-frequency word cloud + social network map of the novel with reference to PPT

  1. Book of Emperors word cloud

import jieba
import wordcloud
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib

def getText(filepath):
    f = open(filepath, "r", encoding='utf-8')
    text = f.read()
    f.close()
    return text

# 将停用词文件中的词读到列表stopwords
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords


def wordFreq(filepath, text, topn):
    # 使用jieba库中的lcut()对文本分词
    words = jieba.lcut(text.strip())  # 对读入的文本进行分词操作
    stopwords=stopwordslist('D:\StudyResource\dataVisual\stop.txt')
    counts = {}
    for word in words:
        if len(word)==1:#删除长度为1的字符
            continue
        elif word not in stopwords:
            if word=="殿下"or word=="太子":
                word='韩烨'
            elif word=="梓元" or word=="帝梓元" or word=="小姐":
                word="安乐"
            elif word=="公主":
                word="安宁"
            elif word=="陛下":
                word="嘉宁帝"
            counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)
    f = open(filepath[:-4] + '_词频.txt', 'w')
    for i in range(topn):
        word, count = items[i]
        f.writelines("{}\t{}\n".format(word, count))
    f.close()

if __name__=='__main__':
    filepath='D:\StudyResource\dataVisual\帝皇书_星零.txt'
    text=getText(filepath)
    wordFreq(filepath,text,100)
    # 将词云制作出来
    f=open('D:\StudyResource\dataVisual\帝皇书_星零_词频.txt','r')
    text=f.read()
    wcloud=wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\STKAITI.TTF',
                               background_color="white",width=1000,
                               # mask=,
                               max_words=500,height=860,margin=2).generate(text)
    #generate(text)指根据词频文件生成词云
    wcloud.to_file("帝皇书cloud.png")
    f.close()

    reactions={}
    # lst_para=text.split('\n')
  1. Social Relationship Diagram of Main Characters in Book of Emperors

import re
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import networkx as nx

matplotlib.rcParams['font.sans-serif'] = ['SimHei']

# 导入Python中的正则表达式(re模块),通过正则匹配,找到文中所有“第**回”形式的字符。
# “\u4e00”和“\u9fa5”是unicode编码,并且正好是中文编码的开始和结束的两个值,
# 所以这个正则表达式可以用来判断字符串中是否包含中文。

f = open('D:\StudyResource\dataVisual\帝皇书_星零.txt', 'r', encoding='utf-8')
s = f.read()
chapter = re.findall("第[\u4E00-\u9FA5]+章", s)
# print(chapter)
lst_chapter = []
for x in chapter:
    if x not in lst_chapter and len(x) <= 5:
        lst_chapter.append(x)
# print(lst_chapter)
# 找出每一章在原文中的起始位置
lst_start_chapterindex = []
for x in lst_chapter:
    lst_start_chapterindex.append(s.index(x))
# print(lst_start_chapterindex)
# 找出每一章在原文中的结束位置,本回的结束位置就是下一回的起始位置。
# 最后一回的结束位置就是全文的结束。zip将每一回的起始和结束位置拼成一个元组,存放在lst_chapterindex列表中。
lst_end_chapterindex = lst_start_chapterindex[1:] + [len(s)]
lst_chapterindex = list(zip(lst_start_chapterindex, lst_end_chapterindex))
# print(lst_chapterindex)
# 统计每一章中韩烨的出场次数
# cnt_hanye = []
# for ii in range(96):
#     start = lst_chapterindex[ii][0]
#     end = lst_chapterindex[ii][1]
#     cnt_hanye.append(s[start:end].count("韩烨"))
# # print(cnt_handy)
# fig = plt.figure(figsize=(10, 6))
# x = list(range(1, 97))
# y = cnt_hanye
# matplotlib.rc("font",family='FangSong') #使用代码帮助matplotlib识别中文字体仿宋
# plt.plot(x, y,c='b')
# plt.xlabel('章节',fontsize=14)
# plt.ylabel('出场次数',fontsize=14)
# plt.title('韩烨每章出场次数折线图',fontsize=14)#" "内的文字为标题,fontsize的值则为文字的大小,若不设置,系统会给出一个默认值
# plt.plot()
# plt.show()
# fig.savefig("画布")

# 帝皇书之人物社交网络
names = ['韩烨', '安乐', '洛铭西', '嘉宁帝', '温朔', '帝承恩', '安宁', '苑琴', '苑书', '赵福', '太后']
relations = {}
lst_para = s.split('\n')  # 按段落划分,假设在同一段落中的两个人物有关系
for text in lst_para:
    for name1 in names:
        if name1 in text:
            for name2 in names:
                if name2 in text and name1 != name2 and (name2, name1) not in relations:
                    relations[(name1, name2)] = relations.get((name1, name2), 0) + 1
print(relations.items())
# 基于共现关系获取人物关系权重
# 接下来对权重值做归一化操作,找到共现次数的最大值,然后将所有的共现次数映射到0-1之间。
maxrela = max([v for k, v in relations.items()])  # 取最大共现次数
relations = {k: v / maxrela for k, v in relations.items()}
fig=plt.figure(figsize=(15, 15))
G = nx.Graph()
# 根据relations的数据向G中添加边
for k, v in relations.items():
    G.add_edge(k[0], k[1], weight=v)
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.6]
emidle = [(u, v) for (u, v, d) in G.edges(data=True) if (d['weight'] <= 0.6) & (d['weight'] > 0.3)]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.3 ]
# 使用networkx库,绘制人物社交网络图
# 设置图形布局
fig=plt.figure(figsize=(20,20))
pos=nx.spring_layout(G)
# 设置节点样式
nx.draw_networkx_nodes(G,pos,alpha=0.8,node_size=800)
nx.draw_networkx_edges(G,pos,edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')
nx.draw_networkx_edges(G,pos,edgelist=emidle,width=1.5,alpha=0.6,edge_color='y')
nx.draw_networkx_edges(G,pos,edgelist=esmall,width=1,alpha=0.4,edge_color='r',style='dashed')
nx.draw_networkx_labels(G,pos,font_size=12)
plt.axis('off')
plt.title('《帝皇书》主要人物社交关系图')
plt.plot()
plt.show()
fig.savefig('《帝皇书》主要人物社交关系图')
  1. The line chart of the number of appearances in each chapter of Han Ye, the main character in The Emperor's Book

# 统计每一章中韩烨的出场次数
cnt_hanye = []
for ii in range(96):
    start = lst_chapterindex[ii][0]
    end = lst_chapterindex[ii][1]
    cnt_hanye.append(s[start:end].count("韩烨"))
# print(cnt_handy)
fig = plt.figure(figsize=(10, 6))
x = list(range(1, 97))
y = cnt_hanye
matplotlib.rc("font",family='FangSong') #使用代码帮助matplotlib识别中文字体仿宋
plt.plot(x, y,c='b')
plt.xlabel('章节',fontsize=14)
plt.ylabel('出场次数',fontsize=14)
plt.title('韩烨每章出场次数折线图',fontsize=14)#" "内的文字为标题,fontsize的值则为文字的大小,若不设置,系统会给出一个默认值
plt.plot()
plt.show()
fig.savefig("画布")

Guess you like

Origin blog.csdn.net/weixin_52732185/article/details/129134235