数据分析实战(1)

今天帮别人写的一个文本数据分析程序

import jieba
import jieba.posseg
import jieba.analyse
import re
import nltk
from nltk.text import ContextIndex
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import brown
from nltk import FreqDist
import numpy as np
import matplotlib.pyplot as plt

def count_word():
    global temp1
    # 采用读二进制的形式打开文件
    file = open("D:/数据/圣女的救济.txt",'rb+')
    # 这个函数会返回一个列表,以\n分割,例如['无聊的小黑\n','rgb-#000000\n']
    article =file.readlines()
    # 存汉字个数的变量
    Chc_sum = 0
    # 存字符的变量
    noChc_sum = 0
    # 遍历列表,对列表的每个元素进行分析
    for line in article:
        Chc_sum += len(re.findall(r'\\x..','%r'%line))/3
        # utf-8采用3个16进制数表示,所以我这里采用正则表达式匹配'\x..'然后以16进制的数字个数除以3得到汉字个数
        noChc_sum += len(re.sub(r'\\.','',re.sub(r'\\x..','','%r'%line)))-2
        # 这里的逻辑是删除元素中的所有16进制数然后计算剩余的字符串长度,之所以减二是因为
        # 这个算法会把字符串两边的引号也算在里面,大家可以print输出一下看看,我这个里采用非转译的格式输出的(r)
    temp1 = Chc_sum
    print('文中含有 %d 个汉字' % Chc_sum)
    print('文中含有 %d 个标点符号' % noChc_sum)
    print('文中共有 %d 个字' % (Chc_sum+noChc_sum))
    file.close()

# 统计词语的总数
def terms_count():
    global temp2
    # 使用jieba对中文进行语统计并输出
    textopen = open("D:/数据/圣女的救济.txt", 'r',encoding="utf-8")
    lines = textopen.readlines()
    # print(lines)
    l = []
    word_count = 0
    for line in lines:
        # 在上面的输出结果中,我们分词有很多标点符号在里边,此时我们可以使用停止词来去掉不需要词
        clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', line))
        # 结巴分词,得到的是一个列表
        wordlist = jieba.lcut(clean_data)
        # text = nltk.Text(wordlist)
        # 统计词语总数
        word_count += len(wordlist)
        # l.append(wordlist)
    temp2 = word_count
    textopen.close()
    print("文中共有{}个词语".format(word_count))


# 计算词语的平均长度
def average_word_length():
    count_word()
    terms_count()
    # 全局变量
    average_word_length = temp1 / temp2
    print("词语的平均长度是{:.2f}".format(average_word_length))
average_word_length()


# 词性标签的频率分布
def word_character_frequency():
    count = 0
    number = {'a' : count,
              'd' : count,
              'e' : count,
              'n' : count,
              'nr' : count,
              'ns' : count,
              'p' : count,
              'r' : count
              }
    textopen = open("D:/数据/圣女的救济.txt", 'r', encoding="utf-8")
    lines = textopen.readlines()
    for line in lines:
        clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', line))
        words = jieba.posseg.cut(clean_data)
        for word in words:
            st = str(word.flag)
            if st == 'a' or st == 'd' or st == 'e' or st == 'n' or st == 'nr' or st == 'ns' or st == 'p' or st == 'r':
                number[st] += 1
    # 进行可视化展示
    # 下面两行代码用于显示中文
    plt.style.use('ggplot')  # 使用自带的样式进行美化
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    plt.subplot(1, 1, 1)
    x = np.array(['形容词', '副词', '叹词', '名词', '人名', '地名', '介词', '代词'])
    y = np.array(list(number.values()))
    plt.bar(x, y, width=0.5, align='center', label='数量', color='b')
    plt.title("词性频率分布", color='k')
    for a, b in zip(x, y):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=11, color='k')
    plt.xlabel("词性")
    plt.ylabel("数量")
    plt.legend()
    plt.show()
    print(number)
# word_character_frequency()

# 挖掘小说中词频较高的词语
def word_frequency():
    # 注意编码问题
    data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
    # 得到50个词频较高的词语
    tag = jieba.analyse.extract_tags(data, 50)
    print(tag)
# word_frequency()

# 查看单词的上下文
def look_up():
    data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
    clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
    wordlist = jieba.lcut(clean_data)
    text = nltk.Text(wordlist)
    text.concordance(word='绫音',width=20,lines=10)
    # 搜索共同上下文
    text.common_contexts(['绫音', '若山宏美'])
# look_up()

# 统计词频
def word():
    data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
    clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
    wordlist = jieba.lcut(clean_data)
    text = nltk.Text(wordlist)
    text.count(word='下毒')
    words=['凶手','砒霜','刑警','感觉']
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    text.dispersion_plot(words)
# word()

# 计算相似度
def Similarity():
    data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
    clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
    wordlist = jieba.lcut(clean_data)
    text = nltk.Text(wordlist)
    # text.similar(word='下毒', num=10)
    contentindex = ContextIndex(wordlist)
    similarity_scores = contentindex.word_similarity_dict(word='下毒')
    for key, value in similarity_scores.items():
        if value > 0.02:
            print(key, value)
# Similarity()

# 情感分析
def feel_analyse():
    def word_feats(words):
        return dict([(word, True) for word in words])
    # 数据准备
    positive_vocab = ['美丽', '可爱', '自信', '大方', '勇敢', '希望', '包容', '贡献', '诚实', '健康']
    negative_vocab = ['暗淡', '暗示', '傲慢', '懊恼', '罢工', '杀人', '白费', '霸占', '昂贵', '伤心']
    neutral_vocab = ['音乐', '电影', '是', '的', '行动', '做', '词语', '说', '你', '听']
    # 特征提取
    positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
    negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
    neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

    train_set = negative_features + positive_features + neutral_features
    # 训练
    classifier = NaiveBayesClassifier.train(train_set)

    # 测试
    neg = 0
    pos = 0
    data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
    clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
    wordlist = jieba.lcut(clean_data)
    text = nltk.Text(wordlist)
    data_new = list(text)
    for word in data_new:
            classResult = classifier.classify(word_feats(word))
            if classResult == 'neg':
                neg = neg + 1
            if classResult == 'pos':
                pos = pos + 1
    print('积极: ' + str(float(pos) / len(data_new)))
    print('消极: ' + str(float(neg) / len(data_new)))

feel_analyse()
发布了60 篇原创文章 · 获赞 6 · 访问量 7780

猜你喜欢

转载自blog.csdn.net/qq_44205272/article/details/103149176
今日推荐