金融领域词典构建

做情感分析还是需要结合情景和业务，之前直接用词典库效果太差，准备自建金融词典构建

语料库，呃呃呃，所有的词汇来源dict_myself

1.计算TF- IDF ，然后排序，得到的词可能会有和情感词典中重复的

#coding=UTF-8
"""
author:susuxuer
function:构建金融领域词汇
参考文献：https://www.cnblogs.com/en-heng/p/5848553.html
"""
import jieba.posseg as pseg
import numpy as np
import pandas as pd
import jieba
import time
import csv
import sys
import glob
import os
from collections import Counter
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from collections import defaultdict
from gensim import corpora,models

#调用停用词
def loadPoorEnt(path2 = 'G:/project/sentimation_analysis/data/stopwords.csv'):
    csvfile = open(path2,encoding='UTF-8')
    stopwords  = [line.strip() for line in csvfile.readlines()]
    return stopwords
stop_words=loadPoorEnt()

#读取所有文件路径
def get_all_content():
    #abel_dir = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
    all_files = glob.glob(r'D:/GFZQ/GFZQ/xuesu2018/xuesu/*.csv')
    return all_files

#获取文本信息
def get_wenben(path):
	csvfile = open(path,'r',encoding='UTF-8')
	reader = csv.reader(csvfile)
	return reader

# 进行句子的切分，选取v、a、d
def cut(data):
    result=[]    #pos=['n','v']
    res = pseg.cut(data)
    list = []
    for item in res:
        if item.word not in stop_words and (item.flag == 'd' or item.flag == 'a' or item.flag == 'v'):
            list.append(item.word)
    result.append(list)
    return result

#每篇业绩说明会选取部分词汇（先计算tfidf，然后排序）
def extract_features(myQA):
    dict=Dictionary(myQA)
    corpus = [dict.doc2bow(line) for line in myQA]
    model = TfidfModel(corpus)
    map={}
    features= []
    for item1 in corpus:
        tfidf = model[item1]
        for item in tfidf:
            id = item[0]
            value = item[1]
            temp = map.get(id, None)
            if temp == None:
                map[id] = value
            else:
                map[id] += temp
    result = sorted(map.items(), key=lambda x: x[1], reverse=True)
    for id in result:
        word = dict[id[0]]
        features.append(word)
    return features

def set_ourdict(all_files,length):
    dict_ask = []
    dict_ans = []
    QA1 =[]
    QA_all =[]
    #length =len(all_files)
    for i in range(length):
        print ("正在解析第%d家公司" %i)
        file_path = all_files[i]
        wenben = get_wenben(file_path)
        for QA in wenben :
            #print (QA)
            #给每个问题切分，去除停用词后，组合成列表c
            seg_list1 = cut(QA[1])
            #给每个答案分词
            seg_list2 = cut (QA[2])
            seg_list =  seg_list1[0] +  seg_list2[0]
            dict_ask.append(seg_list1[0])
            dict_ans.append(seg_list2[0])
            QA1.append(seg_list)
        QA_all.append(QA1)
    return QA_all,QA1,dict_ans, dict_ask

#时间计算
def cal_time(time):
    if time < 60:
        return str(time) + 'secs'
    if time < 60*60:
        return str(time/60.0) + 'mins'
    if time< 60*60*60:
        return str(time/60.0) + 'hours'

#读取情感词典
def read_emotion():
    path1 ='G:/project/sentimation_analysis/dict/emotion_dict.txt'
    path2 = 'G:/project/sentimation_analysis/dict/degree_dict.txt'
    txtfile1 = open(path1, encoding='UTF-8')
    all_emotion = [line for line in txtfile1.readlines()]
    txtfile2 = open(path1, encoding='UTF-8')
    all_degree = [line for line in txtfile2.readlines()]
    path3 = 'G:/project/sentimation_analysis/data_version1/dict_frequency1.csv'
    csvfile = open(path3,encoding = 'UTF-8')
    lines = [line for line in csvfile.readlines()]
    return all_emotion,all_degree,lines

#判断词句是否已在词典中
def cmp_with_dict(all_emotion,all_degree,lines):
    new_dict =[]
    for word in lines:
        if word not in all_emotion and word not in all_degree:
            new_dict.append(word)
    return new_dict




if __name__ == '__main__':
    start = time.clock()
    all_files = get_all_content()  # 获取所有文件的路径
    length = 1800  # len(all_files)
    print ("统计了%d家公司的情感词" %length)
    QA_all,myQA,answer,question = set_ourdict(all_files,length)
    print ("分词完成")
    dict_myself = answer + question
    features = extract_features(myQA)
    print("开始提取关键词")

    #print (Counter(features).most_common(5))
    data  = dict(Counter(features))
    high_fre =[]
    print("原始数据长度:%d " % len(features))
    print("去重后数据长度:%d " % len(list(set(features))))
    # with open ('G:/project/sentimation_analysis/data/dict_frequency.csv','w',encoding='UTF-8') as fw:
    #     for k ,v in data.items():
    #         fw.write('%s,%d\n' %(k,v))

    print ("已将所有分词保存在dict中")
    with open('G:/project/sentimation_analysis/data/dict_frequency1.csv', 'w', encoding='UTF-8') as fw1:
        for k in data:
            fw1.write('%s\n' % k)
    print("已去除只出现一次的词，并将其保存在dict1中")

    top =[]
    for item in features[:2000]:
        top.append(item)
    with open('G:/project/sentimation_analysis/data/dict_top.csv', 'w', encoding='UTF-8') as fw2:
        for item in top :
            fw2.write('%s\n' %item)

    elapsed = time.clock() - start
    print ("Time Use %s" % cal_time(elapsed))

2.和Hownet情感词典比对，保留词典中没有的词（dict_del.py）

import jieba.posseg as pseg
import numpy as np
import pandas as pd
import jieba
import time
import csv
import sys
import glob
import os
from collections import Counter
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from collections import defaultdict
from gensim import corpora,models


def read_emotion():
    path1 ='G:/project/sentimation_analysis/dict/emotion_dict/all_emotion.txt'
    path2 = 'G:/project/sentimation_analysis/dict/degree_dict/all_degree.txt'
    txtfile1 = open(path1, encoding='UTF-8')
    all_emotion = [line for line in txtfile1.readlines()]
    txtfile2 = open(path1, encoding='UTF-8')
    all_degree = [line for line in txtfile2.readlines()]
    path3 = 'G:/project/sentimation_analysis/data_version1/dict_frequency1.csv'
    csvfile = open(path3,encoding = 'UTF-8')
    lines = [line for line in csvfile.readlines()]
    return all_emotion,all_degree,lines

#判断词句是否已在词典中
def cmp_with_dict(all_emotion,all_degree,lines):
    new_dict =[]
    for word in lines:
        if word not in all_emotion or word not in all_degree:
            new_dict.append(word)
    return new_dict

def cal_time(time):
    if time < 60:
        return str(time) + 'secs'
    if time < 60*60:
        return str(time/60.0) + 'mins'
    if time< 60*60*60:
        return str(time/60.0) + 'hours'

def del_one():
    path = 'G:/project/sentimation_analysis/data/dict_del_one.csv'
    txtfile1 = open(path, encoding='UTF-8')
    lines =[]
    for line in txtfile1.readlines():
        if len(line) != 1:
            lines.append(line)
            print (line)



if __name__=='__main__':
    start = time.clock()
    #del_one()

    all_emotion, all_degree, lines =  read_emotion()
    new_dict = cmp_with_dict(all_emotion, all_degree, lines)
    # with open('G:/project/sentimation_analysis/data/dict_del.csv', 'w', encoding='UTF-8') as fw2:
    #     for item in new_dict :
    #         fw2.write('%s' %item)
    dict_one =[]
    for item in new_dict:
        if len(item.encode('utf8')) == 1:
            new_dict.remove(item)
    with open('G:/project/sentimation_analysis/data/dict_del_one.csv', 'w', encoding='UTF-8') as fw2:
        for item in new_dict:
            fw2.write('%s' % item)

    use = time.clock() -start
    print ("Time USe %s" %use)

2.选取种子词

3.SO-PMI扩展词典

金融领域词典构建

猜你喜欢