LSTM模型---情感分析

import warnings
#控制警告错误的输出
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow as tf
import numpy as np
import time
#模块random包含以各种方式生成随机数的函数,其中的randint()返回一个位于指定范围内的整数
from random import randint
#shuffle() 方法将序列的所有元素随机排序。
from random import shuffle
#----------------------------------
#通过读取名字为停用词.txt的文件来返回停用词
def makeStopWord():
    with open('停用词.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
    stopWord = []
    for line in lines:
        words = jieba.lcut(line,cut_all = False)
        for word in words:
            stopWord.append(word)
    return stopWord

def words2Array(lineList):
    linesArray=[]
    wordsArray=[]
    steps = []
    for line in lineList:
        t = 0
        p = 0
        for i in range(MAX_SIZE):#一条评价最多容纳的单词数目(25个,多退少补)
            if i<len(line):
                try:#添加每个行的每个词的词向量
                    wordsArray.append(model.wv.word_vec(line[i]))
                    p = p + 1
                except KeyError:
                    t=t+1
                    continue
            else:#一句话不够25个词的,用200维的词向量代表的词来补够
               wordsArray.append(np.array([0.0]*dimsh))
        for i in range(t):
            wordsArray.append(np.array([0.0]*dimsh))
        steps.append(p)#统计一句话包含多少个有效词(即扣除非补齐的词)
        linesArray.append(wordsArray)#从第一句话开始,每句话用25行200列的矩阵来表示,直到遍历所有的句子。
        wordsArray = []
    linesArray = np.array(linesArray)#三维矩阵
    steps = np.array(steps)#统计每一句话中的有效词放到数组中,数组的元素个数为句子的个数
    return linesArray, steps

def convert2Data(posArray, negArray, posStep, negStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(posArray)):#积极评价:25*200的矩阵,有效词的个数,标签
        randIt.append([posArray[i], posStep[i], [1,0]])
    for i in range(len(negArray)):#消极评价:25*200的矩阵,有效词的个数,标签
        randIt.append([negArray[i], negStep[i], [0,1]])
    shuffle(randIt)#随机混乱
    for i in range(len(randIt)):
        data.append(randIt[i][0])#每一句话的25*200的矩阵表示,放到data中
        steps.append(randIt[i][1])#每一句话的有效词的个数,放到step中
        labels.append(randIt[i][2])#每一句话的标签,放到label中
    data = np.array(data)
    steps = np.array(steps)
    return data, steps, labels

def getWords(file):
    wordList = []
    trans = []
    lineList = []
    with open(file,'r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        #去掉句子末尾的空格符
        trans = jieba.lcut(line.replace('\n',''), cut_all = False)
        for word in trans:
            if word not in stopWord:
                wordList.append(word)
        lineList.append(wordList)
        wordList = []
    return lineList

def makeData(posPath,negPath):
    #获取词汇,返回类型为[[word1,word2...],[word1,word2...],...]
    pos = getWords(posPath)
    print("The positive data's length is :",len(pos))
    neg = getWords(negPath)
    print("The negative data's length is :",len(neg))
    #将评价数据转换为矩阵,返回类型为array
    posArray, posSteps = words2Array(pos)
    negArray, negSteps = words2Array(neg)
    #将积极数据和消极数据混合在一起打乱,制作数据集
    Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
    return Data, Steps, Labels

#----------------------------------------------
# Word60.model   60维
# word2vec.model        200维

timeA=time.time()
word2vec_path = 'word2vec/word2vec.model'
model=gensim.models.Word2Vec.load(word2vec_path)
dimsh=model.vector_size
MAX_SIZE=25
stopWord = makeStopWord()

print("In train data:")
trainData, trainSteps, trainLabels = makeData('data/B/Pos-train.txt',
                                              'data/B/Neg-train.txt')
print("In test data:")
testData, testSteps, testLabels = makeData('data/B/Pos-test.txt',
                                           'data/B/Neg-test.txt')
trainLabels = np.array(trainLabels)

del model

print("-"*30)
print("The trainData's shape is:",trainData.shape)
print("The testData's shape is:",testData.shape)
print("The trainSteps's shape is:",trainSteps.shape)
print("The testSteps's shape is:",testSteps.shape)
print("The trainLabels's shape is:",trainLabels.shape)
print("The testLabels's shape is:",np.array(testLabels).shape)


num_nodes = 128
batch_size = 16
output_size = 2

graph = tf.Graph()
with graph.as_default():
    #trainData's shape 与下面的tf.placeholder的shape是相同的才行,都是3维的矩阵
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size,MAX_SIZE,dimsh))
    #用下面的tf.placeholder的shape去切The trainSteps's shape,用16去切19130.
    tf_train_steps = tf.placeholder(tf.int32,shape=(batch_size))
    #The trainLabels's shape与tf_train_labels的shape相同,都是2维矩阵。
    tf_train_labels = tf.placeholder(tf.float32,shape=(batch_size,output_size))

    tf_test_dataset = tf.constant(testData,tf.float32)#常量
    tf_test_steps = tf.constant(testSteps,tf.int32)#常量
#tf.nn.rnn_cell.BasicLSTMCell定义单个基本的LSTM单元,num_units表示神经元的个数
#state_is_tuple=True的时候,state是元组形式,state=(c,h)。
#如果是False,那么state是一个由c和h拼接起来的张量,state=tf.concat(1,[c,h])
#还有个参数forget_bias是遗忘门的偏差值(0到1之间)
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = num_nodes,
                                             state_is_tuple=True)
#tf.truncated_normal(shape, mean, stddev) :shape表示生成张量的维度,mean是均值,stddev是标准差。
#这个函数产生随机的正太分布值作为w1初始值,均值和标准差自己设定。
    w1 = tf.Variable(tf.truncated_normal([num_nodes,num_nodes // 2], stddev=0.1))
    #随机产生一维数组,数组内的元素个数为num_nodes // 2
    b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))

    w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
    #随机产生一维数组,数组内的元素个数为2
    b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))
    
    def model(dataset, steps):
        #dynamic_rnn返回两个变量,第一个是每个step的输出值,第二个是最终的状态。
        #sequencelength即为每条评价对应的长度只包括有效单词,通过trainSteps传入
        outputs, last_states = tf.nn.dynamic_rnn(cell = lstm_cell,
                                                 dtype = tf.float32,
                                                 sequence_length = steps,
                                         inputs = dataset)
        #rnn会在每一个步数产生一个输出值,在这里只取最后一个步数的输出值作为输出
        hidden = last_states[-1]
        hidden = tf.matmul(hidden, w1) + b1
        logits = tf.matmul(hidden, w2) + b2
        return logits
    train_logits = model(tf_train_dataset, tf_train_steps)
#tf.reduce_mean(x)表示计算全局平均值;
#tf.reduce_mean(x, axis=0)表示计算每列的平均值;
#tf.reduce_mean(x, axis=1)表示计算每行的平均值;
    loss = tf.reduce_mean(
#第一个参数logits:就是神经网络最后一层的输出,如果有batch的话,它的大小就是[batchsize,num_classes],单样本的话,大小就是num_classes
#第二个参数labels:实际的标签,大小同上。
#第一步是对输出层做归一化处理,得到每一批样本中的每个样本的结果分类概率分布
#第二步是实际标签做一个交叉熵,得到的是一个矩阵
#第三步是对上述矩阵求平均值
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels,
                                                logits=train_logits))
    
    #学习率取0.01
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

    test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))

num_steps = 19131
summary_frequency = 500


with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        offset = (step * batch_size) % (len(trainLabels)-batch_size)
        #feed_dict的作用是给使用placeholder创建出来的tensor赋值
        #每次输入的训练数据只有batch_size个,随机取起点,取连续batch_size个数据
        feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
                   tf_train_labels:trainLabels[offset:offset + batch_size],
                   tf_train_steps:trainSteps[offset:offset + batch_size]}
        _, l = session.run([optimizer,loss],
                           feed_dict = feed_dict)
        mean_loss += l
        if step >0 and step % summary_frequency == 0:
            mean_loss = mean_loss / summary_frequency
            print("The step is: %d"%(step))
            print("In train data,the loss is:%.4f"%(mean_loss))
            mean_loss = 0
            acrc = 0
            prediction = session.run(test_prediction)
            for i in range(len(prediction)):
                if prediction[i][testLabels[i].index(1)] > 0.5:
                    acrc = acrc + 1
            print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))

timeB=time.time()
print("time cost:",int(timeB-timeA))

数据集和stopwords、训练好的Word2vector,入群后跟我要,群号:228735640

猜你喜欢

转载自blog.csdn.net/qq_41424519/article/details/81739670