tensorflow 实现汽车评论情感极性分析:简单神经网络

一. 获取数据

import numpy as np
import pandas as pd
import jieba

# 1. 读取数据,数据预处理
comment_path = "G:\\myLearning\\pythonLearning201712\\carComments\\01\\myCleanedComments.CSV"
comment_df = pd.read_csv(comment_path, engine = 'python')
print(comment_df.head())

del comment_df['ID']
del comment_df['title']
del comment_df['sumUp']
del comment_df['sup_num']

# 2.读入停用词
# 也可使用open(filename).readlines()获取停用词
stopwords_path = "G:\\myLearning\\pythonLearning201712\\myDicts\\新建文件夹\\综合stopword.txt"
stopwords = pd.read_csv(stopwords_path,names = ['stopwords'], sep = 'aaa',encoding = 'utf-8', engine = 'python')

# 3.自定义分词函数
import re
def my_cut_func(my_str):
    my_str = re.sub('[a-zA-Z]*','',my_str)  # 去除字母
    my_str = re.sub('\d|\s','',my_str)     # 去除数字和空格stopwords_path
    my_str_list = jieba.cut(my_str)
    # 去除停用词和长度为1的词
    my_str_list = [word for word in my_str_list if word not in stopwords and len(word) > 1] 
    return my_str_list

# 4.加载搜狗汽车词典: 原文件必须为utf-8格式
jieba.load_userdict('G:\\myLearning\\sougou词库\\汽车词汇大全【官方推荐】.txt') 

# 5.将所有文本合并为一个大字符串
pos_comments = list(comment_df['advance'])
pos_comments = [str(item) for item in pos_comments]
all_pos_comments = ''.join(pos_comments)

neg_comments = list(comment_df['disadvance'])
neg_comments = [str(item) for item in neg_comments]
all_neg_comments = ''.join(neg_comments)


二. 获取词频矩阵:基于bow

# 1. 基于词袋模型(unigram, bigram 和 trigram等)
# (1)读取数据
pos_comment_df = pd.DataFrame()
neg_comment_df = pd.DataFrame()
pos_comment_df['txt'] = comment_df.advance
pos_comment_df['tag'] = 1.0
neg_comment_df['txt'] = comment_df.disadvance
neg_comment_df['tag'] = 0.0
df0 = pos_comment_df.append(neg_comment_df)

# (2)分词和预处理

# 加载搜狗汽车词典: 原文件必须为utf-8格式
jieba.load_userdict('G:\\myLearning\\sougou词库\\汽车词汇大全【官方推荐】.txt') 
# 读入停用词
# 也可使用open(filename).readlines()获取停用词
stopwords_path = "G:\\myLearning\\pythonLearning201712\\myDicts\\新建文件夹\\综合stopword.txt"
stopwords = pd.read_csv(stopwords_path,names = ['stopwords'], sep = 'aaa',encoding = 'utf-8', engine = 'python')

# 自定义分词函数
import re
def cuttxt(my_str):
    my_str = re.sub('[a-zA-Z0-9]*','',str(my_str))  # 去除字母和字母
    my_str = re.sub('\s','',my_str)            # 去除空格、换行符等。
    my_str_list = jieba.cut(my_str)
    # 去除停用词和长度为1的词
    my_str_list = [word for word in my_str_list if word not in stopwords and len(word) > 1] 
    return ' '.join(my_str_list)

df0['cleantxt'] = df0.txt.apply(cuttxt)
df0.head()

# (3) 利用sklearn中的CountVectorizer生成词频的稀疏矩阵
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df = 10)          # 在10个以上的文档中出现过才收入,此处的min_df是可调整优化的参数

wordmtx = countvec.fit_transform(df0.cleantxt)       # 生成稀疏矩阵
wordmtx

<1x4266 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>


三. 利用神经网络进行情感分析

import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=True)    # 默认是稀疏矩阵, 设置为False时返回标准矩阵


# 1.获取训练和测试数据集
nn_X_data = wordmtx.todense()
nn_y_data = np.mat(df0.tag).reshape([-1,1])
ohe_y_data = ohe.fit_transform(nn_y_data)           # 默认是稀疏矩阵
ohe_y_data = ohe_y_data.todense()
m,n = wordmtx.shape

from sklearn.model_selection import train_test_split
nn_X_train,nn_X_test,nn_y_train,nn_y_test = train_test_split(nn_X_data, ohe_y_data, test_size = 0.3, random_state = 1)

# 2.定义placeholder
X_place = tf.placeholder(tf.float32, [None, n])
y_place = tf.placeholder(tf.float32, [None, 2])

# 权值初始化
def weight_variable(shape):
    # 用正态分布来初始化权值
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    # 本例中用relu激活函数,所以用一个很小的正偏置较好
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

# 3.定义FC1
# with tf.name_scope('conv1') as scope:
W_fc1 = weight_variable([n, 128])
b_fc1 = bias_variable([128])
h_fc1 = tf.nn.relu(tf.matmul(X_place, W_fc1) + b_fc1)
# 4.定义FC2
with tf.name_scope('conv2') as scope:
    W_fc2 = weight_variable([128, 2])
    b_fc2 = bias_variable([2])
    y_pred = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
    y_pred = tf.clip_by_value(y_pred, 1e-10, 1.0)           # 可防止计算tf.log(y_pred)时出现溢出的情况
# 5.定义损失函数
cross_entropy = -tf.reduce_sum(y_place * tf.log(y_pred))
trian_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.arg_max(y_place, 1))  
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# 6. 建立模型
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(101):        # 迭代101次
        sess.run(train_step, feed_dict={X_place:nn_X_train, y_place:nn_y_train})
        
        if i%10 == 0:
#             print(i, sess.run(cross_entropy, feed_dict={X_place:nn_X_train, y_place:nn_y_train} ))
        
            print(i,sess.run(accuracy, feed_dict={X_place:nn_X_test, y_place:nn_y_test} ))  
    print(sess.run(y_pred,  feed_dict={X_place:nn_X_test}))


四. 注意事项

     1.  训练时易出现loss为nan的情况,解决办法有:

  (1)、对输入数据进行归一化处理;

    (2)、对于层数较多的情况,各层都做batch_nomorlization;

    (5)、减小学习率lr。

    2.  语句y_pred = tf.clip_by_value(y_pred, 1e-10, 1.0) 可防止计算tf.log(y_pred)时出现溢出迭代情况。当y_pred中有0值时,计算tf.log(y_pred)溢出,会返回nan.  

    3.  尝试改变学习率或者使用AdamOptimizer等优化器。

猜你喜欢

转载自blog.csdn.net/zhuzuwei/article/details/80869162