RNN_for_twitter

import pandas as pd
import numpy as np
data = pd.read_csv('Tweets.csv')
data = data[['airline_sentiment','text']]
with open('twee', 'a', encoding = 'utf-8') as f:
    for string in data.text:
        f.writelines(string+'\n')
from gensim.models import word2vec
sentences = word2vec.Text8Corpus("twee") # 加载语料
model = word2vec.Word2Vec(sentences, size=300) # 训练skip-gram模型,默认window=5。每个单词维度是300
word_vectors = model.wv # 取到每个单词对应的向量
del model # 释放内存资源
data['vec'] = data.text.apply(lambda x :[word_vectors[w] for w in x.split() if w in word_vectors])
data['vec']
0        [[0.57637084, 1.0174645, 1.2169149, -0.3582543...
1        [[0.57637084, 1.0174645, 1.2169149, -0.3582543...
2        [[0.57637084, 1.0174645, 1.2169149, -0.3582543...
3        [[0.57637084, 1.0174645, 1.2169149, -0.3582543...
4        [[0.57637084, 1.0174645, 1.2169149, -0.3582543...
                               ...                        
14635    [[0.5811336, 1.1437037, 0.8925604, 0.07530225,...
14636    [[0.5811336, 1.1437037, 0.8925604, 0.07530225,...
14637    [[0.5811336, 1.1437037, 0.8925604, 0.07530225,...
14638    [[0.5811336, 1.1437037, 0.8925604, 0.07530225,...
14639    [[0.5811336, 1.1437037, 0.8925604, 0.07530225,...
Name: vec, Length: 14640, dtype: object
data = data[data['vec'].apply(lambda x:len(x)>5)]
del data['text']
data.airline_sentiment.unique()
array(['positive', 'neutral', 'negative'], dtype=object)
data.airline_sentiment.value_counts()
negative    8920
neutral     2650
positive    1947
Name: airline_sentiment, dtype: int64
dic = {
    
    'neutral':np.array([1,0,0]), 'positive':np.array([0,1,0]), 'negative':np.array([0,0,1])}
data['cat'] = data.airline_sentiment.map(dic)
del data['airline_sentiment']
data = data.reset_index()  # 防止不连贯,感觉可以舍去
del data['index']
len(data.vec[0][0])
300
# 填充到最大长度
max1ength = max([len(x) for x in data.vec])
max1ength
34
def pad(x):
    x1 = np.zeros((max1ength,300))
    x1[:len(x)] = x
    return x1
dataset = data.vec.apply(pad)
len(dataset),len(data.cat)
(13517, 13517)
dataset
0        [[0.5763708353042603, 1.017464518547058, 1.216...
1        [[0.5763708353042603, 1.017464518547058, 1.216...
2        [[0.5763708353042603, 1.017464518547058, 1.216...
3        [[0.5763708353042603, 1.017464518547058, 1.216...
4        [[0.5763708353042603, 1.017464518547058, 1.216...
                               ...                        
13512    [[0.5811336040496826, 1.1437036991119385, 0.89...
13513    [[0.5811336040496826, 1.1437036991119385, 0.89...
13514    [[0.5811336040496826, 1.1437036991119385, 0.89...
13515    [[0.5811336040496826, 1.1437036991119385, 0.89...
13516    [[0.5811336040496826, 1.1437036991119385, 0.89...
Name: vec, Length: 13517, dtype: object
labels = np.concatenate(data.cat).reshape(len(data.cat),-1)  # 转成了numpy格式。
np.shape(labels)
(13517, 3)
# 数据部分也转成numpy,且空余的部分补0
data_ = np.concatenate(dataset).reshape(len(dataset),max1ength,300)  
np.shape(data_)
(13517, 34, 300)

トレーニングおよびテストデータとしてのシャッフルオーダー選択

index = np.random.permutation(int(len(data)))
label = labels[index]
dataset = data_[index]
# 划分训练集和测试集
label_train = label[:12000]
dataset_train = dataset[:12000]
label_test = label[12000:]
dataset_test = dataset[12000:]

ネットワークを構築する

import tensorflow as tf
learning_rate = 0.005
batch_size = 300
n_input = 300  # 每个单词的向量长度
n_steps = max1ength  # 这里的l是1
n_hidden = 128
n_classes = 3
x = tf.placeholder(tf.float32, [None,n_steps,n_input])
y = tf.placeholder(tf.float32, [None,n_classes])
output_keep_prob = tf.placeholder("float")   # 用于dropout
reg = tf.contrib.layers.l2_regularizer(scale=0.01)   #l2正则化
WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.
# 用于计算每个序列的真实长度(除去[0,..0]的)
# 输入是一个三维矩阵
def length(shuru):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuru), reduction_indices=2)), reduction_indices=1)
    # reduction_indices: 0是找每列里面最大的,最终是行;1是找每行里面最大的,最终是列。
    # tf.sign:转换矩阵中每个数:>0变成1,=0变成0,<0变成-1
cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden,
                kernel_initializer = tf.truncated_normal_initializer(stddev=0.0001),
                bias_initializer = tf.truncated_normal_initializer(stddev=0.0001)),
                output_keep_prob = output_keep_prob)  # wrapper包装器。使用DropoutWrapper包装GRUCell
WARNING:tensorflow:From <ipython-input-35-4ae5b176c4ca>:3: GRUCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
output, _ = tf.nn.dynamic_rnn(
            cell,
            x,
            dtype=tf.float32,
            sequence_length = length(x)
)
WARNING:tensorflow:From <ipython-input-36-6128bbcfb684>:5: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From D:\Code\Miniconda3\envs\tensorflow1.13-gpu\lib\site-packages\tensorflow\python\ops\rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From D:\Code\Miniconda3\envs\tensorflow1.13-gpu\lib\site-packages\tensorflow\python\ops\tensor_array_ops.py:162: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From D:\Code\Miniconda3\envs\tensorflow1.13-gpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py:1259: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
output.get_shape()
TensorShape([Dimension(None), Dimension(34), Dimension(128)])
index = tf.range(0, batch_size)*n_steps + (tf.cast(length(x), tf.int32)- 1)  # 提取每个批次,每条句子末对应的index
flat = tf.reshape(output,[-1,int(output.get_shape()[2])])  # 打平(相当于将batch_size个句子拼接)
last = tf.gather(flat,index)  # 按index取出有用的
fc_1 = tf.contrib.layers.fully_connected(
                        last,
                        64,
                        weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                        activation_fn = tf.nn.relu)
keep_prob = tf.placeholder("float")
fc1_drop = tf.nn.dropout(fc_1, keep_prob)
weight = tf.Variable(tf.truncated_normal([64,n_classes], stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
prediction = tf.nn.softmax(tf.matmul(fc1_drop,weight) + bias)
cross_entropy = -tf.reduce_sum(y *tf.log(prediction))
# 正则化
weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
# 应用正则化
tf.contrib.layers.apply_regularization(reg,weights_list = weights)
<tf.Tensor 'get_regularization_penalty:0' shape=() dtype=float32>
reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)  # 等于上一步的返回值
# 梯度拆解
optimizer = tf.train.AdamOptimizer(learning_rate,beta1=0.9)
grads = optimizer.compute_gradients(cross_entropy + tf.reduce_sum(reg_ws))
for i,(g,v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g,5),v)# clip gradients
train_op = optimizer.apply_gradients(grads)
D:\Code\Miniconda3\envs\tensorflow1.13-gpu\lib\site-packages\tensorflow\python\ops\gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def generatebatch(X,Y,n_examples,batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs,batch_ys #生成每一个batch
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()
for step in range(10):
    index_ = np.random.permutation(int(len(dataset_train)))
    dataset_train = dataset_train[index_]
    label_train = label_train[index_]
    for batch_x, batch_y in generatebatch(dataset_train,label_train,len(label_train),batch_size):
        sess.run(train_op,feed_dict={
    
    x: batch_x, y: batch_y, keep_prob: 0.5, output_keep_prob:0.5})
    acc = sess.run(accuracy, feed_dict={
    
    x: batch_x, y: batch_y, keep_prob: 1, output_keep_prob:1})
    loss = sess.run(cross_entropy, feed_dict={
    
    x: batch_x, y: batch_y, keep_prob: 1, output_keep_prob:1})
    saver.save(sess,'./saver/lesson0', global_step = step)
    print("Iter " + str(step)+ ",Minibatch Loss= "+ "{:.6f}".format(loss)+ ",Training Accuracy= " + "{:.5f}".format(acc))
print("Optimization Finished!")
Iter 0,Minibatch Loss= 193.702866,Training Accuracy= 0.72000
Iter 1,Minibatch Loss= 175.091751,Training Accuracy= 0.78000
Iter 2,Minibatch Loss= 167.572891,Training Accuracy= 0.75667
Iter 3,Minibatch Loss= 166.535400,Training Accuracy= 0.78667
Iter 4,Minibatch Loss= 141.220993,Training Accuracy= 0.80667
WARNING:tensorflow:From D:\Code\Miniconda3\envs\tensorflow1.13-gpu\lib\site-packages\tensorflow\python\training\saver.py:966: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Iter 5,Minibatch Loss= 128.777084,Training Accuracy= 0.83667
Iter 6,Minibatch Loss= 107.956490,Training Accuracy= 0.86667
Iter 7,Minibatch Loss= 97.454933,Training Accuracy= 0.88000
Iter 8,Minibatch Loss= 115.299561,Training Accuracy= 0.84000
Iter 9,Minibatch Loss= 94.709442,Training Accuracy= 0.87000
Optimization Finished!
acc = []
for batch_x, batch_y in generatebatch(dataset_test,label_test, len(label_test), batch_size):
    accu = sess.run(accuracy, feed_dict={
    
    x: batch_x, y: batch_y, keep_prob: 1, output_keep_prob: 1})
    acc.append(accu)
acc = sum(acc)/len(acc)
print(acc)

おすすめ

転載: blog.csdn.net/Saker__/article/details/108983922