RNN用于二值分类

import numpy as np
import random
with np.load('rnn_data/file_name.npz') as data:
    feature = data['feature']
    label = data['label']
    rea_lenth = data['true_lenth'] #实际长度

#迭代器
class SimpleDataIterator():
    def __init__(self, X,y,true_lenth):
        self.X = X
        self.y = y
        self.true_lenth = true_lenth
        self.size = len(self.y)
        self.epochs = 0
        self.cursor = 0

    def shuffle(self):
        shuffle_index = np.random.permutation(self.size)
        self.X = [self.X[i] for i in shuffle_index]
        self.y = [self.y[i] for i in shuffle_index]
        self.true_lenth = [self.true_lenth[i] for i in shuffle_index]
        self.cursor = 0

    def next_batch(self, batch_size):    #batch_size作为参数传递
        if self.cursor+batch_size > self.size:
            self.epochs += 1
            self.shuffle()
        resX = self.X[self.cursor:self.cursor+batch_size]
        resX = np.array(resX)
        resy = self.y[self.cursor:self.cursor+batch_size]
        resy = np.array(resy)
        res_len = self.true_lenth[self.cursor:self.cursor+batch_size]
        res_len = np.array(res_len)
        self.cursor += batch_size
        return resX,resy,res_len

#训练

import tensorflow as tf
import sys
import random  
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold  #StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from scipy import interp
%matplotlib inline   #jupyter绘图需要

# hyperparameters  
lr = 0.0001  
keep_prob = 0.5
lambda_l2_reg = 0.01
training_iters = 100000  
train_batch_size = 200   #3200/200=16用作训练 800/200用作测试
test_batch_size = 200
n_steps = 4950 #最大时序长度
n_inputs = 23   # 输入35维的向量  
n_hidden_units = 64
n_classes = 2  

with tf.Graph().as_default():

    # tf Graph input  
    x = tf.placeholder(tf.float32, [None, n_steps, n_inputs])  #设为None动态改变batch大小 batch size x max length x features.
    y = tf.placeholder(tf.float32, [None, n_classes])  
    true_lenth = tf.placeholder(tf.int32)
    

    #Define weights
    weights = {  
        'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units])),  
        'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]))  
    }  
    biases = {  
        'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),  
        'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))  
    }

    indices = label
    depth = 2
    on_value = 1
    off_value = 0
    output_ = tf.one_hot(indices,depth,on_value,off_value,axis=1)

    def length(sequence):
        used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length
    def last_relevant(output, length):
        batch_size = tf.shape(output)[0]
        max_length = tf.shape(output)[1]
        out_size = int(output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, out_size])
        result = tf.gather(flat, index)
        return result
    def RNN(X, weights, biases,true_lenth):

        with tf.variable_scope('init_name',initializer=tf.orthogonal_initializer()):   #正交初始化
            cell = tf.contrib.rnn.GRUCell(n_hidden_units)
            init_state = tf.get_variable('init_state', [1, n_hidden_units],initializer=tf.constant_initializer(0.0))  #tf.constant_initializer(0.0)
            init_state = tf.tile(init_state, [train_batch_size, 1])

            outputs, states = tf.nn.dynamic_rnn(
            cell,X,dtype=tf.float32,sequence_length=true_lenth,initial_state=init_state)

        outputs = tf.nn.dropout(outputs, keep_prob)
        last = last_relevant(outputs, true_lenth)
        results = tf.matmul(last,weights['out']) + biases['out'] 
        return results 

    def cost(output, target):
        # Compute cross entropy for each frame.
        cross_entropy = target * tf.log(output+ 1e-10)
        cross_entropy = -tf.reduce_sum(cross_entropy, axis=2)
        mask = tf.sign(tf.reduce_max(tf.abs(target), axis=2))
        cross_entropy *= mask
        # Average over actual sequence lengths.
        cross_entropy = tf.reduce_sum(cross_entropy, axis=1)
        cross_entropy /= tf.reduce_sum(mask, axis=1)
        return tf.reduce_mean(cross_entropy)

    pred = RNN(x, weights, biases,true_lenth) 
    predict_prob = tf.nn.softmax(pred) #得到对应预测标签的概率值
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y,logits = pred))  #(pred,y))
    #L2正则化
    l2 = lambda_l2_reg * sum(
    tf.nn.l2_loss(tf_var)
        for tf_var in tf.trainable_variables()
        if not ("Bias" in tf_var.name)
    )
    cost += l2
    train_op = tf.train.AdamOptimizer(lr).minimize(cost)  

    correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))  #返回true/false
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))  

    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        init = tf.initialize_all_variables()
    else:
        init = tf.global_variables_initializer()
    with tf.Session() as sess:  
        labelR = sess.run(output_)
        
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        cv = StratifiedKFold(label, n_folds=5)
        finalRes = []
        for numFold,(train_index,test_index) in enumerate(cv):
            sess.run(init)
            x_train = [X[i] for i in train_index]
            y_train = [labelR[i] for i in train_index]
            train_true_lenth = [rea_lenth[i] for i in train_index]
            x_test = [X[i] for i in test_index]
            y_test = [labelR[i] for i in test_index]
            test_true_lenth = [rea_lenth[i] for i in test_index]
            print('train_index 长度：',len(train_index))
            print('test_index 长度：',len(test_index))
            
            trainingData = SimpleDataIterator(x_train,y_train,train_true_lenth)
            testingData = SimpleDataIterator(x_test,y_test,test_true_lenth)

            epoch = 0 #统计迭代所有训练集的次数
            maxAccuracy = 0 #连续5次不大于验证集最大准确性则 early stopping
            failNum = 0 #统计连续不大于最大准确性的次数
            count_ = 0
            while epoch<training_iters:
                batch_xs,batch_ys,batch_xs_len = trainingData.next_batch(train_batch_size)
                batch_xs = batch_xs.reshape([train_batch_size, n_steps, n_inputs])
                sess.run([train_op], feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                })
                count_ += 1
                if (epoch % 30)== 0 and (count_>= int(len(y_train)/train_batch_size)) :  #每30epoch输出此刻准确性
                    accur = sess.run(accuracy, feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                    })
                    print('%s%d%s%f'%('At ',epoch,'th accuracy:',accur) )
                    valiTem = x_test[0:train_batch_size];valiTem = np.array(valiTem)
                    vali_y = y_test[0:train_batch_size];vali_y = np.array(vali_y)
                    vali_len = test_true_lenth[0:train_batch_size]
                    valiAccur = sess.run(accuracy,feed_dict={x:valiTem.reshape([-1, n_steps, n_inputs]),
                                                             y:vali_y,true_lenth:vali_len,}) #测试集中拿出一份用于验证集
                    if valiAccur > maxAccuracy:
                        maxAccuracy = valiAccur
                        failNum = 0
                    else :
                        failNum += 1
                    costVal = sess.run(cost, feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                    })
                    print('%s%f'%('cost:',costVal))

                if failNum >= 5:
                    print('%s%f'%('Accuracy on validation set:',valiAccur))
                    break
                if trainingData.epochs>epoch:   #一个batch内的数据全部遍历完
                    epoch += 1
                    count_ = 0
            result = []
            prob = [] #保存最后预测每个label的概率
            final_label = []
            while testingData.epochs == 0:
                batch_xt,batch_yt,batch_xt_len  = testingData.next_batch(test_batch_size)
                batch_xt = np.array(batch_xt)
                batch_yt = np.array(batch_yt)
                batch_xt = batch_xt.reshape([test_batch_size, n_steps, n_inputs])

                temp_prob = sess.run(predict_prob,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,})
                temp_label = sess.run(tf.argmax(batch_yt, 1))
                final_label.extend(temp_label)
                temp_prob2 = np.array(temp_prob)
                prob.extend(temp_prob2[:,1])
                result.append(sess.run(accuracy,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,}))

            fpr, tpr, thresholds = roc_curve(final_label, prob, pos_label=1)
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.6f)' % (numFold, roc_auc))

            print('%d%s%f'%(numFold,"th fold accuracy：",np.mean(result)))
            finalRes.append(np.mean(result))
        print("Testing accuracy：",np.mean(finalRes))

        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')#画对角线
        mean_tpr /= len(cv) 					#在mean_fpr100个点，每个点处插值插值多次取平均
        mean_tpr[-1] = 1.0 						#坐标最后一个点为（1,1）
        mean_auc = auc(mean_fpr, mean_tpr)		#计算平均AUC值
        #画平均ROC曲线
        plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.6f)' % mean_auc, lw=2)

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()

以上代码未修改early-stopping，early-stopping参考如下，通过每次保存验证集上最好的模型参数实现：

    with tf.Session() as sess:  
        labelR = sess.run(output_)
        #Create a saver object which will save all the variables
        saver = tf.train.Saver() 
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        cv = StratifiedKFold(label, n_folds=5)
        finalRes = []
        for numFold,(train_index,test_index) in enumerate(cv):
            highest_accuracy = 0
            steps_since_save = 0
            sess.run(init)
            x_train = [X[i] for i in train_index]
            y_train = [labelR[i] for i in train_index]
            train_true_lenth = [rea_lenth[i] for i in train_index]
            x_test = [X[i] for i in test_index]
            y_test = [labelR[i] for i in test_index]
            test_true_lenth = [rea_lenth[i] for i in test_index]
            print('train_index 长度：',len(train_index))
            print('test_index 长度：',len(test_index))

            trainingData = SimpleDataIterator(x_train,y_train,train_true_lenth)
            testingData = SimpleDataIterator(x_test,y_test,test_true_lenth)

            epoch = 0 #统计迭代所有训练集的次数
            count_ = 0
            while epoch<training_iters:
                batch_xs,batch_ys,batch_xs_len = trainingData.next_batch(train_batch_size)
                batch_xs = batch_xs.reshape([train_batch_size, n_steps, n_inputs])
                sess.run([train_op], feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                })
                count_ += 1
                if (epoch % DISPLAY_EVERY)== 0 and (count_>= int(len(y_train)/train_batch_size)) :  #每30epoch输出此刻准确性
                    accur = sess.run(accuracy, feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                    })
                    print('%s%d%s%f'%('At ',epoch,'th accuracy:',accur) )
                    
                    if accur>highest_accuracy:
                        print(">> New Highest Accuracy, Saving Model <<")
                        saver.save(sess, 'saved_model_{0}'.format(numFold))
                        print(">> Model Saved <<")
                        highest_accuracy = accur
                        steps_since_save = 0
                    else:
                        steps_since_save += 1
                        
                    valiTem = x_test[0:train_batch_size];valiTem = np.array(valiTem)
                    vali_y = y_test[0:train_batch_size];vali_y = np.array(vali_y)
                    vali_len = test_true_lenth[0:train_batch_size]
                    valiAccur = sess.run(accuracy,feed_dict={x:valiTem.reshape([-1, n_steps, n_inputs]),
                                                             y:vali_y,true_lenth:vali_len,}) #测试集中拿出一份用于验证集
                    costVal = sess.run(cost, feed_dict={  
                    x: batch_xs,  
                    y: batch_ys,  
                    true_lenth:batch_xs_len,
                    })
                    print('%s%f'%('cost:',costVal))
                    
                if steps_since_save > MAX_STEPS_SINCE_SAVE:
                    print("\n\n**** MODEL CONVERGED, STOPPING EARLY ****")
                    print('%s%f'%('Accuracy on validation set:',valiAccur))
                    break
                    
                if trainingData.epochs>epoch:   #一个batch内的数据全部遍历完
                    epoch += 1
                    count_ = 0
            #restore model
            new_saver = tf.train.import_meta_graph('saved_model_{0}.meta'.format(numFold))
            new_saver.restore(sess, tf.train.latest_checkpoint('./'))
            all_vars = tf.get_collection('vars')
            #测试
            result = []
            prob = [] #保存最后预测每个label的概率
            final_label = []
            while testingData.epochs == 0:
                batch_xt,batch_yt,batch_xt_len  = testingData.next_batch(test_batch_size)
                batch_xt = np.array(batch_xt)
                batch_yt = np.array(batch_yt)
                batch_xt = batch_xt.reshape([test_batch_size, n_steps, n_inputs])

                temp_prob = sess.run(predict_prob,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,})
                temp_label = sess.run(tf.argmax(batch_yt, 1))
                final_label.extend(temp_label)
                temp_prob2 = np.array(temp_prob)
                prob.extend(temp_prob2[:,1])
                result.append(sess.run(accuracy,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,}))

欢迎交流

猜你喜欢