RNN的学习和使用

思想就是：“温故而知新”。网络会对前面的信息进行定义并应用到当前输出的计算中，即隐藏层之间的节点不再无连接而是有链接的。正因为一部分输出结果会作为输入的一部分重新输入，使得RNN具有一定的记忆性。

1.识别验证码

# -*- coding: utf-8 -*-

from __future__ import division, print_function, absolute_import

import numpy as np
import tflearn
from sklearn import metrics
import tflearn.datasets.mnist as mnist

#会自动下载MNIST数据集到默认目录，one-hot就是独热编码，神经网络中为了简化设计，将0～9变为新的标记方式0～1.在处理图片文件时，DNN会将其转化为维度为a*b的特征向量，RNN则为连续a个维度为b的特征向量序列
X, Y, testX, testY = mnist.load_data(one_hot=True)

def do_DNN(X, Y, testX, testY):
    # Building deep neural network
    input_layer = tflearn.input_data(shape=[None, 784])
    dense1 = tflearn.fully_connected(input_layer, 64, activation='tanh',
                                     regularizer='L2', weight_decay=0.001)
    dropout1 = tflearn.dropout(dense1, 0.8)
    dense2 = tflearn.fully_connected(dropout1, 64, activation='tanh',
                                     regularizer='L2', weight_decay=0.001)
    dropout2 = tflearn.dropout(dense2, 0.8)
    softmax = tflearn.fully_connected(dropout2, 10, activation='softmax')

    # Regression using SGD with learning rate decay and Top-3 accuracy
    sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.96, decay_step=1000)
    top_k = tflearn.metrics.Top_k(3)
    net = tflearn.regression(softmax, optimizer=sgd, metric=top_k,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(X, Y, n_epoch=20, validation_set=(testX, testY),
              show_metric=True, run_id="dense_model")
#RNN对图片的处理
def do_rnn(X, Y, testX, testY):
    X = np.reshape(X, (-1, 28, 28))
    testX = np.reshape(testX, (-1, 28, 28))
#设置使用LSTM算法
    net = tflearn.input_data(shape=[None, 28, 28])
    net = tflearn.lstm(net, 128, return_seq=True)
    net = tflearn.lstm(net, 128)
#设置全连接网络
    net = tflearn.fully_connected(net, 10, activation='softmax')
#设置输出节点，优化算法使用adam，损失函数使用categorical_crossentropy
    net = tflearn.regression(net, optimizer='adam',
                         loss='categorical_crossentropy', name="output1")
#创建神经网络实体
    model = tflearn.DNN(net, tensorboard_verbose=2)
#调用fit函数训练样本
    model.fit(X, Y, n_epoch=1, validation_set=(testX,testY), show_metric=True,
          snapshot_step=100)


#do_DNN(X, Y, testX, testY)
do_rnn(X, Y, testX, testY)

下载下来的数据集情况为：

---------------------------------

Training samples: 55000

Validation samples: 10000

报错信息OMP: Error #15: Initializing libomp.dylib, but found libiomp5.dylib already initialized.提示，可能是libiomp5.dylib的存在导致错误。
转到anaconda/lib目录下，删掉libiomp5.dylib

或者将

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

加入代码。

2.识别恶意评论

Movie Review Data数据集的每条评论都保存成单独的文本文件，正面和负面的评论给放置在不同的文件夹下面，使用词袋模型将文本向量化。读取不同文件夹，进行标记0或1，RNN会见评论转换为时许数据，设置生成序列的格式后，可进行截取和转化。LSTM会自动筛选设定比例的数据传递给下一层。

# -*- coding: utf-8 -*-
"""
Simple example using LSTM recurrent neural network to classify IMDB
sentiment dataset.
References:
    - Long Short Term Memory, Sepp Hochreiter & Jurgen Schmidhuber, Neural
    Computation 9(8): 1735-1780, 1997.
    - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
    and Christopher Potts. (2011). Learning Word Vectors for Sentiment
    Analysis. The 49th Annual Meeting of the Association for Computational
    Linguistics (ACL 2011).
Links:
    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
    - http://ai.stanford.edu/~amaas/data/sentiment/
"""
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

# Training



model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
          batch_size=32)

3.识别WebShell

逐行读取文件，并记录当前系统调用序号中的最大值，正常的系统调用标记为正常，WebShell运行下系统调用序列标记为WebShell，正常数据与异常数据混合后，随机分配为训练集和测试集。构造RNN使用LSTM算法，然后实例化再验证就可以了。

# -*- coding:utf-8 -*-

import re
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import os
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus
import numpy as np
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics

max_sequences_len=300
max_sys_call=0

def load_one_flle(filename):
    global max_sys_call
    x=[]
    with open(filename) as f:
        line=f.readline()
        line=line.strip('\n')
        line=line.split(' ')
        for v in line:
            if len(v) > 0:
                x.append(int(v))
                if int(v) > max_sys_call:
                    max_sys_call=int(v)
    return x

def load_adfa_training_files(rootdir):
    x=[]
    y=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            x.append(load_one_flle(path))
            y.append(0)
    return x,y

def dirlist(path, allfile):
    filelist = os.listdir(path)

    for filename in filelist:
        filepath = os.path.join(path, filename)
        if os.path.isdir(filepath):
            dirlist(filepath, allfile)
        else:
            allfile.append(filepath)
    return allfile

def load_adfa_webshell_files(rootdir):
    x=[]
    y=[]
    allfile=dirlist(rootdir,[])
    for file in allfile:
        if re.match(r"/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/Web_Shell_\d+/UAD-W*",file):
            x.append(load_one_flle(file))
            y.append(1)
    return x,y

def do_rnn(trainX, testX, trainY, testY):
    global max_sequences_len
    global max_sys_call
    # Data preprocessing
    # Sequence padding

    trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.)
    testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY_old=testY
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    print "GET max_sequences_len embedding %d" % max_sequences_len
    print "GET max_sys_call embedding %d" % max_sys_call

    net = tflearn.input_data([None, max_sequences_len])
    net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.3)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    #y_predict=to_categorical(y_predict, nb_classes=2)

    print(classification_report(testY_old, y_predict))
    print metrics.confusion_matrix(testY_old, y_predict)
    #print metrics.recall_score(testY_old, y_predict)
    #print metrics.accuracy_score(testY_old, y_predict)


if __name__ == '__main__':
    x1,y1=load_adfa_training_files("/Users/zhanglipeng/Data/ADFA-LD/Training_Data_Master/")
    x2,y2=load_adfa_webshell_files("/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/")
    x=x1+x2
    y=y1+y2

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
    do_rnn(x_train, x_test, y_train, y_test)

Training samples: 570

Validation samples: 381

Training Step: 18 | total loss: 1.46542 | time: 136.322s

| Adam | epoch: 001 | loss: 1.46542 - acc: 0.8094 | val_loss: 0.98402 - val_acc: 0.8583 -- iter: 570/570

Training Step: 36 | total loss: 1.05053 | time: 13.147s

| Adam | epoch: 002 | loss: 1.05053 - acc: 0.7672 | val_loss: 0.90229 - val_acc: 0.8714 -- iter: 570/570

Training Step: 54 | total loss: 0.79773 | time: 13.205s

| Adam | epoch: 003 | loss: 0.79773 - acc: 0.8482 | val_loss: 0.57015 - val_acc: 0.8845 -- iter: 570/570

Training Step: 72 | total loss: 0.79644 | time: 10.192s

| Adam | epoch: 004 | loss: 0.79644 - acc: 0.8277 | val_loss: 0.00000 - val_acc: 0.0000 -- iter: 570/570

Training Step: 90 | total loss: 0.75222 | time: 13.096s

| Adam | epoch: 005 | loss: 0.75222 - acc: 0.8212 | val_loss: 0.67396 - val_acc: 0.8609 -- iter: 570/570

RNN的工作能力，实在让人叹服。

1.识别验证码

2.识别恶意评论

3.识别WebShell

猜你喜欢