CNN tmp

# coding: utf-8
# len white data: 1282285
# len black data: 81903


from __future__ import division, print_function, absolute_import
import pickle
import tensorflow as tf
import tflearn
import numpy as np
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import numpy as np
from tflearn.layers.core import dropout, fully_connected
from tflearn.layers.conv import conv_1d, max_pool_1d
from tflearn.layers.estimator import regression
from tflearn.layers.normalization import batch_normalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import average_precision_score, recall_score, precision_score, f1_score

# filename = "data/99666.pcap_svKcnF"
# with open(filename, "rb") as tmp_file:
# ans = (pickle.load(tmp_file))
# # print(ans)
# for k,v in ans.items():
# print(k, type(v[0]), v)
# if v[0] != 0 and v[1]!=0:
# out_flow, in_flow = (list(v[0]), list(v[1]))
# print(out_flow, in_flow)
# print(len(out_flow), len(in_flow))


def report_evaluation_metrics(y_true, y_pred):
average_precision = average_precision_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, labels=[0, 1], pos_label=1)
recall = recall_score(y_true, y_pred, labels=[0, 1], pos_label=1)
f1 = f1_score(y_true, y_pred, labels=[0, 1], pos_label=1)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))
print('Precision: {0:0.2f}'.format(precision))
print('Recall: {0:0.2f}'.format(recall))
print('F1: {0:0.2f}'.format(f1))

# def plot_confusion_matrix(y_true, y_pred):
conf_matrix = confusion_matrix(y_true, y_pred)
print("confusion matrix:", conf_matrix)

# plt.figure(figsize=(12, 12))
# sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d")
# plt.title("Confusion matrix")
# plt.ylabel('True class')
# plt.xlabel('Predicted class')
# plt.show()

FLOW_SIZE = 1024
def extract_flows(filename):
ans = []
with open(filename, "rb") as tmp_file:
pkl_data = (pickle.load(tmp_file))
# print(ans)
for k,v in pkl_data.items():
# print(k, type(v[0]), v)
if v[0] != 0 and v[1]!=0:
out_flow, in_flow = (list(v[0]), list(v[1]))
# print(out_flow, in_flow)
# print(len(out_flow), len(in_flow))
half_size = FLOW_SIZE//2
padding_flow = out_flow[:half_size]+[0]*(half_size-len(out_flow))+in_flow[:half_size]+[0]*(half_size-len(in_flow))
assert len(padding_flow) == FLOW_SIZE
ans.append([filename+":"+k, padding_flow])
return ans



# def get_cnn_model(max_len=1024, volcab_size=256):
# # Building convolutional network
# network = tflearn.input_data(shape=[None, max_len], name='input')
# network = tflearn.embedding(network, input_dim=volcab_size, output_dim=32)
# network = conv_1d(network, 64, 3, activation='relu', regularizer="L2")
# network = max_pool_1d(network, 2)
# network = conv_1d(network, 64, 3, activation='relu', regularizer="L2")
# network = max_pool_1d(network, 2)
# network = batch_normalization(network)
# network = fully_connected(network, 64, activation='relu')
# network = dropout(network, 0.5)
# network = fully_connected(network, 2, activation='softmax')
# sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.96, decay_step=1000)
# network = regression(network, optimizer=sgd, loss='categorical_crossentropy')
# model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path='model.tfl.ckpt')
# return model


def get_cnn_model(max_len=FLOW_SIZE, volcab_size=256):
# Building convolutional network
network = tflearn.input_data(shape=[None, max_len], name='input')
# network = tflearn.embedding(network, input_dim=volcab_size, output_dim=32)
# refer: https://github.com/echowei/DeepTraffic/blob/master/2.encrypted_traffic_classification/4.TrainAndTest/2d_cnn/encrypt_traffic_cnn_2d.py ==>5*5 conv
# refer: https://github.com/echowei/DeepTraffic/blob/master/2.encrypted_traffic_classification/4.TrainAndTest/1d_cnn_25%2B3/encrypt_traffic_cnn_1d.py ==> 25 conv
# refer: https://github.com/echowei/DeepTraffic/blob/master/1.malware_traffic_classification/4.TrainAndTest/traffic_cnn.py
# network = tflearn.input_data(shape=[None, 1, max_len], name='input')
# network = tflearn.reshape(network, (-1, max_len, 1))
network = tf.expand_dims(network, 2)
network = conv_1d(network, nb_filter=32, filter_size=25, strides=1, padding='same', activation='relu')
network = max_pool_1d(network, kernel_size=3, strides=3)
network = conv_1d(network, nb_filter=32, filter_size=25, strides=1, padding='same', activation='relu')
network = max_pool_1d(network, kernel_size=3, strides=3)
network = fully_connected(network, n_units=1024, activation='relu')
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
sgd = tflearn.SGD(learning_rate=0.0001, lr_decay=0.96, decay_step=1000)
network = regression(network, optimizer=sgd, loss='categorical_crossentropy')
model = tflearn.DNN(network, tensorboard_verbose=0, checkpoint_path='model.tfl.ckpt')
return model

"""
# dns tunnel
# black detect rate is ZERO!!!!
def get_cnn_model(max_len, volcab_size):
# Building convolutional network
network = tflearn.input_data(shape=[None, max_len], name='input')
network = tflearn.embedding(network, input_dim=volcab_size, output_dim=64)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 4, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
loss='categorical_crossentropy', name='target')
model = tflearn.DNN(network, tensorboard_verbose=0)
return model
"""


WHITE_DIR = "/home/langjihai/resolve_pcap_for_NN/white/SSL_PAYLOAD_PER_DIR"
BLACK_DIR = "/home/langjihai/resolve_pcap_for_NN/black/SSL_PAYLOAD_PER_DIR"

import os

def get_files(directory):
for dirpath,_,filenames in os.walk(directory):
for f in filenames:
yield os.path.abspath(os.path.join(dirpath, f))

def get_data(dirname):
ans = []
for file in get_files(dirname):
flows = extract_flows(file)
if len(ans) >= 2000000:
break
if flows: ans.extend(flows)
print(len(ans), " flows in", dirname)
return ans

def save_data(data):
with open('data.pickle', 'wb') as handle:
pickle.dump(data, handle)

def load_data():
with open('data.pickle', 'rb') as handle:
return pickle.load(handle)

data_file = "data.pickle"
if os.path.exists(data_file):
print("load data file data.pickle!!!")
data = load_data()
white_data, black_data = data['white_data'], data['black_data']
else:
black_data = get_data(BLACK_DIR)
white_data = get_data(WHITE_DIR)
save_data({"white_data": white_data, "black_data": black_data})
# np.savez(data_file, white_data=white_data, black_data=black_data)
print("len white data:", len(white_data))
print("len black data:", len(black_data))


dataX = []
dataY = []
for flow in white_data:
dataX.append(flow[1])
dataY.append(0)
for flow in black_data:
dataX.append(flow[1])
dataY.append(1)
trainX, testX, trainY, testY = train_test_split(dataX, dataY, test_size=0.2, random_state=666)
# trainX = np.reshape(trainX, [-1, 1, FLOW_SIZE])
# testX = np.reshape(testX, [-1, 1, FLOW_SIZE])
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

model = get_cnn_model()
# Train model, with model checkpoint every epoch and every 200 training steps.
# model.fit(trainX, trainY, n_epoch=10,
# validation_set=(testX, testY),
# show_metric=True,
# snapshot_epoch=True, # Snapshot (save & evaluate) model every epoch.
# snapshot_step=10000, # Snapshot (save & evalaute) model every 500 steps.
# run_id='model_and_weights')
#
# model.save("ECA_CNN.model")
#
# model.load("ECA_CNN.model")
# test=np.linspace(1,101,100).reshape(1,100)
# print("测试结果:",model.predict(test))


model_file = "ECA_CNN.model"
if os.path.exists(model_file + ".meta"):
print("Load a model from local!!!")
model.load(model_file)
# else:
# pass
# model.fit({'input_x': trainX}, {'target_out': trainX}, n_epoch=30,
# validation_set=(testX, testX), batch_size=256, run_id="vae")
model.fit(trainX, trainY, n_epoch=10,
validation_set=(testX, testY),
show_metric=True,
snapshot_epoch=True, # Snapshot (save & evaluate) model every epoch.
# snapshot_step=10000, # Snapshot (save & evalaute) model every 500 steps.
batch_size=256,
run_id='model_and_weights')

model.save(model_file)

Ypred = []
L = len(dataX)
i = 0
N = 10000
while i < L:
p = model.predict(dataX[i:i+N])
for p1,p2 in p:
if p1 > 0.5:
Ypred.append(0)
else:
Ypred.append(1)
i += N
report_evaluation_metrics(dataY, Ypred)

猜你喜欢

转载自www.cnblogs.com/bonelee/p/10482413.html
tmp
CNN