KDD_cup99 pytorch

提示:这个数据集我是简单归一化处理的,只是为了简单跑下实验学习用的,你如果用于跑论文的实验什么的,这个不能用,处理的效果不好。

KDD_Cup99数据集,由于全部的数据集太大,训练集我们只取官网给的10%数据,即kddcup.data_10_percent.gz。

二分类和多分类的完整代码和数据集:下载地址,链接如果失效,请在下方评论,我会及时更新链接https://www.lanzouw.com/iLBaOmyq3gbicon-default.png?t=LA92https://www.lanzouw.com/iLBaOmyq3gb

官网源数据下载地址:KDD Cup 1999 Data

个人下载地址:数据集下载地址

训练集:23种标签,包含normal正常和22种攻击类型标签。包含494021条数据

测试集:38种标签,包含normal正常和37中攻击标签。包含311029条数据

必看说明:

'spy.', 'warezclient'这两种攻击只存在于训练集中,17种攻击只存在于测试集中,分别为{'apache2.', 'httptunnel.', 'mailbomb.', 'mscan.', 'named.', 'processtable.', 'ps.', 'saint.', 'sendmail.', 'snmpgetattack.', 'snmpguess.', 'sqlattack.', 'udpstorm.', 'worm.', 'xlock.', 'xsnoop.', 'xterm.'}

大意:训练集独有两种攻击,测试集独有17中攻击。因此标签的数据化必须跟任务想匹配,如果是二分类,训练集和测试集可以统一编码,normal归一类,其他归一类。如果是多分类,那么你必须保证两者的标签类别一致才行,也就是必须要去除训练集和测试集中独有的标签数据。就是要去除2+17中类别的数据,只去除测试集中独有的17中类型应该也是可以,但感觉没必要。下面我们的处理就按照二分类和多分类两种类别处理数据。

41特征和class标签:['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'ho', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class']

样本:0 'tcp' 'http' 'SF' 181 5450 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 8 8 0.0 0.0 0.0 0.0 1.0 0.0 0.0 9 9 1.0 0.0 0.11 0.0 0.0 0.0 0.0 0.0 'normal.'

0 duration: continuous.   
1 protocol_type: symbolic.  
2 service: symbolic. 
3 flag: symbolic.
4 src_bytes: continuous.
5 dst_bytes: continuous.
6 land: symbolic.
7 wrong_fragment: continuous.
8 burgent: continuous.
9 hot: continuous.
10 num_failed_logins: continuous.
11 logged_in: symbolic.
12 num_compromised: continuous.
13 root_shell: continuous.
14 su_attempted: continuous.
15 num_root: continuous.
16 num_file_creations: continuous.
17 num_shells: continuous.
18 num_access_files: continuous.
19 num_outbound_cmds: continuous.
20 is_host_login: symbolic.
21 is_guest_login: symbolic.
22 count: continuous.
23 srv_count: continuous.
24 serror_rate: continuous.
25 srv_serror_rate: continuous.
26 rerror_rate: continuous.
27 srv_rerror_rate: continuous.
28 same_srv_rate: continuous.
29 diff_srv_rate: continuous.
30 srv_diff_host_rate: continuous.
31 dst_host_count: continuous.
32 dst_host_srv_count: continuous.
33 dst_host_same_srv_rate: continuous.
34 dst_host_diff_srv_rate: continuous.
35 dst_host_same_src_port_rate: continuous.
36 dst_host_srv_diff_host_rate: continuous.
37 dst_host_serror_rate: continuous.
38 dst_host_srv_serror_rate: continuous.
39 dst_host_rerror_rate: continuous.
40 dst_host_srv_rerror_rate: continuous.

简单数据处理-第19列所有值都为0,所以该特征无用,删去。对非数值特征编码,然后对整个数据集特征进行归一化。(提示:这只是演示,实际这么数据处理肯定不合理。)

实验代码:

下面是二分类的代码,如果想做多分类实验,就将num_outputs设置为23,将标签重新数值化

import pandas as pd
import torch
import torchvision
import torch.nn as nn
import numpy as np
import torch.utils.data as Data
from sklearn import preprocessing
import matplotlib.pyplot as plt

epochs = 20
batch_size=64
lr = 0.001


#我直接将官网的格式改成了csv文件
train_data = pd.read_csv('./data/KDD_cup99/train_10_percent.csv',header=None)
test_data = pd.read_csv('./data/KDD_cup99/test.csv',header=None)

 数据简单处理,特征和标签数值化,特征归一化

#分类任务,将测试集中多余的17种类别去掉
test_data=test_data[test_data[41].isin(set(train_data[41]))]
data = pd.concat((train_data, test_data), ignore_index=True)


#特征和标签编码,删去了19列
le = preprocessing.LabelEncoder()
#特征值编码
data[1]=le.fit_transform(data[1])
data[2]=le.fit_transform(data[2])
data[3]=le.fit_transform(data[3])
#将normal.标签设置为1, 非normal.标签设置为0
data.loc[data[41]!='normal.', 41]=0
data.loc[data[41]=='normal.',41]=1
data[41]=data[41].astype('int64')

#第19列的特征全为0,无用,删掉
del data[19]
data.columns=list(range(41))


#对特征值归一化
for i in range(40):
    Max, Min = max(data.loc[:,i]), min(data.loc[:,i])
    data.loc[:,i]= ((data.loc[:,i]-Min)/(Max-Min)).astype('float32')

制作pytorch数据集和定义模型结构

#制作pytorch识别的数据集和定义模型
train_data, train_label = torch.Tensor(data.loc[:494021,:39].values), torch.Tensor(data.loc[:494021,40].values).long()
test_data, test_label = torch.Tensor(data.loc[494021:, :39,].values), torch.Tensor(data.loc[494021:, 40].values).long()

train_dataset = Data.TensorDataset(train_data, train_label)
test_dataset = Data.TensorDataset(test_data, test_label)

#制作Dataloder数据集,可迭代
train_loader = Data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = Data.DataLoader(test_dataset, batch_size=128)


#如果是用gpu,就用gpu训练
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')
#定义模型
num_inputs, num_hiddens, num_outputs = 40, 128, 2
net = nn.Sequential(
    nn.Linear(num_inputs, num_hiddens),
    nn.ReLU(),
    nn.Linear(num_hiddens, 2*num_hiddens),
    nn.ReLU(),
    nn.Linear(2*num_hiddens, num_outputs)
).to(device)

 训练模型

#定义损失函数
loss = torch.nn.CrossEntropyLoss()
#定义优化器
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


#训练
def train():
    net.train()
    batch_loss, correct, total = 0.0,0.0 ,0.0
    for data, label in train_loader:
            data, label = data.to(device), label.to(device)
            net.zero_grad()
            output = net(data)
            l = loss(output, label)
            l.backward()
            optimizer.step()
            
            predict_label = torch.argmax(output, dim=1)
            correct += torch.sum(predict_label==label).cpu().item()
            total +=len(label)
            batch_loss +=l.cpu().item()
            
    return correct/total, batch_loss/len(train_loader)

#绘图
def pltfigure(x,y,title,id, data):
    plt.subplot(2,2,id)
    plt.plot(range(len(data)),data)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(title)
    plt.show()
        
#测试            
def test():
    net.eval()
    batch_loss, correct, total = 0.0,0.0 ,0.0
    for data, label in test_loader:
        data, label = data.to(device), label.to(device)
        
        output = net(data)
        batch_loss +=loss(output, label).cpu().item()
        predict_label = torch.argmax(output, dim=1)
        correct += torch.sum(predict_label==label).cpu().item()
        total +=len(label)
        
    return correct/total, batch_loss/len(test_loader)

#主程序
def main():        
    print('training on: ',device)
    print('batch_size:',batch_size)
    print('epochs:',epochs)
    print('learning_rate:',lr)
    plt.figure()
    
    train_acc_list, train_loss_list, test_acc_list, test_loss_list = [],[],[],[]
    for epoch in range(epochs):
        train_acc, train_loss = train()
        test_acc, test_loss = test()

        print('epoch %d:  train acc: %.2f%% train loss:%.4f,  test acc: %.2f%%, test loss:%.4f'
              %(epoch, 100*train_acc, train_loss, 100*test_acc, test_loss))
        
        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        test_acc_list.append(test_acc)
        test_loss_list.append(test_loss)
    
#     #绘图
#     pltfigure(x='epoch', y='acc',  title='epoch-train_acc', id=1, data=train_acc_list)
#     pltfigure(x='epoch', y='loss', title='epoch-train_loss',id=2, data= train_loss_list)
#     pltfigure(x='epoch', y='acc',  title='epoch-test_acc',  id=3, data=test_acc_list)
#     pltfigure(x='epoch', y='loss', title='epoch-test_loss', id=4, data=test_loss_list)
    
main()

 实验结果

Guess you like

Origin blog.csdn.net/stay_zezo/article/details/114700757