恶意代码的分类

首先，从权限文本中提取出特征。

f = glob.iglob(r'C:/project/ML/train_ben/*/AndroidManifest.xml')
i = 0
for xml in f: 
    tree = ET.parse(xml)
    root = tree.getroot()
    #子文件依次提取特征
    #用t传递子特征
    t = []
    for d in root.iter('uses-permission'):
        pms = d.attrib
        for key, value in pms.items():
            value = value.split('.')[-1]
            all_attr.append(value)
            t.append(value)
    
    x1_attr[i] = t
    i = i+1

接着，处理特征。

#处理权限特征
lb = preprocessing.LabelBinarizer()
lb.fit(all_attr)

for i in range(1000):
    try:
        t = lb.transform(x1_attr[i])
        x1_attr[i] = np.sum(t,axis=0)        
    except: 
        x1_attr[i] = np.zeros(563)

然后，合并数据集。

###增加列
## ben  末尾增加一列为 0
y1 = np.zeros((1000,1))
## mal  末尾增加一列为 1
y2 = np.ones((1000,1))
#合并array, 竖直方向
x = np.vstack((x1_attr,x2_attr))
y = np.vstack((y1,y2))

接着是交叉验证和数据降维。

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

#降低维度
slc = SelectPercentile(chi2,percentile=25)
x = slc.fit_transform(x, y)

最后扔进模型做测试。

# 随机森林建模
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(x_train, y_train)
print('clf.score = ', clf.score(x_test,y_test))

发现特征太多可能过拟合，于是先通过贝叶斯选择部分重要的特征。

#合并array, 竖直方向
x = np.vstack((x1_attr,x2_attr))
y = np.vstack((y1,y2))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=0)

#贝叶斯选择特征
selected_feat_names = []
tmp = []
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rfc.fit(x_train, y_train)

importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
#选择特征的个数
for f in range(x.shape[1]):
    if f < 70:
        tmp.append(indices[f])
    print(len(selected_feat_names), "features are selected")
    
#调节参数   ——选择特征的个数 
plt.title("Feature Importance")  
plt.bar(range(x.shape[1]),  
        importances[indices],  
        color='lightblue',  
        align='center')  
plt.xticks(range(x.shape[1]),  
           tmp,  
           rotation=90)  
plt.xlim([-1, 70])  
plt.tight_layout()  
plt.show()  


print(rfc.score(x_test,y_test))

最后提交txt分类结果。

# 预测
flag = clf.predict(x_test_attr)
fp=open('C:/project/result.txt','w+')
for i in range(len(flag)):
    fp.write(str(i)+'\t'+str(int(flag[i]))+'\n')
fp.close()

猜你喜欢