机器学习——简单的数据处理

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
#数据读取
features_path = 'C:\\Users\\zou\\Desktop\\features.dat'
label_path = 'C:\\Users\\zou\\Desktop\\labels.dat'
def data_load(path):
    f = open(path)
    data_list = []
    for item in f:
        data = list(map(float,item.strip().split(',')))
        data_list.append(data)
    data = np.array(data_list)
    return data
data_x = data_load(features_path)[:2556]
label = data_load(label_path)[:2556]

#数据缺失值信息查看
temp_data = pd.DataFrame(data_x)
print(temp_data.info())

#查看数据平衡状况
label_1 = np.sum(label[label == 1])
label_0 = len(label) - label_1
print(label_1/label_0)

#建立模型的类
class Model(object):
    def __init__(self,train_x,train_y,test_x):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
    def svm(self):
        clf = svm.SVC(C=0.8,kernel = 'rbf',gamma = 20)
        clf.fit(self.train_x,self.train_y)
        svm_predict = clf.predict(self.test_x)
        return svm_predict
    def rfc(self):
        rfc_model = RandomForestClassifier(n_estimators = 30,max_features=10,max_depth = 10,min_samples_split = 20)
        rfc_model.fit(self.train_x,self.train_y)
        rfc_predict = rfc_model.predict(self.test_x)
        return rfc_predict
    def xgboost(self):
        dtrain = xgb.DMatrix(self.train_x,label = self.train_y)
        dtest = xgb.DMatrix(self.test_x)
        params = {'booster':'gbtree',
                  'objective':'binary:logistic',
                  'eval_metric':'mae',
                  'max_depth':8,
                  'lambda':5,
                  'subsample':0.75,
                  'colsample_bytree':0.75,
                  'min_child_weight':2,
                  'eta':0.025,
                  'silent':1}
        bst = xgb.train(params,dtrain,num_boost_round = 500)
        xgb_predict = bst.predict(dtest)
        xgb_predict = np.int64(xgb_predict>=0.5)
        return xgb_predict
    def xgboost_3d(self,subsample = 0.1,colsample_bytree = 0.1):
        dtrain = xgb.DMatrix(self.train_x,label = self.train_y)
        dtest = xgb.DMatrix(self.test_x)
        params = {'booster':'gbtree',
                  'objective':'binary:logistic',
                  'eval_metric':'mae',
                  'max_depth':8,
                  'lambda':5,
                  'subsample':subsample,
                  'colsample_bytree':colsample_bytree,
                  'min_child_weight':2,
                  'eta':0.025,
                  'silent':1}
        bst = xgb.train(params,dtrain,num_boost_round = 500)
        xgb_predict = bst.predict(dtest)
        xgb_predict = np.int64(xgb_predict>=0.5)
        return xgb_predict

#五折交叉建立模型
kf = KFold(n_splits = 5,shuffle = True)


rfc_auc = 0
xgb_auc = 0
for train_index,test_index in kf.split(data_x):
    train_x,test_x = data_x[train_index],data_x[test_index]
    train_y,test_y = label[train_index],label[test_index]
    train_y = train_y.reshape(train_y.shape[0],)
    test_y = test_y.reshape(test_y.shape[0],)
    model = Model(train_x,train_y,test_x)
    svm_predict = model.svm()
    #rfc_predict = model.rfc()
    #xgb_predict = model.xgboost()
    svm_auc = roc_auc_score(test_y,svm_predict)
    print('accuracy:',np.sum(test_y==svm_predict))
    print('auc:',svm_auc)
    #rfc_auc += roc_auc_score(test_y,rfc_predict)
    #xgb_auc += roc_auc_score(test_y,xgb_predict)
svm_auc = svm_auc/5
#rfc_auc = rfc_auc/5
#xgb_auc = xgb_auc/5
print('svm_auc:%f'%svm_auc)

'''
#网格搜索法
fig = plt.figure()
ax = fig.gca(projection='3d')
subsample_list = np.linspace(0.01,1,10)
colsample_bytree_list = np.linspace(0.01,1,10)
#auc = np.array(pd.read_csv('result.csv',header = None,index_col = False).iloc[:,1:])

auc = []
for i in subsample_list:
    auc_1 = []
    for j in colsample_bytree_list:
        xgb_auc = 0
        for train_index,test_index in kf.split(data_x):
            train_x,test_x = data_x[train_index],data_x[test_index]
            train_y,test_y = label[train_index],label[test_index]
            train_y = train_y.reshape(train_y.shape[0],)
            test_y = test_y.reshape(test_y.shape[0],)
            model = Model(train_x,train_y,test_x)
            xgb_predict = model.xgboost_3d(i,j)
            xgb_auc += roc_auc_score(test_y,xgb_predict)
        xgb_auc = xgb_auc/5
        auc_1.append(xgb_auc)
    auc.append(auc_1)
    print('i =',i)

subsample_list,colsample_bytree_list = np.meshgrid(subsample_list,colsample_bytree_list)
xgb_auc_result = np.array(auc)
surf = ax.plot_surface(subsample_list,colsample_bytree_list,xgb_auc_result,cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
ax.set_zlim(0.2, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()  


import numpy as np


fig = plt.figure()
ax = fig.gca(projection='3d')

# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

# Customize the z axis.
ax.set_zlim(-1.01, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()  
'''
机器学习——简单的数据处理

猜你喜欢