实战：利用Python sklearn库里的逻辑回归模型训练数据---建立模型

本案例主要是通过对不均衡的28万组数据建立模型，分析预测欺诈用户，采用sigmod函数作为损失函数，采用交叉验证的方法
和l1正交法则，比对了不同惩罚函数下的模型的召回率，也通过预测值和实际值做出混淆矩阵更加直观看到各种预测结果。
也比较了sigmod函数下的不同阈值下的模型预测的精度和召回率。
以下是部分数据格式，代码中对每一步如何做以及为什么做了详细的说明。可能有某些库的版本不同会出现错误。
数据格式：
Python源码：
#!/usr/bin/env python
# encoding: utf-8
"""
@Company：华中科技大学电气学院聚变与等离子研究所
@version: V1.0
@author: Victor
@contact: [email protected] or [email protected] 2018--2020
@software: PyCharm
@file: LG.py
@time: 2018/11/16 16:32
@Desc：
"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("creditcard.csv")
data.head()
##该案列中Class是分类，1表示该用户是欺诈用户，0表示为好用户

count_classes = pd.value_counts(data['Class'],sort=True).sort_index()##取出Class列中不同数的个数，并按照大小排序
print(count_classes)
plt.figure(1)
count_classes.plot(kind='bar')###直接调用pd中的plot画直方图
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

from sklearn.preprocessing import StandardScaler
data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))##因为机器默认Amout大小与某个特征数据的大小相对应，影响建模，所以对Amount进行标准化
data = data.drop(['Time','Amount'],axis=1)
data.head(3)

X = data.ix[:,data.columns != 'Class']##取得除去Class列的所有数据
y = data.ix[:,data.columns == 'class']
#X.head()
#y.head()
number_records_fraud = len(data[data.Class == 1])##class为1的数量
fraud_indices = np.array(data[data.Class == 1].index)###取出所有class为1在源文件中编号
##print(fraud_indices)
normal_indices = data[data.Class == 0].index

##向下采样：保证数据一样少
random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False)
#print(random_normal_indices)
random_normal_indices = np.array(random_normal_indices)##将其变为数组形式，方便使用
#print(random_normal_indices)

##将class为0和1的等量样本数据的编号整合到一起
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
#print(under_sample_indices)
##根据索引编号取出实际数据
under_sample_data = data.iloc[under_sample_indices,:]
under_sample_data.head()

###对采样的数据进行分析处理
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class']

# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))##输入总的样本数

##开始训练数据，建立模型
from sklearn.cross_validation import train_test_split ##交叉验证

##whole dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)##对原始数据进行切分，取30%来测试，70%用于模型建立

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))


##采样数据集
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample = train_test_split(X_undersample,
                                                                                                 y_undersample,
                                                                                                 test_size=0.3,
                                                                                                 random_state=0)
print("==============================================")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))


##Recall=TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report



###交叉验证
def printing_Kfold_socres(x_train_data, y_train_data):
    ##对训练数据集分成5份，交叉选取其中一份作为验证集，然后取参数平均
    fold = KFold(len(y_train_data), 5, shuffle=False)

    ##不同的惩罚参数，因为选择不同的正交模式对参数的浮动影响很大，需要一个参数来惩罚控制
    c_param_range = [0.01, 0.1, 1, 10, 100]

    results_table = pd.DataFrame(index=range(len(c_param_range), 2),
                                 columns=['C_parameter', 'Mean recall score'])  ##存储结果
    results_table['C_parameter'] = c_param_range
    # print(results_table)

    ### the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    j = 0  ##调用参数的次数标记
    ##循环每个惩罚参数下得交叉验证情况
    for c_param in c_param_range:
        print("=====================")
        print("current c paramter:", c_param)
        print("=====================")
        print("\n")

        recall_accs = []  ##存储每个惩罚参数下得结果精度

        #####设定从1（默认是0）开始遍历fold，fold是序列
        for iteration, indices in enumerate(fold, start=1):
            ###iteration是交叉验证的次数，indices是下标而已
            # >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter']
            # >>> list(enumerate(seasons))
            # [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
            # >>> list(enumerate(seasons, start=1))       # 下标从 1 开始
            # [(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]

            ##使用惩罚参数调用逻辑回归模型
            ##lr是逻辑回归模型的实例，penalty是选择哪种正交模式
            lr = LogisticRegression(C=c_param, penalty='l1')
            ##train the model
            lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())

            ##通过上面的模型测试验证集
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)

            ##recall_score计算召回率，并将结果保存在数组中
            recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)
            recall_accs.append(recall_acc)
            print("iteration:", iteration, "  recall score:", recall_acc)

        ##某个惩罚参数下的平均召回率
        results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs)  ##将结果存储在result_table的j行的Mean recall score列
        j += 1
        print('=======')
        print("Mean recall score:", np.mean(recall_accs))
        print("=============================")

    ##找到上面每个惩罚参数下的最大的平局召回率对应的参数
    return results_table


############下采样交叉验证#############
results_table = printing_Kfold_socres(X_train_undersample,y_train_undersample)
print(results_table)
#print("**********************************")
############原始数据集交叉验证########
###召回率很低很低
#results_table1 = printing_Kfold_socres(X_train,y_train)
#print(results_table1)


##############根据模型预测做出混淆矩阵##############
import itertools
def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


###调用模型预测，用预测值画混淆矩阵
import itertools
lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)###预测的直接是分类值 0，1

# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)##预测值和真实值画混淆矩阵
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure(2)
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
#plt.show()

#####sigmod函数（损失函数）的阈值对模型的预测有很大影响
#####所以看下各个阈值下的召回率和精度，找到最合适的阈值
#####通过混淆矩阵来看
lr = LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

plt.figure(figsize=(10, 10))

j = 1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i

    plt.subplot(3, 3, j)
    j += 1

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

    # Plot non-normalized confusion matrix
    class_names = [0, 1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s' % i)

plt.show()

'''上采样：使不均衡的样本数据一样多，通常采用SMOTE算法（通过在少类的样本中找到一个点，求该点到其他少类样本点的距离
          再排列取值生成新的样本点）
          过采样可以使召回下降，精度上升，误杀率下降'''
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

credit_cards=pd.read_csv('creditcard.csv')

columns=credit_cards.columns
# The labels are in the last column ('Class'). Simply remove it to obtain features columns
features_columns=columns.delete(len(columns)-1)

features=credit_cards[features_columns]
labels=credit_cards['Class']

features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=0)
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_socres(os_features,os_labels)

lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()



因为结果图片很多，没有展示出来
实战：利用Python sklearn库里的逻辑回归模型训练数据---建立模型

猜你喜欢