import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

data = pd.read_csv("creditcard.csv")
data.head()
# 数据有31列：Time、V1-V28、Amount和Class，注意到最后一列Class，这是我们的label值，0代表正常数据，1代表欺诈数据。
# 首先习惯性地画个图观察一下欺诈数据的分布。

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

# 样本重采样是对样本Label数量不均衡处理，这里可以有2种处理方式：
# 欠采样（代表算法：EasyEnsemble） 、 过采样（代表算法：SMOTE） ，
# 检查正负样本label差异

pd.value_counts(data['Class'])

0    284315
1       492
Name: Class, dtype: int64

在这里插入图片描述

count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

Text(0,0.5,'Frequency')

在这里插入图片描述

可以看到Class=0的数据大概有28W，欺诈数据Class=1极少，极度不均匀的分布状态。
通常有两种处理方法：

过采样（让1变得和0一样多）；
下采样（在0中取出部分数据，数量与1一致）

标准化

在这里插入图片描述
在特征数据中，Amount与其他特征数据的取值范围相比，太大了，应该是还没有标准化。所以，需要先对这一列进行标准化：

from sklearn.preprocessing import StandardScaler
# 1、StandardScaler就是z-score方法 
# 将原始数据归一化为均值为0，方差为1的数据集 并将之存储到Amount列
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))
#  删除数据中Time  Amount 列
# 删除没用的两列数据，得到一个新的数据集
data = data.drop(['Time','Amount'],axis=1)
data.head()

C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  after removing the cwd from sys.path.

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V21	V22	V23	V24	V25	V26	V27	V28	normAmount
0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	0.244964
1	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	-0.342475
2	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	1.160686
3	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	0.140534
4	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	-0.073403

5 rows × 30 columns

随机下采样

下采样相对简单，所以我们先进行下采样。现在，分别取出特征和标签：

# 先对数据进行切分
X = data.ix[:, data.columns != 'Class']
y = data.ix[:, data.columns == 'Class']

C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated

为了保证拿到的是数据的原始分布，我们采用的是随机的下采样：

# 随机下采样
# 筛选出class为1的数据总数，并取得其索引值
# Number of data points in the minority class
# 统计异常值得个数
number_records_fraud = len(data[data.Class == 1])
# 统计欺诈样本的下标，并变成矩阵的格式：
fraud_indices = np.array(data[data.Class == 1].index)

# Picking the indices of the normal classes
# 记录正常值的下标：
# 把class为0的数据索引拿到手
normal_indices = data[data.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
# 从normal_indices中抽取number_records_fraud
# 从正常值的索引中，选择和异常值相等个数的样本，保证样本的均衡：
# np.random.choice(a,size, replace, p):在a中以概率p随机选择size个数据，replace是指是否有放回；
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
# 将数据转换成数组：
# 转换成numpy的array格式
random_normal_indices = np.array(random_normal_indices)

# Appending the 2 indices
# fraud_indices：欺诈样本的下标；random_normal_indices：正常值数组；
# concatenate：数据库的拼接；axis=1：按照对应行的数据进行拼接；
# 将两组索引数据连接成性的数据索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

# Under sample dataset
# loc["a","b"]:表示第a行，第b列；
# iloc[1,1]:按照行列来索引，左式为第二行第二列；
# 获取下标所在行的所有列，即得到训练所需要的数据集：
# 下采样数据集
# 定位到真正的数据
under_sample_data = data.iloc[under_sample_indices,:]

# 将数据集按照class列进行分类
# 切分出下采样数据的特征和标签
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'Class']

# Showing ratio
# 展示下比例
# 计算正负比例为0.5
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  984


C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:39: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated

数据切分

将数据集切分为训练集和测试集：

# 导入交叉验证模块的数据切分
from sklearn.cross_validation import train_test_split

# Whole dataset 
# 交叉验证
# 随机划分训练集和测试集：x为除了class之外的其他的值，y为最终的结果列；
# test_size:样本占比；
# 从原始集中获取到训练集与测试集：
# train_test_split：x,y按照test_size的尺寸随机提取数据，然后划分到四个数据集中
# 对全部数据集进行切分，注意使用相同的随机策略
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# Undersampled dataset
# 数据平衡之后的数据中获取到训练集与测试集：
# 对下采样数据集进行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))

C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)


Number transactions train dataset:  199364
Number transactions test dataset:  85443
Total number of transactions:  284807

Number transactions train dataset:  688
Number transactions test dataset:  296
Total number of transactions:  984

模型效果评估

在这里插入图片描述

在建模之前，我们还先考虑一下，选定哪些参数，指定什么作为评估标准？

TP(true positives)：预测为真，实际为真

FN(false negatives)：预测为假，实际为真

FP(false positives)：预测为真，实际为假

TN(true negatives)：预测为假，实际为假

由于我们是要尽可能将所有信用卡欺诈的数据找出来，所以有个很重要的衡量标准：

召回率：Recall = TP/(TP+FN)

假设1000条信用卡数据中，有10条是欺诈数据，召回率有别于准确率，它关注的目标就是这10条数据，找出3条，那么召回率为0.3。

建模

接下来就是建模了，很多时候我们也不知道参数设置为多少比较合适，所以最好的办法写一个脚本让机器分别去跑，我们根据各个模型结果再做选择比较省心。

# Recall = TP/(TP+FN)
# 模型评估：对模型选择较合适的参数
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

训练模型，实例化逻辑回归模型，指定不同的惩罚系数，利用交叉验证找到最合适的参数，打印每个结果

# 将清理后的数据传入到方法中：
def printing_Kfold_scores(x_train_data,y_train_data):
    # K-fold交叉验证
    # 对数据进行5折分组；
    # 五折交叉验证
    fold = KFold(len(y_train_data),5,shuffle=False) 

    # Different C parameters
    # 惩罚力度参数；
    # 正则化权重参数，指定惩罚力度，用以控制过拟合
    c_param_range = [0.01,0.1,1,10,100]

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range

    # the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    # 形成一个两列的数据，c_parameter为第一列
    j = 0
    # 循环的使用五个惩罚力度：
    # 通过k折检验来确定逻辑回归函数在加入惩罚项时，他对应的参数为best_C；
    # 外层循环，调节权重参数
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')

        recall_accs = []
        # 对fold中进行遍历，fold中共有五组数据，start=1：下标从1开始
        # enumerate的作用是将fold数据结构组合为一个序列索引，同时列出数据以及下标
        # 内层循环，调节交叉验证参数
        for iteration, indices in enumerate(fold,start=1):  # 调节交叉验证

            # Call the logistic regression model with a certain C parameter
            # 建立逻辑回归测试模型
            # print(iteration) 表示第几次循环；            
            # print(indices) indices中返回值有两个，即两组值得下标，
            # 第一个为抽样后的剩余数据，用来作为训练集，一般占5分之4；
            # 第二个为抽样的数据，用来作为验证集，一般占5分之1；            
            # 构建逻辑回归的样式，带有l1惩罚项；
            # 建立逻辑回归模型，逻辑回归中有很多惩罚参数，
            # 这里使用的是惩罚力度，指定惩罚方案为L1（或L2）
            lr = LogisticRegression(C = c_param, penalty = 'l1')

            # Use the training data to fit the model. In this case, we use the portion of the fold to train the model
            # with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
            # 将数据放入模型中进行调整
            # x_train_data.iloc[indices[0],:]：4/5数据所对应的训练数据；1/5数据所对应的测试数据
            # 将多维数据降为一维：
            # ravel()：返回的是视图，修改对原数据有影响
            # flatten()：返回的是复制的内容，修改对原数据没有影响
            # 使用训练集训练模型，并做交叉验证
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())

            # Predict values using the test indices in the training data
            # 利用交叉验证进行预测:利用取出的数据进行验证，
            # indices中的第二维是抽取出来的1/5的数据，用来进行交叉验证的
            # 在训练集中，交叉验证预测出的结果y
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            # print(y_pred_undersample) 验证出来的
            
            # 用预测的y值与真实的y值计算recall值，打印结果
            # 验证召回率：正确的结果有多少被给出了
            # Calculate the recall score and append it to a list for recall scores representing the current c_parameter
            # 模型评估标准 re_call = TP/(TP+NF)
            # a=y_train_data.iloc[indices[1],:].values：总的正确结果数：
            # b=y_pred_undersample：预测结果是正确的：sum(a,b一致)
            # 准确率：给出的结果有多少是正确的
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)

        # The mean value of those recall scores is the metric we want to save and get hold of.
        # 在某一惩罚力度下，5组数据形成的集合，最终求平均值；
        # 计算交叉验证结果得出的recall的平均值，并打印
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')
    # 最大值所对应的索引值   
    # print(results_table) 得到的是一个表格，记录了惩罚系数与召回率均值
    # idxmax：获取召回率最大值
    # 仅此一句会报错：best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    best_c = results_table
    # you can see the type of best_c
    best_c.dtypes.eq(object) 
    #get the object column of the best_c
    new = best_c.columns[best_c.dtypes.eq(object)] 
    # change the type of object
    best_c[new] = best_c[new].apply(pd.to_numeric, errors = 'coerce', axis=0) 
    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter'] #calculate the mean values

    # Finally, we can check which C parameter is the best amongst the chosen.
    # 最后，我们可以选择C参数之间的最优值
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    return best_c

可能遇到的错误如下图：
在这里插入图片描述
解决方案：https://stackoverflow.com/questions/48719937/getting-typeerror-reduction-operation-argmax-not-allowed-for-this-dtype-when

# 到目前为止，确定好模型需要的参数
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)

-------------------------------------------
C parameter:  0.01
-------------------------------------------

Iteration  1 : recall score =  0.958904109589041
Iteration  2 : recall score =  0.9315068493150684
Iteration  3 : recall score =  1.0
Iteration  4 : recall score =  0.972972972972973
Iteration  5 : recall score =  0.9545454545454546

Mean recall score  0.9635858772845076

-------------------------------------------
C parameter:  0.1
-------------------------------------------

Iteration  1 : recall score =  0.8493150684931506
Iteration  2 : recall score =  0.863013698630137
Iteration  3 : recall score =  0.9152542372881356
Iteration  4 : recall score =  0.9459459459459459
Iteration  5 : recall score =  0.9090909090909091

Mean recall score  0.8965239718896557

-------------------------------------------
C parameter:  1
-------------------------------------------

Iteration  1 : recall score =  0.863013698630137
Iteration  2 : recall score =  0.8767123287671232
Iteration  3 : recall score =  0.9661016949152542
Iteration  4 : recall score =  0.9459459459459459
Iteration  5 : recall score =  0.9090909090909091

Mean recall score  0.9121729154698739

-------------------------------------------
C parameter:  10
-------------------------------------------



C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:76: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration  1 : recall score =  0.863013698630137
Iteration  2 : recall score =  0.9178082191780822
Iteration  3 : recall score =  0.9830508474576272
Iteration  4 : recall score =  0.9459459459459459
Iteration  5 : recall score =  0.9090909090909091

Mean recall score  0.9237819240605403

-------------------------------------------
C parameter:  100
-------------------------------------------

Iteration  1 : recall score =  0.863013698630137
Iteration  2 : recall score =  0.9178082191780822
Iteration  3 : recall score =  0.9830508474576272
Iteration  4 : recall score =  0.9459459459459459
Iteration  5 : recall score =  0.9090909090909091

Mean recall score  0.9237819240605403

*********************************************************************************
Best model to choose from cross validation is with C parameter =  0.01
*********************************************************************************

由以上结果可以看到，当前最好的值为0.96
接下来，画一个更直观的混淆矩阵图出来

# 混淆矩阵：
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    # 绘制热图
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    
    thresh = cm.max() / 2.
    # itertools.product：笛卡儿积；
    # 对括号中的两个值进行笛卡尔求积
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        # 文字显示内容：
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

import itertools
# 构建的模型：
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
# 得到预测结果
# 使用默认阈值时：
y_pred_undersample = lr.predict(X_test_undersample.values)
# 需要调节阈值时：predict_proba:返回的是n行k列的数组，第i行j列的数值是模型预测第i个样本为某个标签的概率，并且每一行的概率之和为1；
# y_pred_undersample = lr.predict_proba(x_undersample_test.values)


# Compute confusion matrix
# 使用混淆矩阵来判断数据预测的准确性
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
# 非归一化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.9319727891156463

在这里插入图片描述

一目了然的图，可以看到，137个真实的欺诈被模型找出来了，但是有10个漏网之鱼，同时有21个正常数据被误杀。Recall值能达到0.93，看起来挺高的，这就是我们要的结果吗？并非如此，这是用的下采样数据计算的混淆矩阵。

接下来，我们用原始数据画出混淆矩阵图，看看结果：

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)

# Compute confusion matrix
# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
# 非归一化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.9115646258503401

![png](output_23_1.png

这里我们能看到，模型出现一个很大的问题，误杀数量竟然达到了9576条，这无疑对业务产生了重大影响。为什么会出现这个问题呢？这是根据下采样模型得到的效果，而在下采样数据中，数据量太少，正常的少，异常的同样也少，样本是有局限的，出现这种情况也很正常。

那么如何解决这个问题呢？
如果我们一开始没有对数据进行任何预处理操作，我们能不能得到好的结果呢？

best_c = printing_Kfold_scores(X_train,y_train)

-------------------------------------------
C parameter:  0.01
-------------------------------------------

Iteration  1 : recall score =  0.4925373134328358
Iteration  2 : recall score =  0.6027397260273972
Iteration  3 : recall score =  0.6833333333333333
Iteration  4 : recall score =  0.5692307692307692
Iteration  5 : recall score =  0.45

Mean recall score  0.5595682284048672

-------------------------------------------
C parameter:  0.1
-------------------------------------------



C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:76: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration  1 : recall score =  0.5671641791044776
Iteration  2 : recall score =  0.6164383561643836
Iteration  3 : recall score =  0.6833333333333333
Iteration  4 : recall score =  0.5846153846153846
Iteration  5 : recall score =  0.525

Mean recall score  0.5953102506435158

-------------------------------------------
C parameter:  1
-------------------------------------------

Iteration  1 : recall score =  0.5522388059701493
Iteration  2 : recall score =  0.6164383561643836
Iteration  3 : recall score =  0.7166666666666667
Iteration  4 : recall score =  0.6153846153846154
Iteration  5 : recall score =  0.5625

Mean recall score  0.612645688837163

-------------------------------------------
C parameter:  10
-------------------------------------------

Iteration  1 : recall score =  0.5522388059701493
Iteration  2 : recall score =  0.6164383561643836
Iteration  3 : recall score =  0.7333333333333333
Iteration  4 : recall score =  0.6153846153846154
Iteration  5 : recall score =  0.575

Mean recall score  0.6184790221704963

-------------------------------------------
C parameter:  100
-------------------------------------------

Iteration  1 : recall score =  0.5522388059701493
Iteration  2 : recall score =  0.6164383561643836
Iteration  3 : recall score =  0.7333333333333333
Iteration  4 : recall score =  0.6153846153846154
Iteration  5 : recall score =  0.575

Mean recall score  0.6184790221704963

*********************************************************************************
Best model to choose from cross validation is with C parameter =  10.0
*********************************************************************************

可以看到，直接用极度不均衡数据建模的话，效果都很差。所以对数据进行预处理是非常有必要的。
数据决定上限，参数决定下限。

我们还是先看看它的混淆矩阵结果：

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.6190476190476191

在这里插入图片描述

从结果看到，误杀少了，但是很多欺诈数据没有找出来。

之前我们使用的是Sigmoid函数中默认的阈值：0.5，如果我们自己指定阈值，会对结果产生什么影响呢？

# C = 0.01固定，然后改变thresholds
lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
# 这里改成计算结果的概率值
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
# 指定阈值
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))
# 将预测的概率值与阈值进行对比
j = 1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
    # 画出3*3的子图
    plt.subplot(3,3,j)
    j += 1
    
    # Compute confusion matrix
    # 计算混淆矩阵
    cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

    # Plot non-normalized confusion matrix
    # 非归一化混淆矩阵
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i)

Recall metric in the testing dataset:  1.0
Recall metric in the testing dataset:  1.0
Recall metric in the testing dataset:  1.0
Recall metric in the testing dataset:  0.9727891156462585
Recall metric in the testing dataset:  0.9319727891156463
Recall metric in the testing dataset:  0.8571428571428571
Recall metric in the testing dataset:  0.8231292517006803
Recall metric in the testing dataset:  0.7278911564625851
Recall metric in the testing dataset:  0.5850340136054422

在这里插入图片描述

当阈值为0.1-0.3时，recall值为1，说明太过严苛。随着阈值越来越大，模型的要求越来越宽松。这里需要根据实际业务需求，权衡利弊，选定一个代价最低的模型。

过采样-SMOTE样本生成策略

既然下采样有局限性，误杀这么高，那过采样呢？

说到过采样，那么就有个问题，怎么生成数据呢？

在机器学习中，有这么个套路，即SMOTE样本生成策略：

其中k值为要翻的倍数，假设少数类样本为100，你想变成500，K就取5。先算x到其他少数类样本的距离，然后找出离它最近的5个样本，分别得到距离，将这个距离乘上一个0-1之间的随机数，加上样本本身，得到新数据。相当于对样本进行了微调的过程。
常见问题：1.Jupyter: No module named 'imblearn" after installation
解决方案：https://stackoverflow.com/questions/47606873/jupyter-no-module-named-imblearn-after-installation

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

credit_cards=pd.read_csv('creditcard.csv')

columns=credit_cards.columns
# The labels are in the last column ('Class'). Simply remove it to obtain features columns
# 最后一类是Class，简单地删除它，获得特征变量列
features_columns=columns.delete(len(columns)-1)

features=credit_cards[features_columns]
labels=credit_cards['Class']

交叉验证

随机划分训练集和测试集：features为除了class之外的其他的值，labels为最终的结果列；

test_size:样本占比；

从原始集中获取到训练集与测试集：

train_test_split：features,labels按照test_size的尺寸随机提取数据，然后划分到四个数据集中

对全部数据集进行切分，注意使用相同的随机策略

features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                            labels, 
                                                                            test_size=0.2, 
                                                                            random_state=0)

# 生成新数据
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)

# 查看下:发现少数类补齐跟多数类一样个数，保持平衡
len(os_labels[os_labels==1])

len(os_labels[os_labels==0])

os_features.shape

(454908, 30)

os_labels.shape

(454908,)

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
# 到目前为止，确定好模型需要的参数
best_c = printing_Kfold_scores(os_features,os_labels)

-------------------------------------------
C parameter:  0.01
-------------------------------------------

Iteration  1 : recall score =  0.8903225806451613
Iteration  2 : recall score =  0.8947368421052632
Iteration  3 : recall score =  0.968861347792409
Iteration  4 : recall score =  0.9577933854321232
Iteration  5 : recall score =  0.9575735593145822

Mean recall score  0.9338575430579077

-------------------------------------------
C parameter:  0.1
-------------------------------------------



C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:76: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration  1 : recall score =  0.8903225806451613
Iteration  2 : recall score =  0.8947368421052632
Iteration  3 : recall score =  0.9690826601748368
Iteration  4 : recall score =  0.959683890042976
Iteration  5 : recall score =  0.9604312988426155

Mean recall score  0.9348514543621705

-------------------------------------------
C parameter:  1
-------------------------------------------

Iteration  1 : recall score =  0.8903225806451613
Iteration  2 : recall score =  0.8947368421052632
Iteration  3 : recall score =  0.9704547969458891
Iteration  4 : recall score =  0.9602224640309516
Iteration  5 : recall score =  0.9605192292896318

Mean recall score  0.9352511826033794

-------------------------------------------
C parameter:  10
-------------------------------------------

Iteration  1 : recall score =  0.8903225806451613
Iteration  2 : recall score =  0.8947368421052632
Iteration  3 : recall score =  0.9701228283722474
Iteration  4 : recall score =  0.9602994031720908
Iteration  5 : recall score =  0.9601675075015662

Mean recall score  0.9351298323592656

-------------------------------------------
C parameter:  100
-------------------------------------------

Iteration  1 : recall score =  0.8903225806451613
Iteration  2 : recall score =  0.8947368421052632
Iteration  3 : recall score =  0.9705654531371031
Iteration  4 : recall score =  0.9593101856431563
Iteration  5 : recall score =  0.9608709510776975

Mean recall score  0.9351612025216763

*********************************************************************************
Best model to choose from cross validation is with C parameter =  1.0
*********************************************************************************

看结果，与下采样对比，误杀比例明显小得多，也就是说，当我们用过采样策略，模型效果最好。

# 建立逻辑回归模型，选定上面最佳的惩罚系数
# 这里使用的是惩罚力度，指定惩罚方案为L1（或L2）
lr = LogisticRegression(C = best_c, penalty = 'l1')
# 使用过采样-SMOTE样本生成-训练集训练模型，并做交叉验证
lr.fit(os_features,os_labels.values.ravel())
# 在训练集中，交叉验证预测出的结果y
y_pred = lr.predict(features_test.values)

# Compute confusion matrix
# 计算混淆矩阵
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
# 非归一化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.900990099009901

在这里插入图片描述

唐宇迪机器学习实战——交易数据异常检测

标准化

随机下采样

数据切分

模型效果评估

建模

过采样-SMOTE样本生成策略

交叉验证

猜你喜欢