# 数据处理3大件
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      import numpy 
      
      as np
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      import pandas 
      
      as pd
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      import matplotlib.pyplot 
      
      as plt


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      data=pd.read_csv(
      
      'creditcard.csv')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      print(data.shape) 
      
      #(284807, 31)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      print(data.describe())
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      data.head()  
      
      #和print(data.head()) 的效果不一样

1. 这个文件中，第一列是时间（独一无二，像主关键字），v1-v28是属性

2. Amount这一列的值和其他不太一样，可能需要预处理

3. class表示样本类别，0-正例，1-反例，正例反例的数目差异对预测十分重要

二、数据预处理

2.1 计算正常样本和异常样本的比例：pd.value_counts


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      count_classes=pd.value_counts(data[
      
      'Class'],sort=
      
      'True').sort_index() 
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      print(count_classes)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      count_classes.plot(kind=
      
      'bar')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.title(
      
      'Fraud class histogram')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.xlabel(
      
      'class')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.ylabel(
      
      'frequency')

结论：正反例的数目很不均衡，训练的时候有两种方式，下采样（取同样少的正例）/过采样（让反例数目和反例一样多）

2.2 Amount 这列的数据标准化

由于取值的大小对训练也是有影响的，所以有这个处理的必要，但是并不是说一定要标准化，有可能不标准化，反而更符合实际情况呢，这个是后话，我们接下来看一下应该要怎么样进行标准化。顺便把与建模没有关系的time那一列删掉。


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.preprocessing 
      
      import StandardScaler
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      data[
      
      'normAmount']=StandardScaler().fit_transform(data[
      
      'Amount'].values.reshape(
      
      -1,
      
      1)) 
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      data=data.drop([
      
      'Time',
      
      'Amount'],axis=
      
      1)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      data.head()

注意：要在series后面加values.reshape()，否则会报错！原视频中的代码是没有的，这点注意一下

三、建立模型

3.1 下采样方式进行模型训练

1. 下采样，获得正反例数目均衡的样本，用作模型训练


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 下采样：以浪费样本为代价
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 如果不下采样，X-y如下：
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      X=data.loc[:,data.columns !=
      
      'Class']
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      y=data.loc[:,data.columns ==
      
      'Class']
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 下采样
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      number_records_fraud=len(data[data[
      
      'Class']==
      
      1]) 
      
      #计算异常样本的数目
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      fraud_indices=np.array(data[data.Class==
      
      1].index) 
      
      #取出异常样本所在位置的索引值，<class 'numpy.ndarray'>
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      normal_indices=data[data.Class==
      
      0].index  
      
      # 取出正常样本所在位置的索引值#<class 'pandas.core.indexes.numeric.Int64Index'>
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      #从正常样本中随机取出异常样本数目的样本
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      random_normal_indices=np.random.choice(normal_indices, number_records_fraud,replace=
      
      False) 
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      random_normal_indices=np.array(random_normal_indices) 
      
      #转换成为ndarray类型，虽然我也不知道为什么中间有个过度的类型
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      under_sample_indices=np.concatenate([fraud_indices,random_normal_indices]) 
      
      #正常样本与异常样本所在索引
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      under_sample_data=data.iloc[under_sample_indices,:] 
      
      #通过索引取出训练样本
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      X_under_sample=under_sample_data.iloc[:,under_sample_data.columns !=
      
      'Class'] 
      
      #train_data
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Y_under_sample=under_sample_data.iloc[:,under_sample_data.columns ==
      
      'Class'] 
      
      #train_target

2. 将数据分训练集、测试集


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 构造训练集和测试集
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.cross_validation 
      
      import train_test_split
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 为了对比下采样与不处理构建模型的差异
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=
      
      0.3,random_state=
      
      0) 
      
      #测试数据占30%，指定随机方式有利于复现
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      X_train_undersample,X_tes_undersamplet,y_train_undersample,y_test_undersample=train_test_split(X_under_sample,Y_under_sample,test_size=
      
      0.3,random_state=
      
      0)

3. 实例化模型

（交叉验证-实例化模型-带入数据训练模型-评价模型（score））

交叉验证还可以对比不同参数的效果，从而选出最合适的参数。本例中的c是正则化惩罚项，用途就是在recall_score差不多的情况下，选出更好的一组参数，这组参数的特点是浮动差异小，泛化能力更强，没那么容易过拟合。


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      #使用逻辑回归进行建模操作
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.linear_model 
      
      import LogisticRegression
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.cross_validation 
      
      import KFold,cross_val_score
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.metrics 
      
      import confusion_matrix,recall_score,classification_report


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      def print_kfold_scores(x_train_data, y_train_data):
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          fold=KFold(len(y_train_data),
      
      5,shuffle=
      
      False) 
      
      #数据数目，交叉折叠次数5次，不进行洗牌
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          c_param_range=[
      
      0.01,
      
      0.1,
      
      1,
      
      10,
      
      100]
      
      #待选的模型参数
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
      
      # 新建一个dataFrame类型（csv，就像一个表格），列名是参数取值、平均召回率
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          result_table=pd.DataFrame(index=range(len(c_param_range),
      
      2),columns=[
      
      'C_parameter',
      
      'Mean recall score'])
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          result_table[
      
      'C_parameter']=c_param_range
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          j=
      
      0
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
      
      for c_param 
      
      in c_param_range: 
      
      # 将待选参数一个一个进行模型训练，并分别计算对应的召回率
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      '==========================')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      'C parameter:',c_param)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
             
      
      # print('C parameter:' + str(c_param))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      '--------------------------')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              recall_accs=[]
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
             
      
      for iteration,indices 
      
      in enumerate(fold,start=
      
      1): 
      
      #和交叉验证有关
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  lr=LogisticRegression(C=c_param,penalty=
      
      'l1') 
      
      #实例化逻辑回归模型
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  lr.fit(x_train_data.iloc[indices[
      
      0],:],y_train_data.iloc[indices[
      
      0],:].values.ravel())
      
      #将数据带入训练
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  y_pred_undersample=lr.predict(x_train_data.iloc[indices[
      
      1],:].values) 
      
      #预测结果
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  recall_acc=recall_score(y_train_data.iloc[indices[
      
      1],:].values,y_pred_undersample) 
      
      #召回率
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  recall_accs.append(recall_acc) 
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                  print(
      
      'Iteration',iteration,
      
      ': recall score =',recall_acc) 
      
      # 交叉验证的折叠次数为5，有5次小结果
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
             
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              result_table.loc[j,
      
      'Mean recall score']=np.mean(recall_accs) 
      
      #计算该参数对应的平均召回率
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              j+=
      
      1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      '')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      'Mean recall score',np.mean(recall_accs))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              print(
      
      '')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
             
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          best_c = result_table.iloc[result_table[
      
      'Mean recall score'].astype(
      
      'float64').idxmax()][
      
      'C_parameter']
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          print(
      
      '**************************************')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          print(
      
      'best model to choose from cross validation is with C parameter = ',best_c)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          print(
      
      '**************************************')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
      
      return best_c
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      best_c=print_kfold_scores(X_train_undersample,y_train_undersample)

注意：和学习视频中的代码有一些不同，best_c的求解那一行，如果不加上astype('float64')进行类型转换，会报错：reduction operation 'argmax' not allowed for this dtype

结果：


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 0.01
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.958904109589041
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.9178082191780822
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 1.0
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9594594594594594
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9848484848484849
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9642040546150135
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 0.1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8356164383561644
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.863013698630137
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9491525423728814
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9324324324324325
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.8939393939393939
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.8948309011462019
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8493150684931506
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8767123287671232
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9661016949152542
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9459459459459459
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.8939393939393939
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9064028864121736
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 10
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8493150684931506
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8767123287671232
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9661016949152542
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9459459459459459
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.8787878787878788
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9033725833818705
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 100
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.863013698630137
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8767123287671232
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9830508474576272
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9459459459459459
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.8787878787878788
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9095021399177424
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      **************************************
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      best model to choose from cross validation is with C parameter =  0.01
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      **************************************

3.2 过采样方式创建模型

1. SMOTE算法原理

过采样涉及数据生成操作，SMOTE算法原理：

（1）对于少数类中的每一个样本，计算它到少数类样本集中所有样本的欧氏距离，得到其k近邻。

（2）根据样本的不平衡比例设置一个采样比例以确定采样倍率N。对于每一个少数类样本x，从其k近邻中随机选择若干个样本，假设选择的近邻为xn。

（3）对每个随机选出的近邻xn，分别与原样本按如下公式构建新样本： $x_{new}=x+rand(0,1)\times (x\tilde{~}-x)$

2. 安装imblearn库：pip install imblearn

3. 程序


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      from imblearn.over_sampling 
      
      import SMOTE
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.ensemble 
      
      import RandomForestClassifier
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.metrics 
      
      import confusion_matrix
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      from sklearn.model_selection 
      
      import train_test_split
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 读入文件，将数据分成features-labels两部分
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      credit_cards=pd.read_csv(
      
      'creditcard.csv')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      columns=credit_cards.columns
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      features_columns=columns.delete(len(columns)
      
      -1) 
      
      #将倒数第二列的class删去，剩下的就全是可用于训练的特征了
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      features=credit_cards[features_columns] 
      
      #将csv中除了class这列的其他数据全部取出，作为特征（为啥不直接用drop呢）
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      labels=credit_cards[
      
      'Class']
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 划分训练集、测试集
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      features_train,features_test,labels_train,labels_test=train_test_split(features,labels,test_size=
      
      0.2,random_state=
      
      0)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 过采样
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      oversample=SMOTE(random_state=
      
      0) 
      
      #算法实例化
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      os_features,os_labels=oversample.fit_sample(features_train,labels_train) 
      
      #参数带入
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      #len(os_labels[os_labels==1])
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 转换成能被逻辑回归模型接受的数据类型
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      os_features=pd.DataFrame(os_features)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      os_labels=pd.DataFrame(os_labels)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 带入前面写好的逻辑回归模型
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      best_c=print_kfold_scores(os_features,os_labels)

结果：由于现在的数据总量已经达到40w+，所以计算需要一定时间


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 0.01
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8903225806451613
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8947368421052632
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.968861347792409
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9518031237291302
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9584308811729921
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9328309550889913
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 0.1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8903225806451613
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8947368421052632
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9703220095164324
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9600575944427957
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9605082379837548
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9351894529386815
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8903225806451613
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8947368421052632
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.9701228283722474
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9587496290434266
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9603763423132302
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9348616444958656
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 10
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8903225806451613
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8947368421052632
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.969768728560363
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9603653510073532
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9607720293248041
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9351931063285889
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      ==========================
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      C parameter: 100
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      --------------------------
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 1 : recall score = 0.8903225806451613
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 2 : recall score = 0.8947368421052632
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 3 : recall score = 0.970366271992918
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 4 : recall score = 0.9590903595256153
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Iteration 5 : recall score = 0.9598377683252547
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      Mean recall score 0.9348707645188424
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      **************************************
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      best model to choose from cross validation is with C parameter =  10.0
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      **************************************

四、模型评估方法

4.1 精度

——预测正确样本数（0-0,1-1）/所有样本数

4.2 召回率

（recall score）=TP/(TP+FN），具体含义见下表，一般作为检测类题目的评估标准

简言之，就是实际筛选出来的/本应该筛选出来的

4.3 混淆矩阵：

	正类	负类
被检测到	TP：正类判断为正类	FP：负类判断为正类
未被检测到	FN：正类判断为负类	TN：负类判断为负类

4.4 代码


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 混淆矩阵的可视化显示，以下代码可以直接作为模板
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      import itertools
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      def plot_confusion_matrix(cm, classes,
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                                title='Confusion matrix',
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                                cmap=plt.cm.Blues):
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
      
      """
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          This function prints and plots the confusion matrix.
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          """
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.imshow(cm, interpolation=
      
      'nearest', cmap=cmap)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.title(title)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.colorbar()
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          tick_marks = np.arange(len(classes))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.xticks(tick_marks, classes, rotation=
      
      0)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.yticks(tick_marks, classes)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          thresh = cm.max() / 
      
      2.
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
      
      for i, j 
      
      in itertools.product(range(cm.shape[
      
      0]), range(cm.shape[
      
      1])):
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
              plt.text(j, i, cm[i, j],
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                       horizontalalignment=
      
      "center",
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
                       color=
      
      "white" 
      
      if cm[i, j] > thresh 
      
      else 
      
      "black")
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.tight_layout()
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.ylabel(
      
      'True label')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.xlabel(
      
      'Predicted label')


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      # 过采样最佳参数构造的逻辑回归模型
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      lr=LogisticRegression(C=best_c,penalty=
      
      'l1')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      lr.fit(X_train,y_train.values.ravel())
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      y_pred_oversample=lr.predict(X_test.values)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      #构造混淆矩阵
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      cnf_matrix=confusion_matrix(y_test,y_pred_oversample)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      np.set_printoptions(precision=
      
      2)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      print(
      
      'recall metrix in the testing database:',cnf_matrix[
      
      1,
      
      1]/(cnf_matrix[
      
      0,
      
      1]+cnf_matrix[
      
      1,
      
      1]))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      class_names=[
      
      0,
      
      1]
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.figure()
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plot_confusion_matrix(cnf_matrix,classes=class_names,title=
      
      'Confusion matrix')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.show()

结果：这个是过采样的混淆矩阵，下采样差不多，就不赘述了

召回率：91/(91+56)

精度：(85284+91)/(85284+12+56+91)

4.5 阈值对预测结果的影响

就是，当结果大于多少时，我们认为这是反例（本题中）


  
  
   
   
    
    
     
     
    
    
    
    
     
     
      
      # #阈值的影响
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      # # 下采样最佳参数构造的逻辑回归模型（为了速度快一些）
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      lr=LogisticRegression(C=
      
      1,penalty=
      
      'l1')
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      lr.fit(X_train_undersample,y_train_undersample.values.ravel())
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      y_pred_undersample_proba=lr.predict_proba(X_tes_undersamplet.values)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
      
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      threshold=[
      
      0.1, 
      
      0.2, 
      
      0.3, 
      
      0.4, 
      
      0.5, 
      
      0.6, 
      
      0.7, 
      
      0.8, 
      
      0.9]
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      plt.figure(figsize=(
      
      10,
      
      10))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      j=
      
      1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
      for i 
      
      in threshold:
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          y_test_pred_high_recall=y_pred_undersample_proba[:,
      
      1]>i
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plt.subplot(
      
      3,
      
      3,j)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          j+=
      
      1
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
         
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          cnf_matrix = confusion_matrix(y_test_undersample, y_test_pred_high_recall)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          np.set_printoptions(precision=
      
      2)
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          print(
      
      'recall metrix in the testing database:', cnf_matrix[
      
      1, 
      
      1] / (cnf_matrix[
      
      0, 
      
      1] + cnf_matrix[
      
      1, 
      
      1]))
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          class_names = [
      
      0, 
      
      1]
     
     
    
    
   
   
    
    
     
     
    
    
    
    
     
     
      
          plot_confusion_matrix(cnf_matrix, classes=class_names, title=
      
      'Threshold>%s'%i)

结果：

逻辑回归是一种经典的二分类算法，一般拿到分类任务时，会先用逻辑回归来试一下。

信用卡欺诈案例

一、读取数据，先了解一下