import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

pandas 一些知识点

frame=pd.DataFrame(np.arange(12).reshape((4,3)),columns=['c','a','b'],index=['D','B','C','A'])
print(frame)
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(frame[['c','a']])
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(frame['a'])
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(frame.ix[:,'a'])
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(frame.ix[:,['a','b']])
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(frame.ix[:,frame.columns!='a'])
print('>>>>>>>>>>>>>>>>>>>>>>>>')
print(np.array(frame[frame['b']!=2].index))

c a b D 0 1 2 B 3 4 5 C 6 7 8 A 9 10 11 >>>>>>>>>>>>>>>>>>>>>>>> c a D 0 1 B 3 4 C 6 7 A 9 10 >>>>>>>>>>>>>>>>>>>>>>>> D 1 B 4 C 7 A 10 Name: a, dtype: int64 >>>>>>>>>>>>>>>>>>>>>>>> D 1 B 4 C 7 A 10 Name: a, dtype: int64 >>>>>>>>>>>>>>>>>>>>>>>> a b D 1 2 B 4 5 C 7 8 A 10 11 >>>>>>>>>>>>>>>>>>>>>>>> c b D 0 2 B 3 5 C 6 8 A 9 11 >>>>>>>>>>>>>>>>>>>>>>>> [‘B’ ‘C’ ‘A’] /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:10: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated # Remove the CWD from sys.path while we load stuff. /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:12: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated if sys.path[0] == ”: # 1、数据集载入

#读取数据
data = pd.read_csv('creditcard.csv')
data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	…	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	…	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	…	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	…	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	…	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	…	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

#这里的数据为了考虑用户隐私等，已经通过PCA降维处理过了，得到了处理好的特征了 #目的就是去检测在数据样本中哪些是具有欺诈行为的，这是经典的二分类问题 #需要考虑样本的分布规则是什么样子？在真实样本的中，负样本的数量少 # 1、考虑样本是否均衡 # 2、样本预处理，提取特征（这里已经提取好了）

count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
print(count_classes)
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

0 284315 1 492 Name: Class, dtype: int64 Text(0,0.5,’Frequency’) ![png](output_8_2.png) #样本不均衡，过采样和下采样下采样：两个样本一样小过采样：对1号样本进行生成，使得和0号样本一样多 # amount的分布差异比较大，机器学习算法可能认为数值偏大重要成度偏高，数值小的特征重要程度偏小一些#为了使得特征程度相当，所以做一个归一化 # 下采样 # 3、数据标准化

#sklearn.preprocessing:预处理模块；StandardScaler：标准化函数
from sklearn.preprocessing import StandardScaler
#reshape(-1,1):1,变成一个列向量；-1，让Python去自己匹配一下
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data = data.drop(['Time','Amount'],axis=1)
data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	…	V21	V22	V23	V24	V25	V26	V27	V28	normAmount
0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	…	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	0.244964
1	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	…	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	-0.342475
2	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	…	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	1.160686
3	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	…	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	0.140534
4	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	…	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	-0.073403

5 rows × 30 columns

# d = data[:-1]
# c = data['Class']
X = data.ix[:,data.columns != 'Class']
y = data.ix[:,data.columns=='Class']

number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)

#拿出索引
normal_indices = data[data.Class==0].index
#参数1从哪里选择，参数2：选择多少个，参数3：替换不
random_normal_indices = np.random.choice(normal_indices,number_records_fraud,replace=False)
#将index转化为数组
random_normal_indices = np.array(random_normal_indices)

#合并,np.concatenate([np.array([1,2,3]),np.array([4,5,6])])
under_smaple_indices = np.concatenate([fraud_indices,random_normal_indices])
#利用数组选择相应的行数
# under_smaple_data = data.iloc[[1,4],:]
#利用切边可以选择从哪一行到哪一行，注意参数不是数组
# under_smaple_data = data.iloc[1:4,:]
under_smaple_data = data.iloc[under_smaple_indices,:]

under_smaple_data_X = under_smaple_data.ix[:,under_smaple_data.columns != 'Class']
under_smaple_data_y = under_smaple_data.ix[:,under_smaple_data.columns=='Class']

print('Percenttage of normal trans:',len(under_smaple_data[under_smaple_data.Class==0])/len(under_smaple_data))
print('Percenttage of fraud trans:',len(under_smaple_data[under_smaple_data.Class==1])/len(under_smaple_data))
print('total trans:',len(under_smaple_data))

Percenttage of normal trans: 0.5
Percenttage of fraud trans: 0.5
total trans: 984


/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:25: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated

交叉验证：模型中有许多参数进行选择，从经验上肯定不是很准的，所以需要交叉验证。

首先将数据集分成测试机和训练集，然后将训练集在平均的切成N份，用N-1份训练，剩下的验证，可重复N次。

为什么进行N次？

对于同一组训练集和验证集是不一样的，可以保险些，将N次误差的平均值当作评估效果。通过这样选择参数才合理。

4、参数选择（交叉验证）

#交叉验证模块，train_test_split：切分数据级
from sklearn.cross_validation import train_test_split
#切分之前，需要洗牌操作,random_state=0:每次切分都一样
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
print(len(X_train))

under_smaple_data_X_train,under_smaple_data_X_test,under_smaple_data_y_train,under_smaple_data_y_test = \
train_test_split(under_smaple_data_X,under_smaple_data_y,test_size=0.3,random_state=0)
print(len(under_smaple_data_X_train))

199364
688


/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

癌症识别，1000样本有10个有癌症，如果1000样本都预测成正样本，即准确率为99%，但是结果患有癌症的患者没有找出来，所以这个模型没有用。

所以评估模型很重要，所以在样本很不均衡的时候，准确率是骗人的。recall：这里指的是0/10=0,更多的时候用recall

from sklearn.linear_model import LogisticRegression
#KFold，选择几倍的交叉验证；cross_val_score:交叉验证的结果
from sklearn.cross_validation import KFold,cross_val_score
#confusion_matrix，混淆矩阵
#sklearn.metrics 包含评分方法，性能度量，成对度量和距离度量
from sklearn.metrics import confusion_matrix,recall_score,classification_report

dff = pd.DataFrame(index=list(range(5)),columns=['a','b'])

print(dff)

     a    b
0  NaN  NaN
1  NaN  NaN
2  NaN  NaN
3  NaN  NaN
4  NaN  NaN

def print_KFold_scores(x_trian_data,y_train_data):
    #把原始训练集切分成5部分
    fold = KFold(len(y_train_data),5,shuffle=False)

    #不知道正则化系统等于多少，效果(交叉验证)更好
    c_param_range = [0.01,0.1,1,10,100]#正则化系数（惩罚函数）,theta变化小，

    results_table = pd.DataFrame(index=range(len(c_param_range),2),columns=['C_parameter','mean recall score'])
    results_table['C_parameter'] = c_param_range

    j = 0
    for c_param in c_param_range:
        print('----------------------------------------')
        print('C parameter:',c_param)
        print('-----------------------------------------')
        print('')

        recall_accs = []
        for iteration ,indices in enumerate(fold,start=1):

            #实例化模型
            lr = LogisticRegression(C=c_param,penalty='l1')
            #
            lr.fit(x_trian_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())

            y_pred = lr.predict(x_trian_data.iloc[indices[1],:].values)

            #计算召回率
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred)
            recall_accs.append(recall_acc)
            print('Iteration:',iteration,'; recall score = ',recall_acc)
        results_table.ix[j,'mean recall score'] = np.mean(recall_accs)

        j+=1
        print('')
        print('mean recall score',np.mean(recall_accs))
        print('')
    results_table['mean recall score'] = results_table['mean recall score'].astype('float64').idxmax()


    best_c = results_table.loc[results_table['mean recall score'].idxmax()]['C_parameter']

    print('****************************')
    print('best_c',best_c)
    print('*****************************')

    return best_c

best_c = print_KFold_scores(under_smaple_data_X_train,under_smaple_data_y_train)

----------------------------------------
C parameter: 0.01
-----------------------------------------

Iteration: 1 ; recall score =  0.9315068493150684
Iteration: 2 ; recall score =  0.9178082191780822
Iteration: 3 ; recall score =  1.0
Iteration: 4 ; recall score =  0.9594594594594594
Iteration: 5 ; recall score =  0.9696969696969697

mean recall score 0.955694299529916

----------------------------------------
C parameter: 0.1
-----------------------------------------

Iteration: 1 ; recall score =  0.8493150684931506
Iteration: 2 ; recall score =  0.863013698630137
Iteration: 3 ; recall score =  0.9322033898305084
Iteration: 4 ; recall score =  0.9459459459459459
Iteration: 5 ; recall score =  0.9090909090909091

mean recall score 0.8999138023981302

----------------------------------------
C parameter: 1
-----------------------------------------

Iteration: 1 ; recall score =  0.8493150684931506
Iteration: 2 ; recall score =  0.863013698630137
Iteration: 3 ; recall score =  0.9661016949152542
Iteration: 4 ; recall score =  0.9459459459459459
Iteration: 5 ; recall score =  0.9090909090909091

mean recall score 0.9066934634150794

----------------------------------------
C parameter: 10
-----------------------------------------

Iteration: 1 ; recall score =  0.8493150684931506
Iteration: 2 ; recall score =  0.8904109589041096
Iteration: 3 ; recall score =  0.9661016949152542
Iteration: 4 ; recall score =  0.9459459459459459
Iteration: 5 ; recall score =  0.9090909090909091

mean recall score 0.9121729154698739

----------------------------------------
C parameter: 100
-----------------------------------------



/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:32: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration: 1 ; recall score =  0.863013698630137
Iteration: 2 ; recall score =  0.8904109589041096
Iteration: 3 ; recall score =  0.9661016949152542
Iteration: 4 ; recall score =  0.9459459459459459
Iteration: 5 ; recall score =  0.8939393939393939

mean recall score 0.9118823384669682

****************************
best_c 0.01
*****************************

混淆矩阵的用途：X轴预测值，Y：真实值
下采样数据集，recall虽然符合标准，但是误杀有点多

best_c = print_KFold_scores(X_train,y_train)

----------------------------------------
C parameter: 0.01
-----------------------------------------

Iteration: 1 ; recall score =  0.4925373134328358
Iteration: 2 ; recall score =  0.6027397260273972
Iteration: 3 ; recall score =  0.6833333333333333
Iteration: 4 ; recall score =  0.5692307692307692
Iteration: 5 ; recall score =  0.45

mean recall score 0.5595682284048672

----------------------------------------
C parameter: 0.1
-----------------------------------------



/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:31: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration: 1 ; recall score =  0.5671641791044776
Iteration: 2 ; recall score =  0.6164383561643836
Iteration: 3 ; recall score =  0.6833333333333333
Iteration: 4 ; recall score =  0.5846153846153846
Iteration: 5 ; recall score =  0.525

mean recall score 0.5953102506435158

----------------------------------------
C parameter: 1
-----------------------------------------

Iteration: 1 ; recall score =  0.5522388059701493
Iteration: 2 ; recall score =  0.6164383561643836
Iteration: 3 ; recall score =  0.7166666666666667
Iteration: 4 ; recall score =  0.6153846153846154
Iteration: 5 ; recall score =  0.5625

mean recall score 0.612645688837163

----------------------------------------
C parameter: 10
-----------------------------------------

Iteration: 1 ; recall score =  0.5522388059701493
Iteration: 2 ; recall score =  0.6164383561643836
Iteration: 3 ; recall score =  0.7333333333333333
Iteration: 4 ; recall score =  0.6153846153846154
Iteration: 5 ; recall score =  0.575

mean recall score 0.6184790221704963

----------------------------------------
C parameter: 100
-----------------------------------------

Iteration: 1 ; recall score =  0.5522388059701493
Iteration: 2 ; recall score =  0.6164383561643836
Iteration: 3 ; recall score =  0.7333333333333333
Iteration: 4 ; recall score =  0.6153846153846154
Iteration: 5 ; recall score =  0.575

mean recall score 0.6184790221704963

****************************
best_c 0.01
*****************************

5、混淆矩阵（模型评估标准）

import itertools
def plot_confusion_matrix(cm,classes,title='Confusion matrix',cmap=plt.cm.Blues):
    #把某块显示成一种颜色，则需要调用interpolation='nearest'参数即可
    plt.imshow(cm,interpolation='nearest',cmap=cmap)

#     plt.imshow(cm, interpolation='nearest',cmap=cmap )
    plt.title(title)
    #增加颜色类标的代码
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks,classes)
    plt.yticks(tick_marks,classes)

    thresh = cm.max()/2
    for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
        plt.text(j,i,cm[i,j],horizontalalignment='center',color='white' if cm[i,j]>thresh else 'black')

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

lr = LogisticRegression(C = best_c, penalty='l1')
lr.fit(under_smaple_data_X_train,under_smaple_data_y_train.values.ravel())
y_pred = lr.predict(X_test.values)

cnf_matrix = confusion_matrix(y_test,y_pred)
print(cnf_matrix)
np.set_printoptions(precision=2)

print('Recall metric in the testing dataset:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=class_names)

plt.show()

[[76205  9091]
 [   12   135]]
Recall metric in the testing dataset: 0.9183673469387755

这里写图片描述

注：需要在大规模样本（原始数据集上）中测试

6、逻辑回归中的阈值参数

lr = LogisticRegression(C=0.01,penalty='l1')
#ravel():把二位数组压缩成一维
lr.fit(under_smaple_data_X_train,under_smaple_data_y_train.values.ravel())
y_pre_proba = lr.predict_proba(under_smaple_data_X_test.values)

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j=1
for i in thresholds:
    y_test_pred_high_recall = y_pre_proba[:,1]>i

    plt.subplot(3,3,j)
    j+=1

    cnf_matrix = confusion_matrix(under_smaple_data_y_test,y_test_pred_high_recall)
    np.set_printoptions(precision=2)

    print('Recall metric in the testing dataset:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix,classes=class_names,title='Thredhold >= %s'%i)

Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 0.9931972789115646
Recall metric in the testing dataset: 0.9319727891156463
Recall metric in the testing dataset: 0.8843537414965986
Recall metric in the testing dataset: 0.8435374149659864
Recall metric in the testing dataset: 0.7619047619047619
Recall metric in the testing dataset: 0.5850340136054422

这里写图片描述
注：在实际过程中，还需要考虑阈值如何设定

过采样

smote样本生成策略：
对于少数类中每一个样本x，以欧氏距离为标准计算它到少数类样本集Smin中所有样本的距离，得到其k近邻。
根据样本不平衡比例设置一个采样比例以确定采样倍率N，对于每一个少数类样本x，从其k近邻中随机选择若干个样本，假设选择的近邻为xn。
对于每一个随机选出的近邻xn，分别与原样本按照如下的公式构建新的样本
xnew=x+rand(0,1)∗|x−xn|

import pandas as pd
#数据不平衡模块
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

credit_cards = pd.read_csv('creditcard.csv')
columns = credit_cards.columns
print(len(columns)-1)
features_columns = columns.delete(len(columns)-1)
print(features_columns)
features = credit_cards[features_columns]

labels = credit_cards['Class']
type(labels)

30
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')





pandas.core.series.Series

features_train,features_test,labels_train,labels_test = train_test_split(features,labels,
                                                                         test_size=0.2,random_state=0)

oversampler = SMOTE(random_state=0)
os_features,os_labels = oversampler.fit_sample(features_train,labels_train)

len(os_labels[os_labels==1])
print(type(os_labels))
print(type(os_features))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = print_KFold_scores(os_features,os_labels)

----------------------------------------
C parameter: 0.01
-----------------------------------------

Iteration: 1 ; recall score =  0.8903225806451613
Iteration: 2 ; recall score =  0.8947368421052632
Iteration: 3 ; recall score =  0.9684187230275534
Iteration: 4 ; recall score =  0.9577714028203691
Iteration: 5 ; recall score =  0.9580901506908036

mean recall score 0.93386793985783

----------------------------------------
C parameter: 0.1
-----------------------------------------



/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:31: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Iteration: 1 ; recall score =  0.8903225806451613
Iteration: 2 ; recall score =  0.8947368421052632
Iteration: 3 ; recall score =  0.9704769281841319
Iteration: 4 ; recall score =  0.9588485507963201
Iteration: 5 ; recall score =  0.9605631945131401

mean recall score 0.9349896192488034

----------------------------------------
C parameter: 1
-----------------------------------------

Iteration: 1 ; recall score =  0.8903225806451613
Iteration: 2 ; recall score =  0.8947368421052632
Iteration: 3 ; recall score =  0.9705211906606175
Iteration: 4 ; recall score =  0.9603763423132302
Iteration: 5 ; recall score =  0.9603323770897221

mean recall score 0.935257866562799

----------------------------------------
C parameter: 10
-----------------------------------------

Iteration: 1 ; recall score =  0.8903225806451613
Iteration: 2 ; recall score =  0.8947368421052632
Iteration: 3 ; recall score =  0.9704326657076463
Iteration: 4 ; recall score =  0.9599586726899023
Iteration: 5 ; recall score =  0.9604093162308613

mean recall score 0.9351720154757668

----------------------------------------
C parameter: 100
-----------------------------------------

Iteration: 1 ; recall score =  0.8903225806451613
Iteration: 2 ; recall score =  0.8947368421052632
Iteration: 3 ; recall score =  0.9698351222750913
Iteration: 4 ; recall score =  0.9602334553368286
Iteration: 5 ; recall score =  0.9582550202789594

mean recall score 0.9346766041282606

****************************
best_c 0.01
*****************************

lr = LogisticRegression(C = best_c, penalty='l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)

print('Recall metric in the testing dataset:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=class_names)

plt.show()

Recall metric in the testing dataset: 0.9108910891089109