数据来源：https://www.kaggle.com/c/titanic

Training

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

train_data = pd.read_csv('train.csv')

count_survivors = pd.value_counts(train_data['Survived'])
count_survivors.plot(kind='bar')
plt.xlabel('Is_survived')
plt.ylabel('Number of People')
plt.title('Survivor histogram')

survivor_hist

from sklearn.preprocessing import StandardScaler

train_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})

age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])
train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]
train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))

train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))
train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))
train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))

train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]
train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]

train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])

c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning)

X = train_data.ix[:, train_data.columns != 'Survived']
Y = train_data.ix[:, train_data.columns == 'Survived']

c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated “”“Entry point for launching an IPython kernel.

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.metrics import recall_score,confusion_matrix

def getBestC(X, Y):
    folds = KFold(len(Y), 5)
    c_param_range = [0.01,0.1,1,10,100]

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range


    for i in range(len(c_param_range)):
        print '******** c_param = %.2f ********' % c_param_range[i]
        recall_accs = []
        for iteration, fold in enumerate(folds, start=1):
            lr = LogisticRegression(C = c_param_range[i], penalty = 'l1')
            lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values)
            Y_hat = lr.predict(X.iloc[fold[1]].values)
            recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat)
            recall_accs.append(recall_acc)

            print 'Iteration %d: recall score = %f' % (iteration,recall_acc)

        results_table.ix[i,'Mean recall score'] = np.mean(recall_accs)
        print '\nMean recall score %f\n' % np.mean(recall_accs)

    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
    print '--------------------------------\nbest_c = %.2f' % best_c
    return best_c

c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. “This module will be removed in 0.20.”, DeprecationWarning)

best_c = getBestC(X, Y)

******** c_param = 0.01 ********
Iteration 1: recall score = 0.000000
Iteration 2: recall score = 0.000000
Iteration 3: recall score = 0.000000
Iteration 4: recall score = 0.000000
Iteration 5: recall score = 0.000000

Mean recall score 0.000000

******** c_param = 0.10 ********
Iteration 1: recall score = 0.694915
Iteration 2: recall score = 0.683544
Iteration 3: recall score = 0.681159
Iteration 4: recall score = 0.583333
Iteration 5: recall score = 0.698413

Mean recall score 0.668273

******** c_param = 1.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.710145
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.746032

Mean recall score 0.701604

******** c_param = 10.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

******** c_param = 100.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

--------------------------------
best_c = 10.00


c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

import itertools

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat = lr.predict(X.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(Y, Y_hat)
#np.set_printoptions(precision=2)

print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()

Recall value in training dataset: 0.710526

confusion_matrix

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X.values)

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j = 1
for i in thresholds:
    Y_hat = Y_hat_proba[:,1] > i

    plt.subplot(3,3,j)
    j += 1

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(Y, Y_hat)

    print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)

    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i)

Recall value in training dataset: 0.938596, with threshold = 0.1
Recall value in training dataset: 0.850877, with threshold = 0.2
Recall value in training dataset: 0.824561, with threshold = 0.3
Recall value in training dataset: 0.757310, with threshold = 0.4
Recall value in training dataset: 0.710526, with threshold = 0.5
Recall value in training dataset: 0.646199, with threshold = 0.6
Recall value in training dataset: 0.532164, with threshold = 0.7
Recall value in training dataset: 0.371345, with threshold = 0.8
Recall value in training dataset: 0.204678, with threshold = 0.9

thresholds_cnf

Testing

test_data = pd.read_csv('test.csv')

test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})

age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])
test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]
test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))

test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))
test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))

fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])
test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]
test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))

test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]
test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]

test_data = test_data.drop(columns=['Name','Ticket','Cabin'])

test_data.head()

	PassengerId	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	892	3	1	0.428099	-0.499470	-0.400248	-0.498403	3
1	893	3	0	1.399492	0.616992	-0.400248	-0.513271	1
2	894	2	1	2.565163	-0.499470	-0.400248	-0.465085	3
3	895	3	1	-0.154736	-0.499470	-0.400248	-0.483463	1
4	896	3	0	-0.543293	0.616992	0.619896	-0.418468	1

train_data.head()

	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	0	3	1	-0.494245	0.432793	-0.473674	-0.502445	1.0
1	1	1	0	0.717307	0.432793	-0.473674	0.786845	2.0
2	1	3	0	-0.191357	-0.474545	-0.473674	-0.488854	1.0
3	1	1	0	0.490141	0.432793	-0.473674	0.420730	1.0
4	0	3	1	0.490141	-0.474545	-0.473674	-0.486337	1.0

X_test = test_data.drop(['PassengerId'], axis=1)

X_test.head()

	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	3	1	0.428099	-0.499470	-0.400248	-0.498403	3
1	3	0	1.399492	0.616992	-0.400248	-0.513271	1
2	2	1	2.565163	-0.499470	-0.400248	-0.465085	3
3	3	1	-0.154736	-0.499470	-0.400248	-0.483463	1
4	3	0	-0.543293	0.616992	0.619896	-0.418468	1

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X_test.values)
Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]

results = pd.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('results.csv')

逻辑回归实战 — Kaggle_Titanic 2

Training

Testing

猜你喜欢