数据来源:https://www.kaggle.com/c/titanic
Training
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
train_data = pd.read_csv('train.csv')
count_survivors = pd.value_counts(train_data['Survived'])
count_survivors.plot(kind='bar')
plt.xlabel('Is_survived')
plt.ylabel('Number of People')
plt.title('Survivor histogram')
from sklearn.preprocessing import StandardScaler
train_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})
age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])
train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]
train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))
train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))
train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))
train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))
train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]
train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]
train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])
c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning)
X = train_data.ix[:, train_data.columns != 'Survived']
Y = train_data.ix[:, train_data.columns == 'Survived']
c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated “”“Entry point for launching an IPython kernel.
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.metrics import recall_score,confusion_matrix
def getBestC(X, Y):
folds = KFold(len(Y), 5)
c_param_range = [0.01,0.1,1,10,100]
results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
for i in range(len(c_param_range)):
print '******** c_param = %.2f ********' % c_param_range[i]
recall_accs = []
for iteration, fold in enumerate(folds, start=1):
lr = LogisticRegression(C = c_param_range[i], penalty = 'l1')
lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values)
Y_hat = lr.predict(X.iloc[fold[1]].values)
recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat)
recall_accs.append(recall_acc)
print 'Iteration %d: recall score = %f' % (iteration,recall_acc)
results_table.ix[i,'Mean recall score'] = np.mean(recall_accs)
print '\nMean recall score %f\n' % np.mean(recall_accs)
best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
print '--------------------------------\nbest_c = %.2f' % best_c
return best_c
c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. “This module will be removed in 0.20.”, DeprecationWarning)
best_c = getBestC(X, Y)
******** c_param = 0.01 ********
Iteration 1: recall score = 0.000000
Iteration 2: recall score = 0.000000
Iteration 3: recall score = 0.000000
Iteration 4: recall score = 0.000000
Iteration 5: recall score = 0.000000
Mean recall score 0.000000
******** c_param = 0.10 ********
Iteration 1: recall score = 0.694915
Iteration 2: recall score = 0.683544
Iteration 3: recall score = 0.681159
Iteration 4: recall score = 0.583333
Iteration 5: recall score = 0.698413
Mean recall score 0.668273
******** c_param = 1.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.710145
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.746032
Mean recall score 0.701604
******** c_param = 10.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905
Mean recall score 0.710576
******** c_param = 100.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905
Mean recall score 0.710576
--------------------------------
best_c = 10.00
c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat = lr.predict(X.values)
cnf_matrix = confusion_matrix(Y, Y_hat)
print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
Recall value in training dataset: 0.710526
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X.values)
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j = 1
for i in thresholds:
Y_hat = Y_hat_proba[:,1] > i
plt.subplot(3,3,j)
j += 1
cnf_matrix = confusion_matrix(Y, Y_hat)
print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)
class_names = [0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
Recall value in training dataset: 0.938596, with threshold = 0.1
Recall value in training dataset: 0.850877, with threshold = 0.2
Recall value in training dataset: 0.824561, with threshold = 0.3
Recall value in training dataset: 0.757310, with threshold = 0.4
Recall value in training dataset: 0.710526, with threshold = 0.5
Recall value in training dataset: 0.646199, with threshold = 0.6
Recall value in training dataset: 0.532164, with threshold = 0.7
Recall value in training dataset: 0.371345, with threshold = 0.8
Recall value in training dataset: 0.204678, with threshold = 0.9
Testing
test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})
age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])
test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]
test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))
test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))
test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))
fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])
test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]
test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))
test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]
test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]
test_data = test_data.drop(columns=['Name','Ticket','Cabin'])
test_data.head()
|
PassengerId |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
0 |
892 |
3 |
1 |
0.428099 |
-0.499470 |
-0.400248 |
-0.498403 |
3 |
1 |
893 |
3 |
0 |
1.399492 |
0.616992 |
-0.400248 |
-0.513271 |
1 |
2 |
894 |
2 |
1 |
2.565163 |
-0.499470 |
-0.400248 |
-0.465085 |
3 |
3 |
895 |
3 |
1 |
-0.154736 |
-0.499470 |
-0.400248 |
-0.483463 |
1 |
4 |
896 |
3 |
0 |
-0.543293 |
0.616992 |
0.619896 |
-0.418468 |
1 |
train_data.head()
|
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
0 |
0 |
3 |
1 |
-0.494245 |
0.432793 |
-0.473674 |
-0.502445 |
1.0 |
1 |
1 |
1 |
0 |
0.717307 |
0.432793 |
-0.473674 |
0.786845 |
2.0 |
2 |
1 |
3 |
0 |
-0.191357 |
-0.474545 |
-0.473674 |
-0.488854 |
1.0 |
3 |
1 |
1 |
0 |
0.490141 |
0.432793 |
-0.473674 |
0.420730 |
1.0 |
4 |
0 |
3 |
1 |
0.490141 |
-0.474545 |
-0.473674 |
-0.486337 |
1.0 |
X_test = test_data.drop(['PassengerId'], axis=1)
X_test.head()
|
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
0 |
3 |
1 |
0.428099 |
-0.499470 |
-0.400248 |
-0.498403 |
3 |
1 |
3 |
0 |
1.399492 |
0.616992 |
-0.400248 |
-0.513271 |
1 |
2 |
2 |
1 |
2.565163 |
-0.499470 |
-0.400248 |
-0.465085 |
3 |
3 |
3 |
1 |
-0.154736 |
-0.499470 |
-0.400248 |
-0.483463 |
1 |
4 |
3 |
0 |
-0.543293 |
0.616992 |
0.619896 |
-0.418468 |
1 |
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X.values, Y.values)
Y_hat_proba = lr.predict_proba(X_test.values)
Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]
results = pd.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('results.csv')