Machine Learning - CART Decision Tree - Titanic Survival Prediction

Prediction using the CART classifier

read data

import pandas as pd
data =pd.read_csv("train.csv")

View data

# 显示前五行
data.head()

# 显示行数和列数
data.shape

# 显示所有列的数据类型等信息
data.info()

 

# 显示类别Embarked特征列的所有取值及出现次数
data.Embarked.value_counts()

 3. Data processing

1. Missing value processing

When viewing the data, it is found that the features Age and Embarked have missing values

Feature Age is complemented with its mean

The feature Embarked is complemented with the mode "S"

2. Feature code conversion

Most models can only handle numeric data

Use encoding that converts non-numeric types to computable

feature code conversion

Generate N binary feature columns (value 0 or 1), each corresponding to a value

Using a decision tree model, there is generally no need to scale the features

# 缺失值处理
data.Age.fillna(data.Age.median(),inplace=True)
data.Embarked.fillna('S',inplace=True)

# 特征编码转换
data.Sex=data.Sex.map({'female':0,'male':1})
embarked_d=pd.get_dummies(data.Embarked,prefix='Embarked',drop_first=True)
data=pd.concat([data,embarked_d],axis=1)

# 将处理好的数据放入
feature_cols=['Pclass','Sex','Age','Embarked_Q','Embarked_S']
X=data[feature_cols]
y=data.Survived

Fourth, train and select the model

dataset for training

from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3,random_state=1)

treeclf.fit(X,y)

5. Visual decision tree

import graphviz
from sklearn import tree
from graphviz import Digraph
dot_data = tree.export_graphviz(treeclf,out_file=None,feature_names=feature_cols,class_names='Survived',filled=True,rounded=True,special_characters=True)
graph=graphviz.Source(dot_data)
graph.render('xgboost1') #输出pdf文件
graph

 6. The Importance of Viewing Features

pd.DataFrame({'feature':feature_cols,'importance':treeclf.feature_importances_})

 7. Model Evaluation

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=4)



from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[1,3,5,10,15,20,30]}
tree_clf=GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,scoring='accuracy')
tree_clf.fit(X_train,y_train)

print(tree_clf.best_params_)
print(tree_clf.best_score_)

y_pred=tree_clf.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

 Full code:

import pandas as pd
data =pd.read_csv("train.csv")

data.Age.fillna(data.Age.median(),inplace=True)
data.Embarked.fillna('S',inplace=True)

data.Sex=data.Sex.map({'female':0,'male':1})
embarked_d=pd.get_dummies(data.Embarked,prefix='Embarked',drop_first=True)
data=pd.concat([data,embarked_d],axis=1)

feature_cols=['Pclass','Sex','Age','Embarked_Q','Embarked_S']
X=data[feature_cols]
y=data.Survived

from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3,random_state=1)
treeclf.fit(X,y)

import graphviz
from sklearn import tree
from graphviz import Digraph
dot_data = tree.export_graphviz(treeclf,out_file=None,feature_names=feature_cols,class_names='Survived',filled=True,rounded=True,special_characters=True)
graph=graphviz.Source(dot_data)
graph



pd.DataFrame({'feature':feature_cols,'importance':treeclf.feature_importances_})
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=4)

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth':[1,3,5,10,15,20,30]}
tree_clf=GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,scoring='accuracy')
tree_clf.fit(X_train,y_train)


print(tree_clf.best_params_)
print(tree_clf.best_score_)

y_pred=tree_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Guess you like

Origin blog.csdn.net/qq_21402983/article/details/124221394