Prediction using the CART classifier
read data
import pandas as pd
data =pd.read_csv("train.csv")
View data
# 显示前五行
data.head()
# 显示行数和列数
data.shape
# 显示所有列的数据类型等信息
data.info()
# 显示类别Embarked特征列的所有取值及出现次数
data.Embarked.value_counts()
3. Data processing
1. Missing value processing
When viewing the data, it is found that the features Age and Embarked have missing values
Feature Age is complemented with its mean
The feature Embarked is complemented with the mode "S"
2. Feature code conversion
Most models can only handle numeric data
Use encoding that converts non-numeric types to computable
feature code conversion
Generate N binary feature columns (value 0 or 1), each corresponding to a value
Using a decision tree model, there is generally no need to scale the features
# 缺失值处理
data.Age.fillna(data.Age.median(),inplace=True)
data.Embarked.fillna('S',inplace=True)
# 特征编码转换
data.Sex=data.Sex.map({'female':0,'male':1})
embarked_d=pd.get_dummies(data.Embarked,prefix='Embarked',drop_first=True)
data=pd.concat([data,embarked_d],axis=1)
# 将处理好的数据放入
feature_cols=['Pclass','Sex','Age','Embarked_Q','Embarked_S']
X=data[feature_cols]
y=data.Survived
Fourth, train and select the model
dataset for training
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3,random_state=1)
treeclf.fit(X,y)
5. Visual decision tree
import graphviz
from sklearn import tree
from graphviz import Digraph
dot_data = tree.export_graphviz(treeclf,out_file=None,feature_names=feature_cols,class_names='Survived',filled=True,rounded=True,special_characters=True)
graph=graphviz.Source(dot_data)
graph.render('xgboost1') #输出pdf文件
graph
6. The Importance of Viewing Features
pd.DataFrame({'feature':feature_cols,'importance':treeclf.feature_importances_})
7. Model Evaluation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=4)
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[1,3,5,10,15,20,30]}
tree_clf=GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,scoring='accuracy')
tree_clf.fit(X_train,y_train)
print(tree_clf.best_params_)
print(tree_clf.best_score_)
y_pred=tree_clf.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
Full code:
import pandas as pd
data =pd.read_csv("train.csv")
data.Age.fillna(data.Age.median(),inplace=True)
data.Embarked.fillna('S',inplace=True)
data.Sex=data.Sex.map({'female':0,'male':1})
embarked_d=pd.get_dummies(data.Embarked,prefix='Embarked',drop_first=True)
data=pd.concat([data,embarked_d],axis=1)
feature_cols=['Pclass','Sex','Age','Embarked_Q','Embarked_S']
X=data[feature_cols]
y=data.Survived
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3,random_state=1)
treeclf.fit(X,y)
import graphviz
from sklearn import tree
from graphviz import Digraph
dot_data = tree.export_graphviz(treeclf,out_file=None,feature_names=feature_cols,class_names='Survived',filled=True,rounded=True,special_characters=True)
graph=graphviz.Source(dot_data)
graph
pd.DataFrame({'feature':feature_cols,'importance':treeclf.feature_importances_})
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=4)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
parameters = {'max_depth':[1,3,5,10,15,20,30]}
tree_clf=GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,scoring='accuracy')
tree_clf.fit(X_train,y_train)
print(tree_clf.best_params_)
print(tree_clf.best_score_)
y_pred=tree_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))