机器学习 决策树 随机森林算法

决策树

# 决策树API
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
# 字典特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
# 到处dot文件 进行树的可视化
from sklearn.tree import export_graphviz

# 使用决策树预测泰坦尼克号 存活
# titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# print(titan)
titan = pd.read_csv("./data/Tank_survive/train.txt")
# 处理数据
# 找出特征值 目标值
# 特征值
x = titan[['pclass', 'age', 'sex']]
# 目标值
y = titan['survived']
# age有缺失值 要进行处理 按照列的平均值填充
# inplace 表示进行替换
x['age'].fillna(x['age'].mean(), inplace=True)
# 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# 进行处理 特征工程 进行onehot编码
dict = DictVectorizer(sparse=False)
# to_dict 方法可以将数据转化为字典格式 oriend参数指明按照行进行转化
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
x_test = dict.transform(x_test.to_dict(orient="records"))
print(dict.get_feature_names())
print(x_train)
print("***" * 20)
print(x_test)
# 用决策树进行预测
# max_depth 树的最大深度
dec = DecisionTreeClassifier(max_depth=5)
dec.fit(x_train, y_train)
print(dec.score(x_test, y_test))
export_graphviz(dec, "./tree.dot", feature_names=['年龄', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])

随机森林

# 决策树API
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
# 字典特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
# 到处dot文件 进行树的可视化
from sklearn.tree import export_graphviz
# 随机森林
from sklearn.ensemble import RandomForestClassifier
# 网格搜索拍 交叉验证
from sklearn.model_selection import GridSearchCV

# 使用决策树预测泰坦尼克号 存活
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# print(titan)
# titan = pd.read_csv("./data/Tank_survive/train.txt")
# 处理数据
# 找出特征值 目标值
# 特征值
x = titan[['pclass', 'age', 'sex']]
# 目标值
y = titan['survived']
# age有缺失值 要进行处理 按照列的平均值填充
# inplace 表示进行替换
x['age'].fillna(x['age'].mean(), inplace=True)
# 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# 进行处理 特征工程 进行onehot编码
dict = DictVectorizer(sparse=False)
# to_dict 方法可以将数据转化为字典格式 oriend参数指明按照行进行转化
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
x_test = dict.transform(x_test.to_dict(orient="records"))
print(dict.get_feature_names())
print(x_train)
print("***" * 20)
print(x_test)
# 使用随机森林进行预测
rf = RandomForestClassifier()
# 网格搜索 交叉验证
# 参数说明 
# n_estimators 随机森林的树的数目  
# max_depth 树的最大深度
param = {"n_estimators": [120, 200, 300, 800, 1200], "max_depth": [5, 8, 15, 25, 30]}
gc = GridSearchCV(rf, param_grid=param,cv=2)
gc.fit(x_train,y_train)
print("准确率为:",gc.score(x_test,y_test))

print("选择的参数模型:",gc.best_estimator_)

猜你喜欢

转载自blog.csdn.net/qq_41009846/article/details/85220487
今日推荐