本案例利用sklearn自带的数据集,选取房子经纬度作为特征参数来对标签进行分类。
也用切分的数据训练模型来进行特征参数的选择得到最好的测试准确度。
Python源码:
#!/usr/bin/env python
# encoding: utf-8
"""
@Company:华中科技大学电气学院聚变与等离子研究所
@version: V1.0
@author: Victor
@contact: [email protected] or [email protected] 2018--2020
@software: PyCharm
@file: House.py
@time: 2018/11/17 19:36
@Desc:
"""
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing() ###调用sklearn自带的数集
#print(housing.DESCR)
print(housing.data.shape)
print(housing.data[1])
#####取要使用的特征做决策树
from sklearn import tree
dtr = tree.DecisionTreeRegressor(max_depth=2)
dtr.fit(housing.data[:,[6,7]],housing.target)###取房子所在的经度和纬度
###输出构造决策树模型的一些基本参数,有些事默认的
print(dtr)
#要可视化显示 首先需要安装 graphviz http://www.graphviz.org/Download..php
dot_data =tree.export_graphviz(
dtr,
out_file = None,
feature_names = housing.feature_names[6:8],
filled = True,
impurity = False,
rounded = True
)
#pip install pydotplus
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
graph.write_png("out.png") #当前文件夹生成out.png
'''自动选择最合适的特征参数'''
####用切分的数据训练来进行特征参数的选择
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = \
train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42)##,取其中10%做测试集,random_state指定每次随机结果都是一致的
dtr = tree.DecisionTreeRegressor(random_state = 42)##构造树模型
dtr.fit(data_train, target_train)
print("==============================")
print("测试分类的准确度:",dtr.score(data_test, target_test))##测试检验分类的准确度
'''随机森林'''
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor( random_state = 42)
rfr.fit(data_train, target_train)
print(rfr.score(data_test, target_test))
结果展示: