def __init__(self,
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
class_weight=None,
presort=False):
super(DecisionTreeClassifier, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
class_weight=class_weight,
random_state=random_state,
min_impurity_decrease=min_impurity_decrease,
min_impurity_split=min_impurity_split,
presort=presort)
上面是默认的参数,下面是可以的选项
决策树不仅可以做分类,还可以做回归。但是不管做分类还是回归,决策树都要考虑的问题是可能的过拟合发生,如果一旦出现,要考虑通过树的常见的超参数来降低,通常这些超参数包括:
1. criterion gini or entropy,选择哪个特征作为分裂点的判断公式。
2. splitter best or random:选择spitter best的话,是说从所有特征中找最好的切分点, random在数据量大的时候,特征多的时候,在部分特征中找最好的切分点。
3. max_features or None :max_features < 50是一般选择None,即使用所有的特征。
4. max_depth: 树的最大深度
5. min_samples_split:如果节点的样本数小于min_samples_split,则不再对这个节点分裂,这个值是在样本数很大时才用的。
6. min_samples_leaf:叶子节点的样本少于min_samples_leaf,则它和它的兄弟都会被裁剪。
7. max_leaf_nodes:决策树最大的叶子节点数,如果叶子节点大于max_leaf_nodes,则可能发生过拟合了,考虑调小这个值。
8. min_weight_fraction_leaf:这个值限制了叶子节点所有样本权重和的最小值,则会和兄弟节点一起被裁剪。
9. class_weight:调整某个类别的权重,主要考虑到某个类别的样本数所占比例大,导致偏向它,用户可以配置使这个类别权重小一些。
通过调整这些超参数,会得到最优化的结果。
#简单回归
# from sklearn import tree
# X = [[0,0],[1,1]]
# Y = [0,1]
# clf = tree.DecisionTreeRegressor()
# clf.fit(X,Y)
# print(clf.predict([[2,2]]))
#在sklearn中值集成了CART和ID3的决策树模型
#分类
from sklearn import datasets
from sklearn import tree
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
data_train = iris.data
data_target = iris.target
train_x,test_x,train_y,test_y = train_test_split(data_train,data_target,test_size=0.3)
data_target = iris.target
criterions=['gini','entropy']
for criterion in criterions:
clf = tree.DecisionTreeClassifier(criterion=criterion)
clf.fit(train_x, train_y)
print(criterion,clf.score(train_x, train_y))
print(criterion,"Testing score:%f"%(clf.score(test_x,test_y)))