机器学习1——分类问题
为建模准备数据
在时间场景中,我们一般会另设一个Dev数据集作为开发数据集(测试集),我们在成功建模后用它对模型调优。模型是从训练集中训练出来的,然后用Dev数据来度量它的指标准确度等,基于上述过程的结果,如果需要进一步提高,模型会被更深入的调优。
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np
def get_iris():
data=load_iris()
x=data['data']
y=data['target']
input_dataset=np.column_stack([x,y])
np.random.shuffle(input_dataset)
return input_dataset
#采用8/2分布分割数据集
data=get_iris()
train,test=train_test_split(data,train_size=0.8)
print("train size",train.shape)
print("test size ",test.shape)
# 检测训练集和测试集的类别标签是否分布合理
def get_class_distribution(y):
distribution={}
set_y=set(y)
for y_label in set_y:
no_element=len(np.where(y==y_label)[0])
distribution[y_label]=no_element
return distribution
def print_class_label_split(train,test):
y_train=train[:,-1]
train_distribution=get_class_distribution(y_train)
print("\nTrain data set class label distribution")
print("======================================\n")
for k ,v in train_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
y_test=test[:,-1]
test_distribution=get_class_distribution(y_test)
print("\nTest data set class label distribution")
print("======================================\n")
for k ,v in test_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
print_class_label_split(train,test)
train size (120, 5)
test size (30, 5)
Train data set class label distribution
======================================
class label=0, Percentage records=36.00
class label=1, Percentage records=42.00
class label=2, Percentage records=42.00
Test data set class label distribution
======================================
class label=0, Percentage records=14.00
class label=1, Percentage records=8.00
class label=2, Percentage records=8.00
如何在训练集和测试集中均匀的分割类别标签
from sklearn.cross_validation import StratifiedShuffleSplit
straified_split=StratifiedShuffleSplit(data[:,-1],test_size=0.2,n_iter=1)
for train_index,test_index in straified_split:
train=data[train_index]
test=data[test_index]
print_class_label_split(train,test)
Train data set class label distribution
======================================
class label=0, Percentage records=40.00
class label=1, Percentage records=40.00
class label=2, Percentage records=40.00
Test data set class label distribution
======================================
class label=0, Percentage records=10.00
class label=1, Percentage records=10.00
class label=2, Percentage records=10.00
查找最近邻
首先需要混淆矩阵,它是类别标签的真实值与预测值的对应排列矩阵。
预测值 | |
---|---|
真实值 | T |
F |
- FP:TRUE Positive的缩写。测试集中真实标签为T,预测值标签也为T的总数
- FN: False Negative的缩写。测试集中真实标签为T,预测值却为F的总数
- FP:False Positive的缩写。测试集中真实标签为F,预测值却为T的总数
- TN:TRUE Negative的缩写。测试集中真实标签为F,预测值也为F的总数
准确度是正确的预测总数的总数。从混淆矩阵中,我们知道TP和TN之和就是正确的预测数。
训练集的准确度总是比较乐观的,但是我们应该看看测试集的准确度指标来判断模型的真正效果
K近邻算法(KNN)把所有的训练集数据加载到内存中,当它需要对测试实例进行分类时,它衡量这个实例和所有训练实例之间的距离,基于距离,它选择训练集里的K个最近的实例。测试集的分类预测值就是基于这K个最近邻的主体分类情况
# 准备数据
from sklearn.datasets import make_classification
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
def get_data():
x,y=make_classification(n_features=4,n_samples=1000)
return x,y
def plot_data(x,y):
"""
Plot a scatter plot fo all variable combinations
"""
subplot_start = 321
col_numbers = range(0,4)
col_pairs = itertools.combinations(col_numbers,2)
plt.subplots_adjust(wspace=0.36,hspace=0.2)
plt.figure(figsize=(16,9))
for col_pair in col_pairs:
plt.subplot(subplot_start)
plt.scatter(x[:,col_pair[0]],x[:,col_pair[1]],c=y)
title_string = str(col_pair[0]) + "-" + str(col_pair[1])
plt.title(title_string)
x_label = str(col_pair[0])
y_label = str(col_pair[1])
plt.xlabel(x_label)
plt.xlabel(y_label)
subplot_start+=1
x,y=get_data()
plot_data(x,y)
<Figure size 432x288 with 0 Axes>
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
def get_train_test(x,y):
train_size=0.7
test_size=1-train_size
input_data=np.column_stack([x,y])
straified_split=StratifiedShuffleSplit(input_data[:,-1],train_size=train_size,n_iter=1)
for train_indx,test_indx in straified_split:
train_x=input_data[train_indx,:-1]
train_y=input_data[train_indx,-1]
test_x=input_data[test_indx,:-1]
test_y=input_data[test_indx,-1]
return train_x,train_y,test_x,test_y
def build_model(x,y,k=2):
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(x,y)
return knn
def test_model(x,y,knn_model):
y_predicted=knn_model.predict(x)
print(classification_report(y,y_predicted))
x,y=get_data()
plot_data(x,y)
train_x,train_y,test_x,test_y=get_train_test(x,y)
knn_model=build_model(train_x,train_y)
test_model(test_x,test_y,knn_model)
test_model(train_x,train_y,knn_model)
precision recall f1-score support
0.0 0.89 0.94 0.91 50
1.0 0.94 0.88 0.91 50
avg / total 0.91 0.91 0.91 100
precision recall f1-score support
0.0 0.90 1.00 0.95 353
1.0 1.00 0.88 0.94 347
avg / total 0.95 0.94 0.94 700
<Figure size 432x288 with 0 Axes>
用朴素贝叶斯分类文档
这种算法的驱动力来自贝叶斯规则,公式如下:
这个公式展示了我们已经知道事件Y的发生的情况时,事件X发生的概率有多大。在贝叶斯的术语里,必须先定义条件概率:给定评价条件下评价为正面的概率和给定评价条件下评价为负面的概率,写成如下等式:
对于任何一条评价,如果有了上面两个概率值,我们即可通过比较它们来将这条评价归类到正面还是负面:如果负面的条件概率大于正面的条件概率,则评价法是负面分类,反之亦然。
要比较这两个等式来决定最终结果,我们可以忽略分母,因此它只是简单的缩放因此,等式左边称为后验概率。等式右边的分子部分 ,其中 是正面评价的先验概率,它是我们从训练集中获取的正面分类标签的分布信仰,我们通过下面的公式把它从训练集中计算出来:
是一种可能性,它回答了这个问题:给定这个类别是正面的,这个评价是正面的可能性有多大
#加载库和生成数据
from nltk.corpus import movie_reviews,stopwords
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures
def get_data():
dataset=[]
ylabel=[]
for cat in movie_reviews.categories():
for field in movie_reviews.fileids(cat):
words=list(movie_reviews.words(field))
dataset.append((words,cat))
ylabel.append(cat)
return dataset,ylabel
def get_train_test(input_dataset,ylabel):
stragiht_split=StratifiedShuffleSplit(ylabel,train_size=0.8,n_iter=1)
for train_index,test_index in stragiht_split:
train=[input_dataset[i] for i in train_index]
train_y=[ylabel[i] for i in train_index]
test=[input_dataset[i] for i in test_index]
test_y=[ylabel[i] for i in test_index]
return train,train_y,test,test_y
#模型构建
def build_word_features(instance):
feature_set={}
words=instance[0]
for word in words:
feature_set[word]=1
return (feature_set,instance[1])
def build_negate_features(instance):
words= instance[0]
final_words=[]
negate=False
negate_words=['no','not']
for word in words:
if negate:
word='not_'+word
negate=False
if word not in negate_words:
final_words.append(word)
else:
negate=True
feature_set={}
for word in final_words:
feature_set[word]=1
return (feature_set,instance[1])
def remove_stop_words(in_data):
stopwordlist=stopwords.words('english')
negate_words=['no','not']
New_stopwords=[word for word in stopwordlist if word not in negate_words]
label=in_data[1]
words=[word for word in in_data[0] if word not in New_stopwords]
return (words,label)
def build_keyphrase_features(instance):
feature_set={}
instance=remove_stop_words(instance)
words=instance[0]
bigram_finder=BigramCollocationFinder.from_words(words)
bigrams=bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
for bigram in bigrams:
feature_set[bigram]=1
return (feature_set,instance[1])
def build_model(features):
model=nltk.NaiveBayesClassifier.train(features)
return model
def probel_model(model,features,dataset_type='Train'):
accuracy=nltk.classify.accuracy(model,features)
print("\n"+ dataset_type+"Accuracy=%0.2f"%(accuracy*100)+"%")
def show_features(model,features=5):
print("\nFeature Importance")
print("=====================")
print(model.show_most_informative_features(features))
#模型调整
def build_model_cycle_1(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
def build_model_cycle_2(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
def build_model_cycle_3(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
#主程序
input_dataset,ylabels=get_data()
train_data,train_y,ALL_test_data,ALL_test_y=get_train_test(input_dataset,ylabels)
dev_data,dev_y,test_data,test_y=get_train_test(ALL_test_data,ALL_test_y)
print("\n Origin Data Size=",len(input_dataset))
print("\n Train Data Size=",len(train_data))
print("\n Dev Data Size=",len(dev_data))
print("]n Test Data Size=",len(test_data))
model_cycle_1=build_model_cycle_1(train_data,dev_data)
show_features(model_cycle_1)
model_cycle_2=build_model_cycle_2(train_data,dev_data)
show_features(model_cycle_2)
model_cycle_3=build_model_cycle_3(train_data,dev_data)
show_features(model_cycle_3)
Origin Data Size= 2000
Train Data Size= 1600
Dev Data Size= 160
]n Test Data Size= 20
TrainAccuracy=0.00%
DevAccuracy=0.00%
Feature Importance
=====================
Most Informative Features
sucks = 1 neg : pos = 16.3 : 1.0
lousy = 1 neg : pos = 13.7 : 1.0
seagal = 1 neg : pos = 11.7 : 1.0
outstanding = 1 pos : neg = 11.4 : 1.0
stupidity = 1 neg : pos = 11.0 : 1.0
None
TrainAccuracy=0.00%
DevAccuracy=0.00%
Feature Importance
=====================
Most Informative Features
sucks = 1 neg : pos = 16.3 : 1.0
lousy = 1 neg : pos = 13.7 : 1.0
seagal = 1 neg : pos = 11.7 : 1.0
outstanding = 1 pos : neg = 11.4 : 1.0
stupidity = 1 neg : pos = 11.0 : 1.0
None
TrainAccuracy=0.00%
DevAccuracy=0.00%
Feature Importance
=====================
Most Informative Features
sucks = 1 neg : pos = 16.3 : 1.0
lousy = 1 neg : pos = 13.7 : 1.0
seagal = 1 neg : pos = 11.7 : 1.0
outstanding = 1 pos : neg = 11.4 : 1.0
stupidity = 1 neg : pos = 11.0 : 1.0
None
构建决策树解决多类问题
理论上,对于给定的决策树可以构建许多决策树,其中一些的准确度要更好一些。现在有一些高效的算法,能够有限的时间内生成较为合理准确的树。比如Hunt算法,ID3,C4.5和CART等算法都是基于它而来的,这种算法的概述如下:
给定一个数据集D,他有n条记录,每条记录具有每个属性,而且每条记录的标签为y1,y2,y3三者中的一个。算法过程如下
- 如果D里的所有记录都属于同一个类别,假定为y1,则y1是树的叶子节点,标签为y1
- 如果D的记录属于多个类别,则采用一个特征测试条件将记录分割成较小的子集。假定第一次运行,我们在所有的属性上执行特征测试条件,从中找出一个属性能够把数据字分割成为3个较小的子集,然后将这个属性变成根节点,在3个子集中应用测试条件来找出下一级节点。这个过程不断迭代执行
决策树有许多的有点,列举如下:
- 易于解释
- 仅需要极少的数据准备和数据-特征转换
- 天然支持多分类问题
决策树也有自己的不足:
- 容易过拟合:训练集的准确度很高而对测试集的效果很差
- 对于一个给定的数据集,能产生成千上万的决策树
- 类别不平衡的影响十分严重:在二元分类问题中,每类的实例数量不同时就会爆发类别不平衡,对多分类问题也是一样
特征测试条件就是基于一个称为“不纯性”的标准或指标,将输入的记录分割成多个自己。不纯性则是通过对实例的每个属性上的类别标签进行相关的计算得出,对它影响最大的属性被分割为数据的基准属性。也就是树里的这一级节点。在决策树中,采用“熵”的概念作为计算不纯度的指标。熵的定义如下
其中 的定义如下:
为了找到最适合用来分割数据的变量,我们选择了熵,首先要做的就是基于类别标签对熵进行计算,公式如下:
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import pprint
def get_data():
data=load_iris()
x=data['data']
y=data['target']
label_names=data['target_names']
return x,y,label_names.tolist()
def get_feature_names():
data=load_iris()
x_feature=data['feature_names']
return data['feature_names']
def get_train_test(x,y):
input_data=np.column_stack([x,y])
straight_split=StratifiedShuffleSplit(input_data[:,-1],train_size=0.7,n_iter=1,random_state=77)
for train_index,test_index in straight_split:
train_x=input_data[train_index,:-1]
train_y=input_data[train_index,-1]
test_x=input_data[test_index,:-1]
test_y=input_data[test_index,-1]
print(train_x.shape)
print(test_x.shape)
return train_x,train_y,test_x,test_y
def build_model(x,y):
model=tree.DecisionTreeClassifier(criterion="entropy")
model=model.fit(x,y)
return model
def test_model(x,y,model,label_names):
y_predicted=model.predict(x)
print("Model Accuracy = %0.2f" %(accuracy_score(y,y_predicted)*100),"%\n")
print("\n Confusion Matrix")
print('====================')
print(pprint.pprint(confusion_matrix(y,y_predicted)))
print('\nClassification Report')
print('========================')
print(classification_report(y,y_predicted,target_names=label_names))
x,y,label_names=get_data()
train_x,train_y,test_x,test_y=get_train_test(x,y)
model=build_model(train_x,train_y)
test_model(test_x,test_y,model,label_names)
tree.export_graphviz(model,out_file='tree.dot',feature_names=get_feature_names(),class_names=label_names)#指定特征值标签和类标签
(105, 4)
(15, 4)
Model Accuracy = 93.33 %
Confusion Matrix
====================
array([[5, 0, 0],
[0, 4, 1],
[0, 0, 5]], dtype=int64)
None
Classification Report
========================
precision recall f1-score support
setosa 1.00 1.00 1.00 5
versicolor 1.00 0.80 0.89 5
virginica 0.83 1.00 0.91 5
avg / total 0.94 0.93 0.93 15
%%cmd
dot -Tpdf tree.dot -o tree.pdf
Microsoft Windows [版本 10.0.16299.371]
(c) 2017 Microsoft Corporation。保留所有权利。
E:\PycharmProjects\JupyterFiles\python>dot -Tpdf tree.dot -o tree.pdf
E:\PycharmProjects\JupyterFiles\python>
机器学习2——回归问题
回归可以被认为是一种函数逼近,它的任务是找到这样一个函数:当一系列随机变量X作为函数的输入时,返回反应变量Y。简单回归框架虽然威力强大,但是仍然你手指与一个缺陷:由于线性回归采用的系数值上限和下限无法控制,对于给定的数据,回归容易过度拟合。对于未知数据,输出的回归模型可能执行效果不佳,缩减方法就是用来解决这个问题的,它也被称为正则化方法。
回归方法预测实数值
一个线性回归模型的定义如下:使用训练集数据来查找系数
这个等式的值越小,线性回归模型的效果越号,因此,这个优化问题就是使得上面的等式最小化,也就是说找出使得等式最小的 系数。
上面这个式子叫做均方误差——用它来衡量回归模型是否可用。我们希望输出模型的真实和预测值的平均平方差尽可能小,这种查找系数的方法叫做最小二乘估计
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
model=LinearRegression(normalize=True,fit_intercept=True)
model.fit(x,y)
return model
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
def plot_residuals(y,predicted_y):
diff=y-predicted_y
plt.xlabel('Predicted_Y')
plt.ylabel("Residuals")
plt.plot(predicted_y,diff,'go')
#主程序
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
model=build_model(x_train,y_train)
predicted_y=model.predict(x_train)
plot_residuals(y_train,predicted_y)
model_view(model)
print("\nModel Performance On Train_set")
model_worth(y_train,predicted_y)
predicted_y=model.predict(x_dev)
print("\nModel Performance On Dev_set")
model_worth(y_dev,predicted_y)
Model Coefficents
======================
Coefficients 1 -0.109
Coefficients 2 0.043
Coefficients 3 0.053
Coefficients 4 2.237
Coefficients 5 -15.879
Coefficients 6 3.883
Coefficients 7 0.001
Coefficients 8 -1.321
Coefficients 9 0.284
Coefficients 10 -0.012
Coefficients 11 -0.904
Coefficients 12 0.009
Coefficients 13 -0.529
Intercept 33.288
Model Performance On Train_set
Mean squared error = 23.18
Model Performance On Dev_set
Mean squared error = 18.25
# 准备一些多项式特征
poly_features=PolynomialFeatures(2)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
#用多项式特征建模
model_poly=build_model(x_train_poly,y_train)
predicted_y=model_poly.predict(x_train_poly)
print("\nModel Performance On Train_set(poly_features)")
model_worth(y_train,predicted_y)
predicted_y=model_poly.predict(x_dev_poly)
print("\nModel Performance On Dev_set(Poly_features)")
model_worth(y_dev,predicted_y)
Model Performance On Train_set(poly_features)
Mean squared error = 5.45
Model Performance On Dev_set(Poly_features)
Mean squared error = 13.23
#将模型应用到测试集上
x_test_poly=poly_features.transform(x_test)
predicted_y=model_poly.predict(x_test_poly)
print("\nModel Performance On Test_set(poly_features)")
model_worth(y_test,predicted_y)
predicted_y=model.predict(x_test)
print("\nModel Performance On Test_set(Regular_features)")
model_worth(y_test,predicted_y)
Model Performance On Test_set(poly_features)
Mean squared error = 15.17
Model Performance On Test_set(Regular_features)
Mean squared error = 21.66
L2缩减回归——岭回归
在训练线性回归模型时,有的系数取值会很大,导致模型很不稳定。正则化或者缩减是控制系数权重的一种途径,这样的权重不会使用过大的数值。如果数据集中包含着大量关联的预测器,仅仅微笑的改变就可能导致模型不稳定。此外,我们还要面对如何解释模型的问题。例如,假定有两个负相关的变量,他们对反应变量的影响应该是相反的。我们可以对相关联的变量进行手工检查,并删除其中其主导因素的变量。然后再进行进行建模,当然如果能够自动完成这些操作,将会方便很多。
我们把线性回归的代价函数进行修改,将系数包含进来,如你所知,代价函数的值最小,模型的效果才好。将系数引入到代价函数之后,可以对权重取值太高的系数进行大幅惩罚。这种方法称为缩减方法,因为他们减小了系数的值。
如你所见,代价函数里增加了系数的平方和。这样,在优化过程查找上式最小值的过程中,它必须大大减小系数的值来达到目标。参数 决定了缩减的幅度, 越大,缩减的幅度越大,系数的值越趋向于0
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
def get_data():
data=load_boston()
x=data['data']
y=data['target']
x=x-np.mean(x,axis=0)
return x,y
def build_model(x,y):
model=Ridge(normalize=True,alpha=0.015)
model.fit(x,y)
return model
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
x_test_poly=poly_features.transform(x_test)
choosen_model=build_model(x_train_poly,y_train)
#应用模型到测试集
predicted_y=choosen_model.predict(x_train_poly)
print("\nModel Performance in Training set(Poly_features)")
mse=model_worth(y_train,predicted_y)
model_view(choosen_model)
#应用模型到Dev
predicted_y=choosen_model.predict(x_dev_poly)
print("\nModel Performance in Dev set(Poly_features)")
mse=model_worth(y_dev,predicted_y)
#应用模型到测试集
predicted_y=choosen_model.predict(x_test_poly)
print("\nModel Performance in Test set(Poly_features)")
mse=model_worth(y_test,predicted_y)
Model Performance in Training set(Poly_features)
Mean squared error = 6.85
Model Coefficents
======================
Coefficients 1 0.000
Coefficients 2 -0.016
Coefficients 3 -0.003
Coefficients 4 0.006
Coefficients 5 4.285
Coefficients 6 -10.623
Coefficients 7 5.401
Coefficients 8 -0.050
Coefficients 9 -1.361
Coefficients 10 0.103
Coefficients 11 -0.009
Coefficients 12 -0.209
Coefficients 13 0.018
Coefficients 14 -0.408
Coefficients 15 0.003
Coefficients 16 -0.004
Coefficients 17 2.027
Coefficients 18 -0.535
Coefficients 19 0.156
Coefficients 20 0.001
Coefficients 21 -0.044
Coefficients 22 -0.006
Coefficients 23 -0.000
Coefficients 24 0.006
Coefficients 25 -0.000
Coefficients 26 0.016
Coefficients 27 0.002
Coefficients 28 -0.066
Coefficients 29 0.035
Coefficients 30 -0.004
Coefficients 31 -0.001
Coefficients 32 0.006
Coefficients 33 0.003
Coefficients 34 0.000
Coefficients 35 -0.000
Coefficients 36 -0.000
Coefficients 37 -0.007
Coefficients 38 -0.030
Coefficients 39 1.945
Coefficients 40 0.098
Coefficients 41 0.002
Coefficients 42 -0.009
Coefficients 43 -0.010
Coefficients 44 0.000
Coefficients 45 -0.018
Coefficients 46 0.001
Coefficients 47 -0.003
Coefficients 48 -36.405
Coefficients 49 -4.002
Coefficients 50 0.001
Coefficients 51 -1.241
Coefficients 52 -0.326
Coefficients 53 0.009
Coefficients 54 0.154
Coefficients 55 0.026
Coefficients 56 -0.177
Coefficients 57 -1.193
Coefficients 58 -0.547
Coefficients 59 1.027
Coefficients 60 -1.362
Coefficients 61 -0.006
Coefficients 62 -3.407
Coefficients 63 -0.002
Coefficients 64 0.434
Coefficients 65 -0.048
Coefficients 66 0.002
Coefficients 67 -0.223
Coefficients 68 -0.011
Coefficients 69 -0.518
Coefficients 70 -0.008
Coefficients 71 -0.292
Coefficients 72 -0.009
Coefficients 73 0.008
Coefficients 74 -0.000
Coefficients 75 -0.011
Coefficients 76 -0.001
Coefficients 77 -0.006
Coefficients 78 -0.091
Coefficients 79 -0.003
Coefficients 80 -0.045
Coefficients 81 -0.004
Coefficients 82 0.094
Coefficients 83 0.000
Coefficients 84 0.004
Coefficients 85 -0.000
Coefficients 86 -0.029
Coefficients 87 0.004
Coefficients 88 -0.000
Coefficients 89 -0.001
Coefficients 90 -0.002
Coefficients 91 0.031
Coefficients 92 -0.001
Intercept 21.027
Model Performance in Dev set(Poly_features)
Mean squared error = 11.54
Model Performance in Test set(Poly_features)
Mean squared error = 9.46
L1缩减回归——LASSO
最小绝对值和选择操作(LASSO)是另一种在回归问题中常用的缩减方法。它和岭回归相比,更倾向于稀疏的结果。如果一个结果的大多数系数被缩减为0,那它被称为稀疏的。LASSO的大多数系数都变成了0,对于相关联的变量,只选择保留其中一个,而不像岭回归那样给这些变量的系数分配相同的权重。LASSO的这种特性可以用来选择变量。
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso,LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
alpha_range=np.linspace(0,0.5,200)
model=Lasso(normalize=True)
coeffiecients=[]
for alpha in alpha_range:
model.set_params(alpha=alpha)
model.fit(x,y)
coeffiecients.append(model.coef_)
coeff_path(alpha_range,coeffiecients)
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
def coeff_path(alpha_range,coeffiecients):
plt.figure(figsize=(16,9))
plt.xlabel("alpha value")
plt.ylabel("coeffiecient weight")
plt.plot(alpha_range,coeffiecients)
plt.axis('tight')
def get_coeff(x,y,alpha):
model=Lasso(normalize=True,alpha=alpha)
model.fit(x,y)
coefs=model.coef_
indices=[i for i,coef in enumerate(coefs) if abs(coef)>0.0]
return indices
x,y=get_data()
build_model(x,y)
print("\nPredicting using all the variables")
full_model=LinearRegression(normalize=True)
full_model.fit(x,y)
predicted_y=full_model.predict(x)
model_worth(y,predicted_y)
print("\nModels at different alpha values")
alpha_values=[0.22,0.08,0.01]
for alpha in alpha_values:
indices=get_coeff(x,y,alpha)
print("\t Attributes include",indices)
x_new=x[:,indices]
model=LinearRegression(normalize=True)
model.fit(x_new,y)
predicted_y=model.predict(x_new)
model_worth(y,predicted_y)
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator
C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:477: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged.
positive)
C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
ConvergenceWarning)
Predicting using all the variables
Mean squared error = 21.90
Models at different alpha values
Attributes include [5, 12]
Mean squared error = 30.51
Attributes include [5, 10, 12]
Mean squared error = 27.13
Attributes include [0, 1, 3, 4, 5, 7, 10, 11, 12]
Mean squared error = 22.89
L1和L2缩减交叉验证迭代
交叉验证一般也叫K折交叉验证,训练集被划分为k粉,模型在k-1份数据上进行训练,剩下的用来测试,这样就不需要单独划分Dev集
from sklearn.datasets import load_iris
from sklearn.cross_validation import KFold,StratifiedKFold
def get_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
def class_distribution(y):
class_dist={}
total=0
for entry in y:
try:
class_dist[entry]+=1
except KeyError:
class_dist[entry]=1
total+=1
for k,v in class_dist.items():
print('\tclass %d percentage = %0.2f'%(k,v/(1.0*total)))
#第一种分法
x,y=get_data()
kfolds=KFold(n=y.shape[0],n_folds=3)
fold_count=1
for train,test in kfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1
#第二种分法
skfolds=StratifiedKFold(y,n_folds=3)
fold_count=1
for train,test in skfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1
Fold 1 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 1 percentage = 0.50
class 2 percentage = 0.50
Test Class Distribution
class 0 percentage = 1.00
Fold 2 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 0 percentage = 0.50
class 2 percentage = 0.50
Test Class Distribution
class 1 percentage = 1.00
Fold 3 x train shape (100, 4) x test shape (50, 4)
Train Class Distribution
class 0 percentage = 0.50
class 1 percentage = 0.50
Test Class Distribution
class 2 percentage = 1.00
Fold 1 x train shape (99, 4) x test shape (51, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Fold 2 x train shape (99, 4) x test shape (51, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Fold 3 x train shape (102, 4) x test shape (48, 4)
Train Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
Test Class Distribution
class 0 percentage = 0.33
class 1 percentage = 0.33
class 2 percentage = 0.33
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
def get_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
kfold=KFold(y.shape[0],5)
model=Ridge(normalize=True)
alpha_range=np.linspace(0.0015,0.0017,30)
grid_param={'alpha':alpha_range}
grid=GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,scoring='mean_squared_error')
grid.fit(x,y)
display_param_results(grid.grid_scores_)
print(grid.best_params_)
return grid.best_estimator_
def view_model(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
def display_param_results(param_results):
fold=1
for param_result in param_results:
print("Fold %d mean squared error %0.2f" %(fold,abs(param_result[1])),param_result[0])
fold+=1
x,y=get_data()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_test_poly=poly_features.transform(x_test)
choosen_model=build_model(x_train_poly,y_train)
predicted_y=choosen_model.predict(x_train_poly)
model_worth(y_train,predicted_y)
view_model(choosen_model)
predicted_y=choosen_model.predict(x_test_poly)
model_worth(y_test,predicted_y)
Fold 1 mean squared error 0.05 {'alpha': 0.0015}
Fold 2 mean squared error 0.05 {'alpha': 0.001506896551724138}
Fold 3 mean squared error 0.05 {'alpha': 0.0015137931034482758}
Fold 4 mean squared error 0.05 {'alpha': 0.001520689655172414}
Fold 5 mean squared error 0.05 {'alpha': 0.0015275862068965518}
Fold 6 mean squared error 0.05 {'alpha': 0.0015344827586206897}
Fold 7 mean squared error 0.05 {'alpha': 0.0015413793103448276}
Fold 8 mean squared error 0.05 {'alpha': 0.0015482758620689655}
Fold 9 mean squared error 0.05 {'alpha': 0.0015551724137931034}
Fold 10 mean squared error 0.05 {'alpha': 0.0015620689655172415}
Fold 11 mean squared error 0.05 {'alpha': 0.0015689655172413794}
Fold 12 mean squared error 0.05 {'alpha': 0.0015758620689655172}
Fold 13 mean squared error 0.05 {'alpha': 0.0015827586206896551}
Fold 14 mean squared error 0.05 {'alpha': 0.001589655172413793}
Fold 15 mean squared error 0.05 {'alpha': 0.001596551724137931}
Fold 16 mean squared error 0.05 {'alpha': 0.001603448275862069}
Fold 17 mean squared error 0.05 {'alpha': 0.001610344827586207}
Fold 18 mean squared error 0.05 {'alpha': 0.0016172413793103448}
Fold 19 mean squared error 0.05 {'alpha': 0.0016241379310344827}
Fold 20 mean squared error 0.05 {'alpha': 0.0016310344827586206}
Fold 21 mean squared error 0.05 {'alpha': 0.0016379310344827587}
Fold 22 mean squared error 0.05 {'alpha': 0.0016448275862068966}
Fold 23 mean squared error 0.05 {'alpha': 0.0016517241379310345}
Fold 24 mean squared error 0.05 {'alpha': 0.0016586206896551724}
Fold 25 mean squared error 0.05 {'alpha': 0.0016655172413793102}
Fold 26 mean squared error 0.05 {'alpha': 0.0016724137931034481}
Fold 27 mean squared error 0.05 {'alpha': 0.001679310344827586}
Fold 28 mean squared error 0.05 {'alpha': 0.0016862068965517241}
Fold 29 mean squared error 0.05 {'alpha': 0.001693103448275862}
Fold 30 mean squared error 0.05 {'alpha': 0.0017}
{'alpha': 0.0015}
Mean squared error = 0.04
Model Coefficents
======================
Coefficients 1 0.000
Coefficients 2 -0.452
Coefficients 3 -0.432
Coefficients 4 0.324
Coefficients 5 0.547
Coefficients 6 0.109
Coefficients 7 0.002
Coefficients 8 -0.021
Coefficients 9 -0.063
Coefficients 10 -0.148
Coefficients 11 0.142
Intercept 1.663
Mean squared error = 0.04
0.03618990322467848
GridSearchCV是sklearn提供的一个便捷的函数,帮助我们采用一个范围内的参数对模型进行训练。我们看一下GridSearchCV的参数
- estimator : 这是指定用给定参数和数据来运行的模型的类型
- param-grid :这个是一个参数字典,用来评估模型效果
- cv : 这个参数定义了感兴趣的交叉验证类型,我们要传递之前创建的k份迭代器作为CV参数
- 评分函数:在本例中,我们使用的是平法误差,也就是用来评估模型的指标
集成方法
现实生活中,面对着不确定情况,却要做出艰难决定时,我们通常会听取多个朋友的意见。然后基于朋友们的集体智慧来做出决定,机器学习里的集成方法就是采用了这种相似的概念。
集成方法的基本思路是拥有大量的模型,每一个在训练集上产生差别不大的结果,一些模型相较于其他的在某些方面的数据效果会更好一些。可以相信,最后从多个模型得到的输出结果肯定比仅从一个模型中得到的结果要好一些。
引导聚集,通常称为挂袋法,是一种简练优雅的方法,它产生了大量的模型,并将它们的输出集成起来获得最终的预测值。挂袋法集成中的每一个模型只使用训练集的一部分,它们的思路是减少对数据产生过拟合。前面规定了每个模型的差别不能太大,在每个模型训练时采用带替换的采样,这样就产生了一定的差异。还有一种方法是对属性进行采样,采用所有的属性,不同的模型采用不同的属性集合。挂袋法很容易实现并行化。当并行处理框架可用时,模型能并行处理不同的训练集样本。挂袋法对如线性回归之类的线性预测器无效
提升法也是一种集成技术,它产生了一个逐步复杂的模型序列。它按顺序基于钱一个模型的错误训练新的模型,每次训练得到的模型被赋予一个权重,这个权重一句模型在给定数据的效果而定。最终的预测值产生时,这些权重值就是每个特定模型对于最终输出结果的影响力的判据。
理解集成——挂袋法
挂袋法也就引导聚集,它只有在潜在的模型能够产生不同的变化时才有效,也就是只有能够让潜在的数据引入变化, 它就能产生有着轻微变化的多种模型。我们使用自举在数据集上产生模型的变化,所谓自举,就是在给定的数据集上随机采样一定数量的实例,无论是否带有替换,在挂袋法里,我们用自举产生m个不同的数据集,然后用他们中的每一个构建一个模型。对于回归问题,最后用所有模型产生的输出来产生最终的预测值
随机化是用来在建模过程中引入变化的另一种技术,一个例子就是在集成的每个模型里随机选择属性的自己,这样不同模型试用不同的属性集合,这种技术被称为随机子空间方法。对于一些很稳定的模型,挂袋法的效果不明显,它适合那些对很小的改变也十分敏感的分类器,如决策树
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
def get_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
def build_single_model(x,y):
model=KNeighborsClassifier()
model.fit(x,y)
return model
def bagging_model(x,y):
bagging=BaggingClassifier(KNeighborsClassifier(),n_estimators=100,random_state=9,max_samples=1.0,max_features=0.7,
bootstrap=True,bootstrap_features=True)
bagging.fit(x,y)
return bagging
def view_model(model):
print("\n Sampled attributes in top 10 estimators \n")
for i,features in enumerate(model.estimators_features_[0:10]):
print("estimator %d"%(i+1),features)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
#构建多个模型
bagging=bagging_model(x_train,y_train)
predicted_y=bagging.predict(x_train)
print("\n Bagging Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
view_model(bagging)
#查看Dev集的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
predicted_y=bagging.predict(x_dev)
print("\n Bagging Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
Single Model Accuracy on training data
precision recall f1-score support
0 0.88 0.87 0.88 181
1 0.87 0.88 0.87 169
avg / total 0.87 0.87 0.87 350
Bagging Model Accuracy on training data
precision recall f1-score support
0 0.93 0.97 0.95 181
1 0.96 0.92 0.94 169
avg / total 0.95 0.95 0.95 350
Sampled attributes in top 10 estimators
estimator 1 [25 20 10 6 17 18 11 17 9 14 3 10 10 23 22 18 17 11 21 20 1]
estimator 2 [14 3 27 28 20 20 27 25 0 21 1 12 20 21 29 1 0 28 16 4 9]
estimator 3 [29 5 23 19 2 16 21 4 13 27 1 15 24 5 14 1 4 25 22 26 29]
estimator 4 [23 10 16 7 22 11 0 14 14 17 8 17 27 12 13 23 8 7 27 0 27]
estimator 5 [ 3 0 26 13 23 7 27 15 18 11 26 18 26 3 22 6 11 21 6 12 19]
estimator 6 [16 5 24 19 21 2 2 22 12 21 14 28 5 29 9 19 24 14 21 8 11]
estimator 7 [ 7 23 2 17 22 2 12 14 25 5 7 10 25 5 17 16 9 0 9 9 15]
estimator 8 [16 10 7 8 8 18 6 3 12 29 13 17 20 9 2 25 6 28 15 0 16]
estimator 9 [22 29 2 5 6 11 18 4 19 27 17 28 20 15 21 26 14 5 28 15 21]
estimator 10 [29 22 17 10 16 10 27 8 2 18 26 1 3 2 1 17 2 12 10 22 26]
Single Model Accuracy on Dev data
precision recall f1-score support
0 0.83 0.84 0.83 51
1 0.85 0.83 0.84 54
avg / total 0.84 0.84 0.84 105
Bagging Model Accuracy on Dev data
precision recall f1-score support
0 0.83 0.88 0.86 51
1 0.88 0.83 0.86 54
avg / total 0.86 0.86 0.86 105
理解集成——提升法
提升法是一种强大的集成技术,在数据科学中得到了广泛的应用,实际上,它是数据科学工具包的最基本工具之一。和挂袋法一样,提升法也使用了一群评估器,但这也是两者之间仅有的相似之处。在深入了解之前,我们先了解一下提升法是如何成为一个高效的集成工具的
以我们熟悉的二元分类问题为例,输入是一系列的预测器(x),输出的取值只能是0或者1的反应变量(y)。这个分类器的输入可以表达为下式:
分类器的任务就是找到一个可以近似的函数。
分类器错误定义的比例定义如下式:
假设我们构建了一个弱分类器,其错误比例进稍好于随机猜测。在提升法里构建一系列若分离器用在进行了微调的数据集合上,每个分类器使用的数据只做了小小的调整,最后结束于第M个分类器。
最后,把各个分类器生成的预测结果集成起来进行加权多数票投票:
上述这种方法就称为AdaBoost
提升法和挂袋法的不同之处就在于权重 和顺序建模。前面说过,提升法构建了一系列的弱分类器,并给每个分类器使用经过微调的数据集。我们来看看数据微调是怎么回事,正式这些微调的影响产生了权重的 。
从第一个分类器初始化开始,m=1,先把每个实例的权重定位1/N,也就是说,如果有100条记录,每条记录获得0.01的权重,我们用w来表示权重,现在有100个这样的权重值,如下
现在所有记录被分类器选中的机会是均等的,我们来创建一个分类器,对训练集进行测试以获取错误分类比例。之前我们提到过错误分类比例计算公式,现在对它进行一些小改动,引入权重,公式如下:
公式里的abs表示取绝对值,根据错误比例,我们采用下面的公式来计算 值的权重
上式中的 是一个非常小的值
假定模型1的错误比例为0.3,也就是说它对70%的记录进行正确分类,因此这个模型的权重将大致为0.8,这是个不错的权重,基于这个结果,我们回头给单独的记录设置权重,方法如下:
如你所见,那么被错误分裂的属性权重值都上升了,这就提高了那些分类错误的记录被下下一个分类器选中的几率。序列中随后的分类器都会选择权重较大的实例,并试着适配它。就这样,后续的分类器都会对钱一个分类器错误分类的实例更加关注
这就是提升法的威力,它将多个弱分类器转化为一个强分类器整体
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,zero_one_loss
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import itertools
def get_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
def build_single_model(x,y):
model=DecisionTreeClassifier()
model.fit(x,y)
return model
def build_boosting_model(x,y,n_estimators=20):
boosting=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,min_samples_leaf=1),
random_state=9,n_estimators=n_estimators,algorithm="SAMME")
boosting.fit(x,y)
return boosting
def view_model(model):
print("\n Estimator Weights and Error\n")
for i,weight in enumerate(model.estimator_weights_):
print("estimator %d weight =%0.4f error= %0.4f"%(i+1,weight,model.estimator_errors_[i]))
plt.plot(model.estimator_weights_,model.estimator_errors_)
def number_estimators_vs_err_rate(x,y,x_dev,y_dev):
no_estimators=range(20,120,10)
misclassy_rate=[]
misclassy_rate_dev=[]
for no_estimator in no_estimators:
boosting=build_boosting_model(x,y,no_estimator)
predicted_y=boosting.predict(x)
predicted_y_dev=boosting.predict(x_dev)
misclassy_rate.append(zero_one_loss(y,predicted_y))
misclassy_rate_dev.append(zero_one_loss(y_dev,predicted_y_dev))
# no_estimators=np.asarray(no_estimators)
# misclassy_rate=np.asarray(misclassy_rate)
# misclassy_rate_dev=np.asarray(misclassy_rate_dev)
# print(no_estimators,misclassy_rate)
plt.plot(no_estimators,misclassy_rate,label='Train',color='g')
plt.plot(no_estimators,misclassy_rate_dev,label="Dev",color='r')
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassfication = %0.2f" %(zero_one_loss(y_train,predicted_y)*100,),'%')
#构建多个模型
boosting=build_boosting_model(x_train,y_train)
predicted_y=boosting.predict(x_train)
print("\n Boosting Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_train,predicted_y)*100),"%")
view_model(boosting)
#查看在Dev集上的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
predicted_y=boosting.predict(x_dev)
print("\n Boosting Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
number_estimators_vs_err_rate(x_train,y_train,x_dev,y_dev)
Single Model Accuracy on training data
precision recall f1-score support
0 1.00 1.00 1.00 181
1 1.00 1.00 1.00 169
avg / total 1.00 1.00 1.00 350
Fraction of misclassfication = 0.00 %
Boosting Model Accuracy on training data
precision recall f1-score support
0 0.86 0.94 0.90 181
1 0.93 0.84 0.88 169
avg / total 0.89 0.89 0.89 350
Fraction of misclassification = 10.86 %
Estimator Weights and Error
estimator 1 weight =0.8337 error= 0.3029
estimator 2 weight =0.8921 error= 0.2907
estimator 3 weight =0.6730 error= 0.3378
estimator 4 weight =0.6067 error= 0.3528
estimator 5 weight =0.5746 error= 0.3602
estimator 6 weight =0.5537 error= 0.3650
estimator 7 weight =0.5697 error= 0.3613
estimator 8 weight =0.5538 error= 0.3650
estimator 9 weight =0.5579 error= 0.3640
estimator 10 weight =0.4530 error= 0.3886
estimator 11 weight =0.4530 error= 0.3886
estimator 12 weight =0.3564 error= 0.4118
estimator 13 weight =0.4130 error= 0.3982
estimator 14 weight =0.3679 error= 0.4091
estimator 15 weight =0.3142 error= 0.4221
estimator 16 weight =0.3888 error= 0.4040
estimator 17 weight =0.4902 error= 0.3799
estimator 18 weight =0.2798 error= 0.4305
estimator 19 weight =0.4463 error= 0.3902
estimator 20 weight =0.2645 error= 0.4343
Single Model Accuracy on Dev data
precision recall f1-score support
0 0.62 0.75 0.68 51
1 0.70 0.57 0.63 54
avg / total 0.66 0.66 0.65 105
Fraction of misclassification = 34.29 %
Boosting Model Accuracy on Dev data
precision recall f1-score support
0 0.71 0.86 0.78 51
1 0.84 0.67 0.74 54
avg / total 0.78 0.76 0.76 105
Fraction of misclassification = 23.81 %
理解集成——梯度提升
梯度提升法采用的是梯度,而不是权重来鉴别缺陷。以一个简单的回归问题为例,我们给定所需的预测器变量X和反应变量Y,这二者都是一个实数。
梯度法的执行步骤如下:
先从简单的模型开始,如平均值:
预测值简单的设置为反应变量的平均值,接着开始调整残差,也就是真实值和预测值之间的误差
下一个分类器在如下的数据上进行训练
随后的模型在掐你个模型的残差上进行训练,就这样,算法持续的在集成中构建所需数量的模型。
现在来探究为什么要在残差上进行训练,到目前位置,我们清楚提升法创造了渐进的模型,假设我们构建了两个模型F1(X)和F2(X),来预测Y,依据渐进的原则,可以把两个模型组合成以下的形式
也就是说这两个模型的预测值结合起来得到了预测值Y1.可以推导出等价的公式
残差是模型没有完善处理的部分,简言之就是上一个模型的缺陷。因此我们可以利用残差来提升模型效果,也即改善上一个模型的缺陷
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y,n_estimators=500):
"""
Build a Gradient Boost regression model
"""
model = GradientBoostingRegressor(n_estimators=n_estimators,verbose=10,\
subsample=0.7, learning_rate= 0.15,max_depth=3,random_state=77)
model.fit(x,y)
return model
def view_model(model):
print("\n Training Scores")
print("===================")
for i,score in enumerate(model.train_score_):
print("\t Estimator %d score %0.3f" %(i+1,score))
plt.plot(range(1,model.estimators_.shape[0]+1),model.train_score_)
plt.xlabel("model sequence")
plt.ylabel("Model Score")
print("\n Feature Importance ")
print("=======================")
for i,score in enumerate(model.feature_importances_):
print("\t Feature %d score %0.3f" %(i+1,score))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(2,interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
#用多项式特征建模
model_poly=build_model(x_train_poly,y_train)
predicted_y=model_poly.predict(x_train_poly)
print("\n Model Performance in Training set(Polynomial features)\n")
model_worth(y_train,predicted_y)
#查看模型细节
view_model(model_poly)
#把模型应用到Dev集
predicted_y=model_poly.predict(x_dev_poly)
print("\n Model Performance in Dev set(Polynomial features)\n")
model_worth(y_dev,predicted_y)
#把模型应用到测试集中
x_test_poly=poly_features.transform(x_test)
predicted_y=model_poly.predict(x_test_poly)
print("\n Model Performance in Test set(Polynomial features)\n")
model_worth(y_test,predicted_y)
Iter Train Loss OOB Improve Remaining Time
1 58.5196 20.8748 1.50s
2 45.2833 10.3732 1.37s
3 40.1522 8.8467 1.33s
4 27.7772 8.2210 1.37s
5 27.6316 3.9991 1.49s
6 21.0990 4.0621 1.52s
7 17.5833 2.8307 1.59s
8 15.0893 2.3535 1.54s
9 11.9637 1.8070 1.53s
10 10.2020 1.4632 1.50s
11 9.5262 0.7945 1.49s
12 7.2975 0.6774 1.46s
13 6.6403 0.4107 1.46s
14 6.4880 0.2982 1.44s
15 6.0600 0.2326 1.46s
16 5.4829 0.0987 1.48s
17 4.4872 0.4339 1.46s
18 4.5728 -0.0933 1.46s
19 4.4907 -0.1070 1.47s
20 4.2713 0.1204 1.46s
21 3.8139 -0.0294 1.46s
22 4.0569 0.1370 1.47s
23 3.2827 -0.1233 1.47s
24 3.6827 -0.0219 1.46s
25 3.3933 0.0265 1.45s
26 3.0773 0.0355 1.44s
27 2.9170 -0.0794 1.43s
28 2.8000 0.0090 1.42s
29 2.9376 0.0555 1.42s
30 2.9341 -0.0120 1.42s
31 2.7107 -0.0145 1.41s
32 2.4329 -0.0825 1.41s
33 2.2248 -0.0282 1.42s
34 2.3968 0.0218 1.41s
35 2.1704 -0.0340 1.41s
36 2.4265 -0.0543 1.41s
37 2.0856 -0.0292 1.40s
38 1.9023 -0.0714 1.39s
39 2.2966 -0.1218 1.38s
40 2.0939 -0.0538 1.38s
41 1.9168 -0.0141 1.37s
42 1.7684 -0.0197 1.37s
43 1.9962 -0.0659 1.37s
44 1.9309 -0.0236 1.37s
45 1.7975 -0.0312 1.37s
46 1.7221 0.0071 1.36s
47 1.8995 -0.0159 1.36s
48 1.6663 -0.0241 1.35s
49 1.6364 -0.0024 1.34s
50 1.6127 -0.0219 1.34s
51 1.5108 -0.0099 1.33s
52 1.4282 -0.0125 1.34s
53 1.4411 -0.0601 1.34s
54 1.5713 -0.0175 1.33s
55 1.4338 0.0106 1.33s
56 1.2473 -0.0265 1.32s
57 1.3721 -0.0129 1.32s
58 1.3093 -0.0234 1.31s
59 1.2621 -0.0350 1.31s
60 1.2321 -0.0122 1.31s
61 1.2824 -0.0339 1.30s
62 1.2321 -0.0079 1.30s
63 1.1697 -0.0329 1.29s
64 1.1407 -0.0417 1.29s
65 1.1132 -0.0320 1.29s
66 1.1506 -0.0135 1.29s
67 0.9701 -0.0237 1.29s
68 1.0161 -0.0143 1.29s
69 0.9928 -0.0242 1.28s
70 0.9920 -0.0250 1.29s
71 0.9580 -0.0345 1.28s
72 0.8313 -0.0304 1.28s
73 0.9547 -0.0056 1.27s
74 0.7691 -0.0207 1.27s
75 0.9145 0.0017 1.26s
76 0.8059 -0.0348 1.26s
77 0.7404 -0.0211 1.26s
78 0.7966 -0.0302 1.25s
79 0.8268 -0.0111 1.25s
80 0.8224 -0.0075 1.24s
81 0.6483 -0.0431 1.24s
82 0.8112 -0.0243 1.23s
83 0.6981 -0.0066 1.23s
84 0.6891 -0.0102 1.23s
85 0.6803 -0.0283 1.22s
86 0.6356 -0.0080 1.22s
87 0.6988 -0.0047 1.21s
88 0.6938 -0.0316 1.21s
89 0.6239 -0.0091 1.22s
90 0.7027 -0.0166 1.23s
91 0.6343 -0.0096 1.23s
92 0.5790 -0.0290 1.23s
93 0.5781 -0.0199 1.23s
94 0.6280 -0.0085 1.22s
95 0.5657 -0.0167 1.22s
96 0.5596 -0.0061 1.21s
97 0.5814 -0.0080 1.21s
98 0.5678 -0.0053 1.21s
99 0.5716 -0.0108 1.21s
100 0.5283 -0.0096 1.20s
101 0.4609 -0.0274 1.20s
102 0.5268 -0.0105 1.19s
103 0.4963 -0.0030 1.19s
104 0.4829 -0.0097 1.18s
105 0.4424 -0.0143 1.18s
106 0.4895 -0.0222 1.18s
107 0.4521 -0.0135 1.17s
108 0.4749 -0.0016 1.17s
109 0.4321 -0.0138 1.16s
110 0.4301 -0.0109 1.16s
111 0.4250 -0.0049 1.16s
112 0.3632 -0.0089 1.15s
113 0.3951 -0.0094 1.15s
114 0.4158 -0.0189 1.15s
115 0.3850 -0.0041 1.14s
116 0.3878 -0.0069 1.14s
117 0.3758 -0.0118 1.14s
118 0.3982 -0.0035 1.13s
119 0.3775 -0.0047 1.13s
120 0.3300 -0.0058 1.12s
121 0.3339 -0.0058 1.12s
122 0.3446 -0.0113 1.12s
123 0.3177 -0.0064 1.11s
124 0.3026 -0.0008 1.11s
125 0.3065 -0.0033 1.10s
126 0.3337 -0.0036 1.10s
127 0.2870 -0.0100 1.10s
128 0.2903 -0.0077 1.09s
129 0.3062 -0.0055 1.09s
130 0.2854 -0.0061 1.08s
131 0.2862 -0.0039 1.08s
132 0.2437 -0.0052 1.07s
133 0.2479 -0.0105 1.07s
134 0.2616 -0.0051 1.06s
135 0.2704 -0.0113 1.06s
136 0.2330 -0.0096 1.06s
137 0.2263 -0.0079 1.05s
138 0.2517 -0.0053 1.05s
139 0.2440 -0.0102 1.04s
140 0.2561 -0.0018 1.04s
141 0.2234 -0.0037 1.04s
142 0.2313 -0.0012 1.04s
143 0.2226 -0.0034 1.04s
144 0.2326 -0.0029 1.04s
145 0.2462 -0.0034 1.03s
146 0.2043 -0.0014 1.03s
147 0.2156 -0.0055 1.02s
148 0.2051 -0.0062 1.02s
149 0.2068 -0.0064 1.02s
150 0.1821 -0.0069 1.02s
151 0.1884 -0.0046 1.01s
152 0.1747 -0.0040 1.01s
153 0.1733 -0.0049 1.01s
154 0.1967 -0.0044 1.00s
155 0.1949 -0.0086 1.00s
156 0.1783 -0.0013 0.99s
157 0.1683 -0.0058 0.99s
158 0.1728 -0.0052 0.99s
159 0.1717 -0.0046 0.98s
160 0.1743 -0.0037 0.98s
161 0.1591 -0.0018 0.98s
162 0.1659 -0.0059 0.98s
163 0.1582 -0.0018 0.98s
164 0.1488 0.0002 0.97s
165 0.1649 -0.0028 0.97s
166 0.1519 -0.0044 0.97s
167 0.1408 -0.0064 0.96s
168 0.1351 -0.0015 0.96s
169 0.1402 -0.0041 0.96s
170 0.1382 -0.0025 0.96s
171 0.1321 -0.0019 0.96s
172 0.1295 -0.0038 0.95s
173 0.1351 -0.0035 0.95s
174 0.1277 -0.0018 0.95s
175 0.1326 -0.0020 0.94s
176 0.1225 -0.0045 0.94s
177 0.1251 -0.0063 0.94s
178 0.1203 -0.0045 0.94s
179 0.1278 -0.0025 0.94s
180 0.1110 -0.0022 0.94s
181 0.1105 -0.0023 0.93s
182 0.1081 -0.0015 0.93s
183 0.1014 -0.0046 0.93s
184 0.1036 -0.0013 0.92s
185 0.0965 -0.0015 0.92s
186 0.1002 -0.0054 0.92s
187 0.0984 -0.0013 0.92s
188 0.0952 -0.0028 0.91s
189 0.0926 -0.0033 0.91s
190 0.1002 -0.0020 0.91s
191 0.0945 -0.0017 0.91s
192 0.0947 -0.0016 0.91s
193 0.0884 -0.0017 0.91s
194 0.0872 -0.0039 0.90s
195 0.0895 -0.0020 0.90s
196 0.0871 -0.0020 0.90s
197 0.0829 -0.0035 0.90s
198 0.0760 -0.0016 0.90s
199 0.0846 -0.0024 0.89s
200 0.0808 -0.0018 0.89s
201 0.0831 -0.0021 0.89s
202 0.0722 -0.0016 0.89s
203 0.0771 -0.0012 0.89s
204 0.0711 -0.0009 0.88s
205 0.0704 -0.0012 0.88s
206 0.0724 -0.0009 0.88s
207 0.0645 -0.0039 0.88s
208 0.0690 -0.0006 0.87s
209 0.0662 -0.0014 0.87s
210 0.0612 -0.0016 0.87s
211 0.0608 -0.0005 0.87s
212 0.0619 -0.0024 0.87s
213 0.0593 0.0001 0.86s
214 0.0618 -0.0014 0.86s
215 0.0570 -0.0016 0.86s
216 0.0599 -0.0015 0.86s
217 0.0504 -0.0019 0.85s
218 0.0525 -0.0018 0.85s
219 0.0510 -0.0022 0.85s
220 0.0549 -0.0009 0.84s
221 0.0513 -0.0011 0.84s
222 0.0483 -0.0005 0.84s
223 0.0496 -0.0013 0.84s
224 0.0460 -0.0006 0.83s
225 0.0496 -0.0008 0.83s
226 0.0486 -0.0003 0.83s
227 0.0501 -0.0021 0.83s
228 0.0481 -0.0005 0.83s
229 0.0451 -0.0010 0.82s
230 0.0470 -0.0005 0.82s
231 0.0436 -0.0013 0.82s
232 0.0413 -0.0019 0.82s
233 0.0397 -0.0010 0.82s
234 0.0382 -0.0002 0.81s
235 0.0394 -0.0005 0.81s
236 0.0381 -0.0003 0.81s
237 0.0391 -0.0007 0.81s
238 0.0375 -0.0010 0.81s
239 0.0396 -0.0007 0.80s
240 0.0367 -0.0005 0.80s
241 0.0377 -0.0008 0.80s
242 0.0367 -0.0011 0.79s
243 0.0362 -0.0007 0.79s
244 0.0338 -0.0004 0.79s
245 0.0351 -0.0005 0.79s
246 0.0326 -0.0008 0.78s
247 0.0316 -0.0015 0.78s
248 0.0337 -0.0007 0.78s
249 0.0346 -0.0004 0.77s
250 0.0299 -0.0005 0.77s
251 0.0307 -0.0013 0.77s
252 0.0330 -0.0008 0.76s
253 0.0316 -0.0009 0.76s
254 0.0263 -0.0011 0.76s
255 0.0314 -0.0002 0.76s
256 0.0291 -0.0011 0.75s
257 0.0288 -0.0005 0.75s
258 0.0274 -0.0007 0.75s
259 0.0280 -0.0001 0.74s
260 0.0276 -0.0012 0.74s
261 0.0265 -0.0009 0.74s
262 0.0282 -0.0005 0.73s
263 0.0251 -0.0007 0.73s
264 0.0251 -0.0007 0.73s
265 0.0246 -0.0008 0.72s
266 0.0253 -0.0003 0.72s
267 0.0245 -0.0005 0.72s
268 0.0234 -0.0005 0.72s
269 0.0218 -0.0005 0.71s
270 0.0241 -0.0005 0.71s
271 0.0220 -0.0003 0.71s
272 0.0217 -0.0007 0.71s
273 0.0228 -0.0007 0.70s
274 0.0209 -0.0013 0.70s
275 0.0221 -0.0009 0.69s
276 0.0224 -0.0002 0.69s
277 0.0222 -0.0007 0.69s
278 0.0195 -0.0009 0.68s
279 0.0207 -0.0005 0.68s
280 0.0194 -0.0003 0.68s
281 0.0190 -0.0004 0.68s
282 0.0196 -0.0005 0.67s
283 0.0167 -0.0005 0.67s
284 0.0173 -0.0007 0.67s
285 0.0157 -0.0004 0.66s
286 0.0178 -0.0009 0.66s
287 0.0171 -0.0002 0.66s
288 0.0172 -0.0004 0.66s
289 0.0158 -0.0006 0.65s
290 0.0158 -0.0004 0.65s
291 0.0175 -0.0004 0.64s
292 0.0159 -0.0003 0.64s
293 0.0151 -0.0005 0.64s
294 0.0152 -0.0001 0.63s
295 0.0147 -0.0003 0.63s
296 0.0158 -0.0002 0.63s
297 0.0150 -0.0003 0.62s
298 0.0152 -0.0004 0.62s
299 0.0134 -0.0004 0.62s
300 0.0134 -0.0005 0.61s
301 0.0133 -0.0004 0.61s
302 0.0134 -0.0001 0.61s
303 0.0123 -0.0002 0.60s
304 0.0132 -0.0003 0.60s
305 0.0127 -0.0003 0.60s
306 0.0118 -0.0003 0.59s
307 0.0123 -0.0001 0.59s
308 0.0117 -0.0005 0.59s
309 0.0122 -0.0002 0.58s
310 0.0121 -0.0002 0.58s
311 0.0114 -0.0005 0.58s
312 0.0121 -0.0002 0.57s
313 0.0107 -0.0002 0.57s
314 0.0118 -0.0001 0.56s
315 0.0097 -0.0001 0.56s
316 0.0111 -0.0002 0.56s
317 0.0108 -0.0001 0.55s
318 0.0100 -0.0002 0.55s
319 0.0105 -0.0003 0.55s
320 0.0103 -0.0003 0.54s
321 0.0093 -0.0002 0.54s
322 0.0094 -0.0002 0.54s
323 0.0102 -0.0004 0.53s
324 0.0086 -0.0002 0.53s
325 0.0087 -0.0004 0.53s
326 0.0086 -0.0001 0.53s
327 0.0088 -0.0001 0.52s
328 0.0082 -0.0001 0.52s
329 0.0091 -0.0001 0.52s
330 0.0087 -0.0001 0.51s
331 0.0077 -0.0002 0.51s
332 0.0082 -0.0002 0.51s
333 0.0078 -0.0003 0.50s
334 0.0079 -0.0002 0.50s
335 0.0081 -0.0002 0.50s
336 0.0071 -0.0003 0.49s
337 0.0070 -0.0002 0.49s
338 0.0078 -0.0003 0.49s
339 0.0071 -0.0001 0.48s
340 0.0071 -0.0002 0.48s
341 0.0070 -0.0001 0.48s
342 0.0070 -0.0001 0.47s
343 0.0068 -0.0002 0.47s
344 0.0072 -0.0001 0.47s
345 0.0066 -0.0002 0.47s
346 0.0063 -0.0003 0.46s
347 0.0056 -0.0002 0.46s
348 0.0060 -0.0001 0.46s
349 0.0065 -0.0002 0.45s
350 0.0058 -0.0002 0.45s
351 0.0062 -0.0002 0.45s
352 0.0059 -0.0003 0.44s
353 0.0056 -0.0001 0.44s
354 0.0059 -0.0002 0.44s
355 0.0059 -0.0002 0.43s
356 0.0055 -0.0001 0.43s
357 0.0057 -0.0002 0.43s
358 0.0059 -0.0002 0.43s
359 0.0053 -0.0001 0.42s
360 0.0056 -0.0000 0.42s
361 0.0050 -0.0001 0.42s
362 0.0051 -0.0002 0.41s
363 0.0056 -0.0001 0.41s
364 0.0053 -0.0002 0.41s
365 0.0050 -0.0003 0.41s
366 0.0049 -0.0002 0.40s
367 0.0044 -0.0002 0.40s
368 0.0043 -0.0000 0.40s
369 0.0048 -0.0002 0.39s
370 0.0046 -0.0001 0.39s
371 0.0045 -0.0001 0.39s
372 0.0044 -0.0002 0.38s
373 0.0051 -0.0001 0.38s
374 0.0043 -0.0001 0.38s
375 0.0043 -0.0001 0.38s
376 0.0043 -0.0001 0.37s
377 0.0044 -0.0001 0.37s
378 0.0041 -0.0001 0.37s
379 0.0039 -0.0001 0.36s
380 0.0041 -0.0001 0.36s
381 0.0037 -0.0001 0.36s
382 0.0042 -0.0000 0.35s
383 0.0040 -0.0000 0.35s
384 0.0036 -0.0001 0.35s
385 0.0039 -0.0000 0.34s
386 0.0035 -0.0000 0.34s
387 0.0036 -0.0001 0.34s
388 0.0035 -0.0001 0.33s
389 0.0036 -0.0001 0.33s
390 0.0035 -0.0001 0.33s
391 0.0035 -0.0000 0.33s
392 0.0034 -0.0001 0.32s
393 0.0029 -0.0001 0.32s
394 0.0032 -0.0001 0.32s
395 0.0030 -0.0001 0.31s
396 0.0032 -0.0001 0.31s
397 0.0030 -0.0001 0.31s
398 0.0029 -0.0001 0.30s
399 0.0030 -0.0000 0.30s
400 0.0027 -0.0001 0.30s
401 0.0026 -0.0000 0.30s
402 0.0027 -0.0001 0.29s
403 0.0029 -0.0000 0.29s
404 0.0025 -0.0001 0.29s
405 0.0027 -0.0002 0.28s
406 0.0028 -0.0001 0.28s
407 0.0026 -0.0001 0.28s
408 0.0023 -0.0002 0.27s
409 0.0024 -0.0001 0.27s
410 0.0026 -0.0001 0.27s
411 0.0025 -0.0000 0.26s
412 0.0026 -0.0001 0.26s
413 0.0023 -0.0001 0.26s
414 0.0024 -0.0000 0.26s
415 0.0025 -0.0000 0.25s
416 0.0022 -0.0001 0.25s
417 0.0023 -0.0000 0.25s
418 0.0022 -0.0001 0.24s
419 0.0021 -0.0001 0.24s
420 0.0020 -0.0000 0.24s
421 0.0020 -0.0000 0.23s
422 0.0020 -0.0000 0.23s
423 0.0020 -0.0000 0.23s
424 0.0022 -0.0001 0.23s
425 0.0020 -0.0000 0.22s
426 0.0020 -0.0001 0.22s
427 0.0021 -0.0000 0.22s
428 0.0019 -0.0001 0.22s
429 0.0019 -0.0000 0.21s
430 0.0017 -0.0000 0.21s
431 0.0019 -0.0001 0.21s
432 0.0017 -0.0001 0.20s
433 0.0019 -0.0001 0.20s
434 0.0016 -0.0001 0.20s
435 0.0017 -0.0000 0.19s
436 0.0016 -0.0001 0.19s
437 0.0018 -0.0000 0.19s
438 0.0016 -0.0000 0.19s
439 0.0016 -0.0000 0.18s
440 0.0017 -0.0000 0.18s
441 0.0015 -0.0000 0.18s
442 0.0015 -0.0000 0.17s
443 0.0015 -0.0000 0.17s
444 0.0014 -0.0001 0.17s
445 0.0013 -0.0001 0.16s
446 0.0014 -0.0001 0.16s
447 0.0015 -0.0000 0.16s
448 0.0014 -0.0000 0.16s
449 0.0013 -0.0001 0.15s
450 0.0013 -0.0000 0.15s
451 0.0013 -0.0001 0.15s
452 0.0012 -0.0000 0.14s
453 0.0012 -0.0000 0.14s
454 0.0012 -0.0000 0.14s
455 0.0012 -0.0000 0.13s
456 0.0012 -0.0000 0.13s
457 0.0011 -0.0000 0.13s
458 0.0012 -0.0001 0.12s
459 0.0012 -0.0000 0.12s
460 0.0012 -0.0000 0.12s
461 0.0011 -0.0000 0.12s
462 0.0011 -0.0000 0.11s
463 0.0011 -0.0000 0.11s
464 0.0011 -0.0000 0.11s
465 0.0010 -0.0000 0.10s
466 0.0010 -0.0000 0.10s
467 0.0010 -0.0000 0.10s
468 0.0010 -0.0000 0.09s
469 0.0010 -0.0000 0.09s
470 0.0009 -0.0000 0.09s
471 0.0010 -0.0000 0.09s
472 0.0010 -0.0000 0.08s
473 0.0009 -0.0000 0.08s
474 0.0009 -0.0000 0.08s
475 0.0009 -0.0000 0.07s
476 0.0008 -0.0000 0.07s
477 0.0008 -0.0000 0.07s
478 0.0008 -0.0000 0.07s
479 0.0008 -0.0000 0.06s
480 0.0008 -0.0000 0.06s
481 0.0008 -0.0000 0.06s
482 0.0008 -0.0000 0.05s
483 0.0008 -0.0000 0.05s
484 0.0007 -0.0000 0.05s
485 0.0007 -0.0000 0.04s
486 0.0008 -0.0000 0.04s
487 0.0007 -0.0000 0.04s
488 0.0007 -0.0000 0.04s
489 0.0007 -0.0000 0.03s
490 0.0006 -0.0000 0.03s
491 0.0007 -0.0000 0.03s
492 0.0006 -0.0000 0.02s
493 0.0006 -0.0000 0.02s
494 0.0006 -0.0000 0.02s
495 0.0006 -0.0000 0.01s
496 0.0006 -0.0000 0.01s
497 0.0006 -0.0000 0.01s
498 0.0006 -0.0000 0.01s
499 0.0006 -0.0000 0.00s
500 0.0005 -0.0000 0.00s
Model Performance in Training set(Polynomial features)
Mean squared error = 0.00
Training Scores
===================
Estimator 1 score 58.520
Estimator 2 score 45.283
Estimator 3 score 40.152
Estimator 4 score 27.777
Estimator 5 score 27.632
Estimator 6 score 21.099
Estimator 7 score 17.583
Estimator 8 score 15.089
Estimator 9 score 11.964
Estimator 10 score 10.202
Estimator 11 score 9.526
Estimator 12 score 7.298
Estimator 13 score 6.640
Estimator 14 score 6.488
Estimator 15 score 6.060
Estimator 16 score 5.483
Estimator 17 score 4.487
Estimator 18 score 4.573
Estimator 19 score 4.491
Estimator 20 score 4.271
Estimator 21 score 3.814
Estimator 22 score 4.057
Estimator 23 score 3.283
Estimator 24 score 3.683
Estimator 25 score 3.393
Estimator 26 score 3.077
Estimator 27 score 2.917
Estimator 28 score 2.800
Estimator 29 score 2.938
Estimator 30 score 2.934
Estimator 31 score 2.711
Estimator 32 score 2.433
Estimator 33 score 2.225
Estimator 34 score 2.397
Estimator 35 score 2.170
Estimator 36 score 2.427
Estimator 37 score 2.086
Estimator 38 score 1.902
Estimator 39 score 2.297
Estimator 40 score 2.094
Estimator 41 score 1.917
Estimator 42 score 1.768
Estimator 43 score 1.996
Estimator 44 score 1.931
Estimator 45 score 1.797
Estimator 46 score 1.722
Estimator 47 score 1.899
Estimator 48 score 1.666
Estimator 49 score 1.636
Estimator 50 score 1.613
Estimator 51 score 1.511
Estimator 52 score 1.428
Estimator 53 score 1.441
Estimator 54 score 1.571
Estimator 55 score 1.434
Estimator 56 score 1.247
Estimator 57 score 1.372
Estimator 58 score 1.309
Estimator 59 score 1.262
Estimator 60 score 1.232
Estimator 61 score 1.282
Estimator 62 score 1.232
Estimator 63 score 1.170
Estimator 64 score 1.141
Estimator 65 score 1.113
Estimator 66 score 1.151
Estimator 67 score 0.970
Estimator 68 score 1.016
Estimator 69 score 0.993
Estimator 70 score 0.992
Estimator 71 score 0.958
Estimator 72 score 0.831
Estimator 73 score 0.955
Estimator 74 score 0.769
Estimator 75 score 0.914
Estimator 76 score 0.806
Estimator 77 score 0.740
Estimator 78 score 0.797
Estimator 79 score 0.827
Estimator 80 score 0.822
Estimator 81 score 0.648
Estimator 82 score 0.811
Estimator 83 score 0.698
Estimator 84 score 0.689
Estimator 85 score 0.680
Estimator 86 score 0.636
Estimator 87 score 0.699
Estimator 88 score 0.694
Estimator 89 score 0.624
Estimator 90 score 0.703
Estimator 91 score 0.634
Estimator 92 score 0.579
Estimator 93 score 0.578
Estimator 94 score 0.628
Estimator 95 score 0.566
Estimator 96 score 0.560
Estimator 97 score 0.581
Estimator 98 score 0.568
Estimator 99 score 0.572
Estimator 100 score 0.528
Estimator 101 score 0.461
Estimator 102 score 0.527
Estimator 103 score 0.496
Estimator 104 score 0.483
Estimator 105 score 0.442
Estimator 106 score 0.490
Estimator 107 score 0.452
Estimator 108 score 0.475
Estimator 109 score 0.432
Estimator 110 score 0.430
Estimator 111 score 0.425
Estimator 112 score 0.363
Estimator 113 score 0.395
Estimator 114 score 0.416
Estimator 115 score 0.385
Estimator 116 score 0.388
Estimator 117 score 0.376
Estimator 118 score 0.398
Estimator 119 score 0.378
Estimator 120 score 0.330
Estimator 121 score 0.334
Estimator 122 score 0.345
Estimator 123 score 0.318
Estimator 124 score 0.303
Estimator 125 score 0.306
Estimator 126 score 0.334
Estimator 127 score 0.287
Estimator 128 score 0.290
Estimator 129 score 0.306
Estimator 130 score 0.285
Estimator 131 score 0.286
Estimator 132 score 0.244
Estimator 133 score 0.248
Estimator 134 score 0.262
Estimator 135 score 0.270
Estimator 136 score 0.233
Estimator 137 score 0.226
Estimator 138 score 0.252
Estimator 139 score 0.244
Estimator 140 score 0.256
Estimator 141 score 0.223
Estimator 142 score 0.231
Estimator 143 score 0.223
Estimator 144 score 0.233
Estimator 145 score 0.246
Estimator 146 score 0.204
Estimator 147 score 0.216
Estimator 148 score 0.205
Estimator 149 score 0.207
Estimator 150 score 0.182
Estimator 151 score 0.188
Estimator 152 score 0.175
Estimator 153 score 0.173
Estimator 154 score 0.197
Estimator 155 score 0.195
Estimator 156 score 0.178
Estimator 157 score 0.168
Estimator 158 score 0.173
Estimator 159 score 0.172
Estimator 160 score 0.174
Estimator 161 score 0.159
Estimator 162 score 0.166
Estimator 163 score 0.158
Estimator 164 score 0.149
Estimator 165 score 0.165
Estimator 166 score 0.152
Estimator 167 score 0.141
Estimator 168 score 0.135
Estimator 169 score 0.140
Estimator 170 score 0.138
Estimator 171 score 0.132
Estimator 172 score 0.129
Estimator 173 score 0.135
Estimator 174 score 0.128
Estimator 175 score 0.133
Estimator 176 score 0.123
Estimator 177 score 0.125
Estimator 178 score 0.120
Estima