Question: Linear regression requires the assumption that there is a linear relationship behind our data;
If the square of x is understood as one feature, x is understood as another feature; originally there is only one feature x, but now it is regarded as a data set with two features, and one more feature is the square of x. In fact, the formula itself is still A linear regression formula, but from the perspective of x, it is a so-called nonlinear equation. Such a method is called polynomial regression
PCA dimensionality reduction, polynomial regression to increase dimensionality
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
np.random.seed(666)
x=np.random.uniform(-3.0,3.0,size=100)
X=x.reshape(-1,1)
y=0.5*x**2+x+2+np.random.normal(0,1,size=100)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialRegression(degree):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("lin_reg", LinearRegression())
])
poly2_reg=PolynomialRegression(degree=2)
poly2_reg.fit(X,y)
y2_predict=poly2_reg.predict(X)
mean_squared_error(y,y2_predict)
plt.scatter(x,y)
plt.plot(np.sort(x),y2_predict[np.argsort(x)],color='r')
plt.show()
# 不同的degree
poly10_reg=PolynomialRegression(degree=10)
poly10_reg.fit(X,y)
y10_predict=poly10_reg.predict(X)
mean_squared_error(y,y10_predict)
plt.scatter(x,y)
plt.plot(np.sort(x),y10_predict[np.argsort(x)],color='r')
plt.show()
# degree=100
poly100_reg=PolynomialRegression(degree=100)
poly100_reg.fit(X,y)
y100_predict=poly100_reg.predict(X)
mean_squared_error(y,y100_predict)
plt.scatter(x,y)
plt.plot(np.sort(x),y100_predict[np.argsort(x)],color='r')
plt.show()
The main problem solved by machine learning is actually the problem of overfitting.
Generalization ability: the ability from this to the other (this curve is obtained based on the known training data, but this curve has very weak ability when facing new data, that is, poor generalization ability)
We want to train this model not to fit these points to the greatest extent, but to obtain a predictable model. When new data is available, our model can give a good answer.
Therefore, it is meaningless for us to measure how well our model fits the training data. What we really need is to be able to measure how good the generalization ability of this model is.
So use training data set and test data set
If the model obtained by using the training data can also obtain good results in the face of the test data, we can say that the generalization ability of this model is very strong! ! ! But if its effect is very poor in the face of the test data set, then the generalization ability is very weak, and most of us have encountered overfitting.
Model complexity: different models have different meanings
KNN: The smaller the K, the more complex the model; K=1, the most complex
Polynomial regression: the greater the order, the greater the degree, and the more complex the model
You can also see the over-fitting and under-fitting of the model through the learning curve
Learning curve: With the gradual increase of training samples, the performance ability of the model trained by the algorithm
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x=np.random.uniform(-3.0,3.0,size=100)
X=x.reshape(-1,1)
y=0.5*x**2+x+2+np.random.normal(0,1,size=100)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=10)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
train_score=[]
test_score=[]
for i in range(1,76):
lin_reg=LinearRegression()
lin_reg.fit(X_train[:i],y_train[:i]) # 依次取训练数据、测试数据的前i个元素
y_train_predict=lin_reg.predict(X_train[:i])
train_score.append(mean_squared_error(y_train[:i],y_train_predict))
y_test_predict=lin_reg.predict(X_test)
test_score.append(mean_squared_error(y_test,y_test_predict))
plt.plot([i for i in range(1,76)],np.sqrt(train_score),label="train")
plt.plot([i for i in range(1,76)],np.sqrt(test_score),label="test")
plt.legend()
plt.show()
# 封装成函数
def plot_learning_curve(algo,X_train,X_test,y_train,y_test):
train_score=[]
test_score=[]
for i in range(1,len(X_train)+1):
algo.fit(X_train[:i],y_train[:i])
y_train_predict=algo.predict(X_train[:i])
train_score.append(mean_squared_error(y_train[:i],y_train_predict))
y_test_predict=algo.predict(X_test)
test_score.append(mean_squared_error(y_test,y_test_predict))
plt.plot([i for i in range(1,len(X_train)+1)],np.sqrt(train_score),label="train")
plt.plot([i for i in range(1,len(X_train)+1)],np.sqrt(test_score),label="test")
plt.legend()
plt.axis([0,len(X_train)+1,0,4])
plt.show()
plot_learning_curve(LinearRegression(),X_train,X_test,y_train,y_test)
# 使用多项式回归
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialRegression(degree):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("lin_reg", LinearRegression())
])
poly2_reg=PolynomialRegression(degree=2)
plot_learning_curve(poly2_reg,X_train,X_test,y_train,y_test)
poly2_reg=PolynomialRegression(degree=20) # 过拟合
plot_learning_curve(poly2_reg,X_train,X_test,y_train,y_test)
,
解决方法:将整个数据分成三部分:训练数据集、验证数据集(validation test)、测试数据集(将验证数据集当成之前的测试数据集)
import numpy as np
from sklearn import datasets
digits=datasets.load_digits() # 手写识别数据
X=digits.data
y=digits.target
# 测试train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=666)
from sklearn.neighbors import KNeighborsClassifier
best_score, best_p, best_k = 0, 0, 0
for k in range(2, 11):
for p in range(1, 6):
knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)
score = knn_clf.score(X_test, y_test)
if score > best_score:
best_score, best_p, best_k = score, p, k
print("Best K=", best_k)
print("Best P=", best_p)
print("Best Score", best_score)
# 使用交叉验证
from sklearn.model_selection import cross_val_score
knn_clf=KNeighborsClassifier()
cross_val_score(knn_clf,X_train,y_train,cv=3)
best_score, best_p, best_k = 0, 0, 0
for k in range(2, 11):
for p in range(1, 6):
knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
scores = cross_val_score(knn_clf, X_train, y_train, cv=3)
score = np.mean(scores)
if score > best_score:
best_score, best_p, best_k = score, p, k
print("Best K=", best_k)
print("Best P=", best_p)
print("Best Score", best_score)
best_knn_clf=KNeighborsClassifier(weights="distance",n_neighbors=2,p=2) # 选用刚刚找到的最好的参数
best_knn_clf.fit(X_train,y_train)
best_knn_clf.score(X_test,y_test) # 对于模型完全没有见过的test
# 回顾网格搜索
from sklearn.model_selection import GridSearchCV # cv:就是交叉验证的意思,cross_validation
param_grid = [
{
'weights': ['distance'],
'n_neighbors': [ i for i in range(2,11)],
'p':[i for i in range(1,6)]
}
]
grid_search=GridSearchCV(knn_clf,param_grid,verbose=1,cv=3,n_jobs=-1) #n_jobs=-1的时候,表示cpu里的所有core进行工作(cv:交叉验证,默认3)
grid_search.fit(X_train,y_train)
grid_search.best_score_
grid_search.best_params_
cross_val_score(knn_clf,X_train,y_train,cv=5)
留一法:训练数据集有m个样本,就分成m份;每次都将m-1份样本用于训练,然后去看预测那剩下的一个样本预测的准不准,将这些结果综合起来来进行评均,作为衡量我们当前参数下这个模型对应的预测的准确度
偏差方差平衡:Bias Variance Trade off
导致较高方差:是模型太过复杂,没有完全的学习到这个问题的实质,而学习到了很多的噪音
高方差:泛化能力差
解决方差:模型的正则化
α:新的超参数
这种正则化的方式又叫做岭回归
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
x=np.random.uniform(-3.0,3.0,size=100)
X=x.reshape(-1,1)
y=0.5*x+3+np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
def PolynomialRegression(degree):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("lin_reg", LinearRegression())
])
from sklearn.model_selection import train_test_split
np.random.seed(666)
X_train,X_test,y_train,y_test=train_test_split(X,y)
from sklearn.metrics import mean_squared_error
poly_reg=PolynomialRegression(degree=20)
poly_reg.fit(X_train,y_train)
y_predict=poly_reg.predict(X_test)
mean_squared_error(y_test,y_predict)
X_plot=np.linspace(-3,3,100).reshape(100,1)
y_plot=poly_reg.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,0,6])
plt.show()
def plot_model(model):
X_plot=np.linspace(-3,3,100).reshape(100,1)
y_plot=model.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,0,6])
plt.show()
plot_model(poly_reg)
# 使用岭回归
from sklearn.linear_model import Ridge
def RidgeRegression(degree,alpha):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("ridge_reg", Ridge(alpha=alpha))
])
ridge1_reg=RidgeRegression(20,0.0001)
ridge1_reg.fit(X_train,y_train)
y1_predict=ridge1_reg.predict(X_test)
mean_squared_error(y_test,y1_predict)
plot_model(ridge1_reg)
ridge2_reg=RidgeRegression(20,1)
ridge2_reg.fit(X_train,y_train)
y2_predict=ridge2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
plot_model(ridge2_reg)
ridge3_reg=RidgeRegression(20,100)
ridge3_reg.fit(X_train,y_train)
y3_predict=ridge3_reg.predict(X_test)
mean_squared_error(y_test,y3_predict)
plot_model(ridge3_reg)
另外一种模型正则化的方式:LASSO Regularization
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
x=np.random.uniform(-3.0,3.0,size=100)
X=x.reshape(-1,1)
y=0.5*x+3+np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
def PolynomialRegression(degree):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("lin_reg", LinearRegression())
])
from sklearn.model_selection import train_test_split
np.random.seed(666)
X_train,X_test,y_train,y_test=train_test_split(X,y)
from sklearn.metrics import mean_squared_error
poly_reg=PolynomialRegression(degree=20)
poly_reg.fit(X_train,y_train)
y_predict=poly_reg.predict(X_test)
mean_squared_error(y_test,y_predict)
def plot_model(model):
X_plot=np.linspace(-3,3,100).reshape(100,1)
y_plot=model.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,0,6])
plt.show()
plot_model(poly_reg)
# LASSO
from sklearn.linear_model import Lasso
def LassoRegression(degree,alpha):
return Pipeline([
("poly", PolynomialFeatures(degree=degree)),
("std_scaler", StandardScaler()),
("ridge_reg", Lasso(alpha=alpha))
])
lasso1_reg=LassoRegression(20,0.01)
lasso1_reg.fit(X_train,y_train)
y1_predict=lasso1_reg.predict(X_test)
mean_squared_error(y_test,y1_predict)
plot_model(lasso1_reg)
# 增大α
lasso2_reg=LassoRegression(20,0.1)
lasso2_reg.fit(X_train,y_train)
y2_predict=lasso2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
plot_model(lasso2_reg)