算法有待我再琢磨琢磨
导入boston房价数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston=datasets.load_boston()
X=boston.data
y=boston.target
X=X[y<50.0]
y=y[y<50.0]
X.shape
(490, 13)
导入自己写的LinearRegression.py
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=666)
from LinearRegression import LinearRegression
reg=LinearRegression()
reg.fit_normal(X_train,y_train)
LinearRegression()
reg.coef_# 系数
array([-1.15625837e-01, 3.13179564e-02, -4.35662825e-02, -9.73281610e-02,
-1.09500653e+01, 3.49898935e+00, -1.41780625e-02, -1.06249020e+00,
2.46031503e-01, -1.23291876e-02, -8.79440522e-01, 8.31653623e-03,
-3.98593455e-01])
reg.interception_#截距
32.59756158867594
reg.score(X_test,y_test)
0.8009390227581046
scikit-learn中的回归问题
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
lin_reg.coef_
array([-1.15625837e-01, 3.13179564e-02, -4.35662825e-02, -9.73281610e-02,
-1.09500653e+01, 3.49898935e+00, -1.41780625e-02, -1.06249020e+00,
2.46031503e-01, -1.23291876e-02, -8.79440522e-01, 8.31653623e-03,
-3.98593455e-01])
lin_reg.intercept_
32.59756158869959
lin_reg.score(X_test,y_test)
0.8009390227581041
KNN Regressor使用KNN算法解决线性回归问题
from sklearn.neighbors import KNeighborsRegressor
knn_reg=KNeighborsRegressor()
knn_reg.fit(X_train,y_train)
knn_reg.score(X_test,y_test)
0.602674505080953
网格搜索,寻找超参数
from sklearn.model_selection import GridSearchCV
param_grid=[
{
"weights":["uniform"],
"n_neighbors":[i for i in range(1,11)]
},
{
"weights":["distance"],
"n_neighbors":[i for i in range(1,11)],
"p":[i for i in range(1,6)]
}
]
knn_reg=KNeighborsRegressor()
grid_search=GridSearchCV(knn_reg,param_grid,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)
Fitting 5 folds for each of 60 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 235 tasks | elapsed: 0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 0.6s finished
GridSearchCV(cv=None, error_score=nan,
estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='deprecated', n_jobs=-1,
param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'weights': ['uniform']},
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=1)
grid_search.best_params_
{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
grid_search.best_score_
0.6243135119018297
grid_search.best_estimator_.score(X_test,y_test)
0.7353138117643773
linearRegression.py
import numpy as np
from metrics import r2_score
class LinearRegression:
def __int__(self):
"初始化"
self.coef_=None
self.interception_=None
self._theta=None
def fit_normal(self,X_train,y_train):
assert X_train.shape[0]==y_train.shape[0],\
"必须相等"
X_b=np.hstack([np.ones((len(X_train),1)),X_train])
self._theta=np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train);
self.interception_=self._theta[0]
self.coef_=self._theta[1:]
return self
def predict(self,X_predict):
assert self.interception_ is not None and self.coef_ is not None,\
"must fit before predict!"
assert X_predict.shape[1]==len(self.coef_),\
"X_predict的特征数量必须等于X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self,X_test,y_test):
y_predict=self.predict(X_test)
return r2_score(y_test,y_predict)
def __repr__(self):
return "LinearRegression()"