先插入代码
1 import numpy as np 2 import matplotlib.pyplot as plt 3 import pandas as pd 4 from sklearn.model_selection import train_test_split 5 from sklearn.linear_model import Lasso, Ridge 6 from sklearn.model_selection import GridSearchCV 7 8 9 if __name__ == "__main__": 10 # pandas读入 11 data = pd.read_csv('8.Advertising.csv') # TV、Radio、Newspaper、Sales 12 x = data[['TV', 'Radio', 'Newspaper']] 13 # x = data[['TV', 'Radio']] 14 y = data['Sales'] 15 print x 16 print y 17 18 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) 19 # print x_train, y_train 20 model = Lasso() 21 # model = Ridge() 22 23 alpha_can = np.logspace(-3, 2, 10) 24 lasso_model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) 25 lasso_model.fit(x, y) 26 print '验证参数:\n', lasso_model.best_params_ 27 28 y_hat = lasso_model.predict(np.array(x_test)) 29 mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error 30 rmse = np.sqrt(mse) # Root Mean Squared Error 31 print mse, rmse 32 33 t = np.arange(len(x_test)) 34 plt.plot(t, y_test, 'r-', linewidth=2, label='Test') 35 plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') 36 plt.legend(loc='upper right') 37 plt.grid() 38 plt.show()
代码解析(以行号为基准)
11行:读取csv数据,n行4列,(4列分别为TV、Radio、Newspaper、Sales)
12行:选取(TV、Radio、Newspaper)这三个数据为特征量,
14行:sales为对应的数值(公式:y(sales) =θ0x0 + θ1x(tv) + θ2x(radio) + θ3x(np))