sklearn包实现广告数据的线性回归模型

code:

import csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pprint import pprint

if __name__ == '__main__':
	path = "Advertising.csv"
	# 这里使用pandas读取数据
	data = pd.read_csv(path)
	x = data[['TV', 'Radio', 'Newspaper']]
	y = data['Sales']
	print("X data:")
	print(x)
	print("Y data:")
	print(y)
	# 绘制出文件中的数据
	plt.figure(figsize=(15,15))
	plt.subplot(311)
	plt.plot(data['TV'], y, 'ro')
	plt.title('TV')
	plt.grid()
	plt.subplot(312)
	plt.plot(data['Radio'], y, 'g^')
	plt.title('Radio')
	plt.grid()
	plt.subplot(313)
	plt.plot(data['Newspaper'], y, 'b*')
	plt.title('Newspaper')
	plt.grid()
	plt.tight_layout()
	plt.show()
	# 使用sklearn包进行数据训练和拟合,从文件中选择80%的数据进行训练
	x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1)
	print("x_train:")
	print(x_train)
	print("y_train")
	print(y_train)
	# 读取线性回归模型,然后进行数据拟合
	linreg = LinearRegression()
	model = linreg.fit(x_train, y_train)
	print('+++++++++++++++++++++')
	print(model)
	print('++++++++++++++++++++')
	print(linreg.coef_)
	print(linreg.intercept_)

	y_hat = linreg.predict(np.array(x_test))
	mse = np.average((y_hat - np.array(y_test)) ** 2)
	rmse = np.sqrt(mse)
	print(mse)
	print(rmse)

	t = np.arange(len(x_test))
	# plt.figure(figsize=(15,15))
	plt.plot(t, y_test, 'r-', lw=2, label="True data")
	plt.plot(t, y_hat, 'g-', lw=2, label='Predicted data')
	plt.legend(loc='upper right')
	plt.title('Regression method to regress sales', fontsize=18)
	plt.grid()
	plt.show()

这里再使用其他的线性回归方法来实现该模型,其实大同小异,无非就是loss function不同,约束条件不同,具体可以搜索lasso和ridge regression方法。
import csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from pprint import pprint

if __name__ == '__main__':
	path = "Advertising.csv"
	# 这里使用pandas读取数据
	data = pd.read_csv(path)
	x = data[['TV', 'Radio', 'Newspaper']]
	y = data['Sales']
	print("X data:")
	print(x)
	print("Y data:")
	print(y)
	# 绘制出文件中的数据
	plt.figure(figsize=(15,15))
	plt.subplot(311)
	plt.plot(data['TV'], y, 'ro')
	plt.title('TV')
	plt.grid()
	plt.subplot(312)
	plt.plot(data['Radio'], y, 'g^')
	plt.title('Radio')
	plt.grid()
	plt.subplot(313)
	plt.plot(data['Newspaper'], y, 'b*')
	plt.title('Newspaper')
	plt.grid()
	plt.tight_layout()
	plt.show()
	# 使用sklearn包进行数据训练和拟合,从文件中选择80%的数据进行训练
	x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1)
	print("x_train:")
	print(x_train)
	print("y_train")
	print(y_train)
	# 读取线性回归模型,然后进行数据拟合
	model = Lasso()
	# model = Ridge()
	alpha_can = np.logspace(-3, 2, 10)
	lass_model1 = GridSearchCV(model, param_grid={'alpha':alpha_can}, cv=5)
	lass_model1.fit(x_train, y_train)
	print('+++++++++++++++++++++')
	print(model)
	print('++++++++++++++++++++')
	print("hyper-parameter:\n", lass_model1.best_params_)

	y_hat = lass_model1.predict(np.array(x_test))
	print("lasso_models score:\n", lass_model1.score(x_test, y_test))
	mse = np.average((y_hat - np.array(y_test)) ** 2)
	rmse = np.sqrt(mse)
	print(mse)
	print(rmse)

	t = np.arange(len(x_test))
	# plt.figure(figsize=(15,15))
	plt.plot(t, y_test, 'r-', lw=2, label="True data")
	plt.plot(t, y_hat, 'g-', lw=2, label='Predicted data')
	plt.legend(loc='upper right')
	plt.title('New Regression method to regress sales', fontsize=18)
	plt.grid()
	plt.show()




当然波士顿房价经典数据也拿来试试:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
import sklearn.datasets
from pprint import pprint
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import warnings

def not_empty(s):
    return s != ''

if __name__ == "__main__":
    warnings.filterwarnings(action='ignore')
    np.set_printoptions(suppress=True)
    # 方法使用pandas读取数据
    file_data = pd.read_csv('housing.data', header=None)
    # a = np.array([float(s) for s in str if s != ''])
    data = np.empty((len(file_data), 14))
    for i, d in enumerate(file_data.values):
        d = list(map(float, filter(not_empty, d[0].split(' '))))
        data[i] = d
    x, y = np.split(data, (13, ), axis=1)
    # 方法使用sklearn自动导入数据,需要联网
    # data = sklearn.datasets.load_boston()
    # x = np.array(data.data)
    # y = np.array(data.target)
    print('numbers of samples:%d, numbers of features:%d' % x.shape)
    print(y.shape)

    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=0)
    model = Pipeline([
        ('ss', StandardScaler()),
        ('poly', PolynomialFeatures(degree=3, include_bias=True)),
        ('linear', ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.99, 1], alphas=np.logspace(-3, 2, 5),
                                fit_intercept=False, max_iter=1e3, cv=3))
    ])
    print('modeling...')
    model.fit(x_train, y_train.ravel())
    linear = model.get_params('linear')['linear']
    print('hyper-parameters:', linear.alpha_)
    print('L1 ratio:', linear.l1_ratio_)
    print('parmas:', linear.coef_.ravel())
    y_pred = model.predict(x_test)
    r2 = model.score(x_test, y_test)
    mse = mean_squared_error(y_test, y_pred)
    print('R2:', r2)
    print('mean vars:', mse)

    t = np.arange(len(y_pred))
    plt.figure(facecolor='w')
    plt.plot(t, y_test.ravel(), 'r-', lw=2, label='real values')
    plt.plot(t, y_pred, 'g-', lw=2, label='predicted values')
    plt.legend(loc='best')
    plt.title('the prediction of Boston Huoses Prices', fontsize=18)
    plt.xlabel('number the samples', fontsize=15)
    plt.ylabel('prices of houses', fontsize=15)
    plt.grid()
    plt.show()

(向邹博老师学习ML)

猜你喜欢

转载自blog.csdn.net/oliverkingli/article/details/80581173