吴恩达机器学习课程:网易云课堂
此文转载于:黄海广博士,[email protected],感谢吴恩达先生、黄海广博士及所有奋斗在AI一线的前辈们,努力学习,自此刻始。
注意:此文只是方便本人学习,侵权立删。
更权威科学的课程和笔记请移步:https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes
import numpy as np import pandas as pd import matplotlib.pyplot as plt # 获取数据 path = 'F:/yanwucao/test\ExpData\MeachineLearning\AndrewNg\ex1/ex1data1.txt' data1 = pd.read_csv(path, header=None, names=['population', 'profit']) # data1.plot(kind='scatter', x='population', y='profit') # plt.show() def computeCost(X, y, theta): inner = np.power(((X * theta.T) - y), 2) return np.sum(inner) / (2 * len(X)) cols = data1.shape[1] X = data1.iloc[:, 0:cols-1] # X是所有行,去掉最后一列 y = data1.iloc[:, cols-1:cols] # X是所有行,最后一列 w = pd.Series(np.ones(X.shape[0])) X.insert(0, 'Ones', w) # print(X.head()) # print(y.head()) # 代价函数应该要是numpy矩阵,需要转换X、y,才能使用他们 X = np.matrix(X.values) y = np.matrix(y.values) theta = np.matrix(np.array([0, 0])) # print(computeCost(X, y, theta)) def gradientdescent(X, y, theta, alpha=0.01, iters=1000): cost = np.zeros(iters) temp = np.matrix(np.zeros(theta.shape)) # print(type(errors)) for i in range(iters): errors = X * (theta.T) - y for j in range(theta.shape[1]): temp[0, j] = theta[0, j] - alpha * (np.sum(np.multiply(errors, (X[:, j]))) / len(X)) theta = temp # 需要同时更新theta值 cost[i] = computeCost(X, y, theta) return theta, cost g, cost = gradientdescent(X, y, theta) # print(g) # 显示 x = np.linspace(data1.population.min(), data1.population.max(), 100) f = g[0, 0] + (g[0, 1] * x) fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(x, f, 'r', label='prediction') ax.scatter(data1.population, data1.profit, marker='x', label='traning data') ax.legend(loc=2) ax.set_xlabel('population') ax.set_ylabel('profit') ax.set_title('predicted profit vs population size') plt.show() fig1, ax1 = plt.subplots(figsize=(12, 8)) ax1.plot(np.arange(1000), cost, 'blue', label='error') ax1.set_xlabel('Iterations') ax1.set_ylabel('Cost') ax1.set_title('Error vs. Training Epoch') plt.show() # multi variable linear regression path1 = 'F:\yanwucao/test\ExpData\MeachineLearning\AndrewNg\ex1\ex1data2.txt' data2 = pd.read_csv(path1, header=None, names=['size', 'bedrooms', 'price']) # 特征归一化 data2 = (data2 - data2.mean()) / data2.std() # pandas.mean/std都是按列进行运算 # print(data2.head()) col1 = pd.Series(np.ones(data2.shape[0])) data2.insert(0, 'ones', col1) cols = data2.shape[1] X1 = np.matrix(data2.iloc[:, 0:cols-1].values) y1 = np.matrix(data2.iloc[:, cols-1:cols].values) theta1 = np.matrix([0, 0, 0]) g1, cost1 = gradientdescent(X1, y1, theta1) # print(cost1[-1]) fig2, ax = plt.subplots(figsize=(12, 8)) ax.plot(np.arange(1000), cost, 'r', label='multierror') ax.set_xlabel('iterations') ax.set_ylabel('cost') ax.set_title('multi variable linear regression') plt.show() ###################################################################################### # 直接使用scikit-learn的线性回归函数 from sklearn import linear_model model = linear_model.LinearRegression() model.fit(X, y) x = np.array(X[:, 1].A1) y = model.predict(X).flatten() fig3, ax = plt.subplots(figsize=(12,8)) ax.plot(x, f, 'r', label='Prediction') ax.scatter(data1.Population, data1.Profit, label='Traning Data') ax.legend(loc=2) ax.set_xlabel('Population') ax.set_ylabel('Profit') ax.set_title('Predicted Profit vs. Population Size') plt.show()