foreword

The main purpose of writing this article is to record my learning process of machine learning and Python.

Univariate Linear Regression

In this part of the exercise, you will implement linear regression with one variable to predict the profit of a food truck. Suppose you are the CEO of a restaurant and are considering opening a new branch in a different city. The chain already has trucks in various cities, and you have profit and population data from the cities. You want to use this data to help you expand to the next city;

Code

Dataset preparation

import library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import data and display

# 数据路径
path = r'E:\Code\ML\ml_learning\ex1\ex1data1.txt'

# 读取数据
# names 添加列明，分别是人口，利润
# header 指定第几行作为列名
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
# 读取数据中的前五项数据，head()内不写时默认5，若输入4，则输出4行，9则输出9行数据，-1输出至倒数第二行数据，-11输出至倒数第12行数据
print(data.head())
# 统计汇总数据的信息，如平均值，标准差， 最小值，最大值等
print(data.describe())

data visualization

# 将数据可视化
# kind设置图标类型，scatter为散点图
# x，y为坐标轴标题
# figsize为打开窗口大小
# title为图标标题
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12, 8), title='data')
plt.show()

Dataset processing

# 加入第一列，全为1，x0 = 1
data.insert(0, 'ones', 1)
# 获取数据列数
cols = data.shape[1]
# iloc根据位置索引选取数据， 先行后列，选取前两列作为输入向量
x = data.iloc[:, 0:cols - 1]  
# 最后一列作为目标向量
y = data.iloc[:, cols - 1:cols]  
# print(x.head())
# print(y.head())
# 转化为矩阵
X = np.matrix(x.values)
y = np.matrix(y.values)
theta = np.matrix([0, 0])  # 初始参数设为0

cost function

official

$J\left ( \theta \right ) = \frac{1}{2m}\sum_{i=1}^{m}\left (h _{\theta } \left ( x^{\left ( i \right )} \right )-y^{\left ( i \right )}\right )^{2}$

in: $h_{\theta }\left ( \theta \right )=\theta ^{T}X = \theta_{0}x_{0}+\theta_{1}x_{1}+...+\theta_{n}x_{n}$

def computeCost(X, y, theta):
    '''
    作用：计算代价函数，向量化
    :param X: 输入矩阵
    :param y: 输出目标
    :param theta: parameters参数
    :return:
    '''

    inner = np.power(((X * theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))

When $\theta _{0}$ sum $\theta_{1}$ = 0, the initial value of the cost function should be 32.072733877455676

gradient descent

official

$\theta_{j} = \theta_{j} - \alpha\frac{\partial }{\partial\theta_{j} }J(\theta )$

$\theta_{j} = \theta_{j} - \alpha \frac{1}{m}\sum_{i=1}^{m}\left ( h_{\theta}(x^{(i)})-y^{(i)}\right )x^{(i)}_{j}$

Here we use the vectorized form to update θ, which can greatly improve the efficiency

def gradientDescent(X, y, theta, alpha, epoch):
    '''
    作用: 梯度下降，获取最终theta值以及cost
    :param X: 输入矩阵
    :param y: 输出目标
    :param theta: 参数
    :param alpha: 学习率
    :param epoch: 迭代次数
    :return:
    '''

    # 初始化一个临时矩阵存临时参数
    temp_theta = np.matrix(np.zeros(theta.shape))
    # 获得参数的个数
    parameters_num = int(theta.flatten().shape[1])
    # 样本个数
    m = X.shape[0]
    # 获得每一轮训练的cost
    cost = np.zeros(epoch)
    # 记录每一轮的theta
    counterTheta = np.zeros((epoch, 2))
    for i in range(epoch):
        '''
        利用向量化计算，大大提高效率
        (97,2)*(2,1)->(97,1)->(1,97)*(97,2)=(1,2)
        '''
        temp_theta = theta - (alpha / m) * (X * theta.T - y).T * X
        theta = temp_theta  # 更新梯度
        counterTheta[i] = theta  # 记录每一次的theta
        cost[i] = computeCost(X, y, theta)  # 计算J(theta)并保存
    return counterTheta, theta, cost

Run the model and predict

Here the learning rate is set to 0.01, and the number of training times is set to 3800

# 学习率
alpha = 0.01
# 训练次数
epoch = 3800
# 调用先前定义的两个函数
counterTheta, final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
computeCost(X, y, final_theta)

# 预测35000和70000城市规模的小吃摊利润
predict1 = [1, 3.5] * final_theta.T
print("predict1:", predict1)
predict2 = [1, 7] * final_theta.T
print("predict2:", predict2)

get the prediction as follows

predict1: [[0.28255134]]

predict2: [[4.45669707]]

Draw linear models and cost function graphs

# np.linspace()
# 返回在区间[`start`，`stop`]中计算出的num个均匀间隔的样本
x = np.linspace(start=data.Population.min(), stop=data.Population.max(), num=100)  # xlabel横坐标
h = final_theta[0, 0] + final_theta[0, 1] * x  # ylabel profit

figure, ax = plt.subplots(nrows=1, ncols=2)
# 线性回归图
ax[0].plot(x, h, 'r', label='Prediction')
ax[0].scatter(data.Population, data.Profit, label='Training Data')
ax[0].legend(loc=2)
ax[0].set_xlabel('Population')
ax[0].set_ylabel('Profit')
ax[0].set_title('Predicted Profit vs. Population Size')
# 损失函数图
ax[1].plot(np.arange(epoch), cost, 'r')
ax[1].set_xlabel('Iteration')
ax[1].set_ylabel('Cost')
ax[1].set_title('Error vs. Training Epoch')
plt.show()

The fitted data map and cost map are obtained as follows, which shows that the degree of fitting is relatively good.

multiple linear regression

The data used here is ex1data2, which is multidimensional data. There are 2 variables (the size of the house, the number of bedrooms) and 1 target (the price of the house). The code here is similar to the code of univariate linear regression. The gap between them is too large, so we need to normalize the eigenvalues during preprocessing in order to make the model converge faster.

Put the code and result graph directly below

Code

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''
多变量线性回归
'''
# 数据路径
path = r'E:\Code\ML\ml_learning\ex1\ex1data2.txt'

# 读取数据
# names 添加列明，分别是面积 数量 价格
# header 指定第几行作为列名
data = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
# 特征归一化
data = (data - data.mean()) / data.std()
# 加入第一列，全为1，x0 = 1
data.insert(0, 'ones', 1)
# 获取数据列数
cols = data.shape[1]
# iloc根据位置索引选取数据， 先行后列，选取前两列作为输入向量
x = data.iloc[:, 0:cols - 1]
# 最后一列作为目标向量
y = data.iloc[:, cols - 1:cols]
# 转化为矩阵
X = np.matrix(x.values)
y = np.matrix(y.values)
theta = np.matrix([0, 0, 0])  # 初始参数设为0

'''
代价函数
'''


def computeCost(X, y, theta):
    '''
    作用：计算代价函数，向量化
    :param X: 输入矩阵
    :param y: 输出目标
    :param theta: parameters参数
    :return:
    '''

    inner = np.power(((X * theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))


'''
梯度下降
'''


def gradientDescent(X, y, theta, alpha, epoch):
    '''
    作用: 梯度下降，获取最终theta值以及cost
    :param X: 输入矩阵
    :param y: 输出目标
    :param theta: 参数
    :param alpha: 学习率
    :param epoch: 迭代次数
    :return:
    '''

    # 初始化一个临时矩阵存临时参数
    temp_theta = np.matrix(np.zeros(theta.shape))
    # 获得参数的个数
    parameters_num = int(theta.flatten().shape[1])
    # 样本个数
    m = X.shape[0]
    # 获得每一轮训练的cost
    cost = np.zeros(epoch)
    # 记录每一轮的theta
    counterTheta = np.zeros((epoch, 3))
    for i in range(epoch):
        '''
        利用向量化计算，大大提高效率
        (97,2)*(2,1)->(97,1)->(1,97)*(97,2)=(1,2)
        '''
        temp_theta = theta - (alpha / m) * (X * theta.T - y).T * X
        theta = temp_theta  # 更新梯度
        counterTheta[i] = theta  # 记录每一次的theta
        cost[i] = computeCost(X, y, theta)  # 计算J(theta)并保存
    return counterTheta, theta, cost


'''
跑模型，画图
'''
# 学习率
alpha = 0.01
# 训练次数
epoch = 3800
# 调用先前定义的两个函数
counterTheta, final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
computeCost(X, y, final_theta)

fig2, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(epoch), cost, 'r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()