Batch gradient descent algorithm, stochastic gradient descent algorithm, small batch gradient descent algorithm

Fake code:

Batch gradient descent method (Batch Gradient Descent):
the Repeat an until Convergence { $\theta _{j}:=\theta _{j}+\alpha \sum_{i=1}^{m}(y_{i}^{}-\theta _{i}^{T}x_{i}^{})x_{j}$ }

Stochastic gradient descent method (Batch Stochastic Gradient Descent):
Loop {
for I = {,,. 1 to m
$\theta_{j}:=w_{j}+\alpha(y_{i}^{}-\theta_{i}^{T}x_{i}^{})x_{j}$
}
}
small batch gradient descent (the Mini Batch Gradient Descent):
the Repeat {
for I =. 1,. 11, 21 is, 31 is, .. ., 991 ,, {m
$\theta_{j}:=\theta _{j}+\alpha\frac{1}{10}\sum_{i}^{i+9}(y_{i}^{}-\theta_{i}^{T}x_{i}^{})x_{j}$
}
}

python code:

Stochastic gradient descent:

import numpy as np

# 构造训练数据集
x_train = np.array([[2, 0., 3], [3, 1., 3], [0, 2., 3], [4, 3., 2], [1, 4., 4]])

# 构建一个权重作为数据集的真正的权重，theta1主要是用来构建y_train，然后通过模型计算
# 拟合的theta，这样可以比较两者之间的差异，验证模型。
theta1 = np.array([[2 ,3, 4]]).T

# 构建标签数据集,y=t1*x1+t2*x2+t3*x3+b即y=向量x_train乘向量theta+b, 这里b=2
y_train = (x_train.dot(theta1) + np.array([[2],[2],[2],[2],[2]])).ravel() 

# 构建一个5行1列的单位矩阵x0，然它和x_train组合，形成[x0, x1, x2, x3]，x0=1的数据形式，
# 这样可以将y=t1*x1+t2*x2+t3*x3+b写为y=b*x0+t1*x1+t2*x2+t3*x3即y=向量x_train乘向
# 量theta其中theta应该为[b, *, * , *]，则要拟合的theta应该是[2,2,3,4]，这个值可以
# 和算出来的theta相比较，看模型的是否达到预期
x0 = np.ones((5, 1))
input_data = np.hstack([x0, x_train])
m, n = input_data.shape

# 设置两个终止条件
loop_max = 10000000
epsilon = 1e-6

# 初始化theta（权重）
np.random.seed(0)
theta = np.random.rand(n).T # 随机生成10以内的，n维1列的矩阵

# 初始化步长/学习率
alpha = 0.000001
# 初始化迭代误差（用于计算梯度两次迭代的差）
error = np.zeros(n)

# 初始化偏导数矩阵
diff = np.zeros(n)

# 初始化循环次数
count = 0

while count < loop_max:
    count += 1  # 没运行一次count加1，以此来总共记录运行的次数
    
    # 计算梯度
    for i in range(m):
        # 计算每个维度theta的梯度，并运用一个梯度去更新它
        diff = input_data[i].dot(theta)-y_train[i]
        theta = theta - alpha * diff*(input_data[i])
    
    # else中将前一个theta赋值给error,theta - error便表示前后两个梯度的变化，当梯度
    #变化很小（在接收的范围内）时，便停止迭代。
    if np.linalg.norm(theta - error) < epsilon: # 判断theta与零向量的距离是否在误差内
        break
    else:
        error = theta  
print(theta)

Batch gradient descent:

import numpy as np

# 构造训练数据集
x_train = np.array([[2, 0., 3], [3, 1., 3], [0, 2., 3], [4, 3., 2], [1, 4., 4]])
m = len(x_train)

x0 = np.full((m, 1), 1)

# 构造一个每个数据第一维特征都是1的矩阵
input_data = np.hstack([x0, x_train])
m, n = input_data.shape
theta1 = np.array([[2 ,3, 4]]).T
# 构建标签数据集，后面的np.random.randn是将数据加一点噪声，以便模拟数据集。
#y_train = (input_data.dot(np.array([1, 2, 3, 4]).T)).T
y_train = x_train.dot(theta1) + np.array([[2],[2],[2],[2],[2]]) 

# 设置两个终止条件
loop_max = 1000000
epsilon = 1e-5

# 初始theta
np.random.seed(0)  # 设置随机种子
theta = np.random.randn(n,1)   # 随机取一个1维列向量初始化theta

# 初始化步长/学习率
alpha = 0.00001
# 初始化误差，每个维度的theta都应该有一个误差，所以误差是一个4维。
error = np.zeros((n, 1))  # 列向量

# 初始化偏导数
diff = np.zeros((input_data.shape[1], 1 ))

# 初始化循环次数
count = 0

while count < loop_max:
    count += 1
    sum_m = np.zeros((n, 1))

    for i in range(m):
        for j in range(input_data.shape[1]):
            # 计算每个维度的theta
            diff[j] = (input_data[i].dot(theta)-y_train[i])*input_data[i, j]
        # 求每个维度的梯度的累加和
        sum_m = sum_m + diff
    # 利用这个累加和更新梯度    
    theta = theta - alpha * sum_m
    # else中将前一个theta赋值给error,theta - error便表示前后两个梯度的变化，当梯度
    #变化很小（在接收的范围内）时，便停止迭代。
    if np.linalg.norm(theta - error) < epsilon:
        break
    else:
        error = theta
 
print(theta)

Small batch gradient descent:

import numpy as np

# 构造训练数据集
x_train = np.array([[2, 0., 3], [3, 1., 3], [0, 2., 3], [4, 3., 2], [1, 4., 4]])
m = len(x_train)

x0 = np.full((m, 1), 1)

# 构造一个每个数据第一维特征都是1的矩阵
input_data = np.hstack([x0, x_train])
m, n = input_data.shape
theta1 = np.array([[2 ,3, 4]]).T
# 构建标签数据集，后面的np.random.randn是将数据加一点噪声，以便模拟数据集。
#y_train = (input_data.dot(np.array([1, 2, 3, 4]).T)).T
y_train = x_train.dot(theta1) + np.array([[2],[2],[2],[2],[2]]) 

# 设置两个终止条件
loop_max = 1000000
epsilon = 1e-5

# 初始theta
np.random.seed(0)  # 设置随机种子
theta = np.random.randn(n,1)   # 随机取一个1维列向量初始化theta

# 初始化步长/学习率
alpha = 0.00001
# 初始化误差，每个维度的theta都应该有一个误差，所以误差是一个4维。
error = np.zeros((n, 1))  # 列向量

# 初始化偏导数
diff = np.zeros((input_data.shape[1], 1 ))

# 初始化循环次数
count = 0

# 设置小批量的样本数
minibatch_size= 2 

while count < loop_max:
    count += 1
    sum_m = np.zeros((n, 1))

    for i in range(1, m, minibatch_size):
        for j in range(i - 1, i + minibatch_size - 1, 1):
            # 计算每个维度的theta
            diff[j] = (input_data[i].dot(theta)-y_train[i])*input_data[i, j]
        # 求每个维度的梯度的累加和
        sum_m = sum_m + diff
    # 利用这个累加和更新梯度    
    theta = theta - alpha * (1.0 / minibatch_size)* sum_m
    # else中将前一个theta赋值给error,theta - error便表示前后两个梯度的变化，当梯度
    #变化很小（在接收的范围内）时，便停止迭代。
    if np.linalg.norm(theta - error) < epsilon:
        break
    else:
        error = theta
    
print(theta)

-------------------------------------------------- ------------------
original: https: //www.jianshu.com/p/a20e11416a25