Xiaobai learns Pytorch series – Torch.optim API Algorithms (2)
method | note |
---|---|
Adadelta | Implement the Adadelta algorithm. |
Dosing | Implement the Adagrad algorithm. |
Adam | Implement the Adam algorithm. |
AdamW | Implement the AdamW algorithm. |
SparseAdam | Implements a lazy version of the Adam algorithm suitable for sparse tensors. |
Adamax | Implements the Adamax algorithm (an Adam algorithm based on an infinite norm). |
ASGD | Implements averaged stochastic gradient descent. |
LBFGS | Implements the L-BFGS algorithm, largely inspired by minFunc. |
Hope | Implement the NAdam algorithm. |
RAdam | Implemented the RAdam algorithm. |
RMS plug | Implement the RMSprop algorithm. |
Rprop | Implemented the elastic backpropagation algorithm. |
SGD |
Adadelta
Adadelta is an optimization algorithm for adaptive learning rate, which is similar to RMSprop, but introduces an attenuation coefficient to balance the square of historical gradients and the amount of change in the objective function. In PyTorch, this optimizer can be used by setting optim.Adadelta().
grad = grad.add(param, alpha=weight_decay)
square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
std = square_avg.add(eps).sqrt_()
delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)
param.add_(delta, alpha=-lr)
Dosing
AdaGrad is an adaptive learning rate optimization algorithm that dynamically adjusts the learning rate of each parameter based on historical gradient information when updating parameters.
source code
grad = grad.add(param, alpha=weight_decay)
clr = lr / (1 + (step - 1) * lr_decay)
state_sum.addcmul_(grad, grad, value=1)
std = state_sum.sqrt().add_(eps)
param.addcdiv_(grad, std, value=-clr)
Adam
Adam is an optimization algorithm that combines momentum gradient descent and adaptive learning rate. It considers both the weighted average of the historical gradient and the weighted average of the square of the historical gradient when updating parameters.
source code
grad = grad.add(param, alpha=weight_decay)
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
step_size = lr / bias_correction1
param.addcdiv_(exp_avg, denom, value=-step_size)
AdamW
AdamW is a variant based on the Adam optimization algorithm, which introduces weight decay to solve the possible parameter overfitting problem of Adam.
source code
# Perform stepweight decay
param.mul_(1 - lr * weight_decay)
bias_correction1 = 1 - beta1 ** step
bias_correction2 = 1 - beta2 ** step
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps)
else:
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
step_size = lr / bias_correction1
param.addcdiv_(exp_avg, denom, value=-step_size)
SparseAdam
Adamax
ASGD
LBFGS
Hope
RAdam
RMS plug
RMSprop is an adaptive learning rate optimization algorithm that dynamically adjusts the learning rate of each parameter according to the weighted average of the squares of historical gradients when updating parameters.
source code
grad = grad.add(param, alpha=weight_decay)
square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
if centered:
grad_avg = grad_avgs[i]
grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(eps)
else:
avg = square_avg.sqrt().add_(eps)
if momentum > 0:
buf = momentum_buffer_list[i]
buf.mul_(momentum).addcdiv_(grad, avg)
param.add_(buf, alpha=-lr)
else:
param.addcdiv_(grad, avg, value=-lr)
Rprop
SGD
Gradient descent includes: Batch Gradient Descent, Stochastic Gradient Descent, Mini-batch Gradient Descent, Momentum Gradient Descent
Optional Momentum Gradient Descent is an optimization algorithm that incorporates a momentum term into the gradient descent update process to speed up convergence and reduce oscillations. In PyTorch, momentum gradient descent can be achieved by setting the momentum parameter.
source code
d_p = d_p_list[i]
if weight_decay != 0:
d_p = d_p.add(param, alpha=weight_decay)
if momentum != 0:
buf = momentum_buffer_list[i]
if buf is None:
buf = torch.clone(d_p).detach()
momentum_buffer_list[i] = buf
else:
buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
if nesterov:
d_p = d_p.add(buf, alpha=momentum)
else:
d_p = buf
alpha = lr if maximize else -lr
param.add_(d_p, alpha=alpha)
batch gradient descent
import torch
import matplotlib.pyplot as plt
# 定义训练数据
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)
# 初始化模型参数
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)
# 定义损失函数
loss_fn = torch.nn.MSELoss()
# 定义优化器和超参数
optimizer = torch.optim.SGD([w, b], lr=0.01)
epochs = 100
# 批量梯度下降
for epoch in range(epochs):
# 前向传播
Y_pred = w * X + b
# 计算损失
loss = loss_fn(Y_pred, Y)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 清空梯度
optimizer.zero_grad()
# 输出结果
print(f"w = {
w.item()}, b = {
b.item()}")
# 绘制拟合直线
plt.scatter(X.numpy(), Y.numpy())
plt.plot(X.numpy(), (w * X + b).detach().numpy(), 'r')
plt.show()
stochastic gradient descent
import torch
import matplotlib.pyplot as plt
# 定义训练数据
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)
# 初始化模型参数
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)
# 定义损失函数和优化器
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w, b], lr=0.01)
# 定义超参数
batch_size = 1
epochs = 100
# 随机梯度下降
for epoch in range(epochs):
# 创建DataLoader
loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X, Y), batch_size=batch_size, shuffle=True)
for x_batch, y_batch in loader:
# 前向传播
y_pred = w * x_batch + b
# 计算损失
loss = loss_fn(y_pred, y_batch)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 清空梯度
optimizer.zero_grad()
# 输出结果
print(f"w = {
w.item()}, b = {
b.item()}")
# 绘制拟合直线
plt.scatter(X.numpy(), Y.numpy())
plt.plot(X.numpy(), (w * X + b).detach().numpy(), 'r')
plt.show()
Mini-batch gradient descent
import torch
import matplotlib.pyplot as plt
# 定义训练数据
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)
# 初始化模型参数
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)
# 定义损失函数和优化器
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w, b], lr=0.01)
# 定义超参数
batch_size = 2
epochs = 100
# 小批量梯度下降
for epoch in range(epochs):
# 创建DataLoader
loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X, Y), batch_size=batch_size, shuffle=True)
for x_batch, y_batch in loader:
# 前向传播
y_pred = w * x_batch + b
# 计算损失
loss = loss_fn(y_pred, y_batch)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 清空梯度
optimizer.zero_grad()
# 输出结果
print(f"w = {
w.item()}, b = {
b.item()}")
# 绘制拟合直线
plt.scatter(X.numpy(), Y.numpy())
plt.plot(X.numpy(), (w * X + b).detach().numpy(), 'r')
plt.show()
Momentum Gradient Descent
import torch
import matplotlib.pyplot as plt
# 定义训练数据
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)
# 初始化模型参数和动量
w = torch.tensor(0.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)
momentum = 0.9
# 定义损失函数和优化器
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w, b], lr=0.01, momentum=momentum)
# 定义超参数
epochs = 100
# 动量梯度下降
for epoch in range(epochs):
# 前向传播
y_pred = w * X + b
# 计算损失
loss = loss_fn(y_pred, Y)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 清空梯度
optimizer.zero_grad()
# 输出结果
print(f"w = {
w.item()}, b = {
b.item()}")
# 绘制拟合直线
plt.scatter(X.numpy(), Y.numpy())
plt.plot(X.numpy(), (w * X + b).detach().numpy(), 'r')
plt.show()