本章主要内容:
1、理解算法如何从数据中学习
2、使用微分与梯度下降法,将学习重构为参数估计
3、了解一个简单学习算法
4、了解pytorch如何支持自动求导
1.温度计模型
1.1 收集数据并进行可视化
###温度计实例
###1、获取数据集
import torch
import matplotlib.pyplot as plt
t_c = [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0]
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]
t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)
###可视化数据
%matplotlib inline
plt.scatter(t_u,t_c)
输出:
1.2 选择线性模型首试
我们选择线性模型 t_c = w * t_u + b
###创建模型
def model(t_u, w, b):
return w * t_u + b
###定义损失函数
def loss_fn(t_p, t_c):
squared_diffs = (t_p - t_c)**2
return squared_diffs.mean()
###初始化参数
w = torch.ones(())
b = torch.zeros(())
t_p = model(t_u, w, b)
print(t_p)
loss = loss_fn(t_p,t_c)
print(loss)
###减小损失
delta = 0.1
learning_rate = 1e-2
loss_rate_of_change_w = (loss_fn(model(t_u, w+delta, b ),t_c)-loss_fn(model(t_u, w-delta, b ),t_c))/(2.0 * delta)##书中定义滴
loss_rate_of_change_b = (loss_fn(model(t_u, w+delta, b ),t_c)-loss_fn(model(t_u, w-delta, b ),t_c))/(2.0 * delta)
w = w - learning_rate * loss_rate_of_change_w
b = b - learning_rate * loss_rate_of_change_b
输出:
计算导数:
###进行分析
##1、计算导数
def dloss_fn(t_p,t_c):
dsq_diffs = 2*(t_p - t_c) / t_p.size(0)##均值的导数
return dsq_diffs
def dmodel_dw(t_u,w,b):
return t_u
def dmodel_db(t_u,w,b):
return 1.0
###定义梯度函数
def grad_fn(t_u,t_c,t_p,w,b):
dloss_dtp = dloss_fn(t_p,t_c)
dloss_dw = dloss_dtp * dmodel_dw(t_u,w,b)
dloss_db = dloss_dtp * dmodel_db(t_u,w,b)
return torch.stack([dloss_dw.sum(),dloss_db.sum()])
进行循环训练:
###循环训练
loss_list = []
epoch_list = []
def training_loop(n_epoches, learning_rate, params, t_u, t_c):
for epoch in range(1,n_epoches+1):
w, b = params
t_p = model(t_u,w,b)
loss = loss_fn(t_p,t_c)
grad = grad_fn(t_u, t_c, t_p, w, b)
params = params - learning_rate * grad
print('Epoch %d, Loss %f' %(epoch,float(loss)))
epoch_list.append(epoch)
loss_list.append(float(loss))
print(params)
print(grad)
plt.plot(epoch_list,loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
return params
调用循环训练:
###调用训练循环
training_loop(n_epoches=100,learning_rate = 1e-2,params=torch.tensor([1.0,0.0]),t_u=t_u,t_c=t_c)
输出:
可见训练过程崩溃了,损失也越来越大。究其原因,是因为lr设置过大了,无法找到极值点
调小学习率:
training_loop(n_epoches=100,learning_rate = 1e-4,params=torch.tensor([1.0,0.0]),t_u=t_u,t_c=t_c)
输出:
可见优化过程稳定了。
归一化输入:
###归一化输入
##确保输入范围靠近[-1,1]
t_un = 0.1 * t_u
###调用训练循环
training_loop(n_epoches=100,learning_rate = 1e-2,params=torch.tensor([1.0,0.0]),t_u=t_un,t_c=t_c)
输出:
我们可以发现:参数调回到1e-2,参数也不会在迭代更新中爆炸,加大循环迭代次数:
params =training_loop(
n_epoches=5000,
learning_rate = 1e-2,
params = torch.tensor([1.0,0.0]),
t_u = t_un,
t_c = t_c)
print(params)
输出:
再次可视化数据
###再次可视化数据 jupyter notebook才可以使用魔术命令,Pycharm里不支持
%matplotlib inline
from matplotlib import pyplot as plt
t_p = model(t_un, *params)
fig = plt.figure(dpi=500)
plt.xlabel("Temperature (°Fahrenheit)")
plt.ylabel("Temperature (°Celsius)")
plt.plot(t_u.numpy(), t_p.detach().numpy())##.detach()
'''
实际上,detach()就是返回一个新的tensor,并且这个tensor是从当前的计算图中分离出来的。但是返回的tensor和原来的tensor是共享内存空间的。
如果A网络的输出被喂给B网络作为输入, 如果我们希望在梯度反传的时候只更新B中参数的值,而不更新A中的参数值,这时候就可以使用detach()
这里体现的不是很明显
'''
plt.plot(t_u.numpy(), t_c.numpy(),'o')
注:实际上,detach()就是返回一个新的tensor,并且这个tensor是从当前的计算图中分离出来的。但是返回的tensor和原来的tensor是共享内存空间的。
如果A网络的输出被喂给B网络作为输入, 如果我们希望在梯度反传的时候只更新B中参数的值,而不更新A中的参数值,这时候就可以使用detach()。这里体现的不是很明显。
输出:
2.pytorch自动求导
2.1 使用自动求导机制改进之前的代码:
##pytorch自动求导机制
#使用自动求导机制改进之前的代码
def model(t_u, w, b):
return w * t_u + b
def loss_fn(t_p, t_c):
squared_diffs = (t_p - t_c)**2
return squared_diffs.mean()
params = torch.tensor([1.0,0.0],requires_grad=True)
'''
requires_grad=True参数。
任何将params作为祖先的张量都可以访问从params到那个张量调用的函数链。若这些函数都是可微的,则导数的值将自动填充为params张量的grad属性。
'''
#通常,所有pytorch张量都有grad属性,通常情况下该属性值为None:
params.grad is None
输出:
计算损失:
loss = loss_fn(model(t_u,*params), t_c)
loss.backward()
print(params.grad)
##grad属性包含关于params的每个元素的损失的导数
输出:
注:backward()将导致导数在叶节点上累加,因此需要在参数更新后显式的将梯度归零
##backward()将导致导数在叶节点上累加,因此需要在参数更新后显式的将梯度归零
if params.grad is not None:
params.grad.zero_()
自动求导训练代码:
###自动求导训练代码
loss_list = []
epoch_list = []
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
for epoch in range(1, n_epochs + 1):
if params.grad is not None:
params.grad.zero_()
loss = loss_fn(model(t_u,*params), t_c)
loss.backward()
loss_list.append(loss.item())
with torch.no_grad():
params -= learning_rate * params.grad
if epoch % 50 == 0:
epoch_list.append(epoch)
print('Epoch %d, Loss %f'% (epoch,float(loss)))
plt.plot(loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
return params
training_loop(
n_epochs=5000,
learning_rate = 1e-2,
params = torch.tensor([1.0,0.0],requires_grad=True),
t_u = t_un,
t_c = t_c)
输出:
调用优化器:
##优化器
import torch.optim as optim
dir(optim)
输出:
使用一个梯度下降优化器:
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-5
optimizer = optim.SGD([params], lr = learning_rate)###momentum参数默认为0
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()
optimizer.step()
print(params)
输出:
在适当位置也就是在调用backward()之前对zero_grad()进行调用:
调用优化器之后的整个训练过程:
##改进之前的代码
loss_list = []
epoch_list = []
def training_loop(n_epochs, optimizer, params, t_u, t_c):
for epoch in range(1, n_epochs + 1):
loss = loss_fn(model(t_u,*params), t_c)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_list.append(loss.item())
if epoch % 50 == 0:
epoch_list.append(epoch)
print('Epoch %d, Loss %f'% (epoch,float(loss)))
plt.plot(loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
return params
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr = learning_rate)
training_loop(
n_epochs=5000,
optimizer = optimizer,
params = params,
t_u = t_un,
t_c = t_c)
输出:
测试其他优化器:
##测试其他优化器
loss_list = []
epoch_list = []
def training_loop(n_epochs, optimizer, params, t_u, t_c):
for epoch in range(1, n_epochs + 1):
loss = loss_fn(model(t_u,*params), t_c)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_list.append(loss.item())
if epoch % 50 == 0:
epoch_list.append(epoch)
print('Epoch %d, Loss %f'% (epoch,float(loss)))
plt.plot(loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
return params
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr = learning_rate)
training_loop(
n_epochs=2000,
optimizer = optimizer,
params = params,
t_u = t_u,
t_c = t_c)
输出:(对比时应该迭代相同的次数,这里没注意)
2.2 训练、验证与过拟合
分割数据集并乱序:
##训练、验证与过拟合
#分割数据集获取索引张量
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)
shuffled_indices = torch.randperm(n_samples)##randperm()函数将张量元素打乱进行重排列
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]
print(train_indices,val_indices)
#使用索引张量构建训练集与验证集
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]
train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u
输出:
训练模型:
###训练模型
train_loss_list = []
val_loss_list = []
epoch_list = []
def training_loop(n_epochs, optimizer, params, train_t_u, train_t_c, val_t_u, val_t_c):
for epoch in range(1, n_epochs + 1):
train_t_p = model(train_t_u ,*params)
train_loss = loss_fn(train_t_p, train_t_c)
val_t_p = model(val_t_u, *params)
val_loss = loss_fn(train_t_p, train_t_c)
val_loss_list.append(val_loss.item())
optimizer.zero_grad()
train_loss.backward() #仅在训练集上训练模型
optimizer.step()
train_loss_list.append(train_loss.item())
if epoch < 3 or epoch % 500 == 0:
epoch_list.append(epoch)
print(f"Epoch {epoch},Training loss {train_loss.item():.4f},"f"Validation loss {val_loss.item():.4f}")
plt.plot(train_loss_list)
plt.xlabel('epoch')
plt.ylabel('train loss')
plt.title("Training loss")
plt.show()
plt.plot(val_loss_list)
plt.xlabel('epoch')
plt.ylabel('val loss')
plt.title("Validation loss")
plt.show()
return params
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr = learning_rate)
training_loop(
n_epochs=3000,
optimizer = optimizer,
params = params,
train_t_u = train_t_un,
train_t_c = train_t_c,
val_t_u = val_t_un,
val_t_c = val_t_c)
输出:
自动求导的更新与关闭
##自动求导更新及关闭
#上下文管理器torch.no_grad()
def training_loop(n_epochs, optimizer, params, train_t_u, train_t_c, val_t_u, val_t_c):
for epoch in range(1, n_epochs + 1):
train_t_p = model(train_t_u ,*params)
train_loss = loss_fn(train_t_p, train_t_c)
with torch.no_grad():
val_t_p = model(val_t_u, *params)
val_loss = loss_fn(train_t_p, train_t_c)
val_loss_list.append(val_loss.item())
assert val_loss.requires_grad == False###强制设置为false
optimizer.zero_grad()
train_loss.backward() #仅在训练集上训练模型
optimizer.step()
train_loss_list.append(train_loss.item())
也可定义新方法,接受数据作为输入,根据一个布尔类型的参数决定model()与loss_fn()是否进行自动求导。
##使用set_grad_enabled()
def calc_forward(t_u, t_c, is_train):
with torch.set_grad_enabled(is_train):
t_p = model(t_u, *params)
loss = loss(t_p, t_c)
return loss
下一部分是书中的练习题。