作业三:使用minibatch的方式进行梯度下降
项目 | 内容 |
---|---|
这个作业属于的课程 | 人工智能实战2019(北京航空航天大学) |
这个作业的要求 | 第三次作业:使用minibatch的方式进行梯度下降 |
我在这个课程的目标是 | 学习算法,积累项目经验,锻炼coding能力 |
这个作业在哪个具体方面帮助我实现目标 | 了解batch, iteration,epoch的概念;学习使用批处理操作 |
作业正文 | 见下文 |
其他参考文献 | 微软示例代码 |
1. 作业要求
- 使用minibatch的方式进行梯度下降
- 复习讲过的课程(链接),并回答关于损失函数的 2D 示意图的问题
- 为什么是椭圆而不是圆?如何把这个图变成一个圆?
- 为什么中心是个椭圆区域而不是一个点?
2. 实现随机选取数据的方式进行minibatch梯度下降
示例代码位置:/B-教学案例与实践/B6-神经网络基本原理简明教程/微软-方案1/NeuralNetwork/ch04/level4-BatchGradientDescent.py
我的代码:
```
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from pathlib import Path
x_data_name = "TemperatureControlXData.dat"
y_data_name = "TemperatureControlYData.dat"
class CData(object):
def __init__(self, loss, w, b, epoch, iteration):
self.loss = loss
self.w = w
self.b = b
self.epoch = epoch
self.iteration = iteration
def ReadData():
Xfile = Path(x_data_name)
Yfile = Path(y_data_name)
if Xfile.exists() & Yfile.exists():
X = np.load(Xfile)
Y = np.load(Yfile)
return X.reshape(1,-1),Y.reshape(1,-1)
else:
return None,None
def ForwardCalculationBatch(W,B,batch_x):
Z = np.dot(W, batch_x) + B
return Z
def BackPropagationBatch(batch_x, batch_y, batch_z):
m = batch_x.shape[1]
dZ = batch_z - batch_y
dB = dZ.sum(axis=1, keepdims=True)/m
dW = np.dot(dZ, batch_x.T)/m
return dW, dB
def UpdateWeights(w, b, dW, dB, eta):
w = w - eta*dW
b = b - eta*dB
return w,b
def InitialWeights(num_input, num_output, flag):
if flag == 0:
# zero
W = np.zeros((num_output, num_input))
elif flag == 1:
# normalize
W = np.random.normal(size=(num_output, num_input))
elif flag == 2:
# xavier
W=np.random.uniform(
-np.sqrt(6/(num_input+num_output)),
np.sqrt(6/(num_input+num_output)),
size=(num_output,num_input))
B = np.zeros((num_output, 1))
return W,B
def CheckLoss(W, B, X, Y):
m = X.shape[1]
Z = np.dot(W, X) + B
LOSS = (Z - Y)**2
loss = LOSS.sum()/m/2
return loss
def shuffle(X, Y):
num_example = X.shape[1]
rank = np.arange(0, num_example)
np.random.shuffle(rank)
X_shuffle = []
Y_shuffle = []
for i in rank:
X_shuffle.append(X[:,i])
Y_shuffle.append(Y[:,i])
X_shuffle = np.transpose(X_shuffle)
Y_shuffle = np.transpose(Y_shuffle)
return X_shuffle, Y_shuffle
def GetBatchSamples(X,Y,batch_size,iteration):
num_feature = X.shape[0]
start = iteration * batch_size
end = start + batch_size
batch_x = X[0:num_feature, start:end].reshape(num_feature, batch_size)
batch_y = Y[0, start:end].reshape(1, batch_size)
return batch_x, batch_y
def GetMinimalLossData(dict_loss):
key = sorted(dict_loss.keys())[0]
w = dict_loss[key].w
b = dict_loss[key].b
return w,b,dict_loss[key]
def ShowIterLossHistory(dict_loss, batch_size):
loss = []
for key in dict_loss:
loss.append(key)
plt.title("batch size :" + str(batch_size))
plt.xlabel("iteration")
plt.plot(loss[30:800])
plt.ylabel("loss")
savefig("/Users/souchiguu/Desktop/" + str(batch_size) + ".png")
plt.show()
def ShowEpochLossHistory(list_epoch, Batchsize):
color = ['b','g','y']
for num_batch in range(len(Batchsize)):
loss = []
for key in list_epoch[num_batch]:
loss.append(key)
plt.plot(loss, color[num_batch], label = 'batchsize='+str(Batchsize[num_batch]))
plt.title("learning rate = 0.01" )
plt.xlabel("epoch")
plt.ylabel("loss")
plt.legend()
savefig("/Users/souchiguu/Desktop/" + "0.1" ".png")
plt.show()
if name == 'main':
# method = "MiniBatch"
eta, max_epoch = 0.01, 50
Batchsize = [5, 10, 15]
list_epoch = []
# read data
X_origin, Y_origin = ReadData()
# count of samples
num_example = X_origin.shape[1]
num_feature = X_origin.shape[0]
for batch_size in Batchsize:
W, B = InitialWeights(1,1,0)
# calculate loss to decide the stop condition
# loss = 5
dict_epoch_loss = {}
dict_iter_loss = {}
for epoch in range(max_epoch):
# random shuffle
X, Y = shuffle(X_origin, Y_origin)
# if num_example=200, batch_size=10, then iteration=200/10=20
max_iteration = (int)(num_example / batch_size)
sum_loss = 0
for iteration in range(max_iteration):
# get x and y value for one sample
batch_x, batch_y = GetBatchSamples(X,Y,batch_size,iteration)
# get z from x,y
batch_z = ForwardCalculationBatch(W, B, batch_x)
# calculate gradient of w and b
dW, dB = BackPropagationBatch(batch_x, batch_y, batch_z)
# update w,b
W, B = UpdateWeights(W, B, dW, dB, eta)
# calculate loss for this batch
loss = CheckLoss(W,B,X,Y)
# print("batchsize=%d, epoch=%d, iteration=%d, loss=%f" %(batch_size, epoch, iteration, loss))
dict_iter_loss[loss] = CData(loss, W, B, epoch, iteration)
sum_loss += loss
# end for
dict_epoch_loss[sum_loss] = CData(sum_loss, W, B, epoch, max_iteration)
# end for
list_epoch.append(dict_epoch_loss)
ShowIterLossHistory(dict_iter_loss, batch_size)
w,b,cdata = GetMinimalLossData(dict_epoch_loss)
print("w:", cdata.w, "b:", cdata.b)
print("batchsize=%d, epoch=%d, iteration=%d, loss=%f" %(batch_size, cdata.epoch, cdata.iteration, cdata.loss))
ShowEpochLossHistory(list_epoch, Batchsize)
learning rate 为0.1,loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势:
0.1 lr
w: [[1.99854936]] b: [[3.00973446]]
batchsize=5, epoch=27, iteration=40, loss=0.196999
w: [[1.99322693]] b: [[3.00426234]]
batchsize=10, epoch=37, iteration=20, loss=0.098183
w: [[1.99197605]] b: [[3.0102479]]
batchsize=15, epoch=46, iteration=13, loss=0.063790
![](https://ws2.sinaimg.cn/large/006tKfTcly1g1e77ba9arj30hs0dcaac.jpg)
![](https://ws1.sinaimg.cn/large/006tKfTcly1g1e78989h1j30hs0dcaai.jpg)
![](https://ws2.sinaimg.cn/large/006tKfTcly1g1e7895wl2j30hs0dc74p.jpg)
![](https://ws1.sinaimg.cn/large/006tKfTcly1g1e788xlefj30hs0dc0t3.jpg)
learning rate 为0.01,loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势:
0.01 lr
w: [[1.90823943]] b: [[3.05213545]]
batchsize=5, epoch=49, iteration=40, loss=0.209842
w: [[1.82458827]] b: [[3.09514729]]
batchsize=10, epoch=49, iteration=20, loss=0.123555
w: [[1.78029477]] b: [[3.11700018]]
batchsize=15, epoch=49, iteration=13, loss=0.089676
```
3. 思考题
loss_2d:
为什么是椭圆而不是圆?如何把这个图变成一个圆?
直观上是因为\(w\)与\(b\)在损失函数中的系数不同。本质原因是它们在正向传播中的地位不同,类似两个正交的特征向量,对应着不同的特征值。损失函数 \(J(w,b)=\frac{1}{m}\sum_{i=1}^m(wx_i+b_i-y_i)^2\)。\(w\)的系数是\(b\)的系数的\(\frac{1}{m}\sum_{i=1}^mx_i^2\)倍,当\(\sum_{i=1}^mx_i^2\)不等于1时,满足椭圆型方程\(\frac{x^2}{a^2}+\frac{y^2}{b^2}=1\)。强制\(\sum_{i=1}^mx_i^2==1\)可以把这个图变成一个圆。如,令\(x\)的均值为0,模长为m。为什么中心是个椭圆区域而不是一个点?
loss最小的点应该是唯一取得的。中心不是一个点而是一个椭圆区域是因为无法连续对w,b进行取值,只能用离散的形式逼近,中心点附近的loss取值相似的地方构成这个椭圆区域。