作业三：使用minibatch的方式进行梯度下降

项目	内容
这个作业属于的课程	人工智能实战2019(北京航空航天大学）
这个作业的要求	第三次作业：使用minibatch的方式进行梯度下降
我在这个课程的目标是	学习算法，积累项目经验，锻炼coding能力
这个作业在哪个具体方面帮助我实现目标	了解batch, iteration,epoch的概念；学习使用批处理操作
作业正文	见下文
其他参考文献	微软示例代码

1. 作业要求

使用minibatch的方式进行梯度下降
复习讲过的课程（链接），并回答关于损失函数的 2D 示意图的问题
- 为什么是椭圆而不是圆？如何把这个图变成一个圆？
- 为什么中心是个椭圆区域而不是一个点？

2. 实现随机选取数据的方式进行minibatch梯度下降

示例代码位置：/B-教学案例与实践/B6-神经网络基本原理简明教程/微软-方案1/NeuralNetwork/ch04/level4-BatchGradientDescent.py
我的代码：
```
import numpy as np

import matplotlib.pyplot as plt

from matplotlib.pyplot import savefig

from pathlib import Path

x_data_name = "TemperatureControlXData.dat"

y_data_name = "TemperatureControlYData.dat"

class CData(object):

def  __init__(self, loss, w, b, epoch, iteration):

self.loss = loss

self.w = w

self.b = b

self.epoch = epoch

self.iteration = iteration

def ReadData():

Xfile = Path(x_data_name)

Yfile = Path(y_data_name)

if Xfile.exists() & Yfile.exists():

    X = np.load(Xfile)

    Y = np.load(Yfile)

    return X.reshape(1,-1),Y.reshape(1,-1)

else:

    return  None,None

def ForwardCalculationBatch(W,B,batch_x):

Z = np.dot(W, batch_x) + B

return Z

def BackPropagationBatch(batch_x, batch_y, batch_z):

m = batch_x.shape[1]

dZ = batch_z - batch_y

dB = dZ.sum(axis=1, keepdims=True)/m

dW = np.dot(dZ, batch_x.T)/m

return dW, dB

def UpdateWeights(w, b, dW, dB, eta):

w = w - eta*dW

b = b - eta*dB

return w,b

def InitialWeights(num_input, num_output, flag):

if flag ==  0:

    # zero

    W = np.zeros((num_output, num_input))

elif flag ==  1:

    # normalize

    W = np.random.normal(size=(num_output, num_input))

elif flag ==  2:

    # xavier

    W=np.random.uniform(
        -np.sqrt(6/(num_input+num_output)),

        np.sqrt(6/(num_input+num_output)),

        size=(num_output,num_input))



B = np.zeros((num_output, 1))

return W,B

def CheckLoss(W, B, X, Y):

m = X.shape[1]

Z = np.dot(W, X) + B

LOSS = (Z - Y)**2

loss = LOSS.sum()/m/2

return loss

def shuffle(X, Y):

num_example = X.shape[1]

rank = np.arange(0, num_example)

np.random.shuffle(rank)

X_shuffle = []

Y_shuffle = []

for i in rank:

    X_shuffle.append(X[:,i])

    Y_shuffle.append(Y[:,i])

X_shuffle = np.transpose(X_shuffle)

Y_shuffle = np.transpose(Y_shuffle)

return X_shuffle, Y_shuffle

def GetBatchSamples(X,Y,batch_size,iteration):

num_feature = X.shape[0]

start = iteration * batch_size

end = start + batch_size

batch_x = X[0:num_feature, start:end].reshape(num_feature, batch_size)

batch_y = Y[0, start:end].reshape(1, batch_size)

return batch_x, batch_y

def GetMinimalLossData(dict_loss):

key =  sorted(dict_loss.keys())[0]

w = dict_loss[key].w

b = dict_loss[key].b

return w,b,dict_loss[key]

def ShowIterLossHistory(dict_loss, batch_size):

loss = []

for key in dict_loss:

loss.append(key)

plt.title("batch size :"  +  str(batch_size))

plt.xlabel("iteration")

plt.plot(loss[30:800])

plt.ylabel("loss")

savefig("/Users/souchiguu/Desktop/"  +  str(batch_size) +  ".png")

plt.show()

def ShowEpochLossHistory(list_epoch, Batchsize):

color = ['b','g','y']

for num_batch in  range(len(Batchsize)):

    loss = []

    for key in list_epoch[num_batch]:

    loss.append(key)

    plt.plot(loss, color[num_batch], label  =  'batchsize='+str(Batchsize[num_batch]))

plt.title("learning rate = 0.01" )

plt.xlabel("epoch")

plt.ylabel("loss")

plt.legend()

savefig("/Users/souchiguu/Desktop/"  +  "0.1"  ".png")

plt.show()

if name == 'main':
# method = "MiniBatch"

eta, max_epoch =  0.01, 50

Batchsize = [5, 10, 15]

list_epoch = []

# read data

X_origin, Y_origin = ReadData()

# count of samples

num_example = X_origin.shape[1]

num_feature = X_origin.shape[0]

for batch_size in Batchsize:

    W, B = InitialWeights(1,1,0)

    # calculate loss to decide the stop condition

    # loss = 5

    dict_epoch_loss = {}

    dict_iter_loss = {}

    for epoch in  range(max_epoch):

        # random shuffle

        X, Y = shuffle(X_origin, Y_origin)

        # if num_example=200, batch_size=10, then iteration=200/10=20

        max_iteration = (int)(num_example / batch_size)

        sum_loss =  0

        for iteration in  range(max_iteration):

            # get x and y value for one sample

            batch_x, batch_y = GetBatchSamples(X,Y,batch_size,iteration)

            # get z from x,y

            batch_z = ForwardCalculationBatch(W, B, batch_x)

            # calculate gradient of w and b

            dW, dB = BackPropagationBatch(batch_x, batch_y, batch_z)

            # update w,b

            W, B = UpdateWeights(W, B, dW, dB, eta)
            # calculate loss for this batch
            
            loss = CheckLoss(W,B,X,Y)

            # print("batchsize=%d, epoch=%d, iteration=%d, loss=%f" %(batch_size, epoch, iteration, loss))

            dict_iter_loss[loss] = CData(loss, W, B, epoch, iteration)

            sum_loss += loss
        # end for
        
        dict_epoch_loss[sum_loss] = CData(sum_loss, W, B, epoch, max_iteration)
        
    # end for
    
    list_epoch.append(dict_epoch_loss)
    
    ShowIterLossHistory(dict_iter_loss, batch_size)
    
    w,b,cdata = GetMinimalLossData(dict_epoch_loss)
    
    print("w:", cdata.w, "b:", cdata.b)
    
    print("batchsize=%d, epoch=%d, iteration=%d, loss=%f"  %(batch_size, cdata.epoch, cdata.iteration, cdata.loss))
    
ShowEpochLossHistory(list_epoch, Batchsize)


learning rate 为0.1，loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势：

0.1 lr
w: [[1.99854936]] b: [[3.00973446]]
batchsize=5, epoch=27, iteration=40, loss=0.196999
w: [[1.99322693]] b: [[3.00426234]]
batchsize=10, epoch=37, iteration=20, loss=0.098183
w: [[1.99197605]] b: [[3.0102479]]
batchsize=15, epoch=46, iteration=13, loss=0.063790


![](https://ws2.sinaimg.cn/large/006tKfTcly1g1e77ba9arj30hs0dcaac.jpg)
![](https://ws1.sinaimg.cn/large/006tKfTcly1g1e78989h1j30hs0dcaai.jpg)
![](https://ws2.sinaimg.cn/large/006tKfTcly1g1e7895wl2j30hs0dc74p.jpg)
![](https://ws1.sinaimg.cn/large/006tKfTcly1g1e788xlefj30hs0dc0t3.jpg)

learning rate 为0.01，loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势：

0.01 lr
w: [[1.90823943]] b: [[3.05213545]]
batchsize=5, epoch=49, iteration=40, loss=0.209842
w: [[1.82458827]] b: [[3.09514729]]
batchsize=10, epoch=49, iteration=20, loss=0.123555
w: [[1.78029477]] b: [[3.11700018]]
batchsize=15, epoch=49, iteration=13, loss=0.089676
```

3. 思考题

loss_2d:

为什么是椭圆而不是圆？如何把这个图变成一个圆？
直观上是因为\(w\)与\(b\)在损失函数中的系数不同。本质原因是它们在正向传播中的地位不同，类似两个正交的特征向量，对应着不同的特征值。损失函数 \(J(w,b)=\frac{1}{m}\sum_{i=1}^m(wx_i+b_i-y_i)^2\)。\(w\)的系数是\(b\)的系数的\(\frac{1}{m}\sum_{i=1}^mx_i^2\)倍，当\(\sum_{i=1}^mx_i^2\)不等于1时，满足椭圆型方程\(\frac{x^2}{a^2}+\frac{y^2}{b^2}=1\)。强制\(\sum_{i=1}^mx_i^2==1\)可以把这个图变成一个圆。如，令\(x\)的均值为0，模长为m。
为什么中心是个椭圆区域而不是一个点？
loss最小的点应该是唯一取得的。中心不是一个点而是一个椭圆区域是因为无法连续对w,b进行取值，只能用离散的形式逼近，中心点附近的loss取值相似的地方构成这个椭圆区域。