CNN代码部分

1、the functions we need implement

一个卷积与一个切片部分计算的过程代码：filter就是我们的w值

def conv_single_step(a_slice_prev, W, b):
    """
    Apply one filter defined by parameters W on a single slice (a_slice_prev) of the output activation 
    of the previous layer.

    Arguments:
    a_slice_prev -- slice of input data of shape (f, f, n_C_prev)
    W -- Weight parameters contained in a window - matrix of shape (f, f, n_C_prev)
    b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)

    Returns:
    Z -- a scalar value, result of convolving the sliding window (W, b) on a slice x of the input data
    """

    ### START CODE HERE ### (≈ 2 lines of code)
    # Element-wise product between a_slice and W. Do not add the bias yet.
    s = a_slice_prev * W
    # Sum over all entries of the volume s.
    Z = np.sum(s)
    # Add bias b to Z. Cast b to a float() so that Z results in a scalar value.
    Z = Z + b
    ### END CODE HERE ###

    return Z

w的shape其实也预示了上一层和下一层有多少个神经元单位或者说卷积后的shape

整个卷积部分：利用for循环来对每一个slice来进行操作

def conv_forward(A_prev, W, b, hparameters):
    """
    Implements the forward propagation for a convolution function

    Arguments:
    A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
    W -- Weights, numpy array of shape (f, f, n_C_prev, n_C)
    b -- Biases, numpy array of shape (1, 1, 1, n_C)
    hparameters -- python dictionary containing "stride" and "pad"

    Returns:
    Z -- conv output, numpy array of shape (m, n_H, n_W, n_C)
    cache -- cache of values needed for the conv_backward() function
    """
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    (f, f, n_C_prev, n_C) = W.shape  # n_C代表了filters的个数

    stride = hparameters["stride"]
    pad = hparameters["pad"]
    # 下一层的n_H，n_W,n_C则根据W.shape里来，即filters的个数
    n_H = int((n_H_prev - f + 2 * pad) / stride + 1)
    n_W = int((n_W_prev - f + 2 * pad) / stride + 1)

    # Initialize the output volume Z with zeros. (≈1 line)
    Z = np.zeros((m, n_H, n_W, n_C))

    # Create A_prev_pad by padding A_prev
    A_prev_pad = zero_pad(A_prev, pad)

    for i in range(m):
        a_prev_pad = A_prev_pad[i, :, :, :]
        for h in range(n_H):
            for w in range(n_W):
                for c in range(n_C):
                    vert_start = stride * h
                    vert_end = vert_start + f
                    horiz_start = stride * w
                    horiz_end = horiz_start + f

                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[:, :, :, c], b[:, :, :, c])

    cache = (A_prev, W, b, hparameters)

    return Z, cache

池化操作：最大池化和平均池化

def pool_forward(A_prev, hparameters, mode="max"):
    """
    Implements the forward pass of the pooling layer

    Arguments:
    A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
    hparameters -- python dictionary containing "f" and "stride"
    mode -- the pooling mode you would like to use, defined as a string ("max" or "average")

    Returns:
    A -- output of the pool layer, a numpy array of shape (m, n_H, n_W, n_C)
    cache -- cache used in the backward pass of the pooling layer, contains the input and hparameters
    """
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    f = hparameters['f']
    stride = hparameters['stride']
    n_H = int((n_H_prev - f) / stride + 1)
    n_W = int((n_W_prev - f) / stride + 1)
    n_C = n_C_prev

    A = np.zeros((m, n_H, n_W, n_C))

    for i in range(m):
        slice = A_prev[i, :, :, :]
        for h in range(n_H):
            for w in range(n_W):
                for c in range(n_C):
                    vert_start = stride * h
                    vert_end = vert_start + f
                    horiz_start = stride * w
                    horiz_end = horiz_start + f

                    a_slice_prev = slice[vert_start:vert_end, horiz_start:horiz_end, c]

                    if mode == 'max':
                        A[i, h, w, c] = np.max(a_slice_prev)
                    elif mode == 'average':
                        A[i, h, w, c] = np.average(a_slice_prev)

    cache = (A_prev, hparameters)

    return A, cache

反向传播：卷积层的backward_propagation

This is the formula for computing dA with respect to the cost for a certain filter Wc and a given training example:

每一个卷积和切片的计算就是一个Z，因此Z与dZ的shape为：(m, n_H, n_W, n_C)

$dA+= \sum_{h=0}^{n_H} \sum_{w=0}^{n_W}Wc \ast dZ_{hw}$

Where $W_{c}$ is a filter and $dZ_{hw}$ is a scalar corresponding to the gradient of the cost with respect to the output of the conv layer Z at the hth row and wth column (corresponding to the dot product taken at the ith stride left and jth stride down). Note that at each time, we multiply the the same filter $W_{c}$ by a different dZ when updating dA. We do so mainly because when computing the forward propagation, each filter is dotted and summed by a different a_slice. Therefore when computing the backprop for dA, we are just adding the gradients of all the a_slices.

不同的通道有一个不同的b，其余的b是一致的。

A与dA，z与dZ，W与dW，b与db的维度是一致的。

forward_propagation时，如下图重叠的部分分别对两个不同的Z做出了计算贡献，因此，在backward_propagation时，由低维度向高纬度转变时，有一个逐步累加的过程。dW，db是累加是因为在同一个filter下，W，b是不变的。在计算dA_prev或dA_prev_pad时，相当于重新构造和A_prev相同维度的array，主要根据W的维度中记录了A_prev的维度的相关信息来约束维度，公式还是一样的公式。

维度变化理解（结合代码）：一个通道，一个 $W_{f*f*n_{c}}$ ，一个 $aslice_{f*f*n_{c}}$ ，产生一个Z值。那么一个dW，一个 $aslice_{f*f*n_{c}}$ ，一个dZ值，一个通道产生一个da_prev，但是由于有重叠部分，所以dW在计算过程，有随着filter移动的累加过程。（还是从维度需求上来看吧，比如dA的维度同A的维度相同。比较直接）

看看就好：

    for i in range(m):
        a_prev_pad = A_prev_pad[i, :, :, :]
        da_prev_pad = dA_prev_pad[i, :, :, :]
        for h in range(n_H):
            for w in range(n_W):
                for c in range(n_C):
                    vert_start = stride * h
                    vert_end = vert_start + f
                    horiz_start = stride * w
                    horiz_end = horiz_start + f

                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    # (f, f, n_C_prev) += (f, f, n_C_prev, 1) * (1, 1, 1, 1)
                    # da_prev_pad相当于在一个切片处存在一个dA，需要累加每一个通道里的W与dZ的乘积

                    """
                    forward_propagation:
                        W(f, f, n_C_prev, n_C) and A_prev(m, n_H_prev, n_W_prev, n_C_prev) ——>Z(m, n_H, n_W, n_C)
                    backward_propagation:
                        dZ(m, n_H, n_W, n_C) and W(f, f, n_C_prev, n_C) ——> dA(m, n_H_prev, n_W_prev, n_C_prev) 
                    """
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:, :, :, c] * dZ[i, h, w, c]
                    # W,b是输出结果有多少个通道才有多少个，即都是filter的数量，因此，在此处dW,db也只需要对每个c产生一个累加效果
                    dW[:, :, :, c] += a_slice * dZ[i, h, w, c]
                    db[:, :, :, c] += dZ[i, h, w, c]

        dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]

池化层的backward：

池化层也相当于一个卷积，但是其通道不变，卷积核形为：

Max-pooling：

average-pooling:

def pool_backward(dA, cache, mode='max'):
    """
    Implements the backward pass of the pooling layer

    Arguments:
    dA -- gradient of cost with respect to the output of the pooling layer, same shape as A
    cache -- cache output from the forward pass of the pooling layer, contains the layer's input and hparameters
    mode -- the pooling mode you would like to use, defined as a string ("max" or "average")

    Returns:
    dA_prev -- gradient of cost with respect to the input of the pooling layer, same shape as A_prev
    """
    (A_prev, hparameters) = cache
    stride = hparameters['stride']
    f = hparameters['f']

    m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape
    m, n_H, n_W, n_C = dA.shape

    dA_prev = np.zeros(np.shape(A_prev))

    for i in range(m):
        a_prev = A_prev[i, :, :, :]
        for h in range(n_H):  # loop on the vertical axis
            for w in range(n_W):  # loop on the horizontal axis
                for c in range(n_C):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f

                    if mode == 'max':
                        a_prev_slice = a_prev[vert_start:vert_end, horiz_start:horiz_end, c]
                        mask = create_mask_from_window(a_prev_slice)
                        dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += np.multiply(mask, dA[i, h, w, c])
                    if mode == 'average':
                        # Get the value a from dA (≈1 line)
                        da = dA[i, h, w, c]
                        # Define the shape of the filter as fxf (≈1 line)
                        shape = (f, f)
                        # Distribute it to get the correct slice of dA_prev. i.e. Add the distributed value of da. (≈1 line)
                        dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += distribute_value(da, shape)

    return dA_prev

剩下很多是tensorflow和Keras的。

猜你喜欢