cs231 神经网络初步之Forwardpropagation 及Backpropagation

神经网络的计算图：
from __future__ import print_function

import numpy as np
import matplotlib.pyplot as plt

class TwoLayerNet(object):
  """
  A two-layer fully-connected neural network. The net has an input dimension of
  N, a hidden layer dimension of H, and performs classification over C classes.
  We train the network with a softmax loss function and L2 regularization on the
  weight matrices. The network uses a ReLU nonlinearity after the first fully
  connected layer.

  In other words, the network has the following architecture:

  input - fully connected layer - ReLU - fully connected layer - softmax

  The outputs of the second fully-connected layer are the scores for each class.
  """

  def __init__(self, input_size, hidden_size, output_size, std=1e-4):
    """
    Initialize the model. Weights are initialized to small random values and
    biases are initialized to zero. Weights and biases are stored in the
    variable self.params, which is a dictionary with the following keys:

    W1: First layer weights; has shape (D, H)
    b1: First layer biases; has shape (H,)
    W2: Second layer weights; has shape (H, C)
    b2: Second layer biases; has shape (C,)

    Inputs:
    - input_size: The dimension D of the input data.
    - hidden_size: The number of neurons H in the hidden layer.
    - output_size: The number of classes C.
    """
    self.params = {}
    self.params['W1'] = std * np.random.randn(input_size, hidden_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['W2'] = std * np.random.randn(hidden_size, output_size)
    self.params['b2'] = np.zeros(output_size)

  def loss(self, X, y=None, reg=0.0):
    """
    Compute the loss and gradients for a two layer fully connected neural
    network.

    Inputs:
    - X: Input data of shape (N, D). Each X[i] is a training sample.
    - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
      an integer in the range 0 <= y[i] < C. This parameter is optional; if it
      is not passed then we only return scores, and if it is passed then we
      instead return the loss and gradients.
    - reg: Regularization strength.

    Returns:
    If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
    the score for class c on input X[i].

    If y is not None, instead return a tuple of:
    - loss: Loss (data loss and regularization loss) for this batch of training
      samples.
    - grads: Dictionary mapping parameter names to gradients of those parameters
      with respect to the loss function; has the same keys as self.params.
    """
    # Unpack variables from the params dictionary
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.shape

    # Compute the forward pass
    scores = None
    #############################################################################
    # TODO: Perform the forward pass, computing the class scores for the input. #
    # Store the result in the scores variable, which should be an array of      #
    # shape (N, C).                                                             #
    #############################################################################
    #pass

    s1=np.dot(X,W1) +b1 # (N,D) (D,H) ---->N,H
    #reLU
    s1_act =np.maximum(s1, 0)  
    scores=np.dot(s1_act ,W2) +b2 # (N,H) (H,C)----->N,C
    
    
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################
    
    # If the targets are not given then jump out, we're done
    if y is None:
      return scores

    # Compute the loss
    loss = None
    #############################################################################
    # TODO: Finish the forward pass, and compute the loss. This should include  #
    # both the data loss and L2 regularization for W1 and W2. Store the result  #
    # in the variable loss, which should be a scalar. Use the Softmax           #
    # classifier loss.                                                          #
    #############################################################################
    #pass Softmax 
    scores -= np.max(scores, axis=1, keepdims=True)# 数值稳定性S
    scores = np.exp(scores)
    scores /= np.sum(scores, axis=1, keepdims=True)# softmax

    loss = scores[np.arange(X.shape[0]), y]
    loss = -np.log(loss).sum()
    loss /= X.shape[0]            
    #w1,w2  regularization
    loss += reg *(np.sum(W1*W1))
    loss += reg *(np.sum(W2*W2))
    
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    # Backward pass: compute gradients
    grads = {}
    #############################################################################
    # TODO: Compute the backward pass, computing the derivatives of the weights #
    # and biases. Store the results in the grads dictionary. For example,       #
    # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    #############################################################################
    
    
    #Forward: X---->W1X+b1-->s1--->Relu--->s1_act (x)--->W2X+b2---->scores --->softmax---->scores  ----> l
                                                                                          #  w2,w1 ----> l 
    #l--> scores---> softmax --->scores
    ds2 = np.copy(scores) #(N,C)
    ds2[np.arange(X.shape[0]), y] -= 1 # softmax  derivative
    
    # scores---> W2X+b2
    #W ---->z = Wx+b --->l    #▽Wl = ▽zl X^T   #▽bl = ▽zl * 1
    dW2 = np.dot(s1_act .T, ds2)  / X.shape[0] #(H,N) (N,C)--->(H,C) 
    
    # l--->w2  2 path merging
    dW2 += 2 * reg * W2 
    db2 = np.sum(ds2, axis=0) /  X.shape[0]
    grads['W2']=dW2    
    grads['b2']=db2   
 
    
    #W2X+b2 ---->s1_act (x)---->s1  
    #x ---->z = Wx+b --->l    #▽xl = ▽zl W^T  
    ds1_act =np.dot(ds2, W2.T) # (N,C) (C,H)--->(N,H)
    ds1  = (s1 > 0) *ds1_act     
    
    #s1---->W1X+b1
    dW1 = np.dot(X.T, ds1)  / X.shape[0] # (D,H) (N,H) ---->D,H
    db1 = np.sum(ds1, axis=0) /  X.shape[0]
    # l--->w1  2 path merging
    dW1 += 2 * reg * W1
    
    grads['W1']=dW1    
    grads['b1']=db1    
    
 
    
    
    
    

    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

    return loss, grads
https://github.com/duanzhihua
cs231 神经网络初步之Forwardpropagation 及Backpropagation

猜你喜欢