A neural network purely by hand (only use numpy to identify the mnist data set, full code)

full code

A 2-layer DNN network that recognizes mnist handwritten data sets by hand. The low-level NumPy codes of all library functions are given, and this string of codes can be run directly! No other files are required.
If children's shoes of TensorFlow and matplotlib are not installed, you can enter pip install tensorflowand pip install matplotlibto install.

import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf #引入tensorflow只是为了导入mnist数据集

#下面一大段都是定义函数
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_grad(x):
    return (1.0 - sigmoid(x)) * sigmoid(x)


def relu(x):
    return np.maximum(0, x)


def relu_grad(x):
    #grad = np.zeros(x)
    #grad[x>=0] = 1
    x = np.where(x>=0,1,0)
    return x


def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x)  # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))


def mean_squared_error(y, t):
    return 0.5 * np.sum((y - t) ** 2)


def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)

    # 监督数据是one-hot-vector的情况下,转换为正确解标签的索引
    if t.size == y.size:
        t = t.argmax(axis=1)

    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


def softmax_loss(X, t):
    y = softmax(X)
    return cross_entropy_error(y, t)


def numerical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad



class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {
    
    }
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(x, W1) + b1
        #z1 = sigmoid(a1)
        z1 = relu(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        return y

    # x:输入数据, t:监督数据
    def loss(self, x, t):
        y = self.predict(x)

        return cross_entropy_error(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)

        grads = {
    
    }
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads

    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {
    
    }

        batch_num = x.shape[0]

        # forward
        a1 = np.dot(x, W1) + b1
        #z1 = sigmoid(a1)
        z1 = relu(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        # backward
        dy = (y - t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)

        da1 = np.dot(dy, W2.T)
        #dz1 = sigmoid_grad(a1) * da1
        dz1 = relu_grad(a1) * da1
        grads['W1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads


def _change_one_hot_label(X):
    T = np.zeros((X.size, 10))
    for idx, row in enumerate(T):
        row[X[idx]] = 1

    return T

#开搞
# 读入数据
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0 #归一化
x_train = x_train.reshape(-1,784)  # flatten, (60000,28,28)变(60000,784)
x_test = x_test.reshape(-1,784)  # flatten, (10000,28,28)变(10000,784)
y_train = _change_one_hot_label(y_train) #标签变独热码,才能和前向传播softmax之后的结果维度匹配,才能相减算误差
y_test = _change_one_hot_label(y_test) #标签变独热码

#两层DNN(隐藏层50个神经元,784*50*10),激活函数是relu,可自己改成sigmoid,损失函数是交叉熵误差,输出层是softmax,优化函数是SGD
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

#超参数设置
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 512
learning_rate = 0.05

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

#训练
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    y_batch = y_train[batch_mask]

    # 梯度
    # grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, y_batch)

    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, y_batch)
    train_loss_list.append(loss)

    #每一个epoch打印训练和测试的准确率
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, y_train)
        test_acc = network.accuracy(x_test, y_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

# 绘制 loss 曲线
plt.subplot(1,2,1)
plt.title('Loss Function Curve')  # 图片标题
plt.xlabel('Step')  # x轴变量名称
plt.ylabel('Loss')  # y轴变量名称
plt.plot(train_loss_list, label="$Loss$")  # 逐点画出loss值并连线,连线图标是Loss
plt.legend()  # 画出曲线图标

# 绘制 Accuracy 曲线
plt.subplot(1,2,2)
plt.title('Acc Curve')  # 图片标题
plt.xlabel('Epoch')  # x轴变量名称
plt.ylabel('Acc')  # y轴变量名称
plt.plot(train_acc_list, label="$train_{acc}$")  # 逐点画出train_acc值并连线
plt.plot(test_acc_list, label="$test_{acc}$")  # 逐点画出test_acc值并连线
plt.legend()
plt.show()

insert image description here

 

Summarize

A simple two-layer network (number of W: 784*50+50*10, number of b: 50+10) can achieve 95% accuracy without overfitting.
If the batch_size is increased a bit, the loss will not fluctuate so much. If the training period is longer, the acc will be larger. The larger the learning rate, the faster the training, but if it is too large, it will run away, so you can adjust it for fun.
The above activation function is to choose relu, but you can change it to sigmoid yourself. In the code, relu can be replaced with sigmoid. It turns out that relu is better.
The above optimizer is SGD (Stochastic Gradient Descent), as well as Momentum, AdaGrad, Adam, etc. Generally, Adam will have better results.
So we can summarize the whole picture of neural network learning:

前提

There are suitable weights and biases in the neural network, and the weights and biases are adjusted to fit the training data

The process is called "learning". The learning of the neural network is divided into the following four steps.

步骤1(mini-batch)

Randomly select a part of the data from the training data, this part of the data is called mini-batch. us

The goal of is to reduce the value of the loss function of the mini-batch.

步骤2(计算梯度)

In order to reduce the value of the loss function of the mini-batch, it is necessary to find the gradient of each weight parameter.

The gradient represents the direction in which the value of the loss function decreases the most.

步骤3(更新参数)

Make small updates to the weight parameters along the gradient direction.

步骤4(算误差、精度)

Calculate the error every cycle, and calculate the accuracy if it reaches an epoch.

步骤5(重复)

Repeat step 1, step 2, step 3, step 4.

 
For more introductory content on in-depth learning, you can read this article " Understanding Deep Learning in One Article ".

Guess you like

Origin blog.csdn.net/weixin_45116099/article/details/127704825