purpose
After completing the BP neural network derivation and derivation of convolution neural network , I have used python (without any deep learning framework!) Own handwriting a LeNet a convolution neural network! At the same time welcome to point out problems with learning and discussion -
Due to space limitations, here only posted core code (to ensure correct operation !!).
Make ad: https://github.com/Site1997/LeNet-python
(This contains all the code, you can run LeNet.py ~ directly after downloading)
Design ideas
This design I used numpy, scipy and skimage library, for straightforward, using the encapsulation method of the class, made a copy LeNet neural network is structured as follows :( a configuration diagram affixed silently, invasion deleted)
Structure and design configuration of the I pictures are similar, the only difference is the dimension of each of the input I have reduced the number of layer, for example, the input image is 28 * 28, the number of neurons in the whole connection layer becomes small and so forth (run faster). DETAILED code inside the network structure may be found in the super detailed notes .
First, the definition of a class I LeNet neural network, which includes initialization function, forward propagation function, after the function, the convolution function layer, the propagation cell layer function, softmax activation function, and function. The main function call fetch_MNIST.py, to obtain digital image and the label.
Secondly, I made the realization of the above function. correct! I wrote this code process has several trick to note:
- softmax wording to be elegant (avoid Floating-point overflow)
- Full connection layer and convolutional layer backpropagation must be carefully written (convolution or not inverted 180 degrees, not to transpose matrix)
- The heavy weight initialization and learning rate must be adjusted and then transfer a small little minor ! (Otherwise updating the weights, size parameters will lead to a sudden rise in floating-point overflow) (here I use the initialization method xavier's)
Finally, we defined the batch size, the learning rate and the maximum number of iterations can run up it!
And the results of the core code
Code
# -*- coding: utf-8 -*-
'''
Author: Site Li
Website: http://blog.csdn.net/site1997
'''
import numpy as np
from scipy.signal import convolve2d
from skimage.measure import block_reduce
import fetch_MNIST
class LeNet(object):
#The network is like:
# conv1 -> pool1 -> conv2 -> pool2 -> fc1 -> relu -> fc2 -> relu -> softmax
# l0 l1 l2 l3 l4 l5 l6 l7 l8 l9
def __init__(self, lr=0.1):
self.lr = lr
# 6 convolution kernal, each has 1 * 5 * 5 size
self.conv1 = xavier_init(6, 1, 5, 5)
# the size for mean pool is 2 * 2, stride = 2
self.pool1 = [2, 2]
# 16 convolution kernal, each has 6 * 5 * 5 size
self.conv2 = xavier_init(16, 6, 5, 5)
# the size for mean pool is 2 * 2, stride = 2
self.pool2 = [2, 2]
# fully connected layer 256 -> 200
self.fc1 = xavier_init(256, 200, fc=True)
# fully connected layer 200 -> 10
self.fc2 = xavier_init(200, 10, fc=True)
def forward_prop(self, input_data):
self.l0 = np.expand_dims(input_data, axis=1) / 255 # (batch_sz, 1, 28, 28)
self.l1 = self.convolution(self.l0, self.conv1) # (batch_sz, 6, 24, 24)
self.l2 = self.mean_pool(self.l1, self.pool1) # (batch_sz, 6, 12, 12)
self.l3 = self.convolution(self.l2, self.conv2) # (batch_sz, 16, 8, 8)
self.l4 = self.mean_pool(self.l3, self.pool2) # (batch_sz, 16, 4, 4)
self.l5 = self.fully_connect(self.l4, self.fc1) # (batch_sz, 200)
self.l6 = self.relu(self.l5) # (batch_sz, 200)
self.l7 = self.fully_connect(self.l6, self.fc2) # (batch_sz, 10)
self.l8 = self.relu(self.l7) # (batch_sz, 10)
self.l9 = self.softmax(self.l8) # (batch_sz, 10)
return self.l9
def backward_prop(self, softmax_output, output_label):
l8_delta = (output_label - softmax_output) / softmax_output.shape[0]
l7_delta = self.relu(self.l8, l8_delta, deriv=True) # (batch_sz, 10)
l6_delta, self.fc2 = self.fully_connect(self.l6, self.fc2, l7_delta, deriv=True) # (batch_sz, 200)
l5_delta = self.relu(self.l6, l6_delta, deriv=True) # (batch_sz, 200)
l4_delta, self.fc1 = self.fully_connect(self.l4, self.fc1, l5_delta, deriv=True) # (batch_sz, 16, 4, 4)
l3_delta = self.mean_pool(self.l3, self.pool2, l4_delta, deriv=True) # (batch_sz, 16, 8, 8)
l2_delta, self.conv2 = self.convolution(self.l2, self.conv2, l3_delta, deriv=True) # (batch_sz, 6, 12, 12)
l1_delta = self.mean_pool(self.l1, self.pool1, l2_delta, deriv=True) # (batch_sz, 6, 24, 24)
l0_delta, self.conv1 = self.convolution(self.l0, self.conv1, l1_delta, deriv=True) # (batch_sz, 1, 28, 28)
def convolution(self, input_map, kernal, front_delta=None, deriv=False):
N, C, W, H = input_map.shape
K_NUM, K_C, K_W, K_H = kernal.shape
if deriv == False:
feature_map = np.zeros((N, K_NUM, W-K_W+1, H-K_H+1))
for imgId in range(N):
for kId in range(K_NUM):
for cId in range(C):
feature_map[imgId][kId] += \
convolve2d(input_map[imgId][cId], kernal[kId,cId,:,:], mode='valid')
return feature_map
else :
# front->back (propagate loss)
back_delta = np.zeros((N, C, W, H))
kernal_gradient = np.zeros((K_NUM, K_C, K_W, K_H))
padded_front_delta = \
np.pad(front_delta, [(0,0), (0,0), (K_W-1, K_H-1), (K_W-1, K_H-1)], mode='constant', constant_values=0)
for imgId in range(N):
for cId in range(C):
for kId in range(K_NUM):
back_delta[imgId][cId] += \
convolve2d(padded_front_delta[imgId][kId], kernal[kId,cId,::-1,::-1], mode='valid')
kernal_gradient[kId][cId] += \
convolve2d(front_delta[imgId][kId], input_map[imgId,cId,::-1,::-1], mode='valid')
# update weights
kernal += self.lr * kernal_gradient
return back_delta, kernal
def mean_pool(self, input_map, pool, front_delta=None, deriv=False):
N, C, W, H = input_map.shape
P_W, P_H = tuple(pool)
if deriv == False:
feature_map = np.zeros((N, C, W/P_W, H/P_H))
feature_map = block_reduce(input_map, tuple((1, 1, P_W, P_H)), func=np.mean)
return feature_map
else :
# front->back (propagate loss)
back_delta = np.zeros((N, C, W, H))
back_delta = front_delta.repeat(P_W, axis = 2).repeat(P_H, axis = 3)
back_delta /= (P_W * P_H)
return back_delta
def fully_connect(self, input_data, fc, front_delta=None, deriv=False):
N = input_data.shape[0]
if deriv == False:
output_data = np.dot(input_data.reshape(N, -1), fc)
return output_data
else :
# front->back (propagate loss)
back_delta = np.dot(front_delta, fc.T).reshape(input_data.shape)
# update weights
fc += self.lr * np.dot(input_data.reshape(N, -1).T, front_delta)
return back_delta, fc
def relu(self, x, front_delta=None, deriv=False):
if deriv == False:
return x * (x > 0)
else :
# propagate loss
back_delta = front_delta * 1. * (x > 0)
return back_delta
def softmax(self, x):
y = list()
for t in x:
e_t = np.exp(t - np.max(t))
y.append(e_t / e_t.sum())
return np.array(y)
def xavier_init(c1, c2, w=1, h=1, fc=False):
fan_1 = c2 * w * h
fan_2 = c1 * w * h
ratio = np.sqrt(6.0 / (fan_1 + fan_2))
params = ratio * (2*np.random.random((c1, c2, w, h)) - 1)
if fc == True:
params = params.reshape(c1, c2)
return params
def convertToOneHot(labels):
oneHotLabels = np.zeros((labels.size, labels.max()+1))
oneHotLabels[np.arange(labels.size), labels] = 1
return oneHotLabels
def shuffle_dataset(data, label):
N = data.shape[0]
index = np.random.permutation(N)
x = data[index, :, :]; y = label[index, :]
return x, y
if __name__ == '__main__':
train_imgs = fetch_MNIST.load_train_images()
train_labs = fetch_MNIST.load_train_labels().astype(int)
# size of data; batch size
data_size = train_imgs.shape[0]; batch_sz = 64;
# learning rate; max iteration; iter % mod (avoid index out of range)
lr = 0.01; max_iter = 50000; iter_mod = int(data_size/batch_sz)
train_labs = convertToOneHot(train_labs)
my_CNN = LeNet(lr)
for iters in range(max_iter):
# starting index and ending index for input data
st_idx = (iters % iter_mod) * batch_sz
# shuffle the dataset
if st_idx == 0:
train_imgs, train_labs = shuffle_dataset(train_imgs, train_labs)
input_data = train_imgs[st_idx : st_idx + batch_sz]
output_label = train_labs[st_idx : st_idx + batch_sz]
softmax_output = my_CNN.forward_prop(input_data)
if iters % 50 == 0:
# calculate accuracy
correct_list = [ int(np.argmax(softmax_output[i])==np.argmax(output_label[i])) for i in range(batch_sz) ]
accuracy = float(np.array(correct_list).sum()) / batch_sz
# calculate loss
correct_prob = [ softmax_output[i][np.argmax(output_label[i])] for i in range(batch_sz) ]
correct_prob = filter(lambda x: x > 0, correct_prob)
loss = -1.0 * np.sum(np.log(correct_prob))
print "The %d iters result:" % iters
print "The accuracy is %f The loss is %f " % (accuracy, loss)
my_CNN.backward_prop(softmax_output, output_label)
operation result
The 0 iters result:
The accuracy is 0.203125 The loss is 146.051851
The 50 iters result:
The accuracy is 0.203125 The loss is 145.385184
The 100 iters result:
The accuracy is 0.390625 The loss is 142.522602
...
...
The 10500 iters result:
The accuracy is 0.953125 The loss is 13.559059
The 10550 iters result:
The accuracy is 0.953125 The loss is 11.696816
The 10600 iters result:
The accuracy is 0.968750 The loss is 5.641389
The 10650 iters result:
The accuracy is 0.968750 The loss is 13.733489
Ah, such a view is not intuitive, then draw a map to visualize for a moment, da da ~
in the training process, loss continues to decrease, while training accuracy on the rise, suggesting that my code should be right haha ~
( The next section, hand-written in python achieve recurrent neural network, so stay tuned!)