1数据读取函数及可视化
- 该部分与前面手写数字识别代码一样:
# 导入数据及画图函数
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y') # (5000,1)
y = y.reshape(y.shape[0]) # make it back to column vector
X = data.get('X') # (5000,400)
if transpose:
# for this dataset, you need a transpose to get the orientation right
X = np.array([im.reshape((20, 20)).T for im in X])
# and I flat the image again to preserve the vector presentation
X = np.array([im.reshape(400) for im in X])
return X, y
X, _ = load_data('ex4data1.mat')
def plot_100_image(X):
""" sample 100 image and show them
assume the image is square
X : (5000, 400)
"""
size = int(np.sqrt(X.shape[1]))
# sample 100 image, reshape, reorg it
sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 100*400
sample_images = X[sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plot_100_image(X)
plt.show()
2数据读取与数据处理
# 数据读取与数据处理
X_raw, y_raw = load_data('ex4data1.mat', transpose=False)
X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # 增加全部为1的一列
print('X.shape:', X.shape) # X.shape: (5000, 401)
def expand_y(y):
# 相当于是实现了独热编码的功能
# """expand 5000*1 into 5000*10
# where y=10 -> [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]: ndarray
# """
res = []
for i in y:
y_array = np.zeros(10)
y_array[i - 1] = 1
res.append(y_array)
return np.array(res)
y = expand_y(y_raw)
print('y.shape:', y.shape) # y.shape: (5000, 10)
3序列化与反序列化
- 所谓序列化:将神经网络的参数合并成一个列向量
- 反序列化:由列向量还原回最初的参数模样
np.ravel
:对于[[1,2,3],[4,5,6]]
,会变成[1,2,3,4,5,6]
# 序列化和反序列化
# 读取权重
def load_weight(path):
data = sio.loadmat(path)
return data['Theta1'], data['Theta2']
t1, t2 = load_weight('ex4weights.mat')
print('t1.shape:', t1.shape)
print('t2.shape:', t2.shape)
# 定义序列化函数
# 在这个nn架构中,我们有theta1(25,401),theta2(10,26),它们的梯度是delta1,delta2
def serialize(a, b):
# 将权重序列化,变成一个列向量
return np.concatenate((np.ravel(a), np.ravel(b)))
theta = serialize(t1, t2) # 扁平化参数,25*401+10*26=10285
print('theta.shape:', theta.shape)
# 定义反序列化函数
def deserialize(seq):
# """into ndarray of (25, 401), (10, 26)"""
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
4前向传播
- 定义前向传播过程,因为反向传播需要首先通过前向传播计算相关量的值。
# 前向传播
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def feed_forward(theta, X):
"""apply to architecture 400+1 * 25+1 *10
X: 5000 * 401
"""
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0] # 样本数
a1 = X # 5000 * 401
z2 = a1 @ t1.T # 5000 * 25
a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1) # 5000*26
z3 = a2 @ t2.T # 5000 * 10
h = sigmoid(z3) # 5000*10, this is h_theta(X)
return a1, z2, a2, z3, h # you need all those for backprop
_, _, _, _, h = feed_forward(theta, X)
print('h.shape:', h.shape)
5定义正则化代价函数
- 虽然这里使用的y是独热编码之后的y,但是还是像最初的y那样,将每一个样本的标签序列(即一个独热编码的结果)的每一个元素对应进行相应的运算。
# 代价函数
def cost(theta, X, y):
# """calculate cost
# y: (m, k) ndarray
# """
m = X.shape[0] # get the data size m
_, _, _, _, h = feed_forward(theta, X)
# np.multiply is pairwise operation
pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h)) # (5000,10)
pair_computation_sum = pair_computation.sum()
return pair_computation_sum / m
print('cost(theta, X, y):', cost(theta, X, y))
# 正则化代价函数
def regularized_cost(theta, X, y, l=1):
"""the first column of t1 and t2 is intercept theta, ignore them when you do regularization"""
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum() # this is how you ignore first col
reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum()
return cost(theta, X, y) + reg_t1 + reg_t2
print('regularized_cost(theta, X, y):', regularized_cost(theta, X, y))
6反向传播
1、反向传播过程的阐述详见:5-反向传播算法过程梳理_骑着蜗牛环游深度学习世界的博客-CSDN博客
2、借助李宏毅老师的讲解加深对反向传播过程的理解,然后从代价反向一层层的通过权重传递的思想(即吴恩达老师给出的算法)看懂该部分的代码实现
- 因为涉及到求激活函数(这里具体说就是sigmoid函数)的导数,所以先构建下面的函数:
# 1、计算激活函数的导数
def sigmoid_gradient(z):
# 下面的计算结果等同于对激活函数求导的结果
return np.multiply(sigmoid(z), (1 - sigmoid(z)))
- 反向传播过程,如图1所示。
- 首先进行前向传播,获得各层的各个量的结果
- 然后用样本数作为循环,计算每个样本对神经网络的误差的贡献
- 先计算第三层(输出层)的误差项
- 然后通过权重以及激活函数的偏导数计算传递到第二层之后的误差项。根据反向传播算法的思路,误差项只需要计算到正数第二层即可。
- 最后就可以计算两组参数的偏导数了。
# 2、计算theta的梯度
def gradient(theta, X, y):
# initialize
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
# 初始化梯度,即每个参数的偏导数
delta1 = np.zeros(t1.shape) # (25, 401)
delta2 = np.zeros(t2.shape) # (10, 26)
# 进行前向传播
a1, z2, a2, z3, h = feed_forward(theta, X)
'''
a1.shape: (5000, 401)
z2.shape: (5000, 25)
a2.shape: (5000, 26)
z3.shape: (5000, 10)
h.shape: (5000, 10)
'''
for i in range(m):
a1i = a1[i, :] # (1, 401)
z2i = z2[i, :] # (1, 25)
a2i = a2[i, :] # (1, 26)
hi = h[i, :] # (1, 10)
yi = y[i, :] # (1, 10)
d3i = hi - yi # (1, 10),实际值与预测值之间的误差,也就是代价
z2i = np.insert(z2i, 0, np.ones(1)) # make it (1, 26) to compute d2i
d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26)
# careful with np vector transpose
delta2 += np.matrix(d3i).T @ np.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26)
delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401)
delta1 = delta1 / m
delta2 = delta2 / m
return serialize(delta1, delta2)
- 在此基础上,计算考虑正则化项的梯度下降。正则化项不作用在偏置节点上,因此需要将两组参数的第一列设置为0,这样惩罚系数对偏置项的影响就无效了。
def regularized_gradient(theta, X, y, l=1):
"""don't regularize theta of bias terms"""
m = X.shape[0]
delta1, delta2 = deserialize(gradient(theta, X, y))
t1, t2 = deserialize(theta)
t1[:, 0] = 0 # 这一步就是为了不把正则化作用到偏置上
reg_term_d1 = (l / m) * t1
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2
return serialize(delta1, delta2)
7梯度检验
1、回顾梯度检验的概念:在代价函数上沿着切线的方向选择离两个非常近的点然后计算两个点的平均值用以估计梯度,然后与反向传播得出的梯度进行比较
2、当 θ \theta θ是一个向量时,需要分别对其中的每一个分量进行检验,因此代码中使用expand_array
方法对参数进行了维度的扩展,为的就是在梯度检验时,一次只检验一个分量。
梯度下降检验的思想如图2所示。
# 梯度检验
def expand_array(arr):
"""replicate array into matrix
[1, 2, 3]
[[1, 2, 3],
[1, 2, 3],
[1, 2, 3]]
"""
# turn matrix back to ndarray
return np.array(np.matrix(np.ones(arr.shape[0])).T @ np.matrix(arr))
def gradient_checking(theta, X, y, epsilon, regularized=False):
def a_numeric_grad(plus, minus, regularized=False):
"""calculate a partial gradient with respect to 1 theta"""
if regularized:
return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (epsilon * 2)
else:
return (cost(plus, X, y) - cost(minus, X, y)) / (epsilon * 2)
theta_matrix = expand_array(theta) # expand to (10285, 10285)
epsilon_matrix = np.identity(len(theta)) * epsilon
plus_matrix = theta_matrix + epsilon_matrix
minus_matrix = theta_matrix - epsilon_matrix
# calculate numerical gradient with respect to each theta
# 得到每一个theta的梯度检验值
numeric_grad = np.array([a_numeric_grad(plus_matrix[i], minus_matrix[i], regularized)
for i in range(len(theta))])
# analytical grad will depend on if you want it to be regularized or not
# 得到经由梯度下降算法计算之后的偏导数
analytic_grad = regularized_gradient(theta, X, y) if regularized else gradient(theta, X, y)
# If you have a correct implementation, and assuming you used EPSILON = 0.0001
# the diff below should be less than 1e-9
# this is how original matlab code do gradient checking
diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad)
print(
'If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format(
diff))
# 开始进行梯度检验
gradient_checking(theta, X, y, epsilon=0.0001) # 这个运行很慢,谨慎运行
8训练
- 接下来就可以进行训练优化了:
# 训练
# 定义参数初始化函数
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)
def nn_training(X, y):
"""regularized version
the architecture is hard coded here... won't generalize
"""
init_theta = random_init(10285) # 25*401 + 10*26
res = opt.minimize(fun=regularized_cost,
x0=init_theta,
args=(X, y, 1),
method='TNC',
jac=regularized_gradient,
options={
'maxiter': 400})
return res
res = nn_training(X, y) # 慢
print('res:\n', res)
- 训练的结果如图3所示。
- 疑惑的地方:为什么
success
的结果是false
- 疑惑的地方:为什么