说明
由于此次代码升级的更改全部在 Google Colab 完成,Google Colab 可能与大多传统 jupyter 不同,Google Colab以两个空格作为标准
一、网络加入drop out,防止过拟合
需要修改的函数 inference
def inference(images, batch_size, n_classes):
with tf.variable_scope('conv1') as scope:
weights = tf.get_variable('weights',
shape=[3,3,3, 16],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[16],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(images, weights, strides=[1,1,1,1], padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name= scope.name)
#pool1 and norm1
with tf.variable_scope('pooling1_lrn') as scope:
pool1 = tf.nn.max_pool(conv1,
ksize=[1,3,3,1],
strides=[1,2,2,1],
padding='SAME', name='pooling1')
norm1 = tf.nn.lrn(pool1,
depth_radius=4,
bias=1.0,
alpha=0.001/9.0,
beta=0.75,name='norm1')
#conv2
with tf.variable_scope('conv2') as scope:
weights = tf.get_variable('weights',
shape=[3,3,16,16],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[16],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(norm1, weights, strides=[1,1,1,1],padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name='conv2')
#pool2 and norm2
with tf.variable_scope('pooling2_lrn') as scope:
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001/9.0,
beta=0.75,name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1,3,3,1], strides=[1,1,1,1],
padding='SAME',name='pooling2')
#local3
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2, shape=[batch_size, -1])
dim = reshape.get_shape()[1].value
weights = tf.get_variable('weights',
shape=[dim,128],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[128],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
local3_dropout = tf.nn.dropout(local3, keep_prob=0.8)
#local4
with tf.variable_scope('local4') as scope:
weights = tf.get_variable('weights',
shape=[128,128],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[128],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3_dropout, weights) + biases, name='local4')
local4_dropout = tf.nn.dropout(local4, keep_prob=0.8)
# softmax
with tf.variable_scope('softmax_linear') as scope:
weights = tf.get_variable('softmax_linear',
shape=[128, n_classes],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[n_classes],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
softmax_linear = tf.add(tf.matmul(local4_dropout, weights), biases, name='softmax_linear')
return softmax_linear
def losses(logits, labels):
with tf.variable_scope('loss') as scope:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
(logits=logits, labels=labels, name='xentropy_per_example')
loss = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.scalar(scope.name+'/loss', loss)
return loss
def trainning(loss, learning_rate):
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate)
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss, global_step= global_step)
return train_op
def evaluation(logits, labels):
with tf.variable_scope('accuracy') as scope:
correct = tf.nn.in_top_k(logits, labels, 1)
correct = tf.cast(correct, tf.float16)
accuracy = tf.reduce_mean(correct)
tf.summary.scalar(scope.name+'/accuracy', accuracy)
return accuracy
二、为训练加入断点续训
我没有包装成函数的形式,将原代码中的 run_training 注释即可
N_CLASSES = 2
CAPACITY = 2000
learning_rate = 0.0001
tf.reset_default_graph()
train_epochs = 50
BATCH_SIZE = 50
total_batch = int(25000/50)
tfrecords_file = './data/dog_vs_cat.tfrecords'
logs_train_dir = './training'
train_batch, train_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY)
train_logits = inference(train_batch, BATCH_SIZE, N_CLASSES)
train_loss = losses(train_logits, train_label_batch)
train_op = trainning(train_loss, learning_rate)
train__acc = evaluation(train_logits, train_label_batch)
epoch = tf.Variable(0, name='epoch', trainable=False)
startTime = time()
sess = tf.Session()
train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph)
summary_op = tf.summary.merge_all()
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
ckpt_dir = 'training/'
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir)
saver = tf.train.Saver(max_to_keep = 1)
ckpt = tf.train.latest_checkpoint(ckpt_dir)
if ckpt != None:
saver.restore(sess, ckpt)
else:
print("Training from scratch.")
start = sess.run(epoch)
print("Training starts from {} epoch.".format(start + 1))
try:
for ep in range(start, train_epochs):
for i in range(total_batch):
if coord.should_stop():
break
_, tra_loss, tra_acc = sess.run([train_op, train_loss, train__acc])
if i % 50 == 0:
print('Step %d, train loss = %.2f, train accuracy = %.2f%%' %(i, tra_loss, tra_acc*100.0))
summary_str = sess.run(summary_op)
train_writer.add_summary(summary_str, ep)
print("Train epoch:", '%02d' % (sess.run(epoch) + 1),
"Loss=", "{:.6f}".format(tra_loss), "Accuracy=", tra_acc)
checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=ep)
sess.run(epoch.assign(ep + 1))
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
先训练一段时间然后中断
继续训练发现从上一步训练开始
三、评价模型在整个 tfrecord 正确率
注意事项
1.BATCH_SIZE 自行设置,根据自己的想法与电脑配置设置
2.CAPACITY 自己设置,根据自己的想法与电脑配置设置
3.Turns 的计算时用你验证所使用数据集图片数量除以 BATCH_SIZE 取整
4.每次验证完可能总体错误略有偏差,因为 CAPACITY,BATCH_SIZE 的设置使得处理的轮数取整后可能不能完全遍历,还有队列中可能仍存在一些,自行摸索 CAPACITY ,BATCH_SIZE 的设置吧,实在不会,自行百度
为了方便,我用训练时的 tfrecord 进行评估,根据前面两篇大家可以自行将 验证集图像转化成 tfrecord 形式进行评估
IMG_W = 224
IMG_H = 224
CAPACITY = 64
BATCH_SIZE = 1000
Turns = int(25000/BATCH_SIZE)
tfrecords_file = './data/dog_vs_cat.tfrecords'
def evaluate_tfrecord_all_image():
tf.reset_default_graph()
test_batch, test_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY)
logits = inference(test_batch, BATCH_SIZE, N_CLASSES)
testloss = losses(logits, test_label_batch)
testacc = evaluation(logits, test_label_batch)
logs_train_dir = './training/'
saver = tf.train.Saver()
sess = tf.Session()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
print("Reading checkpoints...")
ckpt = tf.train.get_checkpoint_state(logs_train_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
saver.restore(sess, ckpt.model_checkpoint_path)
print('Loading success, global_step is %s' % global_step)
else:
print('No checkpoint file found')
wrong_sum = 0
for i in range(Turns):
print('INFO:The {} batch has been dealt'.format(i + 1))
test_loss,test_acc = sess.run([testloss,testacc])
print('The model\'s loss is %.2f' %test_loss)
correct = int(BATCH_SIZE*test_acc)
print('Correct : %d' % correct)
wrong = BATCH_SIZE - correct
wrong_sum += wrong
print('Wrong : %d' % wrong)
print('The accuracy in the batch is %.2f%%' %(test_acc*100.0))
total_accuracy = (25000.0-wrong_sum)/25000.0
print('------------------*******------------------')
print('INFO:{} images are detected wrong'.format(wrong_sum))
print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100))
coord.request_stop()
coord.join(threads)
实际效果如下,在整个 25000 张图中,错误为 318,整体正确率大概在为98.73%
当然有的读者喜欢使用原图,或者已经使用原图进行训练,那么原图验证如下,当然运行起来确实慢了不少,tfrecord 模式全部验证完,可能原图一个 batch 还没跑完,最终显示有 592 张不同,为何结果不一样,参照上述
IMG_W = 224
IMG_H = 224
CAPACITY = 64
BATCH_SIZE = 1000
Turns = int(25000/BATCH_SIZE)
def evaluate_all_image():
tf.reset_default_graph()
test_dir = './data/train1/'
N_CLASSES = 2
test, test_label = get_files(test_dir)
IMAGES_NUM = len(test_label)
print('There are %d test images totally..' % IMAGES_NUM)
test_batch, test_label_batch = get_batch(test,
test_label,
IMG_W,
IMG_H,
BATCH_SIZE,
CAPACITY)
logits = inference(test_batch, BATCH_SIZE, N_CLASSES)
testloss = losses(logits, test_label_batch)
testacc = evaluation(logits, test_label_batch)
logs_train_dir = './training/'
saver = tf.train.Saver()
sess = tf.Session()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
print("Reading checkpoints...")
ckpt = tf.train.get_checkpoint_state(logs_train_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
saver.restore(sess, ckpt.model_checkpoint_path)
print('Loading success, global_step is %s' % global_step)
else:
print('No checkpoint file found')
wrong_sum = 0
for i in range(Turns):
print('INFO:The {} batch has been dealt'.format(i + 1))
test_loss,test_acc = sess.run([testloss,testacc])
print('The model\'s loss is %.2f' %test_loss)
correct = int(BATCH_SIZE*test_acc)
print('Correct : %d' % correct)
wrong = BATCH_SIZE - correct
wrong_sum += wrong
print('Wrong : %d' % wrong)
print('The accuracy in the batch is %.2f%%' %(test_acc*100.0))
total_accuracy = (25000.0-wrong_sum)/25000.0
print('------------------*******------------------')
print('INFO:{} images are detected wrong'.format(wrong_sum))
print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100))
coord.request_stop()
coord.join(threads)
sess.close()
三、查看tensorboard
升级后的 Tensorboard 面板显示 accuracy 与 loss 如下
计算图如下
四、最后的注意事项
可能我水平也有限,但确实是我踩过的坑,因为训练时用的 TensorFlow 标准化函数,如果验证一张图使用 feed_dict 方式喂入则容易造成验证与预期差距很大,因为feed_dict 方式喂入的是图像的矩阵,而训练喂入的是张量,使用其他库对图像进行标准化得到的矩阵较使用 TensorFlow 使用内置函数标准化得到的张量(这个张量可以可视化为矩阵)在值上不同,例如使用 PIL 的 Image 函数标准化得到的图像与 TensorFlow 差太多
完结撒花,准备 GitHub 去了