TensorFlow实现kaggle数据集图像分类Ⅲ——程序改进升级篇

文章目录

说明
一、网络加入drop out，防止过拟合
二、为训练加入断点续训
三、评价模型在整个 tfrecord 正确率

注意事项

三、查看tensorboard
四、最后的注意事项

说明

由于此次代码升级的更改全部在 Google Colab 完成，Google Colab 可能与大多传统 jupyter 不同，Google Colab以两个空格作为标准

一、网络加入drop out，防止过拟合

需要修改的函数 inference

def inference(images, batch_size, n_classes):   
  with tf.variable_scope('conv1') as scope:
    weights = tf.get_variable('weights', 
                  shape=[3,3,3, 16],
                  dtype=tf.float32, 
                  initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))
    biases = tf.get_variable('biases', 
                  shape=[16],
                  dtype=tf.float32,
                  initializer=tf.constant_initializer(0.1))
    conv = tf.nn.conv2d(images, weights, strides=[1,1,1,1], padding='SAME')
    pre_activation = tf.nn.bias_add(conv, biases)
    conv1 = tf.nn.relu(pre_activation, name= scope.name)
    
    #pool1 and norm1   
  with tf.variable_scope('pooling1_lrn') as scope:
    pool1 = tf.nn.max_pool(conv1,
                ksize=[1,3,3,1],
                strides=[1,2,2,1],
                padding='SAME', name='pooling1')
    norm1 = tf.nn.lrn(pool1, 
              depth_radius=4, 
              bias=1.0, 
              alpha=0.001/9.0,
              beta=0.75,name='norm1')
    
    #conv2
  with tf.variable_scope('conv2') as scope:
    weights = tf.get_variable('weights',
                  shape=[3,3,16,16],
                  dtype=tf.float32,
                  initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32))
    biases = tf.get_variable('biases',
                  shape=[16], 
                  dtype=tf.float32,
                  initializer=tf.constant_initializer(0.1))
    conv = tf.nn.conv2d(norm1, weights, strides=[1,1,1,1],padding='SAME')
    pre_activation = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu(pre_activation, name='conv2')
    
    
    #pool2 and norm2
  with tf.variable_scope('pooling2_lrn') as scope:
    norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001/9.0,
                      beta=0.75,name='norm2')
    pool2 = tf.nn.max_pool(norm2, ksize=[1,3,3,1], strides=[1,1,1,1],
                            padding='SAME',name='pooling2')
    
    
    #local3
  with tf.variable_scope('local3') as scope:
    reshape = tf.reshape(pool2, shape=[batch_size, -1])
    dim = reshape.get_shape()[1].value
    weights = tf.get_variable('weights',
                  shape=[dim,128],
                  dtype=tf.float32,
                  initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
    biases = tf.get_variable('biases',
                  shape=[128],
                  dtype=tf.float32, 
                  initializer=tf.constant_initializer(0.1))
    local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    local3_dropout = tf.nn.dropout(local3, keep_prob=0.8)   
    
    #local4
  with tf.variable_scope('local4') as scope:
    weights = tf.get_variable('weights',
                  shape=[128,128],
                  dtype=tf.float32, 
                  initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
    biases = tf.get_variable('biases',
                  shape=[128],
                  dtype=tf.float32,
                  initializer=tf.constant_initializer(0.1))
    local4 = tf.nn.relu(tf.matmul(local3_dropout, weights) + biases, name='local4')
    local4_dropout = tf.nn.dropout(local4, keep_prob=0.8)
     
        
    # softmax
  with tf.variable_scope('softmax_linear') as scope:
    weights = tf.get_variable('softmax_linear',
                  shape=[128, n_classes],
                  dtype=tf.float32,
                  initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32))
    biases = tf.get_variable('biases', 
                  shape=[n_classes],
                  dtype=tf.float32, 
                  initializer=tf.constant_initializer(0.1))
    softmax_linear = tf.add(tf.matmul(local4_dropout, weights), biases, name='softmax_linear')
  
  return softmax_linear


def losses(logits, labels):
  with tf.variable_scope('loss') as scope:
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
                    (logits=logits, labels=labels, name='xentropy_per_example')
    loss = tf.reduce_mean(cross_entropy, name='loss')
    tf.summary.scalar(scope.name+'/loss', loss)
  return loss


def trainning(loss, learning_rate):
  with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = optimizer.minimize(loss, global_step= global_step)
  return train_op


def evaluation(logits, labels):
  with tf.variable_scope('accuracy') as scope:
    correct = tf.nn.in_top_k(logits, labels, 1)
    correct = tf.cast(correct, tf.float16)
    accuracy = tf.reduce_mean(correct)
    tf.summary.scalar(scope.name+'/accuracy', accuracy)
  return accuracy

二、为训练加入断点续训

我没有包装成函数的形式，将原代码中的 run_training 注释即可

N_CLASSES = 2
CAPACITY = 2000
learning_rate = 0.0001

tf.reset_default_graph()

train_epochs = 50
BATCH_SIZE = 50
total_batch = int(25000/50)


tfrecords_file = './data/dog_vs_cat.tfrecords'
logs_train_dir = './training' 


train_batch, train_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY)
train_logits = inference(train_batch, BATCH_SIZE, N_CLASSES)
train_loss = losses(train_logits, train_label_batch)        
train_op = trainning(train_loss, learning_rate)
train__acc = evaluation(train_logits, train_label_batch)
epoch = tf.Variable(0, name='epoch', trainable=False)

startTime = time()

sess = tf.Session()

train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph)
summary_op = tf.summary.merge_all()

sess.run(tf.global_variables_initializer())

coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)

ckpt_dir = 'training/'
if not os.path.exists(ckpt_dir):
  os.makedirs(ckpt_dir)

saver = tf.train.Saver(max_to_keep = 1)

ckpt = tf.train.latest_checkpoint(ckpt_dir)
if ckpt != None:
  saver.restore(sess, ckpt)
else:
  print("Training from scratch.")

  

start = sess.run(epoch)
print("Training starts from {} epoch.".format(start + 1))


try:
  for ep in range(start, train_epochs):
    for i in range(total_batch):
      if coord.should_stop():
        break
      _, tra_loss, tra_acc = sess.run([train_op, train_loss, train__acc])
      if i % 50 == 0:
        print('Step %d, train loss = %.2f, train accuracy = %.2f%%' %(i, tra_loss, tra_acc*100.0))
        
    summary_str = sess.run(summary_op)
    train_writer.add_summary(summary_str, ep)
  
    print("Train epoch:", '%02d' % (sess.run(epoch) + 1),
        "Loss=", "{:.6f}".format(tra_loss), "Accuracy=", tra_acc)
    checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt')
    saver.save(sess, checkpoint_path, global_step=ep)
    sess.run(epoch.assign(ep + 1))
except tf.errors.OutOfRangeError:
    print('Done training -- epoch limit reached')
finally:
  coord.request_stop()
    
coord.join(threads)
sess.close()

先训练一段时间然后中断

继续训练发现从上一步训练开始

三、评价模型在整个 tfrecord 正确率

注意事项

1.BATCH_SIZE 自行设置，根据自己的想法与电脑配置设置
2.CAPACITY 自己设置，根据自己的想法与电脑配置设置
3.Turns 的计算时用你验证所使用数据集图片数量除以 BATCH_SIZE 取整
4.每次验证完可能总体错误略有偏差，因为 CAPACITY，BATCH_SIZE 的设置使得处理的轮数取整后可能不能完全遍历，还有队列中可能仍存在一些，自行摸索 CAPACITY ，BATCH_SIZE 的设置吧，实在不会，自行百度

为了方便，我用训练时的 tfrecord 进行评估，根据前面两篇大家可以自行将验证集图像转化成 tfrecord 形式进行评估

IMG_W = 224
IMG_H = 224
CAPACITY = 64
BATCH_SIZE = 1000
Turns = int(25000/BATCH_SIZE)

tfrecords_file = './data/dog_vs_cat.tfrecords'
def evaluate_tfrecord_all_image():
  
  tf.reset_default_graph()
  test_batch, test_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY)

  logits = inference(test_batch, BATCH_SIZE, N_CLASSES)
  testloss = losses(logits, test_label_batch) 
  testacc = evaluation(logits, test_label_batch)
  
  logs_train_dir = './training/'
  saver = tf.train.Saver()
  sess = tf.Session() 
  coord = tf.train.Coordinator()
  threads = tf.train.start_queue_runners(sess=sess, coord=coord)

  print("Reading checkpoints...")
  ckpt = tf.train.get_checkpoint_state(logs_train_dir)
  if ckpt and ckpt.model_checkpoint_path:
    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
    saver.restore(sess, ckpt.model_checkpoint_path)
    print('Loading success, global_step is %s' % global_step)
  else:
    print('No checkpoint file found')
  wrong_sum = 0
  for i in range(Turns):    
    print('INFO:The {} batch has been dealt'.format(i + 1))
    test_loss,test_acc = sess.run([testloss,testacc])
    print('The model\'s loss is %.2f' %test_loss)
    correct = int(BATCH_SIZE*test_acc)
    print('Correct : %d' % correct)
    wrong = BATCH_SIZE - correct
    wrong_sum += wrong
    print('Wrong : %d' % wrong)
    print('The accuracy in the batch is %.2f%%' %(test_acc*100.0))
  total_accuracy = (25000.0-wrong_sum)/25000.0
  print('------------------*******------------------')
  print('INFO:{} images are detected wrong'.format(wrong_sum))
  print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100))
  coord.request_stop()
  coord.join(threads)

实际效果如下，在整个 25000 张图中，错误为 318，整体正确率大概在为98.73%

当然有的读者喜欢使用原图，或者已经使用原图进行训练，那么原图验证如下，当然运行起来确实慢了不少，tfrecord 模式全部验证完，可能原图一个 batch 还没跑完，最终显示有 592 张不同，为何结果不一样，参照上述

IMG_W = 224
IMG_H = 224
CAPACITY = 64
BATCH_SIZE = 1000
Turns = int(25000/BATCH_SIZE)


def evaluate_all_image():
  
  tf.reset_default_graph()
  
  test_dir = './data/train1/'
  N_CLASSES = 2
  test, test_label = get_files(test_dir)
  IMAGES_NUM = len(test_label)
  print('There are %d test images totally..' % IMAGES_NUM)
  test_batch, test_label_batch = get_batch(test,
                        test_label,
                        IMG_W,
                        IMG_H,
                        BATCH_SIZE, 
                        CAPACITY)

  logits = inference(test_batch, BATCH_SIZE, N_CLASSES)
  testloss = losses(logits, test_label_batch) 
  testacc = evaluation(logits, test_label_batch)
  
  logs_train_dir = './training/'
  saver = tf.train.Saver()
  sess = tf.Session()
  coord = tf.train.Coordinator()
  threads = tf.train.start_queue_runners(sess=sess, coord=coord)

  print("Reading checkpoints...")
  ckpt = tf.train.get_checkpoint_state(logs_train_dir)
  if ckpt and ckpt.model_checkpoint_path:
    global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
    saver.restore(sess, ckpt.model_checkpoint_path)
    print('Loading success, global_step is %s' % global_step)
  else:
    print('No checkpoint file found')
  wrong_sum = 0
  for i in range(Turns):    
    print('INFO:The {} batch has been dealt'.format(i + 1))
    test_loss,test_acc = sess.run([testloss,testacc])
    print('The model\'s loss is %.2f' %test_loss)
    correct = int(BATCH_SIZE*test_acc)
    print('Correct : %d' % correct)
    wrong = BATCH_SIZE - correct
    wrong_sum += wrong
    print('Wrong : %d' % wrong)
    print('The accuracy in the batch is %.2f%%' %(test_acc*100.0))
  total_accuracy = (25000.0-wrong_sum)/25000.0
  print('------------------*******------------------')
  print('INFO:{} images are detected wrong'.format(wrong_sum))
  print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100))
  coord.request_stop()
  coord.join(threads)
  sess.close()

三、查看tensorboard

升级后的 Tensorboard 面板显示 accuracy 与 loss 如下

计算图如下

四、最后的注意事项

可能我水平也有限，但确实是我踩过的坑，因为训练时用的 TensorFlow 标准化函数，如果验证一张图使用 feed_dict 方式喂入则容易造成验证与预期差距很大，因为feed_dict 方式喂入的是图像的矩阵，而训练喂入的是张量，使用其他库对图像进行标准化得到的矩阵较使用 TensorFlow 使用内置函数标准化得到的张量（这个张量可以可视化为矩阵）在值上不同，例如使用 PIL 的 Image 函数标准化得到的图像与 TensorFlow 差太多