def maybe_download_and_extract():
  """Download and extract the tarball from Alex's website."""
  dest_directory = FLAGS.data_dir
  if not os.path.exists(dest_directory):
    os.makedirs(dest_directory)
  filename = DATA_URL.split('/')[-1]
  filepath = os.path.join(dest_directory, filename)
  if not os.path.exists(filepath):
    def _progress(count, block_size, total_size):
      sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
          float(count * block_size) / float(total_size) * 100.0))
      sys.stdout.flush()
    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
    print()
    statinfo = os.stat(filepath)
    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
  extracted_dir_path = os.path.join(dest_directory, 'cifar-10-batches-bin')
  if not os.path.exists(extracted_dir_path):
    tarfile.open(filepath, 'r:gz').extractall(dest_directory)

二、训练模型

1、加载训练数据

def distorted_inputs():
  """Construct distorted input for CIFAR training using the Reader ops.

  Returns:
    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
    labels: Labels. 1D tensor of [batch_size] size.

  Raises:
    ValueError: If no data_dir
  """
  images, labels = cifar10_input.distorted_inputs(data_dir=data_dir,
                                                  batch_size=FLAGS.batch_size)#对输入图片变形，包含在cifar10_input.py中
  return images, labels

加载图片之前，对图片做了一些数据结构化、白化等的预处理，这些细节操作在cifar10_input.py文件。

这个步骤输出分离的样本（images）和标签（labels），用于后面的步骤。

2、构建卷积网络模型

这个步骤的输入是上一个步骤的输出样本（images）。

模型整体结构是这样的：

1)、模型的第一层——卷积层：卷积——>池化——>正则化

with tf.variable_scope('conv1') as scope: # 为变量指定命名空间
    kernel = _variable_with_weight_decay('weights',
                                         shape=[5, 5, 3, 64],
                                         stddev=5e-2,
                                         wd=0.0)
    conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
    #tf.nn.bias_add(value, bias, name=None)，bias必须是一维tensor,
    #数据唯独和value的最后一维相同，这里是conv最后一维是核数64
    pre_activation = tf.nn.bias_add(conv, biases)
    #线性整流函数
    conv1 = tf.nn.relu(pre_activation, name=scope.name)
    _activation_summary(conv1)#如上定义

  # pool1
  #tf.nn.max_pool(value, ksize, strides, padding, name=None),
  #k第二个参数ksize：[batch, height, width, channels],池化窗口的大小，取一个四维向量，
  #一般是[1, height, width, 1]，因为我们不想在batch和channels上做池化，所以这两个维度设为了1
  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                         padding='SAME', name='pool1')
  # norm1/tf.nn.local_response_normalization
  #https://blog.csdn.net/sinat_21585785/article/details/75087768
  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                    name='norm1')

池化是在单个核作用后的单通道上处理，正则化是在多通道上的处理。

2)、模型的第二层——卷积层：卷积——>正则化——>池化

with tf.variable_scope('conv2') as scope:
    kernel = _variable_with_weight_decay('weights',
                                         shape=[5, 5, 64, 64],
                                         stddev=5e-2,
                                         wd=0.0)
    conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
    pre_activation = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu(pre_activation, name=scope.name)
    _activation_summary(conv2)

  # norm2
  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                    name='norm2')
  # pool2
  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')

3)、模型第三层——全连接层

with tf.variable_scope('local3') as scope:
    # Move everything into depth so we can perform a single matrix multiply.
    reshape = tf.reshape(pool2, [FLAGS.batch_size, -1])
    dim = reshape.get_shape()[1].value
    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
                                          stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
    local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    _activation_summary(local3)

4)、模型第四层——全连接层

with tf.variable_scope('local4') as scope:
    weights = _variable_with_weight_decay('weights', shape=[384, 192],
                                          stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
    local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
    _activation_summary(local4)

5)、模型第五层——输出层

# linear layer(WX + b),
  # We don't apply softmax here because
  # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
  # and performs the softmax internally for efficiency.
  with tf.variable_scope('softmax_linear') as scope:
    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                          stddev=1/192.0, wd=0.0)
    biases = _variable_on_cpu('biases', [NUM_CLASSES],
                              tf.constant_initializer(0.0))
    softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
    _activation_summary(softmax_linear)

这个步骤输出的就是最后一层网络的输出softmax_linear。

3、计算损失

这个步骤的输入是第二步的输出softmax_linear以及第一步的输出标签（labels）。

def loss(logits, labels):
  """Add L2Loss to all the trainable variables.

  Add summary for "Loss" and "Loss/avg".
  Args:
    logits: Logits from inference().
    labels: Labels from distorted_inputs or inputs(). 1-D tensor
            of shape [batch_size]

  Returns:
    Loss tensor of type float.
  """
  # Calculate the average cross entropy loss across the batch.
  labels = tf.cast(labels, tf.int64) #类型转换,使labels符合sparse_softmax_cross_entropy_with_logits输入参数格式要求
  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=logits, name='cross_entropy_per_example')
  cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
  tf.add_to_collection('losses', cross_entropy_mean)#合中key='losses'，value为cross_entropy_mean的子集中?

  # The total loss is defined as the cross entropy loss plus all of the weight
  # decay terms (L2 loss).
  return tf.add_n(tf.get_collection('losses'), name='total_loss')# 返回字典集合中key='losses'的子集中元素之和

这个步骤的输出就是损失。

4、训练模型

这个步骤的输入是上一步输出的损失以及一个全局的迭代计数器。

def train(total_loss, global_step):
  """Train CIFAR-10 model.

  Create an optimizer and apply to all trainable variables. Add moving
  average for all trainable variables.

  Args:
    total_loss: Total loss from loss().
    global_step: Integer Variable counting the number of training steps
      processed.
  Returns:
    train_op: op for training.
  """
  # Variables that affect learning rate.
  num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)#每经过decay_step步训练,学习率衰减一次?

  # Decay the learning rate exponentially based on the number of steps.返回衰减后的学习率,即lr
  #global_step全局的一个计数器，每训练一次加一，这里配合decay_steps用于学习率的递减
  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,#0.1
                                  global_step,
                                  decay_steps,
                                  LEARNING_RATE_DECAY_FACTOR,#0.1
                                  staircase=True)#即每经过decay_steps轮训练后，学习率乘以0.1
  tf.summary.scalar('learning_rate', lr)

  # Generate moving averages of all losses and associated summaries.
  loss_averages_op = _add_loss_summaries(total_loss)

  # Compute gradients. #tf.control_dependencies是一个context manager,控制节点执行顺序，依赖loss_averages_op才可以执行
  with tf.control_dependencies([loss_averages_op]):
    opt = tf.train.GradientDescentOptimizer(lr)#梯度下降法更新参数变量,定义这样一个对象opt
    grads = opt.compute_gradients(total_loss)#opt对象的一个函数,最小化损失来计算梯度

  # Apply gradients.#返回一步梯度更新操作 
  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)


  # Track the moving averages of all trainable variables.
  variable_averages = tf.train.ExponentialMovingAverage(
      MOVING_AVERAGE_DECAY, global_step)#variable_averages是一个对象,1-moving_average_decay相当于求moving average时的更新率
  variables_averages_op = variable_averages.apply(tf.trainable_variables())#这个对象的apply()函数先创造一个变量的影子,然后对影子训练变量求一个moving average,返回这个op.训练参数的moving average要比最终训练得到的参数效果要好很多.

  with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
    train_op = tf.no_op(name='train')#在进行1.梯度更新(即对所有训练参数进行跟新);2.求参数的moving averge后,方可进行tf.no_op()操作;tf.no_op仅仅创造一个操作的占位符

  return train_op

这个步骤的输出是一个整个模型的操作对象。

5、启动训练

这个步骤的输入是上一步输出的train_op操作对象。

class _LoggerHook(tf.train.SessionRunHook): #继承自SessionRunHook
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1
        self._start_time = time.time()

      def before_run(self, run_context):
        self._step += 1
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        if self._step % FLAGS.log_frequency == 0:
          current_time = time.time()
          duration = current_time - self._start_time
          self._start_time = current_time

          loss_value = run_values.results
          examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
          sec_per_batch = float(duration / FLAGS.log_frequency)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

with tf.train.MonitoredTrainingSession(#return a MonitoredSession object
        checkpoint_dir=FLAGS.train_dir,#A string. Optional path to a directory where to restore variables.
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),#Hook that requests stop at a specified step.
               tf.train.NanTensorHook(loss),#Monitors the loss tensor and stops training if loss is NaN.Can either fail with exception or just stop training.
               _LoggerHook()],#Optional list of SessionRunHook objects.
        config=tf.ConfigProto(#an instance of tf.ConfigProto proto used to configure the session. It's the config argument of constructor of tf.Session
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:  
      while not mon_sess.should_stop():
        mon_sess.run(train_op)

这个步骤的输出是得到的模型参数，保存到了checkpoint_dir目录。至此，整个模型的训练结束。

三、模型评估

1、读入待评估的数据

eval_data = FLAGS.eval_data == 'test'
images, labels = cifar10.inputs(eval_data=eval_data)# 读入评估图片和标签

2、读入得到的模型参数

# Restore the moving average version of the learned variables for eval.
#创建计算均值的对象
variable_averages = tf.train.ExponentialMovingAverage(
cifar10.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)

3、使用构建的模型

logits = cifar10.inference(images)

4、预测数据

# Calculate predictions.
#tf.nn.in_top_k(predictions, targets, k, name=None)，whether the targets are in the top K #predictions.
#判定predictions的top k个预测结果是否包含#targets，返回bool变量
top_k_op = tf.nn.in_top_k(logits, labels, 1)

5、启动评估

while step < num_iter and not coord.should_stop():
    predictions = sess.run([top_k_op])
    true_count += np.sum(predictions)#累加
    step += 1
precision = true_count / total_sample_count

至此，从构建模型到训练再到预测，整个基本流程结束。

基于tensorflow的卷积神经网络设计、训练以及预测的主要步骤梳理（基于CIFAR-10数据集）

一、准备工作