Tensorflow implements multi-GPU parallelism

Tebsorflow open source implementation of multi-GPU training cifar10 dataset: cifar10_multi_gpu_train.py

Tensorflow open source implements cifar10 neural network: cifar10.py
Parallelism in Tensorflow is divided into model parallelism and data parallelism. Model parallelism needs to design different parallel methods according to different models. The main principle is to put different computing nodes in the model on different hardware resources for operation. A more general and easy way to achieve large-scale parallelism is data parallelism, which uses multiple hardware resources to calculate the data gradients of different batches, and then aggregates the gradients for global updates.
Data parallelism is applicable to almost all deep learning models. It is always possible to use multiple GPUs to train multiple batches of data at the same time. The models running on each GPU are based on the same neural network, the network structure is the same, and the model parameters are shared.

import os
import re
import time
import numpy as np
import tensorflow as tf
import cifar10_input
import cifar10

batch_size = 128
max_steps = 1000
num_gpus = 1  # gpu数量


# 在scope下生成神经网络并返回scope下的loss
def tower_loss(scope):
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(images)  # 生成神经网络
    _ = cifar10.loss(logits, labels)  # 不直接返回loss而是放到collection
    losses = tf.get_collection('losses', scope)  # 获取当前GPU上的loss(通过scope限定范围)
    total_loss = tf.add_n(losses, name='total_loss')
    return total_loss


'''
外层是不同GPU计算的梯度,内层是某个GPU对应的不同var的值
tower_grads = 
[[(grad0_gpu0, var0_gpu0), (grad1_gpu0, var1_gpu0),...],
 [(grad0_gpu1, var0_gpu1), (grad1_gpu1, var1_gpu1),...]]
zip(*tower_grads)= 相当于转置了
[[(grad0_gpu0, var0_gpu0), (grad0_gpu1, var0, gpu1),...],
 [(grad1_gpu0, var1_gpu0), (grad1_gpu1, var1_gpu1),...]]
'''


def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        grads = [tf.concat(tf.expand_dims(g, 0), 0) for g, _ in grad_and_vars]
        grad = tf.reduce_mean(grads, 0)
        grad_and_var = (grad, grad_and_vars[0][1])
        # [(grad0, var0),(grad1, var1),...]
        average_grads.append(grad_and_var)
    return average_grads


def train():
    # 默认的计算设备为CPU
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # []表示没有维度,为一个数
        # trainable=False,不会加入GraphKeys.TRAINABLE_VARIABLES参与训练
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        num_batches_per_epoch = cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
        # https://tensorflow.google.cn/api_docs/python/tf/train/exponential_decay
        # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
        # staircase is True, then global_step / decay_steps is an integer division
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)
        opt = tf.train.GradientDescentOptimizer(lr)

        tower_grads = []
        for i in range(num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
                    loss = tower_loss(scope)
                    # 让神经网络的变量可以重用,所有GPU使用完全相同的参数
                    # 让下一个tower重用参数
                    tf.get_variable_scope().reuse_variables()
                    grads = opt.compute_gradients(loss)
                    tower_grads.append(grads)
        grads = average_gradients(tower_grads)
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        saver = tf.train.Saver(tf.all_variables())
        init = tf.global_variables_initializer()
        # True会自动选择一个存在并且支持的设备来运行
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(init)
        tf.train.start_queue_runners(sess=sess)

        for step in range(max_steps):
            start_time = time.time()
            _, loss_value = sess.run([apply_gradient_op, loss])
            duration = time.time() - start_time

            if step % 10 == 0:
                num_examples_per_step = batch_size * num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / num_gpus

                print('step %d, loss=%.2f(%.1f examples/sec;%.3f sec/batch)'
                      % (step, loss_value, examples_per_sec, sec_per_batch))

            if step % 1000 == 0 or (step + 1) == max_steps:
                saver.save(sess, 'model.ckpt', global_step=step)


if __name__ == '__main__':
    train()

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325371286&siteId=291194637