1、多GPU原理
单GPU时,思路很简单,前向、后向都在一个GPU上进行,模型参数更新时只涉及一个GPU。多GPU时,有模型并行和数据并行两种情况。模型并行指模型的不同部分在不同GPU上运行。数据并行指不同GPU上训练数据不同,但模型是同一个(相当于是同一个模型的副本)。TensorFlow支持的是数据并行。数据并行的原理:CPU负责梯度平均和参数更新,在GPU上训练模型的副本。多GPU并行计算的过程:
1)模型副本定义在指定的GPU/CPU上;
2)对于每一个GPU, 都是从CPU获得数据,前向传播进行计算,得到loss,并计算出梯度;
3)CPU接到GPU的梯度,取平均值,然后进行梯度更新。
这个在tf的实现思路如下:
模型参数保存在一个指定gpu/cpu上,模型参数的副本在不同gpu上,每次训练,提供batch_size*gpu_num数据,并等量拆分成多个batch,分别送入不同GPU。前向在不同gpu上进行,模型参数更新时,将多个GPU后向计算得到的梯度数据进行平均,并在指定GPU/CPU上利用梯度数据更新模型参数。假设有两个GPU(gpu0,gpu1),模型参数实际存放在cpu0上,实际一次训练过程如下图所示:
2、model_deploy.py文件及其用法
为了能让一个Slim模型在多个GPU上训练更加容易,这个模块提供了一系列帮助函数,比如create_clones()、optimize_clones()、deploy()、gather_clone_loss()、_add_gradients_summaries()、_sum_clones_gradients()等,该模块位于:https://github.com/tensorflow/models/blob/master/research/slim/deployment/model_deploy.py
详细步骤:
(1)创建DeploymentConfig对象:config = model_deploy.DeploymentConfig()
Deployment类定义的源码如下:
class DeploymentConfig(object):
''' 这个配置类描述了如何将一个模型部署在多个单机的多个GPU上,在每个单机上,模型将挥被复制num_clones次
'''
def __init__(self,num_clones=1, clone_on_cpu=False,
replica_id=0, num_replicas=1,num_ps_tasks=0,
worker_job_name='worker',ps_job_name='ps'):
参数:
num_clones : 每个单机部署多少个clone(即部署在多少个GPU)
clone_on_cpu : 如果为True,则单机中的每个clone将被放在CPU中
replica_id : 整数,模型所部署的单机的索引,通常是0.
num_replicas: 使用多少个单机,通常为1,表示单机部署。此时`worker_device`, `num_ps_tasks`和 `ps_device`这几个参数将被忽略。
num_ps_tasks: ‘ps’作业(分布式作业)使用多少个单机,如果为0表示不使用单机
worker_job_name: 默认为“worker”
ps_job_name:默认为'ps'
if num_replicas > 1:
if num_ps_tasks < 1:
raise ValueError('When using replicas num_ps_tasks must be positive')
if num_replicas > 1 or num_ps_tasks > 0:
if not worker_job_name:
raise ValueError('Must specify worker_job_name when using replicas')
if not ps_job_name:
raise ValueError('Must specify ps_job_name when using parameter server')
if replica_id >= num_replicas:
raise ValueError('replica_id must be less than num_replicas')
self._num_clones = num_clones
self._clone_on_cpu = clone_on_cpu
self._replica_id = replica_id
self._num_replicas = num_replicas
self._num_ps_tasks = num_ps_tasks
self._ps_device = '/job:' + ps_job_name if num_ps_tasks > 0 else ''
self._worker_device = '/job:' + worker_job_name if num_ps_tasks > 0 else ''
@property
def num_clones(self):
return self._num_clones
@property
def clone_on_cpu(self):
return self._clone_on_cpu
@property
def replica_id(self):
return self._replica_id
@property
def num_replicas(self):
return self._num_replicas
@property
def num_ps_tasks(self):
return self._num_ps_tasks
@property
def ps_device(self):
return self._ps_device
@property
def worker_device(self):
return self._worker_device
def caching_device(self):
"""缓存变量的设备
Returns:
如果不需要被缓存则返回None,否则返回设备号
"""
if self._num_ps_tasks > 0:
return lambda op: op.device
else:
return None
def clone_device(self, clone_index):
"""根据索引号返回用来创建克隆的设备号
Args:
clone_index: 克隆的索引值
Returns:
tf.device()的一个合适的值.
"""
if clone_index >= self._num_clones:
raise ValueError('clone_index must be less than num_clones')
device = ''
if self._num_ps_tasks > 0:
device += self._worker_device
if self._clone_on_cpu:
device += '/device:CPU:0'
else:
device += '/device:GPU:%d' % clone_index
return device
def clone_scope(self, clone_index):
"""根据索引号返回所创建的克隆的scope名字
Args:
clone_index: 克隆的索引号
Returns:
tf.name_scope()的一个合适值.
"""
if clone_index >= self._num_clones:
raise ValueError('clone_index must be less than num_clones')
scope = ''
if self._num_clones > 1:
scope = 'clone_%d' % clone_index
return scope
def optimizer_device(self):
"""模型参数更新的设备,单机时为CPU
Returns:
tf.device()的一个合适值
"""
if self._num_ps_tasks > 0 or self._num_clones > 0:
return self._worker_device + '/device:CPU:0'
else:
return ''
def inputs_device(self):
"""建立输入的设备,即读取数据的设备,单机时为CPU
Returns:
tf.device()的一个合适值
"""
device = ''
if self._num_ps_tasks > 0:
device += self._worker_device
device += '/device:CPU:0'
return device
def variables_device(self):
"""创建模型变量的设备,单机时为CPU
Returns:
`tf.device()的一个合适值
"""
device = ''
if self._num_ps_tasks > 0:
device += self._ps_device
device += '/device:CPU:0'
class _PSDeviceChooser(object):
"""Slim device chooser for variables when using PS."""
def __init__(self, device, tasks):
self._device = device
self._tasks = tasks
self._task = 0
def choose(self, op):
if op.device:
return op.device
node_def = op if isinstance(op, tf.NodeDef) else op.node_def
if node_def.op.startswith('Variable'):
t = self._task
self._task = (self._task + 1) % self._tasks
d = '%s/task:%d' % (self._device, t)
return d
else:
return op.device
if not self._num_ps_tasks:
return device
else:
chooser = _PSDeviceChooser(device, self._num_ps_tasks)
return chooser.choose
(2) 获取数据集的输入数据
# 定义输入
with tf.device(config.inputs_device()):
images, labels = LoadData(...)
inputs_queue = slim.data.prefetch_queue((images, labels))
(3) 创建模型并clone到多个GPU
with tf.device(config.variables_device()):
global_step = tf.train.get_or_create_global_step()
# 定义模型并创建多GPU克隆
model_fn = build_model
model_args = ....
# 创建多个克隆值,config对象、创造模型的函数以及函数的参数
clones = model_deploy.create_clones(config, model_fn, args=model_args)
其中 create_clones()的源码如下:
Clone = collections.namedtuple('Clone',
['outputs', # Whatever model_fn() returned.
'scope', # The scope used to create it.
'device', # The device used to create.
])
def create_clones(config, model_fn, args=None, kwargs=None):
"""
返回值是一个命名元组Clone的列表,Clone包装了model_fn函数的返回值、scope和device。
注意,这里假设被model_fn创建的任何损失被加入到tf.GraphKeys.LOSSES collection中。为了恢复losses、summaries和updata_op,可以使用:
losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, clone.scope)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope)
model_fn函数被调用了config.num_clones次,被克隆到多个GPU。
Args:
config: DeploymentConfig对象
model_fn: 一个创建模型的函数,调用方式为:model_fn(*args, **kwargs)
args: 传递给model_fn的可选的参数列表
kwargs: 可选参数列表
Returns:
命名元组Clone的一个列表
"""
clones = []
args = args or []
kwargs = kwargs or {}
with slim.arg_scope([slim.model_variable, slim.variable],
device=config.variables_device()):
# 创建克隆
for i in range(0, config.num_clones):
with tf.name_scope(config.clone_scope(i)) as clone_scope:
clone_device = config.clone_device(i)
with tf.device(clone_device):
with tf.variable_scope(tf.get_variable_scope(),reuse=True if i > 0 else None):
outputs = model_fn(*args, **kwargs)
clones.append(Clone(outputs, clone_scope, clone_device))
return clones
(4) 设置学习率和创建优化器
with tf.device(config.optimizer_device()):
learning_rate = ...
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
(5) 计算总的损失计算梯度并进行权重更新
with tf.device(config.variables_device()):
# 计算总共的损失和每个变量的梯度
total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer)
# 创建梯度更新操作
grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
optimize_clones()方法的源码定义如下:
def _gather_clone_loss(clone, num_clones, regularization_losses):
sum_loss = None
# Individual components of the loss that will need summaries.
clone_loss = None
regularization_loss = None
# Compute and aggregate losses on the clone device.
with tf.device(clone.device):
all_losses = []
clone_losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
if clone_losses:
clone_loss = tf.add_n(clone_losses, name='clone_loss')
if num_clones > 1:
clone_loss = tf.div(clone_loss, 1.0 * num_clones,
name='scaled_clone_loss')
all_losses.append(clone_loss)
if regularization_losses:
regularization_loss = tf.add_n(regularization_losses,
name='regularization_loss')
all_losses.append(regularization_loss)
if all_losses:
sum_loss = tf.add_n(all_losses)
# Add the summaries out of the clone device block.
if clone_loss is not None:
tf.summary.scalar('/'.join(filter(None,['Losses', clone.scope, 'clone_loss'])),
clone_loss)
if regularization_loss is not None:
tf.summary.scalar('Losses/regularization_loss', regularization_loss)
return sum_loss
def _optimize_clone(optimizer, clone, num_clones, regularization_losses,
**kwargs):
sum_loss = _gather_clone_loss(clone, num_clones, regularization_losses)
clone_grad = None
if sum_loss is not None:
with tf.device(clone.device):
clone_grad = optimizer.compute_gradients(sum_loss, **kwargs)
return sum_loss, clone_grad
def optimize_clones(clones, optimizer,regularization_losses=None,
**kwargs):
Args:
clones:由create_clones()函数创建的Clones列表
optimizer: 一个 `Optimizer`对象
regularization_losses: (可选)正则化损失列表,如果为None,则可以从tf.GraphKeys.REGULARIZATION_LOSSES获取。如果传入[],则表示忽略正则化损失。
**kwargs: 可选参数
Returns:
一个元素:(total_loss, grads_and_vars).
- total_loss: 一个Tensor,包含每个Clone的平均损失(包括正则化损失)。
- grads_and_vars: 一个元组(gradient, variable)列表,包含每个变量总的梯度值。
"""
grads_and_vars = []
clones_losses = []
num_clones = len(clones)
if regularization_losses is None:
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
for clone in clones:
with tf.name_scope(clone.scope):
clone_loss, clone_grad = _optimize_clone(
optimizer, clone, num_clones, regularization_losses, **kwargs)
if clone_loss is not None:
clones_losses.append(clone_loss)
grads_and_vars.append(clone_grad)
# Only use regularization_losses for the first clone
regularization_losses = None
# Compute the total_loss summing all the clones_losses.
total_loss = tf.add_n(clones_losses, name='total_loss')
# Sum the gradients across clones.
grads_and_vars = _sum_clones_gradients(grads_and_vars)
return total_loss, grads_and_vars