import byteps.torch as bps
# Initialize BytePS bps.init ()
# GPU and local process number bind torch.cuda.set_device (bps.local_rank ())
# optimizer = bps.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression)
Reference links
https://pytorch.org/docs/stable/distributed.html