torch distributed.init out of memory
设置环境gpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2, 3"
local_rank=0
torch.cuda.set_device(local_rank)
cuda(0)默认是第0块显卡,
但是设置CUDA_VISIBLE_DEVICES后:
cuda(0)就是CUDA_VISIBLE_DEVICES里面的第一个gpu。
distributed.init 报错out of memory
import argparse
import logging
import os
import time
import torch
import torch.distributed as dist
import torch.nn.functional as F
import torch.utils.data.distributed
def main(args):
try:
world_size = int(os.environ['WORLD_SIZE'])
rank = int(os.environ['RANK'])
dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
except KeyError:
world_size = 1
rank = 0
dist_url = "tcp://127.0.0.1