When using torch.cuda.max_allocated_memory, only the memory size occupied by the torch tensor can be returned. The size of the model needs to be added, and the loading size of the model is not simply the size of the model folder. At this time, only watch -n 0.1 nvidia- smi command for human monitoring.
In addition, runtimes such as TRT do not support torch.cuda methods.
Here I opened a thread to monitor the memory.
import nvidia_smi
from threading import Thread
from multiprocessing import Process
import time
class Monitor(Thread):
def __init__(self, delay, index):
super(Monitor, self).__init__()
self.stopped = False
self.index=index
self.delay = delay # Time between calls to nvidia-smi
# st mem
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
self.st_mem=info.used
print( "start used memory is {} GiB".format(info.used* 1.0 / 1024**3))
nvidia_smi.nvmlShutdown()
#st_end
self.max_mem=0
time.sleep(self.delay)
self.start()
def run(self):
while not self.stopped:
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.index)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
self.max_mem=max(self.max_mem, info.used)
nvidia_smi.nvmlShutdown()
time.sleep(self.delay)
def stop(self):
self.stopped = True
res=(self.max_mem-self.st_mem)* 1.0 / 1024**3
print( "total used memory is {} GiB".format(res))
return res
if __name__=="__main__":
# Instantiate monitor with a 10-second delay between updates
monitor = Monitor(0.1,0)
# Train, etc.
time.sleep(1)
# Close monitor
monitor.stop()