Article directory
Ubuntu uses Python code to monitor GPU and CPU usage and temperature
question:
In the project, it was found that there was a program that could not be started. After checking, it was found that after entering the command nvidia-smi, the driver information of the NVIDIA graphics card was not printed.
Solution 1:
After rebooting, enter nvidia-smi -l 1 again and print normally.
Solution 2:
It is speculated that the reason may be that the Ubuntu kernel upgrade causes the NVIDIA driver to be unable to connect. Refer to https://zhuanlan.zhihu.com/p/611276995?utm_id=0 to avoid automatic kernel updates through the following methods.
Pass the following two commands and change the parameter values in them to 0
vi /etc/apt/apt.conf.d/10periodic
vi /etc/apt/apt.conf.d/20auto-apgrades
Other,
Guess the reason:
It may be due to heat dissipation issues that cause the driver to crash frequently.
Therefore, I wrote a python code to save the usage and temperature of GPU and CPU to a txt file every 5 minutes. The source code is as follows:
涉及需要安装的python库
# pip3 install nvidia-ml-py3 -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
# pip3 install psutil
def print_gpu_info():
# ref: https://blog.csdn.net/zxc120389574/article/details/106220612
# 获取GPU的用量和温度
# 这里假设使用的是NVIDIA显卡,需要安装nvidia-ml-py库
print("---------GPU usage and T---------")
import pynvml
pynvml.nvmlInit()
# Use device index to get gpu_device
gpu_device = pynvml.nvmlDeviceGetHandleByIndex(0)
# Use gpu_device to get device stats
memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_device)
memoInfo = "Memory: Used/Total = {}/{}".format(memory_info.used / 1024 / 1024, memory_info.total / 1024 / 1024)
powerUsage = pynvml.nvmlDeviceGetPowerUsage(gpu_device)
powerState = pynvml.nvmlDeviceGetPowerState(gpu_device)
# get GPU temperature
gpu_device = pynvml.nvmlDeviceGetHandleByIndex(0)
temperature = pynvml.nvmlDeviceGetTemperature(gpu_device, pynvml.NVML_TEMPERATURE_GPU)
tempInfo = "Temperature:{0}°C".format(temperature)
gpu_info ='gpu: ' + memoInfo + ' ' + tempInfo + ' '
return gpu_info
def print_cpu_info():
# 获取CPU的用量和温度 ()
print("---------CPU usage and T---------")
import psutil
cpu_percent = psutil.cpu_percent()
# psutil.sensors_temperatures() 官方文档这有说的只适用于 linux
# 此程序在ubuntu上验证pass,
cpu_temperature = psutil.sensors_temperatures()['coretemp'][0].current
memoInfo = f"CPU utilization: {cpu_percent}%"
tempInfo = f"CPU temperature: {cpu_temperature}°C"
cpu_info = ' |||| cpu: ' + memoInfo + ' ' + tempInfo + ' '
return cpu_info
def write2Info(info):
from datetime import datetime
t = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
print('info : ', info)
info = t + ' '+ info + '\n'
txtName = './GPUCPU_usage{}.txt'.format(1)
with open(txtName, 'a', encoding='utf-8') as wf:
wf.write(info)
if __name__ == '__main__':
import time
while True:
gpu_info = print_gpu_info()
cpu_info = print_cpu_info()
info = gpu_info + cpu_info
write2Info(info)
time.sleep(2)
Or (the code at the upper and lower ends may be the same, but the code below is a test run noproblem)
import psutil
import time
def print_gpu_info():
#print("---------GPU usage and T---------")
import pynvml
pynvml.nvmlInit()
# Use device index to get gpu_device
gpu_device = pynvml.nvmlDeviceGetHandleByIndex(0)
# Use gpu_device to get device stats
memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_device)
memoInfo = "Memory: Used/Total = {}/{}".format(memory_info.used / 1024 / 1024, memory_info.total / 1024 / 1024)
gpu_device = pynvml.nvmlDeviceGetHandleByIndex(0)
temperature = pynvml.nvmlDeviceGetTemperature(gpu_device, pynvml.NVML_TEMPERATURE_GPU)
tempInfo = "Temperature:{0}°C".format(temperature)
# print(tempInfo)
gpu_info ='gpu: ' + memoInfo + ' ' + tempInfo + ' '
return gpu_info
def print_cpu_info():
#print("---------CPU usage and T---------")
import psutil
cpu_percent = psutil.cpu_percent()
# psutil.sensors_temperatures() 官方文档这有说的只适用于 linux
# 此程序在ubuntu上验证pass,
cpu_temperature = psutil.sensors_temperatures()['coretemp'][0].current
memoInfo = f"CPU utilization: {cpu_percent}%"
# print(memoInfo)
tempInfo = f"CPU temperature: {cpu_temperature}°C"
# print(tempInfo)
cpu_info = ' |||| cpu: ' + memoInfo + ' ' + tempInfo + ' '
return cpu_info
def write2Info(info):
from datetime import datetime
t = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
print('info : ', info)
info = t + ' '+ info + '\n'
txtName = './GPUCPU_usage{}.txt'.format(1)
with open(txtName, 'a', encoding='utf-8') as wf:
wf.write(info)
if __name__ == '__main__':
import time
while True:
gpu_info = print_gpu_info()
cpu_info = print_cpu_info()
info = gpu_info + cpu_info
write2Info(info)
time.sleep(300)
connect
链接:https://pan.baidu.com/s/1ZoPp9mSeydqNuRobkChWsw