"""
Ported from netharn.device, previously called gpu_infos
"""
import ubelt as ub
import os
import warnings
[docs]
class NvidiaSMIError(Exception):
pass
[docs]
def nvidia_smi(ignore_environ=False):
"""
Run nvidia-smi and parse output
Args:
new_mode: internal argument that changes the underlying implementation
ignore_environ (bool): if True respects
CUDA_VISIBLE_DEVICES environment variable, otherwise returns
data corresponding to physical GPU indexes. Defaults to False.
Returns:
dict: info about each nvidia GPU indexed by gpu number
Note:
Not gaurenteed to work if CUDA is not installed.
Warnings:
if nvidia-smi is not found
Example:
>>> # xdoctest: +REQUIRES(env:HAS_CUDA)
>>> from geowatch.utils.util_nvidia import * # NOQA
>>> gpus = nvidia_smi()
>>> # xdoctest: +IGNORE_WANT
>>> import torch
>>> print('gpus = {}'.format(ub.repr2(gpus, nl=4)))
>>> assert len(gpus) == torch.cuda.device_count()
gpus = {
0: {
'gpu_uuid': 'GPU-348ebe36-252b-46fa-8a97-477ae331f6f4',
'index': '0',
'mem_avail': 10013.0,
'mem_total': 11170.0,
'mem_used': 1157.0,
'memory.free': '10013 MiB',
'memory.total': '11170 MiB',
'memory.used': '1157 MiB',
'name': 'GeForce GTX 1080 Ti',
'num': 0,
'num_compute_procs': 1,
'procs': [
{
'gpu_num': 0,
'gpu_uuid': 'GPU-348ebe36-252b-46fa-8a97-477ae331f6f4',
'name': '/usr/bin/python',
'pid': '19912',
'type': 'C',
'used_memory': '567 MiB',
},
],
},
}
"""
# Note: the old netharn implementation has an xml mode and an "old" mode We
# just kept the "new" mode here, but the xml might be worth revisiting that
# is in the notes on the bottom of this file.
# This is slightly more robust than the old mode, but it also makes
# more than one call to nvidia-smi and cannot return information about
# graphics processes.
fields = ['index', 'memory.total', 'memory.used', 'memory.free',
'name', 'gpu_uuid']
mode = 'query-gpu'
try:
gpu_rows = _query_nvidia_smi(mode, fields)
except FileNotFoundError:
warnings.warn('nvidia-smi not found. There are likely no nvidia gpus')
# Lkely no GPUS
return {}
except Exception as ex:
warnings.warn('Problem running nvidia-smi: {!r}'.format(ex))
raise NvidiaSMIError
fields = ['pid', 'name', 'gpu_uuid', 'used_memory']
mode = 'query-compute-apps'
proc_rows = _query_nvidia_smi(mode, fields)
# Coerce into the old-style format for backwards compatibility
gpus = {}
for row in gpu_rows:
gpu = row.copy()
num = int(gpu['index'])
gpu['num'] = num
gpu['mem_used'] = float(gpu['memory.used'].strip().replace('MiB', ''))
gpu['mem_total'] = float(gpu['memory.total'].strip().replace('MiB', ''))
gpu['mem_avail'] = gpu['mem_total'] - gpu['mem_used']
gpu['procs'] = []
gpus[num] = gpu
gpu_uuid_to_num = {gpu['gpu_uuid']: gpu['num'] for gpu in gpus.values()}
for row in proc_rows:
# Give each GPU info on which processes are using it
proc = row.copy()
proc['type'] = 'C'
proc['gpu_num'] = gpu_uuid_to_num[proc['gpu_uuid']]
num = proc['gpu_num']
gpus[num]['procs'].append(proc)
WITH_GPU_PROCS = False
if WITH_GPU_PROCS:
# Hacks in gpu-procs if enabled
import re
info = ub.cmd('nvidia-smi pmon -c 1')
for line in info['out'].split('\n'):
line = line.strip()
if line and not line.startswith("#"):
parts = re.split(r'\s+', line, maxsplit=7)
if parts[1] != '-':
header = [
'gpu_num', 'pid', 'type', 'sm', 'mem', 'enc',
'dec', 'name']
proc = ub.dzip(header, parts)
proc['gpu_num'] = int(proc['gpu_num'])
if proc['type'] == 'G':
gpu = gpus[proc['gpu_num']]
gpu['procs'].append(proc)
proc['gpu_uuid'] = gpu['gpu_uuid']
for gpu in gpus.values():
# Let each GPU know how many processes are currently using it
num_compute_procs = 0
num_graphics_procs = 0
for proc in gpu['procs']:
if proc['type'] == 'C':
num_compute_procs += 1
elif proc['type'] == 'G':
num_graphics_procs += 1
else:
raise NotImplementedError(proc['type'])
# NOTE calling nvidia-smi in query mode does not seem to have
# support for getting info about graphics procs.
gpu['num_compute_procs'] = num_compute_procs
if WITH_GPU_PROCS:
gpu['num_graphics_procs'] = num_graphics_procs
if not ignore_environ:
# Respect CUDA_VISIBLE_DEVICES, nvidia-smi does not respect this by
# default so remap to gain the appropriate effect.
val = os.environ.get('CUDA_VISIBLE_DEVICES', '')
parts = (p.strip() for p in val.split(','))
visible_devices = [int(p) for p in parts if p]
if visible_devices:
remapped = {}
for visible_idx, real_idx in enumerate(visible_devices):
gpu = remapped[visible_idx] = gpus[real_idx]
gpu['index'] = str(visible_idx)
gpu['num'] = visible_idx
gpu['real_num'] = real_idx
gpus = remapped
return gpus
def _query_nvidia_smi(mode, fields):
"""
Runs nvidia smi in query mode
Args:
mode (str): the query cli flag to pass to nvidia-smi
fields (List[str]): csv header fields to query
Returns:
List[Dict[str, str]]: parsed csv output
"""
header = ','.join(fields)
command = ['nvidia-smi', f'--{mode}={header}', '--format=csv,noheader']
info = ub.cmd(command)
if info['ret'] != 0:
print(info['out'])
print(info['err'])
raise NvidiaSMIError('unable to call nvidia-smi: ret={}'.format(
info['ret']))
rows = []
for line in info['out'].split('\n'):
line = line.strip()
if line:
parts = [p.strip() for p in line.split(',')]
row = ub.dzip(fields, parts)
rows.append(row)
return rows
__notes__ = """
Ignore:
# official nvidia-smi python bindings
pip install nvidia-ml-py
import pynvml
# TODO: make more efficient calls to nvidia-smi
utilization.gpu
utilization.memory
compute_mode
memory.total
memory.used
memory.free
index
name
count
nvidia-smi pmon --count 1
nvidia-smi -h
nvidia-smi --help-query-compute-apps
nvidia-smi --help-query-gpu
nvidia-smi --help-query-accounted-apps
nvidia-smi --help-query-supported-clocks
nvidia-smi --help-query-retired-pages
nvidia-smi --query-accounted-apps="pid" --format=csv
nvidia-smi --query-gpu="index,memory.total,memory.used,memory.free,count,name,gpu_uuid" --format=csv
nvidia-smi --query-compute-apps="pid,name,gpu_uuid,used_memory" --format=csv
nvidia-smi --query-accounted-apps="gpu_name,pid" --format=csv
import timerit
ti = timerit.Timerit(40, bestof=5, verbose=2)
for timer in ti.reset('new1'):
with timer:
gpu_info(True)
for timer in ti.reset('old'):
with timer:
gpu_info(False)
for timer in ti.reset('xml'):
with timer:
gpu_info('xml')
xdev.profile_now(gpu_info)('xml')
for timer in ti.reset('cmd'):
with timer:
ub.cmd(['nvidia-smi', '--query', '--xml-format'])
for timer in ti.reset('check_output'):
with timer:
import subprocess
subprocess.check_output(['nvidia-smi', '--query', '--xml-format'])
if new_mode == 'xml':
# Parse info out of the nvidia xml query
# note, that even though this has less calls to nvidia-smi, there
# is a lot more output, which makes it the slowest method especially
# for multi-gpu systems
import xml.etree.ElementTree as ET
info = ub.cmd(['nvidia-smi', '--query', '--xml-format'])
if info['ret'] != 0:
print(info['out'])
print(info['err'])
warnings.warn('Problem running nvidia-smi: ret={}'.format(info['ret']))
raise NvidiaSMIError
xml_string = info['out']
root = ET.fromstring(xml_string)
gpus = {}
for gpu_elem in root.findall('gpu'):
gpu = {}
gpu['uuid'] = gpu_elem.find('uuid').text
gpu['name'] = gpu_elem.find('product_name').text
gpu['num'] = int(gpu_elem.find('minor_number').text)
gpu['procs'] = [
{item.tag: item.text for item in proc_elem}
for proc_elem in gpu_elem.find('processes')
]
for item in gpu_elem.find('fb_memory_usage'):
gpu['memory.' + item.tag] = item.text
gpu['mem_used'] = float(gpu['memory.used'].strip().replace('MiB', ''))
gpu['mem_total'] = float(gpu['memory.total'].strip().replace('MiB', ''))
gpu['mem_avail'] = gpu['mem_total'] - gpu['mem_used']
gpus[gpu['num']] = gpu
# Let each GPU know how many processes are currently using it
num_compute_procs = 0
num_graphics_procs = 0
for proc in gpu['procs']:
if proc['type'] == 'C':
num_compute_procs += 1
elif proc['type'] == 'G':
num_graphics_procs += 1
else:
raise NotImplementedError(proc['type'])
gpu['num_compute_procs'] = num_compute_procs
gpu['num_graphics_procs'] = num_graphics_procs
"""