Source code for geowatch.utils.util_nvidia

"""
Ported from netharn.device, previously called gpu_infos
"""
import ubelt as ub
import os
import warnings



[docs]
class NvidiaSMIError(Exception):
    pass




[docs]
def nvidia_smi(ignore_environ=False):
    """
    Run nvidia-smi and parse output

    Args:
        new_mode: internal argument that changes the underlying implementation

        ignore_environ (bool): if True respects
            CUDA_VISIBLE_DEVICES environment variable, otherwise returns
            data corresponding to physical GPU indexes.  Defaults to False.

    Returns:
        dict: info about each nvidia GPU indexed by gpu number

    Note:
        Not gaurenteed to work if CUDA is not installed.

    Warnings:
        if nvidia-smi is not found

    Example:
        >>> # xdoctest: +REQUIRES(env:HAS_CUDA)
        >>> from geowatch.utils.util_nvidia import *  # NOQA
        >>> gpus = nvidia_smi()
        >>> # xdoctest: +IGNORE_WANT
        >>> import torch
        >>> print('gpus = {}'.format(ub.repr2(gpus, nl=4)))
        >>> assert len(gpus) == torch.cuda.device_count()
        gpus = {
            0: {
                'gpu_uuid': 'GPU-348ebe36-252b-46fa-8a97-477ae331f6f4',
                'index': '0',
                'mem_avail': 10013.0,
                'mem_total': 11170.0,
                'mem_used': 1157.0,
                'memory.free': '10013 MiB',
                'memory.total': '11170 MiB',
                'memory.used': '1157 MiB',
                'name': 'GeForce GTX 1080 Ti',
                'num': 0,
                'num_compute_procs': 1,
                'procs': [
                    {
                        'gpu_num': 0,
                        'gpu_uuid': 'GPU-348ebe36-252b-46fa-8a97-477ae331f6f4',
                        'name': '/usr/bin/python',
                        'pid': '19912',
                        'type': 'C',
                        'used_memory': '567 MiB',
                    },
                ],
            },
        }
    """
    # Note: the old netharn implementation has an xml mode and an "old" mode We
    # just kept the "new" mode here, but the xml might be worth revisiting that
    # is in the notes on the bottom of this file.

    # This is slightly more robust than the old mode, but it also makes
    # more than one call to nvidia-smi and cannot return information about
    # graphics processes.
    fields = ['index', 'memory.total', 'memory.used', 'memory.free',
              'name', 'gpu_uuid']
    mode = 'query-gpu'
    try:
        gpu_rows = _query_nvidia_smi(mode, fields)
    except FileNotFoundError:
        warnings.warn('nvidia-smi not found. There are likely no nvidia gpus')
        # Lkely no GPUS
        return {}
    except Exception as ex:
        warnings.warn('Problem running nvidia-smi: {!r}'.format(ex))
        raise NvidiaSMIError

    fields = ['pid', 'name', 'gpu_uuid', 'used_memory']
    mode = 'query-compute-apps'
    proc_rows = _query_nvidia_smi(mode, fields)

    # Coerce into the old-style format for backwards compatibility
    gpus = {}
    for row in gpu_rows:
        gpu = row.copy()
        num = int(gpu['index'])
        gpu['num'] = num
        gpu['mem_used'] = float(gpu['memory.used'].strip().replace('MiB', ''))
        gpu['mem_total'] = float(gpu['memory.total'].strip().replace('MiB', ''))
        gpu['mem_avail'] = gpu['mem_total'] - gpu['mem_used']
        gpu['procs'] = []
        gpus[num] = gpu

    gpu_uuid_to_num = {gpu['gpu_uuid']: gpu['num'] for gpu in gpus.values()}

    for row in proc_rows:
        # Give each GPU info on which processes are using it
        proc = row.copy()
        proc['type'] = 'C'
        proc['gpu_num'] = gpu_uuid_to_num[proc['gpu_uuid']]
        num = proc['gpu_num']
        gpus[num]['procs'].append(proc)

    WITH_GPU_PROCS = False
    if WITH_GPU_PROCS:
        # Hacks in gpu-procs if enabled
        import re
        info = ub.cmd('nvidia-smi pmon -c 1')
        for line in info['out'].split('\n'):
            line = line.strip()
            if line and not line.startswith("#"):
                parts = re.split(r'\s+', line, maxsplit=7)
                if parts[1] != '-':
                    header = [
                        'gpu_num', 'pid', 'type', 'sm', 'mem', 'enc',
                        'dec', 'name']
                    proc = ub.dzip(header, parts)
                    proc['gpu_num'] = int(proc['gpu_num'])
                    if proc['type'] == 'G':
                        gpu = gpus[proc['gpu_num']]
                        gpu['procs'].append(proc)
                        proc['gpu_uuid'] = gpu['gpu_uuid']

    for gpu in gpus.values():
        # Let each GPU know how many processes are currently using it
        num_compute_procs = 0
        num_graphics_procs = 0
        for proc in gpu['procs']:
            if proc['type'] == 'C':
                num_compute_procs += 1
            elif proc['type'] == 'G':
                num_graphics_procs += 1
            else:
                raise NotImplementedError(proc['type'])

        # NOTE calling nvidia-smi in query mode does not seem to have
        # support for getting info about graphics procs.
        gpu['num_compute_procs'] = num_compute_procs
        if WITH_GPU_PROCS:
            gpu['num_graphics_procs'] = num_graphics_procs

    if not ignore_environ:
        # Respect CUDA_VISIBLE_DEVICES, nvidia-smi does not respect this by
        # default so remap to gain the appropriate effect.
        val = os.environ.get('CUDA_VISIBLE_DEVICES', '')
        parts = (p.strip() for p in val.split(','))
        visible_devices = [int(p) for p in parts if p]

        if visible_devices:
            remapped = {}
            for visible_idx, real_idx in enumerate(visible_devices):
                gpu = remapped[visible_idx] = gpus[real_idx]
                gpu['index'] = str(visible_idx)
                gpu['num'] = visible_idx
                gpu['real_num'] = real_idx
            gpus = remapped

    return gpus



def _query_nvidia_smi(mode, fields):
    """
    Runs nvidia smi in query mode

    Args:
        mode (str): the query cli flag to pass to nvidia-smi
        fields (List[str]): csv header fields to query

    Returns:
        List[Dict[str, str]]: parsed csv output
    """
    header = ','.join(fields)
    command = ['nvidia-smi', f'--{mode}={header}', '--format=csv,noheader']
    info = ub.cmd(command)
    if info['ret'] != 0:
        print(info['out'])
        print(info['err'])
        raise NvidiaSMIError('unable to call nvidia-smi: ret={}'.format(
            info['ret']))
    rows = []
    for line in info['out'].split('\n'):
        line = line.strip()
        if line:
            parts = [p.strip() for p in line.split(',')]
            row = ub.dzip(fields, parts)
            rows.append(row)
    return rows


__notes__ = """
Ignore:

    # official nvidia-smi python bindings
    pip install nvidia-ml-py

    import pynvml

    # TODO: make more efficient calls to nvidia-smi

    utilization.gpu
    utilization.memory
    compute_mode
    memory.total
    memory.used
    memory.free
    index
    name
    count

    nvidia-smi pmon --count 1

    nvidia-smi  -h
    nvidia-smi  --help-query-compute-apps
    nvidia-smi  --help-query-gpu

    nvidia-smi --help-query-accounted-apps
    nvidia-smi --help-query-supported-clocks
    nvidia-smi --help-query-retired-pages
    nvidia-smi --query-accounted-apps="pid" --format=csv

    nvidia-smi  --query-gpu="index,memory.total,memory.used,memory.free,count,name,gpu_uuid" --format=csv
    nvidia-smi  --query-compute-apps="pid,name,gpu_uuid,used_memory" --format=csv
    nvidia-smi  --query-accounted-apps="gpu_name,pid" --format=csv

    import timerit
    ti = timerit.Timerit(40, bestof=5, verbose=2)
    for timer in ti.reset('new1'):
        with timer:
            gpu_info(True)
    for timer in ti.reset('old'):
        with timer:
            gpu_info(False)
    for timer in ti.reset('xml'):
        with timer:
            gpu_info('xml')

    xdev.profile_now(gpu_info)('xml')

    for timer in ti.reset('cmd'):
        with timer:
            ub.cmd(['nvidia-smi', '--query', '--xml-format'])

    for timer in ti.reset('check_output'):
        with timer:
            import subprocess
            subprocess.check_output(['nvidia-smi', '--query', '--xml-format'])


if new_mode == 'xml':
    # Parse info out of the nvidia xml query
    # note, that even though this has less calls to nvidia-smi, there
    # is a lot more output, which makes it the slowest method especially
    # for multi-gpu systems
    import xml.etree.ElementTree as ET

    info = ub.cmd(['nvidia-smi', '--query', '--xml-format'])
    if info['ret'] != 0:
        print(info['out'])
        print(info['err'])
        warnings.warn('Problem running nvidia-smi: ret={}'.format(info['ret']))
        raise NvidiaSMIError
    xml_string = info['out']
    root = ET.fromstring(xml_string)

    gpus = {}
    for gpu_elem in root.findall('gpu'):
        gpu = {}
        gpu['uuid'] = gpu_elem.find('uuid').text
        gpu['name'] = gpu_elem.find('product_name').text
        gpu['num'] = int(gpu_elem.find('minor_number').text)
        gpu['procs'] = [
            {item.tag: item.text for item in proc_elem}
            for proc_elem in gpu_elem.find('processes')
        ]

        for item in gpu_elem.find('fb_memory_usage'):
            gpu['memory.' + item.tag] = item.text

        gpu['mem_used'] = float(gpu['memory.used'].strip().replace('MiB', ''))
        gpu['mem_total'] = float(gpu['memory.total'].strip().replace('MiB', ''))
        gpu['mem_avail'] = gpu['mem_total'] - gpu['mem_used']
        gpus[gpu['num']] = gpu

        # Let each GPU know how many processes are currently using it
        num_compute_procs = 0
        num_graphics_procs = 0
        for proc in gpu['procs']:
            if proc['type'] == 'C':
                num_compute_procs += 1
            elif proc['type'] == 'G':
                num_graphics_procs += 1
            else:
                raise NotImplementedError(proc['type'])
        gpu['num_compute_procs'] = num_compute_procs
        gpu['num_graphics_procs'] = num_graphics_procs

"""