gpuinfo

#!/usr/bin/env python3

# gpuinfo

# Display GPU information for Slurm cluster
# Based on output from sinfo, squeue, and scontrol
# Modified from slurm_gpustat program found at https://github.com/albanie/slurm_gpustat

import re
import argparse
import functools
import subprocess
from typing import Optional
from collections import defaultdict

INACCESSIBLE = {"drain*", "down*", "drng", "drain", "down"}
INTERACTIVE_CMDS = {"bash", "zsh", "sh"}

def split_node_str(node_str):
    node_str = node_str.strip()
    breakpoints, stack = [0], []
    for ii, char in enumerate(node_str):
        if char == "[":
            stack.append(char)
        elif char == "]":
            stack.pop()
        elif not stack and char == ",":
            breakpoints.append(ii + 1)
    end = len(node_str) + 1
    return [node_str[i: j - 1] for i, j in zip(breakpoints, breakpoints[1:] + [end])]

def parse_node_names(node_str):
    names = []
    node_specs = split_node_str(node_str)
    for node_spec in node_specs:
        if "[" not in node_spec:
            names.append(node_spec)
        else:
            head, tail = node_spec.index("["), node_spec.index("]")
            prefix = node_spec[:head]
            subspecs = node_spec[head + 1:tail].split(",")
            for subspec in subspecs:
                if "-" not in subspec:
                    subnames = [f"{prefix}{subspec}"]
                else:
                    start, end = subspec.split("-")
                    num_digits = len(start)
                    subnames = [f"{prefix}{str(x).zfill(num_digits)}"
                                for x in range(int(start), int(end) + 1)]
                names.extend(subnames)
    return names

def parse_cmd(cmd, split=True):
    output = subprocess.check_output(cmd, shell=True).decode("utf-8")
    if split:
        output = [x for x in output.split("\n") if x]
    return output

def parse_all_gpus(partition: Optional[str] = None,
                   default_gpus: int = 4,
                   default_gpu_name: str = "NONAME_GPU") -> dict:
    cmd = "sinfo -o '%1000N|%1000G' --noheader"
    if partition:
        cmd += f" --partition={partition}"
    rows = parse_cmd(cmd)
    resources = defaultdict(list)

    p = re.compile(r'gpu:(?:(\w*):)?(\d*)(?:\(\S*\))?\s*')

    for row in rows:
        node_str, resource_strs = row.split("|")
        for resource_str in resource_strs.split(","):
            if not resource_str.startswith("gpu"):
                continue
            match = p.search(resource_str)
            gpu_type = match.group(1) if match.group(1) is not None else default_gpu_name
            gpu_count = int(match.group(2)) if match.group(2) != "" else default_gpus
            node_names = parse_node_names(node_str)
            for name in node_names:
                resources[name].append({"type": gpu_type, "count": gpu_count})
    return resources

def node_states(partition: Optional[str] = None) -> dict:
    cmd = "sinfo --noheader"
    if partition:
        cmd += f" --partition={partition}"
    rows = parse_cmd(cmd)
    states = {}
    for row in rows:
        tokens = row.split()
        state, names = tokens[4], tokens[5]
        node_names = parse_node_names(names)
        states.update({name: state for name in node_names})
    return states

def resource_by_type(resources: dict) -> dict:
    by_type = defaultdict(list)
    for node, specs in resources.items():
        for spec in specs:
            by_type[spec["type"]].append({"node": node, "count": spec["count"]})
    return by_type

def summary_by_type(resources: dict, tag: str):
    by_type = resource_by_type(resources)
    total = sum(x["count"] for sublist in by_type.values() for x in sublist)
    agg_str = []
    for key, val in sorted(by_type.items(), key=lambda x: sum(y["count"] for y in x[1])):
        gpu_count = sum(x["count"] for x in val)
        agg_str.append(f"{gpu_count} {key} gpus")
    print(f"There are a total of {total} GPUs [{tag}]")
    print("\n".join(agg_str))

def summary(mode: str, resources: dict = None, states: dict = None):
    if not resources:
        resources = parse_all_gpus()
    if not states:
        states = node_states()
    if mode == "accessible":
        res = {key: val for key, val in resources.items()
               if states.get(key, "down") not in INACCESSIBLE}
    elif mode == "up":
        res = resources
    else:
        raise ValueError(f"Unknown mode: {mode}")
    summary_by_type(res, tag=mode)

def gpu_usage(resources: dict, partition: Optional[str] = None) -> dict:
    version_cmd = "sinfo -V"
    slurm_version = parse_cmd(version_cmd, split=False).split(" ")[1]
    if slurm_version.startswith("17"):
       resource_flag = "gres"
    else:
       resource_flag = "tres-per-node"
    cmd = f"squeue -O {resource_flag}:100,nodelist:100,username:100,jobid:100 --noheader"
    if partition:
        cmd += f" --partition={partition}"
    detailed_job_cmd = "scontrol show jobid -dd %s"
    rows = parse_cmd(cmd)
    usage = defaultdict(dict)
    for row in rows:
        tokens = row.split()
        if tokens[0].startswith("gres:"):
            tokens[0] = tokens[0][5:]

        if len(tokens) < 4 or not tokens[0].startswith("gpu"):
            continue
        gpu_count_str, node_str, user, jobid = tokens
        gpu_count_tokens = gpu_count_str.split(":")

        if not gpu_count_tokens[-1].isdigit():
            gpu_count_tokens.append("1")
        num_gpus = int(gpu_count_tokens[-1])
        detailed_output = parse_cmd(detailed_job_cmd % jobid, split=False)
        is_bash = any([f'Command={x}\n' in detailed_output for x in INTERACTIVE_CMDS])
        num_bash_gpus = num_gpus * is_bash
        node_names = parse_node_names(node_str)
        for node_name in node_names:
            if node_name not in resources:
                continue
            node_gpu_types = [x["type"] for x in resources[node_name]]
            if len(gpu_count_tokens) == 2:
                gpu_type = None
            elif len(gpu_count_tokens) == 3:
                gpu_type = gpu_count_tokens[1]
            if gpu_type is None:
                if len(node_gpu_types) != 1:
                    gpu_type = sorted(
                        resources[node_name],
                        key=lambda k: k['count'],
                        reverse=True
                    )[0]['type']
                    msg = (f"cannot determine node gpu type for {user} on {node_name}"
                           f" (guessing {gpu_type})")
                    print(f"WARNING >>> {msg}")
                else:
                    gpu_type = node_gpu_types[0]
            if gpu_type in usage[user]:
                usage[user][gpu_type][node_name]['n_gpu'] += num_gpus
                usage[user][gpu_type][node_name]['bash_gpu'] += num_bash_gpus

            else:
                usage[user][gpu_type] = defaultdict(lambda: {'n_gpu': 0, 'bash_gpu': 0})
                usage[user][gpu_type][node_name]['n_gpu'] += num_gpus
                usage[user][gpu_type][node_name]['bash_gpu'] += num_bash_gpus
    return usage

def in_use(resources: dict = None, partition: Optional[str] = None):
    if not resources:
        resources = parse_all_gpus()
    usage = gpu_usage(resources, partition=partition)
    aggregates = {}
    for user, subdict in usage.items():
        aggregates[user] = {}
        aggregates[user]['n_gpu'] = {key: sum([x['n_gpu'] for x in val.values()])
                                     for key, val in subdict.items()}
        aggregates[user]['bash_gpu'] = {key: sum([x['bash_gpu'] for x in val.values()])
                                        for key, val in subdict.items()}
    print("Usage by user:")
    if not aggregates:
        print("None")
    else:
        for user, subdict in sorted(aggregates.items(),
                                    key=lambda x: sum(x[1]['n_gpu'].values())):
            total = (f"total: {str(sum(subdict['n_gpu'].values())):2s} "
                     f"(interactive: {str(sum(subdict['bash_gpu'].values())):2s})")
            summary_str = ", ".join([f"{key}: {val}" for key, val in subdict['n_gpu'].items()])
            print(f"{user:10s} [{total}] {summary_str}")

@functools.lru_cache(maxsize=64, typed=True)
def occupancy_stats_for_node(node: str) -> dict:
    cmd = f"scontrol show node {node}"
    rows = [x.strip() for x in parse_cmd(cmd)]
    keys = ("AllocTRES", "CfgTRES")
    metrics = {}
    for row in rows:
        for key in keys:
            if row.startswith(key):
                row = row.replace(f"{key}=", "")
                tokens = row.split(",")
                if tokens == [""]:
                    metrics[key] = {}
                else:
                    metrics[key] = {x.split("=")[0]: x.split("=")[1] for x in tokens}
    occupancy = {}
    for metric, alloc_val in metrics["AllocTRES"].items():
        cfg_val = metrics["CfgTRES"][metric]
        occupancy[metric] = f"{alloc_val}/{cfg_val}"
    return occupancy

def available(
        resources: dict = None,
        states: dict = None,
        verbose: bool = False,
):
    if not resources:
        resources = parse_all_gpus()
    if not states:
        states = node_states()
    res = {key: val for key, val in resources.items()
           if states.get(key, "down") not in INACCESSIBLE}
    usage = gpu_usage(resources=res)
    for subdict in usage.values():
        for gpu_type, node_dicts in subdict.items():
            for node_name, user_gpu_count in node_dicts.items():
                resource_idx = [x["type"] for x in res[node_name]].index(gpu_type)
                count = res[node_name][resource_idx]["count"]
                count = max(count - user_gpu_count['n_gpu'], 0)
                res[node_name][resource_idx]["count"] = count
    by_type = resource_by_type(res)
    total = sum(x["count"] for sublist in by_type.values() for x in sublist)
    print(f"There are a total of {total} GPUs [available]")
    for key, counts_for_gpu_type in by_type.items():
        gpu_count = sum(x["count"] for x in counts_for_gpu_type)
        tail = ""
        if verbose:
            summary_strs = []
            for x in counts_for_gpu_type:
                node, count = x["node"], x["count"]
                if count:
                    occupancy = occupancy_stats_for_node(node)
                    users = [user for user in usage if node in usage[user].get(key, [])]
                    details = [f"{key}: {val}" for key, val in sorted(occupancy.items())]
                    details = f"[{', '.join(details)}] [{','.join(users)}]"
                    summary_strs.append(f"\n -> {node}: {count} {key} {details}")
            tail = " ".join(summary_strs)
        print(f"{key}: {gpu_count} available {tail}")

def all_info(verbose: bool, partition: Optional[str] = None):
    divider = "------------------------------------------------"
    print(divider)
    print(f"Slurm GPU information")
    print(divider)
    resources = parse_all_gpus(partition=partition)
    states = node_states(partition=partition)
    for mode in ("up", "accessible"):
        summary(mode=mode, resources=resources, states=states)
        print(divider)
    in_use(resources, partition=partition)
    print(divider)
    available(resources=resources, states=states, verbose=verbose)
    print(divider)

def main():
    parser = argparse.ArgumentParser(description="Display GPU information for cluster")
    parser.add_argument("-p", "--partition", default=None,
                        help=("the partition/queue (or multiple with comma separation) of"
                              " interest. By default set to all available partitions"))
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="provide a more detailed breakdown of GPU resources")
    args = parser.parse_args()

    all_info(verbose=args.verbose, partition=args.partition)

if __name__ == "__main__":
    main()