-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpuinfo
executable file
·295 lines (270 loc) · 11.7 KB
/
gpuinfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python3
# gpuinfo
# Display GPU information for Slurm cluster
# Based on output from sinfo, squeue, and scontrol
# Modified from slurm_gpustat program found at https://github.com/albanie/slurm_gpustat
import re
import argparse
import functools
import subprocess
from typing import Optional
from collections import defaultdict
INACCESSIBLE = {"drain*", "down*", "drng", "drain", "down"}
INTERACTIVE_CMDS = {"bash", "zsh", "sh"}
def split_node_str(node_str):
node_str = node_str.strip()
breakpoints, stack = [0], []
for ii, char in enumerate(node_str):
if char == "[":
stack.append(char)
elif char == "]":
stack.pop()
elif not stack and char == ",":
breakpoints.append(ii + 1)
end = len(node_str) + 1
return [node_str[i: j - 1] for i, j in zip(breakpoints, breakpoints[1:] + [end])]
def parse_node_names(node_str):
names = []
node_specs = split_node_str(node_str)
for node_spec in node_specs:
if "[" not in node_spec:
names.append(node_spec)
else:
head, tail = node_spec.index("["), node_spec.index("]")
prefix = node_spec[:head]
subspecs = node_spec[head + 1:tail].split(",")
for subspec in subspecs:
if "-" not in subspec:
subnames = [f"{prefix}{subspec}"]
else:
start, end = subspec.split("-")
num_digits = len(start)
subnames = [f"{prefix}{str(x).zfill(num_digits)}"
for x in range(int(start), int(end) + 1)]
names.extend(subnames)
return names
def parse_cmd(cmd, split=True):
output = subprocess.check_output(cmd, shell=True).decode("utf-8")
if split:
output = [x for x in output.split("\n") if x]
return output
def parse_all_gpus(partition: Optional[str] = None,
default_gpus: int = 4,
default_gpu_name: str = "NONAME_GPU") -> dict:
cmd = "sinfo -o '%1000N|%1000G' --noheader"
if partition:
cmd += f" --partition={partition}"
rows = parse_cmd(cmd)
resources = defaultdict(list)
p = re.compile(r'gpu:(?:(\w*):)?(\d*)(?:\(\S*\))?\s*')
for row in rows:
node_str, resource_strs = row.split("|")
for resource_str in resource_strs.split(","):
if not resource_str.startswith("gpu"):
continue
match = p.search(resource_str)
gpu_type = match.group(1) if match.group(1) is not None else default_gpu_name
gpu_count = int(match.group(2)) if match.group(2) != "" else default_gpus
node_names = parse_node_names(node_str)
for name in node_names:
resources[name].append({"type": gpu_type, "count": gpu_count})
return resources
def node_states(partition: Optional[str] = None) -> dict:
cmd = "sinfo --noheader"
if partition:
cmd += f" --partition={partition}"
rows = parse_cmd(cmd)
states = {}
for row in rows:
tokens = row.split()
state, names = tokens[4], tokens[5]
node_names = parse_node_names(names)
states.update({name: state for name in node_names})
return states
def resource_by_type(resources: dict) -> dict:
by_type = defaultdict(list)
for node, specs in resources.items():
for spec in specs:
by_type[spec["type"]].append({"node": node, "count": spec["count"]})
return by_type
def summary_by_type(resources: dict, tag: str):
by_type = resource_by_type(resources)
total = sum(x["count"] for sublist in by_type.values() for x in sublist)
agg_str = []
for key, val in sorted(by_type.items(), key=lambda x: sum(y["count"] for y in x[1])):
gpu_count = sum(x["count"] for x in val)
agg_str.append(f"{gpu_count} {key} gpus")
print(f"There are a total of {total} GPUs [{tag}]")
print("\n".join(agg_str))
def summary(mode: str, resources: dict = None, states: dict = None):
if not resources:
resources = parse_all_gpus()
if not states:
states = node_states()
if mode == "accessible":
res = {key: val for key, val in resources.items()
if states.get(key, "down") not in INACCESSIBLE}
elif mode == "up":
res = resources
else:
raise ValueError(f"Unknown mode: {mode}")
summary_by_type(res, tag=mode)
def gpu_usage(resources: dict, partition: Optional[str] = None) -> dict:
version_cmd = "sinfo -V"
slurm_version = parse_cmd(version_cmd, split=False).split(" ")[1]
if slurm_version.startswith("17"):
resource_flag = "gres"
else:
resource_flag = "tres-per-node"
cmd = f"squeue -O {resource_flag}:100,nodelist:100,username:100,jobid:100 --noheader"
if partition:
cmd += f" --partition={partition}"
detailed_job_cmd = "scontrol show jobid -dd %s"
rows = parse_cmd(cmd)
usage = defaultdict(dict)
for row in rows:
tokens = row.split()
if tokens[0].startswith("gres:"):
tokens[0] = tokens[0][5:]
if len(tokens) < 4 or not tokens[0].startswith("gpu"):
continue
gpu_count_str, node_str, user, jobid = tokens
gpu_count_tokens = gpu_count_str.split(":")
if not gpu_count_tokens[-1].isdigit():
gpu_count_tokens.append("1")
num_gpus = int(gpu_count_tokens[-1])
detailed_output = parse_cmd(detailed_job_cmd % jobid, split=False)
is_bash = any([f'Command={x}\n' in detailed_output for x in INTERACTIVE_CMDS])
num_bash_gpus = num_gpus * is_bash
node_names = parse_node_names(node_str)
for node_name in node_names:
if node_name not in resources:
continue
node_gpu_types = [x["type"] for x in resources[node_name]]
if len(gpu_count_tokens) == 2:
gpu_type = None
elif len(gpu_count_tokens) == 3:
gpu_type = gpu_count_tokens[1]
if gpu_type is None:
if len(node_gpu_types) != 1:
gpu_type = sorted(
resources[node_name],
key=lambda k: k['count'],
reverse=True
)[0]['type']
msg = (f"cannot determine node gpu type for {user} on {node_name}"
f" (guessing {gpu_type})")
print(f"WARNING >>> {msg}")
else:
gpu_type = node_gpu_types[0]
if gpu_type in usage[user]:
usage[user][gpu_type][node_name]['n_gpu'] += num_gpus
usage[user][gpu_type][node_name]['bash_gpu'] += num_bash_gpus
else:
usage[user][gpu_type] = defaultdict(lambda: {'n_gpu': 0, 'bash_gpu': 0})
usage[user][gpu_type][node_name]['n_gpu'] += num_gpus
usage[user][gpu_type][node_name]['bash_gpu'] += num_bash_gpus
return usage
def in_use(resources: dict = None, partition: Optional[str] = None):
if not resources:
resources = parse_all_gpus()
usage = gpu_usage(resources, partition=partition)
aggregates = {}
for user, subdict in usage.items():
aggregates[user] = {}
aggregates[user]['n_gpu'] = {key: sum([x['n_gpu'] for x in val.values()])
for key, val in subdict.items()}
aggregates[user]['bash_gpu'] = {key: sum([x['bash_gpu'] for x in val.values()])
for key, val in subdict.items()}
print("Usage by user:")
if not aggregates:
print("None")
else:
for user, subdict in sorted(aggregates.items(),
key=lambda x: sum(x[1]['n_gpu'].values())):
total = (f"total: {str(sum(subdict['n_gpu'].values())):2s} "
f"(interactive: {str(sum(subdict['bash_gpu'].values())):2s})")
summary_str = ", ".join([f"{key}: {val}" for key, val in subdict['n_gpu'].items()])
print(f"{user:10s} [{total}] {summary_str}")
@functools.lru_cache(maxsize=64, typed=True)
def occupancy_stats_for_node(node: str) -> dict:
cmd = f"scontrol show node {node}"
rows = [x.strip() for x in parse_cmd(cmd)]
keys = ("AllocTRES", "CfgTRES")
metrics = {}
for row in rows:
for key in keys:
if row.startswith(key):
row = row.replace(f"{key}=", "")
tokens = row.split(",")
if tokens == [""]:
metrics[key] = {}
else:
metrics[key] = {x.split("=")[0]: x.split("=")[1] for x in tokens}
occupancy = {}
for metric, alloc_val in metrics["AllocTRES"].items():
cfg_val = metrics["CfgTRES"][metric]
occupancy[metric] = f"{alloc_val}/{cfg_val}"
return occupancy
def available(
resources: dict = None,
states: dict = None,
verbose: bool = False,
):
if not resources:
resources = parse_all_gpus()
if not states:
states = node_states()
res = {key: val for key, val in resources.items()
if states.get(key, "down") not in INACCESSIBLE}
usage = gpu_usage(resources=res)
for subdict in usage.values():
for gpu_type, node_dicts in subdict.items():
for node_name, user_gpu_count in node_dicts.items():
resource_idx = [x["type"] for x in res[node_name]].index(gpu_type)
count = res[node_name][resource_idx]["count"]
count = max(count - user_gpu_count['n_gpu'], 0)
res[node_name][resource_idx]["count"] = count
by_type = resource_by_type(res)
total = sum(x["count"] for sublist in by_type.values() for x in sublist)
print(f"There are a total of {total} GPUs [available]")
for key, counts_for_gpu_type in by_type.items():
gpu_count = sum(x["count"] for x in counts_for_gpu_type)
tail = ""
if verbose:
summary_strs = []
for x in counts_for_gpu_type:
node, count = x["node"], x["count"]
if count:
occupancy = occupancy_stats_for_node(node)
users = [user for user in usage if node in usage[user].get(key, [])]
details = [f"{key}: {val}" for key, val in sorted(occupancy.items())]
details = f"[{', '.join(details)}] [{','.join(users)}]"
summary_strs.append(f"\n -> {node}: {count} {key} {details}")
tail = " ".join(summary_strs)
print(f"{key}: {gpu_count} available {tail}")
def all_info(verbose: bool, partition: Optional[str] = None):
divider = "------------------------------------------------"
print(divider)
print(f"Slurm GPU information")
print(divider)
resources = parse_all_gpus(partition=partition)
states = node_states(partition=partition)
for mode in ("up", "accessible"):
summary(mode=mode, resources=resources, states=states)
print(divider)
in_use(resources, partition=partition)
print(divider)
available(resources=resources, states=states, verbose=verbose)
print(divider)
def main():
parser = argparse.ArgumentParser(description="Display GPU information for cluster")
parser.add_argument("-p", "--partition", default=None,
help=("the partition/queue (or multiple with comma separation) of"
" interest. By default set to all available partitions"))
parser.add_argument("-v", "--verbose", action="store_true",
help="provide a more detailed breakdown of GPU resources")
args = parser.parse_args()
all_info(verbose=args.verbose, partition=args.partition)
if __name__ == "__main__":
main()