From e3054167d596927f94c5a62fba773d4d144b796a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 13 Jan 2025 13:59:45 -0800 Subject: [PATCH 1/6] Switch to `pynvml_utils.smi` for PyNVML 12 --- .../standalone/bulk_sampling/bench_cugraph_training.py | 4 ++-- .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py | 4 ++-- python/utils/gpu_metric_poller.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py index 2604642b748..60794654e44 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -36,7 +36,7 @@ def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None: import cupy import rmm - from pynvml.smi import nvidia_smi + from pynvml_utils.smi import nvidia_smi smi = nvidia_smi.getInstance() pool_size = 16e9 # FIXME calculate this diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py index d6205901b68..88ee2ce2f5a 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -201,7 +201,7 @@ def train(self): ) logger.info(f"total time: {total_time_iter}") - # from pynvml.smi import nvidia_smi + # from pynvml_utils.smi import nvidia_smi # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage'] # logger.info(f"rank {self.rank} memory: {mem_info}") diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index 854552fb34f..f8a5978a590 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -31,7 +31,7 @@ import os import sys import threading -from pynvml import smi +from pynvml_utils import smi class GPUMetricPoller(threading.Thread): From 7b710ee537a2a5e714ef53218cb755d3a1c9d8ea Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 13 Jan 2025 14:31:35 -0800 Subject: [PATCH 2/6] Drop unused code This is also quoted or commented, which makes it unused. Given this go ahead and drop it. If the code is needed again, it is simple to grab it from history and readd it. --- python/utils/gpu_metric_poller.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index f8a5978a590..a1c3a748ef3 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -147,34 +147,3 @@ def startGpuMetricPolling(): def stopGpuMetricPolling(gpuPollObj): gpuPollObj.stop() gpuPollObj.join() # consider using timeout and reporting errors - - -""" -smi.nvmlInit() -# hack - get actual device ID somehow -devObj = smi.nvmlDeviceGetHandleByIndex(0) -memObj = smi.nvmlDeviceGetMemoryInfo(devObj) -utilObj = smi.nvmlDeviceGetUtilizationRates(devObj) -initialMemUsed = memObj.used -initialGpuUtil = utilObj.gpu - -while not self.__stop: - time.sleep(0.01) - - memObj = smi.nvmlDeviceGetMemoryInfo(devObj) - utilObj = smi.nvmlDeviceGetUtilizationRates(devObj) - - memUsed = memObj.used - initialMemUsed - gpuUtil = utilObj.gpu - initialGpuUtil - if memUsed > self.maxGpuMemUsed: - self.maxGpuMemUsed = memUsed - if gpuUtil > self.maxGpuUtil: - self.maxGpuUtil = gpuUtil - - smi.nvmlShutdown() -""" - - -# if __name__ == "__main__": -# sto=stopGpuMetricPolling -# po = startGpuMetricPolling() From 72892553824689edde02e319b1533d1169c531b3 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 13 Jan 2025 14:32:45 -0800 Subject: [PATCH 3/6] Use `pynvml` directly (instead of through `smi`) --- python/utils/gpu_metric_poller.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index a1c3a748ef3..dffbd259b0e 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -31,7 +31,7 @@ import os import sys import threading -from pynvml_utils import smi +import pynvml class GPUMetricPoller(threading.Thread): @@ -91,18 +91,18 @@ def __runChildLoop(self, readFileNo, writeFileNo): childReadPipe = os.fdopen(readFileNo) childWritePipe = os.fdopen(writeFileNo, "w") - smi.nvmlInit() + pynvml.nvmlInit() # hack - get actual device ID somehow - devObj = smi.nvmlDeviceGetHandleByIndex(0) - memObj = smi.nvmlDeviceGetMemoryInfo(devObj) - utilObj = smi.nvmlDeviceGetUtilizationRates(devObj) + devObj = pynvml.nvmlDeviceGetHandleByIndex(0) + memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj) + utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj) initialMemUsed = memObj.used initialGpuUtil = utilObj.gpu controlStr = self.__waitForInput(childReadPipe) while True: - memObj = smi.nvmlDeviceGetMemoryInfo(devObj) - utilObj = smi.nvmlDeviceGetUtilizationRates(devObj) + memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj) + utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj) memUsed = memObj.used - initialMemUsed gpuUtil = utilObj.gpu - initialGpuUtil @@ -113,7 +113,7 @@ def __runChildLoop(self, readFileNo, writeFileNo): break controlStr = self.__waitForInput(childReadPipe) - smi.nvmlShutdown() + pynvml.nvmlShutdown() childReadPipe.close() childWritePipe.close() From 9085446563b54d1798561076665f59e4e5594221 Mon Sep 17 00:00:00 2001 From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> Date: Wed, 15 Jan 2025 10:18:50 -0500 Subject: [PATCH 4/6] Update benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py Co-authored-by: jakirkham --- .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py index 88ee2ce2f5a..53abfba0cda 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py @@ -201,9 +201,6 @@ def train(self): ) logger.info(f"total time: {total_time_iter}") - # from pynvml_utils.smi import nvidia_smi - # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage'] - # logger.info(f"rank {self.rank} memory: {mem_info}") y_true = data.y y_true = y_true.reshape((y_true.shape[0],)) From 124eefecfe926fb37da06d007731012069190c4b Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 15 Jan 2025 09:29:37 -0800 Subject: [PATCH 5/6] Remove unneeded `smi` code --- .../cugraph/standalone/bulk_sampling/bench_cugraph_training.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py index 60794654e44..66c34cc1276 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py @@ -36,9 +36,7 @@ def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None: import cupy import rmm - from pynvml_utils.smi import nvidia_smi - smi = nvidia_smi.getInstance() pool_size = 16e9 # FIXME calculate this rmm.reinitialize( From 2f4f1f263b85e0ad6269a973b140c3a2bf38f2d6 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 15 Jan 2025 09:31:39 -0800 Subject: [PATCH 6/6] Remove extra blank line --- .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py index 53abfba0cda..72a530fc6fe 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py @@ -201,7 +201,6 @@ def train(self): ) logger.info(f"total time: {total_time_iter}") - y_true = data.y y_true = y_true.reshape((y_true.shape[0],)) x = data.x.to(torch.float32)