From e3054167d596927f94c5a62fba773d4d144b796a Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 13 Jan 2025 13:59:45 -0800
Subject: [PATCH 1/6] Switch to `pynvml_utils.smi` for PyNVML 12

---
 .../standalone/bulk_sampling/bench_cugraph_training.py        | 4 ++--
 .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py     | 4 ++--
 python/utils/gpu_metric_poller.py                             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
index 2604642b748..60794654e44 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,7 +36,7 @@
 def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
     import cupy
     import rmm
-    from pynvml.smi import nvidia_smi
+    from pynvml_utils.smi import nvidia_smi
 
     smi = nvidia_smi.getInstance()
     pool_size = 16e9  # FIXME calculate this
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
index d6205901b68..88ee2ce2f5a 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -201,7 +201,7 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
-                        # from pynvml.smi import nvidia_smi
+                        # from pynvml_utils.smi import nvidia_smi
                         # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
                         # logger.info(f"rank {self.rank} memory: {mem_info}")
 
diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py
index 854552fb34f..f8a5978a590 100755
--- a/python/utils/gpu_metric_poller.py
+++ b/python/utils/gpu_metric_poller.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,7 +31,7 @@
 import os
 import sys
 import threading
-from pynvml import smi
+from pynvml_utils import smi
 
 
 class GPUMetricPoller(threading.Thread):

From 7b710ee537a2a5e714ef53218cb755d3a1c9d8ea Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 13 Jan 2025 14:31:35 -0800
Subject: [PATCH 2/6] Drop unused code

This is also quoted or commented, which makes it unused. Given this go
ahead and drop it. If the code is needed again, it is simple to grab it
from history and readd it.
---
 python/utils/gpu_metric_poller.py | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py
index f8a5978a590..a1c3a748ef3 100755
--- a/python/utils/gpu_metric_poller.py
+++ b/python/utils/gpu_metric_poller.py
@@ -147,34 +147,3 @@ def startGpuMetricPolling():
 def stopGpuMetricPolling(gpuPollObj):
     gpuPollObj.stop()
     gpuPollObj.join()  # consider using timeout and reporting errors
-
-
-"""
-smi.nvmlInit()
-# hack - get actual device ID somehow
-devObj = smi.nvmlDeviceGetHandleByIndex(0)
-memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-initialMemUsed = memObj.used
-initialGpuUtil = utilObj.gpu
-
-while not self.__stop:
-    time.sleep(0.01)
-
-    memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-    utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-
-    memUsed = memObj.used - initialMemUsed
-    gpuUtil = utilObj.gpu - initialGpuUtil
-    if memUsed > self.maxGpuMemUsed:
-        self.maxGpuMemUsed = memUsed
-    if gpuUtil > self.maxGpuUtil:
-        self.maxGpuUtil = gpuUtil
-
-    smi.nvmlShutdown()
-"""
-
-
-# if __name__ == "__main__":
-#     sto=stopGpuMetricPolling
-#     po = startGpuMetricPolling()

From 72892553824689edde02e319b1533d1169c531b3 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 13 Jan 2025 14:32:45 -0800
Subject: [PATCH 3/6] Use `pynvml` directly (instead of through `smi`)

---
 python/utils/gpu_metric_poller.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py
index a1c3a748ef3..dffbd259b0e 100755
--- a/python/utils/gpu_metric_poller.py
+++ b/python/utils/gpu_metric_poller.py
@@ -31,7 +31,7 @@
 import os
 import sys
 import threading
-from pynvml_utils import smi
+import pynvml
 
 
 class GPUMetricPoller(threading.Thread):
@@ -91,18 +91,18 @@ def __runChildLoop(self, readFileNo, writeFileNo):
         childReadPipe = os.fdopen(readFileNo)
         childWritePipe = os.fdopen(writeFileNo, "w")
 
-        smi.nvmlInit()
+        pynvml.nvmlInit()
         # hack - get actual device ID somehow
-        devObj = smi.nvmlDeviceGetHandleByIndex(0)
-        memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-        utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+        devObj = pynvml.nvmlDeviceGetHandleByIndex(0)
+        memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+        utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
         initialMemUsed = memObj.used
         initialGpuUtil = utilObj.gpu
 
         controlStr = self.__waitForInput(childReadPipe)
         while True:
-            memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-            utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+            memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+            utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
 
             memUsed = memObj.used - initialMemUsed
             gpuUtil = utilObj.gpu - initialGpuUtil
@@ -113,7 +113,7 @@ def __runChildLoop(self, readFileNo, writeFileNo):
                 break
             controlStr = self.__waitForInput(childReadPipe)
 
-        smi.nvmlShutdown()
+        pynvml.nvmlShutdown()
         childReadPipe.close()
         childWritePipe.close()
 

From 9085446563b54d1798561076665f59e4e5594221 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 15 Jan 2025 10:18:50 -0500
Subject: [PATCH 4/6] Update
 benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py

Co-authored-by: jakirkham <jakirkham@gmail.com>
---
 .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py      | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
index 88ee2ce2f5a..53abfba0cda 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -201,9 +201,6 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
-                        # from pynvml_utils.smi import nvidia_smi
-                        # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
-                        # logger.info(f"rank {self.rank} memory: {mem_info}")
 
                     y_true = data.y
                     y_true = y_true.reshape((y_true.shape[0],))

From 124eefecfe926fb37da06d007731012069190c4b Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 15 Jan 2025 09:29:37 -0800
Subject: [PATCH 5/6] Remove unneeded `smi` code

---
 .../cugraph/standalone/bulk_sampling/bench_cugraph_training.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
index 60794654e44..66c34cc1276 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -36,9 +36,7 @@
 def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
     import cupy
     import rmm
-    from pynvml_utils.smi import nvidia_smi
 
-    smi = nvidia_smi.getInstance()
     pool_size = 16e9  # FIXME calculate this
 
     rmm.reinitialize(

From 2f4f1f263b85e0ad6269a973b140c3a2bf38f2d6 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 15 Jan 2025 09:31:39 -0800
Subject: [PATCH 6/6] Remove extra blank line

---
 .../standalone/bulk_sampling/trainers/pyg/trainers_pyg.py        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
index 53abfba0cda..72a530fc6fe 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -201,7 +201,6 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
-
                     y_true = data.y
                     y_true = y_true.reshape((y_true.shape[0],))
                     x = data.x.to(torch.float32)