From 01630c62fac8e5230c7cc0f8ff338e8fb3da8a5a Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Tue, 18 Jun 2024 14:57:42 -0700 Subject: [PATCH] add missing dcgm metrics (#710) --- .../modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml b/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml index 17a24ef6f..e4972a3a0 100644 --- a/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml +++ b/benchmarks/infra/stage-2/modules/gke-setup/modules/nvidia-dcgm/manifest-templates/03-cm-dcgm.yaml @@ -23,6 +23,11 @@ data: DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + # Utilization of IP blocks,, DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor