From 784931ee16129e0278ed1d1f602aa17350d50c51 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Wed, 19 Jun 2024 16:11:17 -0500 Subject: [PATCH] GPU Health API improvements Signed-off-by: Vadym Fedorov --- Makefile | 3 +- pkg/dcgm/const.go | 1081 ++++++++++++++++++++---------------- pkg/dcgm/gpu_group.go | 32 ++ pkg/dcgm/gpu_group_test.go | 28 + pkg/dcgm/health.go | 121 +++- pkg/dcgm/health_test.go | 121 ++++ pkg/dcgm/internal.go | 1 + pkg/dcgm/policy_test.go | 34 +- pkg/dcgm/test_utils.go | 19 + 9 files changed, 938 insertions(+), 502 deletions(-) create mode 100644 pkg/dcgm/health_test.go diff --git a/Makefile b/Makefile index f6c3424..e74ddcc 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ # limitations under the License. GOLANG_VERSION := 1.14.2 +GOLANGCILINT_TIMEOUT ?= 10m .PHONY: all binary install check-format all: binary test-main check-format @@ -45,4 +46,4 @@ clean: rm -f samples/topology/topology lint: - golangci-lint run ./... \ No newline at end of file + golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix \ No newline at end of file diff --git a/pkg/dcgm/const.go b/pkg/dcgm/const.go index 96b63f5..9365ea3 100644 --- a/pkg/dcgm/const.go +++ b/pkg/dcgm/const.go @@ -524,481 +524,620 @@ const ( DCGM_ST_ALREADY_INITIALIZED = -55 ) -var ( - DCGM_FI = map[string]Short{ - "DCGM_FT_BINARY": Short('b'), - "DCGM_FT_DOUBLE": Short('d'), - "DCGM_FT_INT64": Short('i'), - "DCGM_FT_STRING": Short('s'), - "DCGM_FT_TIMESTAMP": Short('t'), +var DCGM_FI = map[string]Short{ + "DCGM_FT_BINARY": Short('b'), + "DCGM_FT_DOUBLE": Short('d'), + "DCGM_FT_INT64": Short('i'), + "DCGM_FT_STRING": Short('s'), + "DCGM_FT_TIMESTAMP": Short('t'), - "DCGM_FI_UNKNOWN": 0, - "DCGM_FI_DRIVER_VERSION": 1, - "DCGM_FI_NVML_VERSION": 2, - "DCGM_FI_PROCESS_NAME": 3, - "DCGM_FI_DEV_COUNT": 4, - "DCGM_FI_CUDA_DRIVER_VERSION": 5, - "DCGM_FI_DEV_NAME": 50, - "DCGM_FI_DEV_BRAND": 51, - "DCGM_FI_DEV_NVML_INDEX": 52, - "DCGM_FI_DEV_SERIAL": 53, - "DCGM_FI_DEV_UUID": 54, - "DCGM_FI_DEV_MINOR_NUMBER": 55, - "DCGM_FI_DEV_OEM_INFOROM_VER": 56, - "DCGM_FI_DEV_PCI_BUSID": 57, - "DCGM_FI_DEV_PCI_COMBINED_ID": 58, - "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, - "DCGM_FI_GPU_TOPOLOGY_PCI": 60, - "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, - "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, - "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, - "DCGM_FI_DEV_COMPUTE_MODE": 65, - "DCGM_FI_DEV_PERSISTENCE_MODE": 66, - "DCGM_FI_DEV_MIG_MODE": 67, - "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, - "DCGM_FI_DEV_MIG_MAX_SLICES": 69, - "DCGM_FI_DEV_CPU_AFFINITY_0": 70, - "DCGM_FI_DEV_CPU_AFFINITY_1": 71, - "DCGM_FI_DEV_CPU_AFFINITY_2": 72, - "DCGM_FI_DEV_CPU_AFFINITY_3": 73, - "DCGM_FI_DEV_CC_MODE": 74, - "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, - "DCGM_FI_DEV_MIG_GI_INFO": 76, - "DCGM_FI_DEV_MIG_CI_INFO": 77, - "DCGM_FI_DEV_ECC_INFOROM_VER": 80, - "DCGM_FI_DEV_POWER_INFOROM_VER": 81, - "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, - "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, - "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, - "DCGM_FI_DEV_VBIOS_VERSION": 85, - "DCGM_FI_DEV_MEM_AFFINITY_0": 86, - "DCGM_FI_DEV_MEM_AFFINITY_1": 87, - "DCGM_FI_DEV_MEM_AFFINITY_2": 88, - "DCGM_FI_DEV_MEM_AFFINITY_3": 89, - "DCGM_FI_DEV_BAR1_TOTAL": 90, - "DCGM_FI_SYNC_BOOST": 91, - "DCGM_FI_DEV_BAR1_USED": 92, - "DCGM_FI_DEV_BAR1_FREE": 93, - "DCGM_FI_DEV_SM_CLOCK": 100, - "DCGM_FI_DEV_MEM_CLOCK": 101, - "DCGM_FI_DEV_VIDEO_CLOCK": 102, - "DCGM_FI_DEV_APP_SM_CLOCK": 110, - "DCGM_FI_DEV_APP_MEM_CLOCK": 111, - "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, - "DCGM_FI_DEV_MAX_SM_CLOCK": 113, - "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, - "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, - "DCGM_FI_DEV_AUTOBOOST": 120, - "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, - "DCGM_FI_DEV_MEMORY_TEMP": 140, - "DCGM_FI_DEV_GPU_TEMP": 150, - "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, - "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, - "DCGM_FI_DEV_POWER_USAGE": 155, - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, - "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, - "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, - "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, - "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, - "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, - "DCGM_FI_DEV_PSTATE": 190, - "DCGM_FI_DEV_FAN_SPEED": 191, - "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, - "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, - "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, - "DCGM_FI_DEV_GPU_UTIL": 203, - "DCGM_FI_DEV_MEM_COPY_UTIL": 204, - "DCGM_FI_DEV_ACCOUNTING_DATA": 205, - "DCGM_FI_DEV_ENC_UTIL": 206, - "DCGM_FI_DEV_DEC_UTIL": 207, - "DCGM_FI_DEV_XID_ERRORS": 230, - "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, - "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, - "DCGM_FI_DEV_PCIE_LINK_GEN": 237, - "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, - "DCGM_FI_DEV_POWER_VIOLATION": 240, - "DCGM_FI_DEV_THERMAL_VIOLATION": 241, - "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, - "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, - "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, - "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, - "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, - "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, - "DCGM_FI_DEV_FB_TOTAL": 250, - "DCGM_FI_DEV_FB_FREE": 251, - "DCGM_FI_DEV_FB_USED": 252, - "DCGM_FI_DEV_FB_RESERVED": 253, - "DCGM_FI_DEV_FB_USED_PERCENT": 254, - "DCGM_FI_DEV_C2C_LINK_COUNT": 285, - "DCGM_FI_DEV_C2C_LINK_STATUS": 286, - "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, - "DCGM_FI_DEV_ECC_CURRENT": 300, - "DCGM_FI_DEV_ECC_PENDING": 301, - "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, - "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, - "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, - "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, - "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, - "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, - "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, - "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, - "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, - "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, - "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, - "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, - "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, - "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, - "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, - "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, - "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, - "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, - "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, - "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, - "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, - "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, - "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, - "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, - "DCGM_FI_DEV_RETIRED_SBE": 390, - "DCGM_FI_DEV_RETIRED_DBE": 391, - "DCGM_FI_DEV_RETIRED_PENDING": 392, - "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, - "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, - "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, - "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, - "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, - "DCGM_FI_DEV_VIRTUAL_MODE": 500, - "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, - "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, - "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, - "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, - "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, - "DCGM_FI_DEV_ENC_STATS": 506, - "DCGM_FI_DEV_FBC_STATS": 507, - "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, - "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, - "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, - "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, - "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, - "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, - "DCGM_FI_DEV_VGPU_VM_ID": 520, - "DCGM_FI_DEV_VGPU_VM_NAME": 521, - "DCGM_FI_DEV_VGPU_TYPE": 522, - "DCGM_FI_DEV_VGPU_UUID": 523, - "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, - "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, - "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, - "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, - "DCGM_FI_DEV_VGPU_ENC_STATS": 528, - "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, - "DCGM_FI_DEV_VGPU_FBC_STATS": 530, - "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, - "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, - "DCGM_FI_DEV_VGPU_PCI_ID": 533, - "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, - "DCGM_FI_INTERNAL_FIELDS_0_START": 600, - "DCGM_FI_INTERNAL_FIELDS_0_END": 699, - "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, - "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, - "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, - "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, - "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, - "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, - "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, - "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, - "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, - "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, - "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, - "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, - "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, - "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, - "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, - "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, - "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, - "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, - "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, - "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, - "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, - "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, - "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, - "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, - "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, - "DCGM_FI_PROF_SM_ACTIVE": 1002, - "DCGM_FI_PROF_SM_OCCUPANCY": 1003, - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, - "DCGM_FI_PROF_DRAM_ACTIVE": 1005, - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, - "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, - "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, - "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, - "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, - "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, - "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, - "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, - "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, - "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, - "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, - "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, - "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, - "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, - "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, - "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, - "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, - "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, - "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, - "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, - "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, - "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, - "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, - "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, - "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, - "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, - "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, - "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, - "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, - "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, - "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, - "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, - "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, - "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, - "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, - "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, - "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, - "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, - "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, - "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, - "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, - "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, - "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, - "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, - "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, - "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, - "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, - "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, - "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, - "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, - "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, - "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, - "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, - "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, - "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, - "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, - "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, - "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, - "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, - "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, - "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, - "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, - "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, - "DCGM_FI_DEV_CPU_UTIL_USER": 1101, - "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, - "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, - "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, - "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, - "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, - "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, - "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, - "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, - "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, - "DCGM_FI_DEV_CPU_VENDOR": 1140, - "DCGM_FI_DEV_CPU_MODEL": 1141, - "DCGM_FI_MAX_FIELDS": 1142, - } + "DCGM_FI_UNKNOWN": 0, + "DCGM_FI_DRIVER_VERSION": 1, + "DCGM_FI_NVML_VERSION": 2, + "DCGM_FI_PROCESS_NAME": 3, + "DCGM_FI_DEV_COUNT": 4, + "DCGM_FI_CUDA_DRIVER_VERSION": 5, + "DCGM_FI_DEV_NAME": 50, + "DCGM_FI_DEV_BRAND": 51, + "DCGM_FI_DEV_NVML_INDEX": 52, + "DCGM_FI_DEV_SERIAL": 53, + "DCGM_FI_DEV_UUID": 54, + "DCGM_FI_DEV_MINOR_NUMBER": 55, + "DCGM_FI_DEV_OEM_INFOROM_VER": 56, + "DCGM_FI_DEV_PCI_BUSID": 57, + "DCGM_FI_DEV_PCI_COMBINED_ID": 58, + "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, + "DCGM_FI_GPU_TOPOLOGY_PCI": 60, + "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, + "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, + "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, + "DCGM_FI_DEV_COMPUTE_MODE": 65, + "DCGM_FI_DEV_PERSISTENCE_MODE": 66, + "DCGM_FI_DEV_MIG_MODE": 67, + "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, + "DCGM_FI_DEV_MIG_MAX_SLICES": 69, + "DCGM_FI_DEV_CPU_AFFINITY_0": 70, + "DCGM_FI_DEV_CPU_AFFINITY_1": 71, + "DCGM_FI_DEV_CPU_AFFINITY_2": 72, + "DCGM_FI_DEV_CPU_AFFINITY_3": 73, + "DCGM_FI_DEV_CC_MODE": 74, + "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, + "DCGM_FI_DEV_MIG_GI_INFO": 76, + "DCGM_FI_DEV_MIG_CI_INFO": 77, + "DCGM_FI_DEV_ECC_INFOROM_VER": 80, + "DCGM_FI_DEV_POWER_INFOROM_VER": 81, + "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, + "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, + "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, + "DCGM_FI_DEV_VBIOS_VERSION": 85, + "DCGM_FI_DEV_MEM_AFFINITY_0": 86, + "DCGM_FI_DEV_MEM_AFFINITY_1": 87, + "DCGM_FI_DEV_MEM_AFFINITY_2": 88, + "DCGM_FI_DEV_MEM_AFFINITY_3": 89, + "DCGM_FI_DEV_BAR1_TOTAL": 90, + "DCGM_FI_SYNC_BOOST": 91, + "DCGM_FI_DEV_BAR1_USED": 92, + "DCGM_FI_DEV_BAR1_FREE": 93, + "DCGM_FI_DEV_SM_CLOCK": 100, + "DCGM_FI_DEV_MEM_CLOCK": 101, + "DCGM_FI_DEV_VIDEO_CLOCK": 102, + "DCGM_FI_DEV_APP_SM_CLOCK": 110, + "DCGM_FI_DEV_APP_MEM_CLOCK": 111, + "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, + "DCGM_FI_DEV_MAX_SM_CLOCK": 113, + "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, + "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, + "DCGM_FI_DEV_AUTOBOOST": 120, + "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, + "DCGM_FI_DEV_MEMORY_TEMP": 140, + "DCGM_FI_DEV_GPU_TEMP": 150, + "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, + "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, + "DCGM_FI_DEV_POWER_USAGE": 155, + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, + "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, + "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, + "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, + "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, + "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, + "DCGM_FI_DEV_PSTATE": 190, + "DCGM_FI_DEV_FAN_SPEED": 191, + "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, + "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, + "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, + "DCGM_FI_DEV_GPU_UTIL": 203, + "DCGM_FI_DEV_MEM_COPY_UTIL": 204, + "DCGM_FI_DEV_ACCOUNTING_DATA": 205, + "DCGM_FI_DEV_ENC_UTIL": 206, + "DCGM_FI_DEV_DEC_UTIL": 207, + "DCGM_FI_DEV_XID_ERRORS": 230, + "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, + "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, + "DCGM_FI_DEV_PCIE_LINK_GEN": 237, + "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, + "DCGM_FI_DEV_POWER_VIOLATION": 240, + "DCGM_FI_DEV_THERMAL_VIOLATION": 241, + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, + "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, + "DCGM_FI_DEV_FB_TOTAL": 250, + "DCGM_FI_DEV_FB_FREE": 251, + "DCGM_FI_DEV_FB_USED": 252, + "DCGM_FI_DEV_FB_RESERVED": 253, + "DCGM_FI_DEV_FB_USED_PERCENT": 254, + "DCGM_FI_DEV_C2C_LINK_COUNT": 285, + "DCGM_FI_DEV_C2C_LINK_STATUS": 286, + "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, + "DCGM_FI_DEV_ECC_CURRENT": 300, + "DCGM_FI_DEV_ECC_PENDING": 301, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, + "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, + "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, + "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, + "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, + "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, + "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, + "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, + "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, + "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, + "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, + "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, + "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, + "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, + "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, + "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, + "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, + "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, + "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, + "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, + "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, + "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, + "DCGM_FI_DEV_RETIRED_SBE": 390, + "DCGM_FI_DEV_RETIRED_DBE": 391, + "DCGM_FI_DEV_RETIRED_PENDING": 392, + "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, + "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, + "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, + "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, + "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, + "DCGM_FI_DEV_VIRTUAL_MODE": 500, + "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, + "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, + "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, + "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, + "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, + "DCGM_FI_DEV_ENC_STATS": 506, + "DCGM_FI_DEV_FBC_STATS": 507, + "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, + "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, + "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, + "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, + "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, + "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, + "DCGM_FI_DEV_VGPU_VM_ID": 520, + "DCGM_FI_DEV_VGPU_VM_NAME": 521, + "DCGM_FI_DEV_VGPU_TYPE": 522, + "DCGM_FI_DEV_VGPU_UUID": 523, + "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, + "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, + "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, + "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, + "DCGM_FI_DEV_VGPU_ENC_STATS": 528, + "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, + "DCGM_FI_DEV_VGPU_FBC_STATS": 530, + "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, + "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, + "DCGM_FI_DEV_VGPU_PCI_ID": 533, + "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, + "DCGM_FI_INTERNAL_FIELDS_0_START": 600, + "DCGM_FI_INTERNAL_FIELDS_0_END": 699, + "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, + "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, + "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, + "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, + "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, + "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, + "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, + "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, + "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, + "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, + "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, + "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, + "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, + "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, + "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, + "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, + "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, + "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, + "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, + "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, + "DCGM_FI_PROF_SM_ACTIVE": 1002, + "DCGM_FI_PROF_SM_OCCUPANCY": 1003, + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, + "DCGM_FI_PROF_DRAM_ACTIVE": 1005, + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, + "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, + "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, + "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, + "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, + "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, + "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, + "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, + "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, + "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, + "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, + "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, + "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, + "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, + "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, + "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, + "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, + "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, + "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, + "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, + "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, + "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, + "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, + "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, + "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, + "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, + "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, + "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, + "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, + "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, + "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, + "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, + "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, + "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, + "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, + "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, + "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, + "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, + "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, + "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, + "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, + "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, + "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, + "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, + "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, + "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, + "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, + "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, + "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, + "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, + "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, + "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, + "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, + "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, + "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, + "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, + "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, + "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, + "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, + "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, + "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, + "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, + "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, + "DCGM_FI_DEV_CPU_UTIL_USER": 1101, + "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, + "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, + "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, + "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, + "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, + "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, + "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, + "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, + "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, + "DCGM_FI_DEV_CPU_VENDOR": 1140, + "DCGM_FI_DEV_CPU_MODEL": 1141, + "DCGM_FI_MAX_FIELDS": 1142, +} + +var OLD_DCGM_FI = map[string]Short{ + "dcgm_sm_clock": 100, + "dcgm_memory_clock": 101, + "dcgm_memory_temp": 140, + "dcgm_gpu_temp": 150, + "dcgm_power_usage": 155, + "dcgm_total_energy_consumption": 156, + "dcgm_pcie_tx_throughput": 200, + "dcgm_pcie_rx_throughput": 201, + "dcgm_pcie_replay_counter": 202, + "dcgm_gpu_utilization": 203, + "dcgm_mem_copy_utilization": 204, + "dcgm_enc_utilization": 206, + "dcgm_dec_utilization": 207, + "dcgm_xid_errors": 230, + "dcgm_power_violation": 240, + "dcgm_thermal_violation": 241, + "dcgm_sync_boost_violation": 242, + "dcgm_board_limit_violation": 243, + "dcgm_low_util_violation": 244, + "dcgm_reliability_violation": 245, + "dcgm_fb_free": 251, + "dcgm_fb_used": 252, + "dcgm_ecc_sbe_volatile_total": 310, + "dcgm_ecc_dbe_volatile_total": 311, + "dcgm_ecc_sbe_aggregate_total": 312, + "dcgm_ecc_dbe_aggregate_total": 313, + "dcgm_retired_pages_sbe": 390, + "dcgm_retired_pages_dbe": 391, + "dcgm_retired_pages_pending": 392, + "dcgm_nvlink_flit_crc_error_count_total": 409, + "dcgm_nvlink_data_crc_error_count_total": 419, + "dcgm_nvlink_replay_error_count_total": 429, + "dcgm_nvlink_recovery_error_count_total": 439, + "dcgm_nvlink_bandwidth_total": 449, + "dcgm_fi_prof_gr_engine_active": 1001, + "dcgm_fi_prof_sm_active": 1002, + "dcgm_fi_prof_sm_occupancy": 1003, + "dcgm_fi_prof_pipe_tensor_active": 1004, + "dcgm_fi_prof_dram_active": 1005, + "dcgm_fi_prof_pcie_tx_bytes": 1009, + "dcgm_fi_prof_pcie_rx_bytes": 1010, +} + +const ( + DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) +) + +type HealthSystem uint + +const ( + DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 + DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2 + DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4 + DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8 + DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10 + DCGM_HEALTH_WATCH_SM HealthSystem = 0x20 + DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40 + DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 + DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 + DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200 + DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400 + DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800 + DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF ) -var ( - OLD_DCGM_FI = map[string]Short{ - "dcgm_sm_clock": 100, - "dcgm_memory_clock": 101, - "dcgm_memory_temp": 140, - "dcgm_gpu_temp": 150, - "dcgm_power_usage": 155, - "dcgm_total_energy_consumption": 156, - "dcgm_pcie_tx_throughput": 200, - "dcgm_pcie_rx_throughput": 201, - "dcgm_pcie_replay_counter": 202, - "dcgm_gpu_utilization": 203, - "dcgm_mem_copy_utilization": 204, - "dcgm_enc_utilization": 206, - "dcgm_dec_utilization": 207, - "dcgm_xid_errors": 230, - "dcgm_power_violation": 240, - "dcgm_thermal_violation": 241, - "dcgm_sync_boost_violation": 242, - "dcgm_board_limit_violation": 243, - "dcgm_low_util_violation": 244, - "dcgm_reliability_violation": 245, - "dcgm_fb_free": 251, - "dcgm_fb_used": 252, - "dcgm_ecc_sbe_volatile_total": 310, - "dcgm_ecc_dbe_volatile_total": 311, - "dcgm_ecc_sbe_aggregate_total": 312, - "dcgm_ecc_dbe_aggregate_total": 313, - "dcgm_retired_pages_sbe": 390, - "dcgm_retired_pages_dbe": 391, - "dcgm_retired_pages_pending": 392, - "dcgm_nvlink_flit_crc_error_count_total": 409, - "dcgm_nvlink_data_crc_error_count_total": 419, - "dcgm_nvlink_replay_error_count_total": 429, - "dcgm_nvlink_recovery_error_count_total": 439, - "dcgm_nvlink_bandwidth_total": 449, - "dcgm_fi_prof_gr_engine_active": 1001, - "dcgm_fi_prof_sm_active": 1002, - "dcgm_fi_prof_sm_occupancy": 1003, - "dcgm_fi_prof_pipe_tensor_active": 1004, - "dcgm_fi_prof_dram_active": 1005, - "dcgm_fi_prof_pcie_tx_bytes": 1009, - "dcgm_fi_prof_pcie_rx_bytes": 1010, - } +type HealthResult uint + +const ( + DCGM_HEALTH_RESULT_PASS HealthResult = 0 // All results within this system are reporting normal + DCGM_HEALTH_RESULT_WARN HealthResult = 10 // A warning has been issued, refer to the response for more information + DCGM_HEALTH_RESULT_FAIL HealthResult = 20 // A failure has been issued, refer to the response for more information ) +// HealthCheckErrorCode error codes for passive and active health checks. +type HealthCheckErrorCode uint + const ( - DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) + DCGM_FR_OK HealthCheckErrorCode = 0 // 0 No error + DCGM_FR_UNKNOWN HealthCheckErrorCode = 1 // 1 Unknown error code + DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2 // 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3 // 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4 // 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5 // 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6 // 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7 // 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8 // 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9 // 9 Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10 // 10 Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11 // 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12 // 12 Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13 // 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14 // 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15 // 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16 // 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17 // 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18 // 18 Cannot access a file + DCGM_FR_NVML_API HealthCheckErrorCode = 19 // 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20 // 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21 // 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22 // 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23 // 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24 // 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 // 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26 // 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27 // 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28 // 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29 // 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30 // 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31 // 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32 // 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33 // 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34 // 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35 // 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36 // 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37 // 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38 // 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39 // 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40 // 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41 // 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42 // 42 Temperature is too high + DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43 // 43 Non-benign clock throttling is occurring + DCGM_FR_INTERNAL HealthCheckErrorCode = 44 // 44 An internal error was detected + DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45 // 45 PCIe generation is too low + DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46 // 46 PCIe width is too low + DCGM_FR_ABORTED HealthCheckErrorCode = 47 // 47 Test was aborted by a user signal + DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48 // 48 This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49 // 49 Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50 // 50 Stress level is too low (bad performance) + DCGM_FR_CUDA_API HealthCheckErrorCode = 51 // 51 Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52 // 52 Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53 // 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED + DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54 // 54 CUDA context is no longer bound + DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55 // 55 ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56 // 56 Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57 // 57 CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58 // 58 Memory error detected + DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59 // 59 No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60 // 60 ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61 // 61 ECC memory is in a pending state - NOT USED: DEPRECATED + DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62 // 62 Memory bandwidth is too low + DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63 // 63 Cannot hit the target power draw + DCGM_FR_API_FAIL HealthCheckErrorCode = 64 // 64 The specified API call failed + DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65 // 65 The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66 // 66 Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API HealthCheckErrorCode = 67 // 67 DCGM API failure + DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68 // 68 Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69 // 69 More errors than fit in the return struct - NOT USED: DEPRECATED + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70 // 70 More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71 // 71 NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72 // 72 The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73 // 73 Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74 // 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED + DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75 // 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76 // 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77 // 77 Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78 // 78 L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79 // 79 L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80 // 80 Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81 // 81 Uncontained error - XID 95 + DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82 // 82 No GPU information given to plugin + DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83 // 83 Pending page retirements due to a DBE + DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84 // 84 Uncorrectable row remapping + DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85 // 85 Row remapping is pending + DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86 // 86 P2P copy test detected an error writing to this GPU + DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87 // 87 P2P copy test detected an error writing from this GPU + DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88 // 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED + DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89 // 89 EUD binary permissions are incorrect + DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90 // 90 EUD plugin is not running as root + DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91 // 91 EUD plugin failed to spawn the EUD binary + DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92 // 92 EUD plugin timed out + DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93 // 93 EUD process remains running after the plugin considers it finished + DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94 // 94 EUD process exited with a non-zero exit code + DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95 // 95 EUD test failed + DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96 // 96 We cannot create a file in this directory. + DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97 // 97 Pause/Resume failed + DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98 // 98 PCIe test caught correctable errors + DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99 // 99 Expected nvlinks up per gpu + DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100 // 100 Expected nvlinks up per nvswitch + DCGM_FR_XID_ERROR HealthCheckErrorCode = 101 // 101 XID error detected + DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102 // 102 Single bit error detected + DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103 // 103 Double bit error detected + DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104 // 104 PCIe replay errors detected + DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105 // 105 SBE threshold violated + DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106 // 106 DBE threshold violated + DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // 107 PCIE replay count violated + DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // 108 The fabricmanager is not initialized + DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // 109 NvSwitch fatal error detected + DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 110 // 110 MUST BE THE LAST ERROR CODE ) diff --git a/pkg/dcgm/gpu_group.go b/pkg/dcgm/gpu_group.go index 94e938c..2f5b4aa 100644 --- a/pkg/dcgm/gpu_group.go +++ b/pkg/dcgm/gpu_group.go @@ -5,6 +5,7 @@ package dcgm #include "dcgm_structs.h" */ import "C" + import ( "encoding/binary" "fmt" @@ -92,3 +93,34 @@ func DestroyGroup(groupId GroupHandle) (err error) { return } + +type GroupInfo struct { + Version uint32 + GroupName string + EntityList []GroupEntityPair +} + +func GetGroupInfo(groupId GroupHandle) (*GroupInfo, error) { + response := C.dcgmGroupInfo_v2{ + version: C.dcgmGroupInfo_version2, + } + + result := C.dcgmGroupGetInfo(handle.handle, groupId.handle, &response) + if err := errorString(result); err != nil { + return nil, err + } + + ret := &GroupInfo{ + Version: uint32(response.version), + GroupName: C.GoString(&response.groupName[0]), + } + + for i := 0; i < int(response.count); i++ { + ret.EntityList = append(ret.EntityList, GroupEntityPair{ + EntityId: uint(response.entityList[i].entityId), + EntityGroupId: Field_Entity_Group(response.entityList[i].entityGroupId), + }) + } + + return ret, nil +} diff --git a/pkg/dcgm/gpu_group_test.go b/pkg/dcgm/gpu_group_test.go index 527f6a3..c0b0f64 100644 --- a/pkg/dcgm/gpu_group_test.go +++ b/pkg/dcgm/gpu_group_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestGroupHandle(t *testing.T) { @@ -17,3 +18,30 @@ func TestGroupHandle(t *testing.T) { assert.Equal(t, input, gh.GetHandle(), "values mismatch") } } + +func TestGetGroupInfo(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + + runOnlyWithLiveGPUs(t) + gpus, err := withInjectionGPUs(t, 1) + require.NoError(t, err) + + gpuID := gpus[0] + + groupID, err := CreateGroup("test1") + require.NoError(t, err) + defer func() { + _ = DestroyGroup(groupID) + }() + err = AddEntityToGroup(groupID, FE_GPU, gpuID) + require.NoError(t, err) + + grInfo, err := GetGroupInfo(groupID) + require.NoError(t, err) + + assert.Equal(t, "test1", grInfo.GroupName) + assert.Len(t, grInfo.EntityList, 1) + assert.Equal(t, FE_GPU, grInfo.EntityList[0].EntityGroupId) + assert.Equal(t, gpuID, grInfo.EntityList[0].EntityId) +} diff --git a/pkg/dcgm/health.go b/pkg/dcgm/health.go index 3a5f6ad..9e31e23 100644 --- a/pkg/dcgm/health.go +++ b/pkg/dcgm/health.go @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package dcgm /* @@ -5,6 +21,7 @@ package dcgm #include "dcgm_structs.h" */ import "C" + import ( "fmt" "math/rand" @@ -23,14 +40,84 @@ type DeviceHealth struct { Watches []SystemWatch } -func setHealthWatches(groupId GroupHandle) (err error) { - result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) +// HealthSet enable the DCGM health check system for the given systems +func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) { + result := C.dcgmHealthSet(handle.handle, groupId.handle, C.dcgmHealthSystems_t(systems)) if err = errorString(result); err != nil { - return fmt.Errorf("Error setting health watches: %s", err) + return fmt.Errorf("error setting health watches: %w", err) } return } +// HealthGet retrieve the current state of the DCGM health check system +func HealthGet(groupId GroupHandle) (HealthSystem, error) { + var systems C.dcgmHealthSystems_t + + result := C.dcgmHealthGet(handle.handle, groupId.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems))) + if err := errorString(result); err != nil { + return HealthSystem(0), err + } + return HealthSystem(systems), nil +} + +type DiagErrorDetail struct { + Message string + Code HealthCheckErrorCode +} + +type Incident struct { + System HealthSystem + Health HealthResult + Error DiagErrorDetail + EntityInfo GroupEntityPair +} + +type HealthResponse struct { + OverallHealth HealthResult + Incidents []Incident +} + +// HealthCheck check the configured watches for any errors/failures/warnings that have occurred +// since the last time this check was invoked. On the first call, stateful information +// about all of the enabled watches within a group is created but no error results are +// provided. On subsequent calls, any error information will be returned. +func HealthCheck(groupId GroupHandle) (HealthResponse, error) { + var healthResults C.dcgmHealthResponse_v4 + healthResults.version = makeVersion4(unsafe.Sizeof(healthResults)) + + result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) + + if err := errorString(result); err != nil { + return HealthResponse{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + response := HealthResponse{ + OverallHealth: HealthResult(healthResults.overallHealth), + } + + // number of watches that encountred error/warning + incidents := uint(healthResults.incidentCount) + + response.Incidents = make([]Incident, incidents) + + for i := uint(0); i < incidents; i++ { + response.Incidents[i] = Incident{ + System: HealthSystem(healthResults.incidents[i].system), + Health: HealthResult(healthResults.incidents[i].health), + Error: DiagErrorDetail{ + Message: *stringPtr(&healthResults.incidents[i].error.msg[0]), + Code: HealthCheckErrorCode(healthResults.incidents[i].error.code), + }, + EntityInfo: GroupEntityPair{ + EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId), + EntityId: uint(healthResults.incidents[i].entityInfo.entityId), + }, + } + } + + return response, nil +} + func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { name := fmt.Sprintf("health%d", rand.Uint64()) groupId, err := CreateGroup(name) @@ -43,32 +130,28 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { return } - err = setHealthWatches(groupId) + err = HealthSet(groupId, DCGM_HEALTH_WATCH_ALL) if err != nil { return } - var healthResults C.dcgmHealthResponse_v4 - healthResults.version = makeVersion4(unsafe.Sizeof(healthResults)) - - result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) - - if err = errorString(result); err != nil { - return deviceHealth, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + result, err := HealthCheck(groupId) + if err != nil { + return } - status := healthStatus(int8(healthResults.overallHealth)) + status := healthStatus(result.OverallHealth) watches := []SystemWatch{} // number of watches that encountred error/warning - incidents := uint(healthResults.incidentCount) + incidents := len(result.Incidents) - for j := uint(0); j < incidents; j++ { + for j := 0; j < incidents; j++ { watch := SystemWatch{ - Type: systemWatch(int(healthResults.incidents[j].system)), - Status: healthStatus(int8(healthResults.incidents[j].health)), + Type: systemWatch(result.Incidents[j].System), + Status: healthStatus(result.Incidents[j].Health), - Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), + Error: result.Incidents[j].Error.Message, } watches = append(watches, watch) } @@ -82,7 +165,7 @@ func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { return } -func healthStatus(status int8) string { +func healthStatus(status HealthResult) string { switch status { case 0: return "Healthy" @@ -94,7 +177,7 @@ func healthStatus(status int8) string { return "N/A" } -func systemWatch(watch int) string { +func systemWatch(watch HealthSystem) string { switch watch { case 1: return "PCIe watches" diff --git a/pkg/dcgm/health_test.go b/pkg/dcgm/health_test.go new file mode 100644 index 0000000..e3f7087 --- /dev/null +++ b/pkg/dcgm/health_test.go @@ -0,0 +1,121 @@ +//go:build linux && cgo + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHealthWhenInvalidGroupID(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + runOnlyWithLiveGPUs(t) + + var invalidGroupID uintptr = 99 + gh := GroupHandle{} + gh.SetHandle(invalidGroupID) + err := HealthSet(gh, DCGM_HEALTH_WATCH_PCIE) + assert.Error(t, err) + assert.Contains(t, err.Error(), "Setting not configured") + + _, err = HealthGet(gh) + assert.Error(t, err) + assert.Contains(t, err.Error(), "Setting not configured") + + _, err = HealthGet(gh) + assert.Error(t, err) + assert.Contains(t, err.Error(), "Setting not configured") +} + +func TestHealthCheckPCIE(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + + runOnlyWithLiveGPUs(t) + gpus, err := withInjectionGPUs(t, 1) + require.NoError(t, err) + + gpuID := gpus[0] + + groupID, err := CreateGroup("test1") + require.NoError(t, err) + defer func() { + _ = DestroyGroup(groupID) + }() + err = AddEntityToGroup(groupID, FE_GPU, gpuID) + require.NoError(t, err) + + err = HealthSet(groupID, DCGM_HEALTH_WATCH_PCIE) + require.NoError(t, err) + + system, err := HealthGet(groupID) + require.NoError(t, err) + require.Equal(t, DCGM_HEALTH_WATCH_PCIE, system) + + skipTestIfUnhealthy(t, groupID) + + err = InjectFieldValue(gpuID, + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + DCGM_FT_INT64, + 0, + time.Now().Add(-50*time.Second).UnixMicro(), + int64(0), + ) + require.NoError(t, err) + + response, err := HealthCheck(groupID) + require.NoError(t, err) + require.Equal(t, DCGM_HEALTH_RESULT_PASS, response.OverallHealth) + + // inject an error into PCI + err = InjectFieldValue(gpuID, + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + DCGM_FT_INT64, + 0, + time.Now().Add(100*time.Second).UnixMicro(), + int64(10), + ) + require.NoError(t, err) + response, err = HealthCheck(groupID) + require.NoError(t, err) + require.Equal(t, DCGM_HEALTH_RESULT_WARN, response.OverallHealth) + require.Len(t, response.Incidents, 1) + assert.Equal(t, gpuID, response.Incidents[0].EntityInfo.EntityId) + assert.Equal(t, DCGM_HEALTH_WATCH_PCIE, response.Incidents[0].System) + assert.Equal(t, DCGM_FR_PCI_REPLAY_RATE, response.Incidents[0].Error.Code) +} + +func skipTestIfUnhealthy(t *testing.T, groupId GroupHandle) { + health, err := HealthCheck(groupId) + require.NoError(t, err) + if health.OverallHealth != DCGM_HEALTH_RESULT_PASS { + msg := "Skipping health check test because we are already unhealthy: " + incidents := []string{} + for _, incident := range health.Incidents { + incidents = append(incidents, incident.Error.Message) + } + + t.Skip(msg + strings.Join(incidents, ", ")) + } +} diff --git a/pkg/dcgm/internal.go b/pkg/dcgm/internal.go index 8a1f2ff..bb6a37d 100644 --- a/pkg/dcgm/internal.go +++ b/pkg/dcgm/internal.go @@ -25,6 +25,7 @@ package dcgm #include "dcgm_structs_internal.h" */ import "C" + import ( "unsafe" ) diff --git a/pkg/dcgm/policy_test.go b/pkg/dcgm/policy_test.go index a9c8d4d..27391ad 100644 --- a/pkg/dcgm/policy_test.go +++ b/pkg/dcgm/policy_test.go @@ -119,7 +119,7 @@ func TestPolicyErrors(t *testing.T) { int64(10), ) if err == nil { - //inject a SBE too so that the health check code gets past its internal checks + // inject a SBE too so that the health check code gets past its internal checks t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_RETIRED_SBE", gpu) err = InjectFieldValue(gpu, DCGM_FI_DEV_RETIRED_SBE, @@ -206,12 +206,12 @@ func TestPolicyErrors(t *testing.T) { }, }, { - //testcase: register multiple policy conditions + // testcase: register multiple policy conditions policy: []policyCondition{NvlinkPolicy, XidPolicy}, numErrors: 2, injectError: func() error { gpu := uint(rand.Intn(8) + 1) - //Inject a DBE error; since it has not registered DBEPolicy it will not get this event. + // Inject a DBE error; since it has not registered DBEPolicy it will not get this event. t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_ECC_DBE_VOL_DEV", gpu) err := InjectFieldValue(gpu, DCGM_FI_DEV_ECC_DBE_VOL_DEV, @@ -220,6 +220,10 @@ func TestPolicyErrors(t *testing.T) { time.Now().Add(60*time.Second).UnixMicro(), int64(1), ) + if err != nil { + return err + } + gpu = uint(rand.Intn(8) + 1) t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) err = InjectFieldValue(gpu, @@ -229,6 +233,10 @@ func TestPolicyErrors(t *testing.T) { time.Now().Add(60*time.Second).UnixMicro(), int64(16), ) + if err != nil { + return err + } + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", gpu) err = InjectFieldValue(gpu, DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, @@ -239,20 +247,24 @@ func TestPolicyErrors(t *testing.T) { ) return err }, - assert: func(cb PolicyViolation, en int) { - switch en { - case 1: - require.NotNil(t, cb) - assert.Equal(t, XidPolicy, cb.Condition) + assert: func(cb PolicyViolation, _ int) { + require.NotNil(t, cb) + + switch cb.Condition { + case XidPolicy: require.IsType(t, XidPolicyCondition{}, cb.Data) xidPolicyCondition := cb.Data.(XidPolicyCondition) assert.Equal(t, uint(16), xidPolicyCondition.ErrNum) - case 2: - require.NotNil(t, cb) - assert.Equal(t, NvlinkPolicy, cb.Condition) + case NvlinkPolicy: require.IsType(t, NvlinkPolicyCondition{}, cb.Data) nvlinkPolicyCondition := cb.Data.(NvlinkPolicyCondition) assert.Equal(t, uint(1), nvlinkPolicyCondition.Counter) + default: + require.FailNowf( + t, + "unexpected condition %s", + string(cb.Condition), + ) } }, }, diff --git a/pkg/dcgm/test_utils.go b/pkg/dcgm/test_utils.go index ae46bd2..0722a4f 100644 --- a/pkg/dcgm/test_utils.go +++ b/pkg/dcgm/test_utils.go @@ -20,6 +20,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func setupTest(t *testing.T) func(t *testing.T) { @@ -39,3 +40,21 @@ func runOnlyWithLiveGPUs(t *testing.T) { t.Skip("Skipping test that requires live GPUs. None were found") } } + +func withInjectionGPUs(t *testing.T, gpuCount int) ([]uint, error) { + t.Helper() + numGPUs, err := GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", MAX_NUM_DEVICES) + } + + entityList := make([]MigHierarchyInfo, gpuCount) + for i := range entityList { + entityList[i] = MigHierarchyInfo{ + Entity: GroupEntityPair{EntityGroupId: FE_GPU}, + } + } + return CreateFakeEntities(entityList) +}