diff --git a/Makefile b/Makefile index e74ddcc..2102aad 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,7 @@ binary: cd samples/processInfo; go build cd samples/restApi; go build cd samples/topology; go build + cd samples/diag; go build test-main: go test -race ./tests @@ -46,4 +47,4 @@ clean: rm -f samples/topology/topology lint: - golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix \ No newline at end of file + golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix diff --git a/pkg/dcgm/admin.go b/pkg/dcgm/admin.go index 2c22321..14313e7 100644 --- a/pkg/dcgm/admin.go +++ b/pkg/dcgm/admin.go @@ -60,7 +60,7 @@ var ( func initDcgm(m mode, args ...string) (err error) { const ( - dcgmLib = "libdcgm.so" + dcgmLib = "libdcgm.so.4" ) lib := C.CString(dcgmLib) defer freeCString(lib) diff --git a/pkg/dcgm/const.go b/pkg/dcgm/const.go index 9365ea3..1f57ea3 100644 --- a/pkg/dcgm/const.go +++ b/pkg/dcgm/const.go @@ -48,424 +48,477 @@ const ( DCGM_FT_STR_NOT_SUPPORTED = "<<>>" DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" - DCGM_FI_UNKNOWN = 0 - DCGM_FI_DRIVER_VERSION = 1 - DCGM_FI_NVML_VERSION = 2 - DCGM_FI_PROCESS_NAME = 3 - DCGM_FI_DEV_COUNT = 4 - DCGM_FI_CUDA_DRIVER_VERSION = 5 - DCGM_FI_DEV_NAME = 50 - DCGM_FI_DEV_BRAND = 51 - DCGM_FI_DEV_NVML_INDEX = 52 - DCGM_FI_DEV_SERIAL = 53 - DCGM_FI_DEV_UUID = 54 - DCGM_FI_DEV_MINOR_NUMBER = 55 - DCGM_FI_DEV_OEM_INFOROM_VER = 56 - DCGM_FI_DEV_PCI_BUSID = 57 - DCGM_FI_DEV_PCI_COMBINED_ID = 58 - DCGM_FI_DEV_PCI_SUBSYS_ID = 59 - DCGM_FI_GPU_TOPOLOGY_PCI = 60 - DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 - DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 - DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 - DCGM_FI_DEV_COMPUTE_MODE = 65 - DCGM_FI_DEV_PERSISTENCE_MODE = 66 - DCGM_FI_DEV_MIG_MODE = 67 - DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 - DCGM_FI_DEV_MIG_MAX_SLICES = 69 - DCGM_FI_DEV_CPU_AFFINITY_0 = 70 - DCGM_FI_DEV_CPU_AFFINITY_1 = 71 - DCGM_FI_DEV_CPU_AFFINITY_2 = 72 - DCGM_FI_DEV_CPU_AFFINITY_3 = 73 - DCGM_FI_DEV_CC_MODE = 74 - DCGM_FI_DEV_MIG_ATTRIBUTES = 75 - DCGM_FI_DEV_MIG_GI_INFO = 76 - DCGM_FI_DEV_MIG_CI_INFO = 77 - DCGM_FI_DEV_ECC_INFOROM_VER = 80 - DCGM_FI_DEV_POWER_INFOROM_VER = 81 - DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 - DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 - DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 - DCGM_FI_DEV_VBIOS_VERSION = 85 - DCGM_FI_DEV_MEM_AFFINITY_0 = 86 - DCGM_FI_DEV_MEM_AFFINITY_1 = 87 - DCGM_FI_DEV_MEM_AFFINITY_2 = 88 - DCGM_FI_DEV_MEM_AFFINITY_3 = 89 - DCGM_FI_DEV_BAR1_TOTAL = 90 - DCGM_FI_SYNC_BOOST = 91 - DCGM_FI_DEV_BAR1_USED = 92 - DCGM_FI_DEV_BAR1_FREE = 93 - DCGM_FI_DEV_SM_CLOCK = 100 - DCGM_FI_DEV_MEM_CLOCK = 101 - DCGM_FI_DEV_VIDEO_CLOCK = 102 - DCGM_FI_DEV_APP_SM_CLOCK = 110 - DCGM_FI_DEV_APP_MEM_CLOCK = 111 - DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 - DCGM_FI_DEV_MAX_SM_CLOCK = 113 - DCGM_FI_DEV_MAX_MEM_CLOCK = 114 - DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 - DCGM_FI_DEV_AUTOBOOST = 120 - DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 - DCGM_FI_DEV_MEMORY_TEMP = 140 - DCGM_FI_DEV_GPU_TEMP = 150 - DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 - DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 - DCGM_FI_DEV_POWER_USAGE = 155 - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 - DCGM_FI_DEV_POWER_USAGE_INSTANT = 157 - DCGM_FI_DEV_SLOWDOWN_TEMP = 158 - DCGM_FI_DEV_SHUTDOWN_TEMP = 159 - DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 - DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 - DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 - DCGM_FI_DEV_PSTATE = 190 - DCGM_FI_DEV_FAN_SPEED = 191 - DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 - DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 - DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 - DCGM_FI_DEV_GPU_UTIL = 203 - DCGM_FI_DEV_MEM_COPY_UTIL = 204 - DCGM_FI_DEV_ACCOUNTING_DATA = 205 - DCGM_FI_DEV_ENC_UTIL = 206 - DCGM_FI_DEV_DEC_UTIL = 207 - DCGM_FI_DEV_XID_ERRORS = 230 - DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 - DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 - DCGM_FI_DEV_PCIE_LINK_GEN = 237 - DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 - DCGM_FI_DEV_POWER_VIOLATION = 240 - DCGM_FI_DEV_THERMAL_VIOLATION = 241 - DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 - DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 - DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 - DCGM_FI_DEV_FB_TOTAL = 250 - DCGM_FI_DEV_FB_FREE = 251 - DCGM_FI_DEV_FB_USED = 252 - DCGM_FI_DEV_FB_RESERVED = 253 - DCGM_FI_DEV_FB_USED_PERCENT = 254 - DCGM_FI_DEV_C2C_LINK_COUNT = 285 - DCGM_FI_DEV_C2C_LINK_STATUS = 286 - DCGM_FI_DEV_C2C_MAX_BANDWIDTH = 287 - DCGM_FI_DEV_ECC_CURRENT = 300 - DCGM_FI_DEV_ECC_PENDING = 301 - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 - DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 - DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 - DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 - DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 - DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 - DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 - DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 - DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 - DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 - DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 - DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 - DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 - DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 - DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 - DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 - DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 - DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 - DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 - DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 - DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 - DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 - DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 - DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX = 385 - DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH = 386 - DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL = 387 - DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW = 388 - DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE = 389 - DCGM_FI_DEV_RETIRED_SBE = 390 - DCGM_FI_DEV_RETIRED_DBE = 391 - DCGM_FI_DEV_RETIRED_PENDING = 392 - DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 - DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 - DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 - DCGM_FI_DEV_ROW_REMAP_PENDING = 396 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 - DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496 - DCGM_FI_DEV_VIRTUAL_MODE = 500 - DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 - DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 - DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 - DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 - DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 - DCGM_FI_DEV_ENC_STATS = 506 - DCGM_FI_DEV_FBC_STATS = 507 - DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 - DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 - DCGM_FI_DEV_VGPU_TYPE_INFO = 510 - DCGM_FI_DEV_VGPU_TYPE_NAME = 511 - DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 - DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 - DCGM_FI_DEV_VGPU_VM_ID = 520 - DCGM_FI_DEV_VGPU_VM_NAME = 521 - DCGM_FI_DEV_VGPU_TYPE = 522 - DCGM_FI_DEV_VGPU_UUID = 523 - DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 - DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 - DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 - DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 - DCGM_FI_DEV_VGPU_ENC_STATS = 528 - DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 - DCGM_FI_DEV_VGPU_FBC_STATS = 530 - DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 - DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 - DCGM_FI_DEV_VGPU_PCI_ID = 533 - DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 - DCGM_FI_INTERNAL_FIELDS_0_START = 600 - DCGM_FI_INTERNAL_FIELDS_0_END = 699 - DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT = 701 - DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ = 702 - DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV = 703 - DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD = 704 - DCGM_FI_DEV_NVSWITCH_POWER_VDD = 705 - DCGM_FI_DEV_NVSWITCH_POWER_DVDD = 706 - DCGM_FI_DEV_NVSWITCH_POWER_HVDD = 707 - DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 - DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 - DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 - DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783 - DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784 - DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785 - DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786 - DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787 - DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807 - DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808 - DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809 - DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810 - DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811 - DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812 - DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813 - DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814 - DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815 - DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816 - DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 - DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 - DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858 - DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859 - DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860 - DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861 - DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862 - DCGM_FI_DEV_NVSWITCH_PHYS_ID = 863 - DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED = 864 - DCGM_FI_DEV_NVSWITCH_LINK_ID = 865 - DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN = 866 - DCGM_FI_DEV_NVSWITCH_PCIE_BUS = 867 - DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE = 868 - DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION = 869 - DCGM_FI_DEV_NVSWITCH_LINK_STATUS = 870 - DCGM_FI_DEV_NVSWITCH_LINK_TYPE = 871 - DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN = 872 - DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS = 873 - DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE = 874 - DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION = 875 - DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID = 876 - DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID = 877 - DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID = 878 - DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 - DCGM_FI_PROF_SM_ACTIVE = 1002 - DCGM_FI_PROF_SM_OCCUPANCY = 1003 - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 - DCGM_FI_PROF_DRAM_ACTIVE = 1005 - DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 - DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 - DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 - DCGM_FI_PROF_PCIE_TX_BYTES = 1009 - DCGM_FI_PROF_PCIE_RX_BYTES = 1010 - DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 - DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 - DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 - DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 - DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 - DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 - DCGM_FI_PROF_NVDEC0_ACTIVE = 1017 - DCGM_FI_PROF_NVDEC1_ACTIVE = 1018 - DCGM_FI_PROF_NVDEC2_ACTIVE = 1019 - DCGM_FI_PROF_NVDEC3_ACTIVE = 1020 - DCGM_FI_PROF_NVDEC4_ACTIVE = 1021 - DCGM_FI_PROF_NVDEC5_ACTIVE = 1022 - DCGM_FI_PROF_NVDEC6_ACTIVE = 1023 - DCGM_FI_PROF_NVDEC7_ACTIVE = 1024 - DCGM_FI_PROF_NVJPG0_ACTIVE = 1025 - DCGM_FI_PROF_NVJPG1_ACTIVE = 1026 - DCGM_FI_PROF_NVJPG2_ACTIVE = 1027 - DCGM_FI_PROF_NVJPG3_ACTIVE = 1028 - DCGM_FI_PROF_NVJPG4_ACTIVE = 1029 - DCGM_FI_PROF_NVJPG5_ACTIVE = 1030 - DCGM_FI_PROF_NVJPG6_ACTIVE = 1031 - DCGM_FI_PROF_NVJPG7_ACTIVE = 1032 - DCGM_FI_PROF_NVOFA0_ACTIVE = 1033 - DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040 - DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041 - DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042 - DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043 - DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044 - DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045 - DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046 - DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047 - DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048 - DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049 - DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050 - DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051 - DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052 - DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053 - DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054 - DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055 - DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056 - DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057 - DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058 - DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059 - DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060 - DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061 - DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062 - DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063 - DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064 - DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065 - DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066 - DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067 - DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068 - DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069 - DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070 - DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071 - DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072 - DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 - DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 - DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 - DCGM_FI_DEV_CPU_UTIL_TOTAL = 1100 - DCGM_FI_DEV_CPU_UTIL_USER = 1101 - DCGM_FI_DEV_CPU_UTIL_NICE = 1102 - DCGM_FI_DEV_CPU_UTIL_SYS = 1103 - DCGM_FI_DEV_CPU_UTIL_IRQ = 1104 - DCGM_FI_DEV_CPU_TEMP_CURRENT = 1110 - DCGM_FI_DEV_CPU_TEMP_WARNING = 1111 - DCGM_FI_DEV_CPU_TEMP_CRITICAL = 1112 - DCGM_FI_DEV_CPU_CLOCK_CURRENT = 1120 - DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT = 1130 - DCGM_FI_DEV_CPU_POWER_LIMIT = 1131 - DCGM_FI_DEV_CPU_VENDOR = 1140 - DCGM_FI_DEV_CPU_MODEL = 1141 - DCGM_FI_MAX_FIELDS = 1142 + DCGM_FI_UNKNOWN = 0 + DCGM_FI_DRIVER_VERSION = 1 + DCGM_FI_NVML_VERSION = 2 + DCGM_FI_PROCESS_NAME = 3 + DCGM_FI_DEV_COUNT = 4 + DCGM_FI_CUDA_DRIVER_VERSION = 5 + DCGM_FI_DEV_NAME = 50 + DCGM_FI_DEV_BRAND = 51 + DCGM_FI_DEV_NVML_INDEX = 52 + DCGM_FI_DEV_SERIAL = 53 + DCGM_FI_DEV_UUID = 54 + DCGM_FI_DEV_MINOR_NUMBER = 55 + DCGM_FI_DEV_OEM_INFOROM_VER = 56 + DCGM_FI_DEV_PCI_BUSID = 57 + DCGM_FI_DEV_PCI_COMBINED_ID = 58 + DCGM_FI_DEV_PCI_SUBSYS_ID = 59 + DCGM_FI_GPU_TOPOLOGY_PCI = 60 + DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 + DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 + DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 + DCGM_FI_DEV_COMPUTE_MODE = 65 + DCGM_FI_DEV_PERSISTENCE_MODE = 66 + DCGM_FI_DEV_MIG_MODE = 67 + DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 + DCGM_FI_DEV_MIG_MAX_SLICES = 69 + DCGM_FI_DEV_CPU_AFFINITY_0 = 70 + DCGM_FI_DEV_CPU_AFFINITY_1 = 71 + DCGM_FI_DEV_CPU_AFFINITY_2 = 72 + DCGM_FI_DEV_CPU_AFFINITY_3 = 73 + DCGM_FI_DEV_CC_MODE = 74 + DCGM_FI_DEV_MIG_ATTRIBUTES = 75 + DCGM_FI_DEV_MIG_GI_INFO = 76 + DCGM_FI_DEV_MIG_CI_INFO = 77 + DCGM_FI_DEV_ECC_INFOROM_VER = 80 + DCGM_FI_DEV_POWER_INFOROM_VER = 81 + DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 + DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 + DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 + DCGM_FI_DEV_VBIOS_VERSION = 85 + DCGM_FI_DEV_MEM_AFFINITY_0 = 86 + DCGM_FI_DEV_MEM_AFFINITY_1 = 87 + DCGM_FI_DEV_MEM_AFFINITY_2 = 88 + DCGM_FI_DEV_MEM_AFFINITY_3 = 89 + DCGM_FI_DEV_BAR1_TOTAL = 90 + DCGM_FI_SYNC_BOOST = 91 + DCGM_FI_DEV_BAR1_USED = 92 + DCGM_FI_DEV_BAR1_FREE = 93 + DCGM_FI_DEV_GPM_SUPPORT = 94 + DCGM_FI_DEV_SM_CLOCK = 100 + DCGM_FI_DEV_MEM_CLOCK = 101 + DCGM_FI_DEV_VIDEO_CLOCK = 102 + DCGM_FI_DEV_APP_SM_CLOCK = 110 + DCGM_FI_DEV_APP_MEM_CLOCK = 111 + DCGM_FI_DEV_CLOCKS_EVENT_REASONS = 112 + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = DCGM_FI_DEV_CLOCKS_EVENT_REASONS + DCGM_FI_DEV_MAX_SM_CLOCK = 113 + DCGM_FI_DEV_MAX_MEM_CLOCK = 114 + DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 + DCGM_FI_DEV_AUTOBOOST = 120 + DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 + DCGM_FI_DEV_MEMORY_TEMP = 140 + DCGM_FI_DEV_GPU_TEMP = 150 + DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 + DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 + DCGM_FI_DEV_POWER_USAGE = 155 + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 + DCGM_FI_DEV_POWER_USAGE_INSTANT = 157 + DCGM_FI_DEV_SLOWDOWN_TEMP = 158 + DCGM_FI_DEV_SHUTDOWN_TEMP = 159 + DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 + DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 + DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 + DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK = 165 + DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK = 166 + DCGM_FI_DEV_VALID_POWER_PROFILE_MASK = 167 + DCGM_FI_DEV_FABRIC_MANAGER_STATUS = 170 + DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE = 171 + DCGM_FI_DEV_FABRIC_CLUSTER_UUID = 172 + DCGM_FI_DEV_FABRIC_CLIQUE_ID = 173 + DCGM_FI_DEV_PSTATE = 190 + DCGM_FI_DEV_FAN_SPEED = 191 + DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 + DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 + DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 + DCGM_FI_DEV_GPU_UTIL = 203 + DCGM_FI_DEV_MEM_COPY_UTIL = 204 + DCGM_FI_DEV_ACCOUNTING_DATA = 205 + DCGM_FI_DEV_ENC_UTIL = 206 + DCGM_FI_DEV_DEC_UTIL = 207 + DCGM_FI_DEV_XID_ERRORS = 230 + DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 + DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 + DCGM_FI_DEV_PCIE_LINK_GEN = 237 + DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 + DCGM_FI_DEV_POWER_VIOLATION = 240 + DCGM_FI_DEV_THERMAL_VIOLATION = 241 + DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 + DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 + DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 + DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 + DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 + DCGM_FI_DEV_FB_TOTAL = 250 + DCGM_FI_DEV_FB_FREE = 251 + DCGM_FI_DEV_FB_USED = 252 + DCGM_FI_DEV_FB_RESERVED = 253 + DCGM_FI_DEV_FB_USED_PERCENT = 254 + DCGM_FI_DEV_C2C_LINK_COUNT = 285 + DCGM_FI_DEV_C2C_LINK_STATUS = 286 + DCGM_FI_DEV_C2C_MAX_BANDWIDTH = 287 + DCGM_FI_DEV_ECC_CURRENT = 300 + DCGM_FI_DEV_ECC_PENDING = 301 + DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 + DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 + DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 + DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 + DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 + DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 + DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 + DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 + DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 + DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 + DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 + DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 + DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 + DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 + DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 + DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 + DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 + DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 + DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 + DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 + DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 + DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 + DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 + DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 + DCGM_FI_DEV_DIAG_MEMORY_RESULT = 350 + DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT = 351 + DCGM_FI_DEV_DIAG_PCIE_RESULT = 352 + DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT = 353 + DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT = 354 + DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT = 355 + DCGM_FI_DEV_DIAG_MEMTEST_RESULT = 356 + DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT = 357 + DCGM_FI_DEV_DIAG_EUD_RESULT = 358 + DCGM_FI_DEV_DIAG_CPU_EUD_RESULT = 359 + DCGM_FI_DEV_DIAG_SOFTWARE_RESULT = 360 + DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT = 361 + DCGM_FI_DEV_DIAG_STATUS = 362 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX = 385 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH = 386 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL = 387 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW = 388 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE = 389 + DCGM_FI_DEV_RETIRED_SBE = 390 + DCGM_FI_DEV_RETIRED_DBE = 391 + DCGM_FI_DEV_RETIRED_PENDING = 392 + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 + DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 + DCGM_FI_DEV_ROW_REMAP_PENDING = 396 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 + DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496 + DCGM_FI_DEV_NVLINK_ERROR_DL_CRC = 497 + DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY = 498 + DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY = 499 + DCGM_FI_DEV_VIRTUAL_MODE = 500 + DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 + DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 + DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 + DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 + DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 + DCGM_FI_DEV_ENC_STATS = 506 + DCGM_FI_DEV_FBC_STATS = 507 + DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 + DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 + DCGM_FI_DEV_VGPU_TYPE_INFO = 510 + DCGM_FI_DEV_VGPU_TYPE_NAME = 511 + DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 + DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 + DCGM_FI_DEV_VGPU_VM_ID = 520 + DCGM_FI_DEV_VGPU_VM_NAME = 521 + DCGM_FI_DEV_VGPU_TYPE = 522 + DCGM_FI_DEV_VGPU_UUID = 523 + DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 + DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 + DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 + DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 + DCGM_FI_DEV_VGPU_ENC_STATS = 528 + DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 + DCGM_FI_DEV_VGPU_FBC_STATS = 530 + DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 + DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 + DCGM_FI_DEV_VGPU_PCI_ID = 533 + DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 + DCGM_FI_FIRST_VGPU_FIELD_ID = 520 + DCGM_FI_LAST_VGPU_FIELD_ID = 570 + DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 + DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT = 701 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ = 702 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV = 703 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD = 704 + DCGM_FI_DEV_NVSWITCH_POWER_VDD = 705 + DCGM_FI_DEV_NVSWITCH_POWER_DVDD = 706 + DCGM_FI_DEV_NVSWITCH_POWER_HVDD = 707 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 + DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 + DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783 + DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784 + DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785 + DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 = 817 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 = 818 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 = 819 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 = 820 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 = 821 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 = 822 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 = 823 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 = 824 + DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 + DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862 + DCGM_FI_DEV_NVSWITCH_PHYS_ID = 863 + DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED = 864 + DCGM_FI_DEV_NVSWITCH_LINK_ID = 865 + DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN = 866 + DCGM_FI_DEV_NVSWITCH_PCIE_BUS = 867 + DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE = 868 + DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION = 869 + DCGM_FI_DEV_NVSWITCH_LINK_STATUS = 870 + DCGM_FI_DEV_NVSWITCH_LINK_TYPE = 871 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN = 872 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS = 873 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE = 874 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION = 875 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID = 876 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID = 877 + DCGM_FI_DEV_NVSWITCH_DEVICE_UUID = 878 + DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 + DCGM_FI_PROF_SM_ACTIVE = 1002 + DCGM_FI_PROF_SM_OCCUPANCY = 1003 + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 + DCGM_FI_PROF_DRAM_ACTIVE = 1005 + DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 + DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 + DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 + DCGM_FI_PROF_PCIE_TX_BYTES = 1009 + DCGM_FI_PROF_PCIE_RX_BYTES = 1010 + DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 + DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 + DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 + DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 + DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 + DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 + DCGM_FI_PROF_NVDEC0_ACTIVE = 1017 + DCGM_FI_PROF_NVDEC1_ACTIVE = 1018 + DCGM_FI_PROF_NVDEC2_ACTIVE = 1019 + DCGM_FI_PROF_NVDEC3_ACTIVE = 1020 + DCGM_FI_PROF_NVDEC4_ACTIVE = 1021 + DCGM_FI_PROF_NVDEC5_ACTIVE = 1022 + DCGM_FI_PROF_NVDEC6_ACTIVE = 1023 + DCGM_FI_PROF_NVDEC7_ACTIVE = 1024 + DCGM_FI_PROF_NVJPG0_ACTIVE = 1025 + DCGM_FI_PROF_NVJPG1_ACTIVE = 1026 + DCGM_FI_PROF_NVJPG2_ACTIVE = 1027 + DCGM_FI_PROF_NVJPG3_ACTIVE = 1028 + DCGM_FI_PROF_NVJPG4_ACTIVE = 1029 + DCGM_FI_PROF_NVJPG5_ACTIVE = 1030 + DCGM_FI_PROF_NVJPG6_ACTIVE = 1031 + DCGM_FI_PROF_NVJPG7_ACTIVE = 1032 + DCGM_FI_PROF_NVOFA0_ACTIVE = 1033 + DCGM_FI_PROF_NVOFA1_ACTIVE = 1034 + DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040 + DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041 + DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042 + DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043 + DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044 + DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045 + DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046 + DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047 + DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048 + DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049 + DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050 + DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051 + DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052 + DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053 + DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054 + DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055 + DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056 + DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057 + DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058 + DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059 + DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060 + DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061 + DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062 + DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063 + DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064 + DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065 + DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066 + DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067 + DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068 + DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069 + DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070 + DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071 + DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072 + DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 + DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 + DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 + DCGM_FI_DEV_CPU_UTIL_TOTAL = 1100 + DCGM_FI_DEV_CPU_UTIL_USER = 1101 + DCGM_FI_DEV_CPU_UTIL_NICE = 1102 + DCGM_FI_DEV_CPU_UTIL_SYS = 1103 + DCGM_FI_DEV_CPU_UTIL_IRQ = 1104 + DCGM_FI_DEV_CPU_TEMP_CURRENT = 1110 + DCGM_FI_DEV_CPU_TEMP_WARNING = 1111 + DCGM_FI_DEV_CPU_TEMP_CRITICAL = 1112 + DCGM_FI_DEV_CPU_CLOCK_CURRENT = 1120 + DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT = 1130 + DCGM_FI_DEV_CPU_POWER_LIMIT = 1131 + DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT = 1132 + DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT = 1133 + DCGM_FI_DEV_CPU_VENDOR = 1140 + DCGM_FI_DEV_CPU_MODEL = 1141 + DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS = 1200 + DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES = 1201 + DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS = 1202 + DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES = 1203 + DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS = 1204 + DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS = 1205 + DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS = 1206 + DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS = 1207 + DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS = 1208 + DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS = 1209 + DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS = 1210 + DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS = 1211 + DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS = 1212 + DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS = 1213 + DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS = 1214 + DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER = 1215 + DCGM_FI_MAX_FIELDS = 1216 DCGM_ST_OK = 0 DCGM_ST_BADPARAM = -1 @@ -522,6 +575,9 @@ const ( DCGM_ST_NVVS_KILLED = -53 DCGM_ST_PAUSED = -54 DCGM_ST_ALREADY_INITIALIZED = -55 + DCGM_ST_NVML_NOT_LOADED = -56 + DCGM_ST_NVML_DRIVER_TIMEOUT = -57 + DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58 ) var DCGM_FI = map[string]Short{ @@ -531,424 +587,477 @@ var DCGM_FI = map[string]Short{ "DCGM_FT_STRING": Short('s'), "DCGM_FT_TIMESTAMP": Short('t'), - "DCGM_FI_UNKNOWN": 0, - "DCGM_FI_DRIVER_VERSION": 1, - "DCGM_FI_NVML_VERSION": 2, - "DCGM_FI_PROCESS_NAME": 3, - "DCGM_FI_DEV_COUNT": 4, - "DCGM_FI_CUDA_DRIVER_VERSION": 5, - "DCGM_FI_DEV_NAME": 50, - "DCGM_FI_DEV_BRAND": 51, - "DCGM_FI_DEV_NVML_INDEX": 52, - "DCGM_FI_DEV_SERIAL": 53, - "DCGM_FI_DEV_UUID": 54, - "DCGM_FI_DEV_MINOR_NUMBER": 55, - "DCGM_FI_DEV_OEM_INFOROM_VER": 56, - "DCGM_FI_DEV_PCI_BUSID": 57, - "DCGM_FI_DEV_PCI_COMBINED_ID": 58, - "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, - "DCGM_FI_GPU_TOPOLOGY_PCI": 60, - "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, - "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, - "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, - "DCGM_FI_DEV_COMPUTE_MODE": 65, - "DCGM_FI_DEV_PERSISTENCE_MODE": 66, - "DCGM_FI_DEV_MIG_MODE": 67, - "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, - "DCGM_FI_DEV_MIG_MAX_SLICES": 69, - "DCGM_FI_DEV_CPU_AFFINITY_0": 70, - "DCGM_FI_DEV_CPU_AFFINITY_1": 71, - "DCGM_FI_DEV_CPU_AFFINITY_2": 72, - "DCGM_FI_DEV_CPU_AFFINITY_3": 73, - "DCGM_FI_DEV_CC_MODE": 74, - "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, - "DCGM_FI_DEV_MIG_GI_INFO": 76, - "DCGM_FI_DEV_MIG_CI_INFO": 77, - "DCGM_FI_DEV_ECC_INFOROM_VER": 80, - "DCGM_FI_DEV_POWER_INFOROM_VER": 81, - "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, - "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, - "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, - "DCGM_FI_DEV_VBIOS_VERSION": 85, - "DCGM_FI_DEV_MEM_AFFINITY_0": 86, - "DCGM_FI_DEV_MEM_AFFINITY_1": 87, - "DCGM_FI_DEV_MEM_AFFINITY_2": 88, - "DCGM_FI_DEV_MEM_AFFINITY_3": 89, - "DCGM_FI_DEV_BAR1_TOTAL": 90, - "DCGM_FI_SYNC_BOOST": 91, - "DCGM_FI_DEV_BAR1_USED": 92, - "DCGM_FI_DEV_BAR1_FREE": 93, - "DCGM_FI_DEV_SM_CLOCK": 100, - "DCGM_FI_DEV_MEM_CLOCK": 101, - "DCGM_FI_DEV_VIDEO_CLOCK": 102, - "DCGM_FI_DEV_APP_SM_CLOCK": 110, - "DCGM_FI_DEV_APP_MEM_CLOCK": 111, - "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, - "DCGM_FI_DEV_MAX_SM_CLOCK": 113, - "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, - "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, - "DCGM_FI_DEV_AUTOBOOST": 120, - "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, - "DCGM_FI_DEV_MEMORY_TEMP": 140, - "DCGM_FI_DEV_GPU_TEMP": 150, - "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, - "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, - "DCGM_FI_DEV_POWER_USAGE": 155, - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, - "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, - "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, - "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, - "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, - "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, - "DCGM_FI_DEV_PSTATE": 190, - "DCGM_FI_DEV_FAN_SPEED": 191, - "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, - "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, - "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, - "DCGM_FI_DEV_GPU_UTIL": 203, - "DCGM_FI_DEV_MEM_COPY_UTIL": 204, - "DCGM_FI_DEV_ACCOUNTING_DATA": 205, - "DCGM_FI_DEV_ENC_UTIL": 206, - "DCGM_FI_DEV_DEC_UTIL": 207, - "DCGM_FI_DEV_XID_ERRORS": 230, - "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, - "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, - "DCGM_FI_DEV_PCIE_LINK_GEN": 237, - "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, - "DCGM_FI_DEV_POWER_VIOLATION": 240, - "DCGM_FI_DEV_THERMAL_VIOLATION": 241, - "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, - "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, - "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, - "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, - "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, - "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, - "DCGM_FI_DEV_FB_TOTAL": 250, - "DCGM_FI_DEV_FB_FREE": 251, - "DCGM_FI_DEV_FB_USED": 252, - "DCGM_FI_DEV_FB_RESERVED": 253, - "DCGM_FI_DEV_FB_USED_PERCENT": 254, - "DCGM_FI_DEV_C2C_LINK_COUNT": 285, - "DCGM_FI_DEV_C2C_LINK_STATUS": 286, - "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, - "DCGM_FI_DEV_ECC_CURRENT": 300, - "DCGM_FI_DEV_ECC_PENDING": 301, - "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, - "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, - "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, - "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, - "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, - "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, - "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, - "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, - "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, - "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, - "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, - "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, - "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, - "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, - "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, - "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, - "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, - "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, - "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, - "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, - "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, - "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, - "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, - "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, - "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, - "DCGM_FI_DEV_RETIRED_SBE": 390, - "DCGM_FI_DEV_RETIRED_DBE": 391, - "DCGM_FI_DEV_RETIRED_PENDING": 392, - "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, - "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, - "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, - "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, - "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, - "DCGM_FI_DEV_VIRTUAL_MODE": 500, - "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, - "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, - "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, - "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, - "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, - "DCGM_FI_DEV_ENC_STATS": 506, - "DCGM_FI_DEV_FBC_STATS": 507, - "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, - "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, - "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, - "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, - "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, - "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, - "DCGM_FI_DEV_VGPU_VM_ID": 520, - "DCGM_FI_DEV_VGPU_VM_NAME": 521, - "DCGM_FI_DEV_VGPU_TYPE": 522, - "DCGM_FI_DEV_VGPU_UUID": 523, - "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, - "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, - "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, - "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, - "DCGM_FI_DEV_VGPU_ENC_STATS": 528, - "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, - "DCGM_FI_DEV_VGPU_FBC_STATS": 530, - "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, - "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, - "DCGM_FI_DEV_VGPU_PCI_ID": 533, - "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, - "DCGM_FI_INTERNAL_FIELDS_0_START": 600, - "DCGM_FI_INTERNAL_FIELDS_0_END": 699, - "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, - "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, - "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, - "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, - "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, - "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, - "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, - "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, - "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, - "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, - "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, - "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, - "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, - "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, - "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, - "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, - "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, - "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, - "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, - "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, - "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, - "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, - "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, - "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, - "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, - "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, - "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, - "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, - "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, - "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, - "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, - "DCGM_FI_PROF_SM_ACTIVE": 1002, - "DCGM_FI_PROF_SM_OCCUPANCY": 1003, - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, - "DCGM_FI_PROF_DRAM_ACTIVE": 1005, - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, - "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, - "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, - "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, - "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, - "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, - "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, - "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, - "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, - "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, - "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, - "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, - "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, - "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, - "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, - "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, - "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, - "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, - "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, - "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, - "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, - "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, - "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, - "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, - "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, - "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, - "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, - "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, - "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, - "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, - "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, - "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, - "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, - "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, - "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, - "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, - "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, - "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, - "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, - "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, - "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, - "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, - "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, - "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, - "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, - "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, - "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, - "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, - "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, - "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, - "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, - "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, - "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, - "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, - "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, - "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, - "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, - "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, - "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, - "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, - "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, - "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, - "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, - "DCGM_FI_DEV_CPU_UTIL_USER": 1101, - "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, - "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, - "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, - "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, - "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, - "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, - "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, - "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, - "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, - "DCGM_FI_DEV_CPU_VENDOR": 1140, - "DCGM_FI_DEV_CPU_MODEL": 1141, - "DCGM_FI_MAX_FIELDS": 1142, + "DCGM_FI_UNKNOWN": 0, + "DCGM_FI_DRIVER_VERSION": 1, + "DCGM_FI_NVML_VERSION": 2, + "DCGM_FI_PROCESS_NAME": 3, + "DCGM_FI_DEV_COUNT": 4, + "DCGM_FI_CUDA_DRIVER_VERSION": 5, + "DCGM_FI_DEV_NAME": 50, + "DCGM_FI_DEV_BRAND": 51, + "DCGM_FI_DEV_NVML_INDEX": 52, + "DCGM_FI_DEV_SERIAL": 53, + "DCGM_FI_DEV_UUID": 54, + "DCGM_FI_DEV_MINOR_NUMBER": 55, + "DCGM_FI_DEV_OEM_INFOROM_VER": 56, + "DCGM_FI_DEV_PCI_BUSID": 57, + "DCGM_FI_DEV_PCI_COMBINED_ID": 58, + "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, + "DCGM_FI_GPU_TOPOLOGY_PCI": 60, + "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, + "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, + "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, + "DCGM_FI_DEV_COMPUTE_MODE": 65, + "DCGM_FI_DEV_PERSISTENCE_MODE": 66, + "DCGM_FI_DEV_MIG_MODE": 67, + "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, + "DCGM_FI_DEV_MIG_MAX_SLICES": 69, + "DCGM_FI_DEV_CPU_AFFINITY_0": 70, + "DCGM_FI_DEV_CPU_AFFINITY_1": 71, + "DCGM_FI_DEV_CPU_AFFINITY_2": 72, + "DCGM_FI_DEV_CPU_AFFINITY_3": 73, + "DCGM_FI_DEV_CC_MODE": 74, + "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, + "DCGM_FI_DEV_MIG_GI_INFO": 76, + "DCGM_FI_DEV_MIG_CI_INFO": 77, + "DCGM_FI_DEV_ECC_INFOROM_VER": 80, + "DCGM_FI_DEV_POWER_INFOROM_VER": 81, + "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, + "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, + "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, + "DCGM_FI_DEV_VBIOS_VERSION": 85, + "DCGM_FI_DEV_MEM_AFFINITY_0": 86, + "DCGM_FI_DEV_MEM_AFFINITY_1": 87, + "DCGM_FI_DEV_MEM_AFFINITY_2": 88, + "DCGM_FI_DEV_MEM_AFFINITY_3": 89, + "DCGM_FI_DEV_BAR1_TOTAL": 90, + "DCGM_FI_SYNC_BOOST": 91, + "DCGM_FI_DEV_BAR1_USED": 92, + "DCGM_FI_DEV_BAR1_FREE": 93, + "DCGM_FI_DEV_GPM_SUPPORT": 94, + "DCGM_FI_DEV_SM_CLOCK": 100, + "DCGM_FI_DEV_MEM_CLOCK": 101, + "DCGM_FI_DEV_VIDEO_CLOCK": 102, + "DCGM_FI_DEV_APP_SM_CLOCK": 110, + "DCGM_FI_DEV_APP_MEM_CLOCK": 111, + "DCGM_FI_DEV_CLOCKS_EVENT_REASONS": 112, + "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + "DCGM_FI_DEV_MAX_SM_CLOCK": 113, + "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, + "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, + "DCGM_FI_DEV_AUTOBOOST": 120, + "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, + "DCGM_FI_DEV_MEMORY_TEMP": 140, + "DCGM_FI_DEV_GPU_TEMP": 150, + "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, + "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, + "DCGM_FI_DEV_POWER_USAGE": 155, + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, + "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, + "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, + "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, + "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, + "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, + "DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK": 165, + "DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK": 166, + "DCGM_FI_DEV_VALID_POWER_PROFILE_MASK": 167, + "DCGM_FI_DEV_FABRIC_MANAGER_STATUS": 170, + "DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE": 171, + "DCGM_FI_DEV_FABRIC_CLUSTER_UUID": 172, + "DCGM_FI_DEV_FABRIC_CLIQUE_ID": 173, + "DCGM_FI_DEV_PSTATE": 190, + "DCGM_FI_DEV_FAN_SPEED": 191, + "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, + "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, + "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, + "DCGM_FI_DEV_GPU_UTIL": 203, + "DCGM_FI_DEV_MEM_COPY_UTIL": 204, + "DCGM_FI_DEV_ACCOUNTING_DATA": 205, + "DCGM_FI_DEV_ENC_UTIL": 206, + "DCGM_FI_DEV_DEC_UTIL": 207, + "DCGM_FI_DEV_XID_ERRORS": 230, + "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, + "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, + "DCGM_FI_DEV_PCIE_LINK_GEN": 237, + "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, + "DCGM_FI_DEV_POWER_VIOLATION": 240, + "DCGM_FI_DEV_THERMAL_VIOLATION": 241, + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, + "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, + "DCGM_FI_DEV_FB_TOTAL": 250, + "DCGM_FI_DEV_FB_FREE": 251, + "DCGM_FI_DEV_FB_USED": 252, + "DCGM_FI_DEV_FB_RESERVED": 253, + "DCGM_FI_DEV_FB_USED_PERCENT": 254, + "DCGM_FI_DEV_C2C_LINK_COUNT": 285, + "DCGM_FI_DEV_C2C_LINK_STATUS": 286, + "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, + "DCGM_FI_DEV_ECC_CURRENT": 300, + "DCGM_FI_DEV_ECC_PENDING": 301, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, + "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, + "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, + "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, + "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, + "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, + "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, + "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, + "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, + "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, + "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, + "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, + "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, + "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, + "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, + "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, + "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, + "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, + "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, + "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, + "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, + "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, + "DCGM_FI_DEV_DIAG_MEMORY_RESULT": 350, + "DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT": 351, + "DCGM_FI_DEV_DIAG_PCIE_RESULT": 352, + "DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT": 353, + "DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT": 354, + "DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT": 355, + "DCGM_FI_DEV_DIAG_MEMTEST_RESULT": 356, + "DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT": 357, + "DCGM_FI_DEV_DIAG_EUD_RESULT": 358, + "DCGM_FI_DEV_DIAG_CPU_EUD_RESULT": 359, + "DCGM_FI_DEV_DIAG_SOFTWARE_RESULT": 360, + "DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT": 361, + "DCGM_FI_DEV_DIAG_STATUS": 362, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, + "DCGM_FI_DEV_RETIRED_SBE": 390, + "DCGM_FI_DEV_RETIRED_DBE": 391, + "DCGM_FI_DEV_RETIRED_PENDING": 392, + "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, + "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, + "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, + "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, + "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, + "DCGM_FI_DEV_NVLINK_ERROR_DL_CRC": 497, + "DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY": 498, + "DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY": 499, + "DCGM_FI_DEV_VIRTUAL_MODE": 500, + "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, + "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, + "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, + "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, + "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, + "DCGM_FI_DEV_ENC_STATS": 506, + "DCGM_FI_DEV_FBC_STATS": 507, + "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, + "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, + "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, + "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, + "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, + "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, + "DCGM_FI_DEV_VGPU_VM_ID": 520, + "DCGM_FI_DEV_VGPU_VM_NAME": 521, + "DCGM_FI_DEV_VGPU_TYPE": 522, + "DCGM_FI_DEV_VGPU_UUID": 523, + "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, + "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, + "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, + "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, + "DCGM_FI_DEV_VGPU_ENC_STATS": 528, + "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, + "DCGM_FI_DEV_VGPU_FBC_STATS": 530, + "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, + "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, + "DCGM_FI_DEV_VGPU_PCI_ID": 533, + "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, + "DCGM_FI_FIRST_VGPU_FIELD_ID": 520, + "DCGM_FI_LAST_VGPU_FIELD_ID": 570, + "DCGM_FI_FIRST_NVSWITCH_FIELD_ID": 700, + "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, + "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, + "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, + "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, + "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, + "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, + "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, + "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, + "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4": 817, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5": 818, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6": 819, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7": 820, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4": 821, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5": 822, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6": 823, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7": 824, + "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, + "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, + "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, + "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, + "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, + "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, + "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, + "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, + "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, + "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, + "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, + "DCGM_FI_DEV_NVSWITCH_DEVICE_UUID": 878, + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, + "DCGM_FI_PROF_SM_ACTIVE": 1002, + "DCGM_FI_PROF_SM_OCCUPANCY": 1003, + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, + "DCGM_FI_PROF_DRAM_ACTIVE": 1005, + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, + "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, + "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, + "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, + "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, + "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, + "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, + "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, + "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, + "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, + "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, + "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, + "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, + "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, + "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, + "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, + "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, + "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, + "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, + "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, + "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, + "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, + "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, + "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, + "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, + "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, + "DCGM_FI_PROF_NVOFA1_ACTIVE": 1034, + "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, + "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, + "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, + "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, + "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, + "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, + "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, + "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, + "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, + "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, + "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, + "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, + "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, + "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, + "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, + "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, + "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, + "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, + "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, + "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, + "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, + "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, + "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, + "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, + "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, + "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, + "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, + "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, + "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, + "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, + "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, + "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, + "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, + "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, + "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, + "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, + "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, + "DCGM_FI_DEV_CPU_UTIL_USER": 1101, + "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, + "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, + "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, + "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, + "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, + "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, + "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, + "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, + "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, + "DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT": 1132, + "DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT": 1133, + "DCGM_FI_DEV_CPU_VENDOR": 1140, + "DCGM_FI_DEV_CPU_MODEL": 1141, + "DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS": 1200, + "DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES": 1201, + "DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS": 1202, + "DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES": 1203, + "DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS": 1204, + "DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS": 1205, + "DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS": 1206, + "DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS": 1207, + "DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS": 1208, + "DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS": 1209, + "DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS": 1210, + "DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS": 1211, + "DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS": 1212, + "DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS": 1213, + "DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS": 1214, + "DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER": 1215, + "DCGM_FI_MAX_FIELDS": 1216, } var OLD_DCGM_FI = map[string]Short{ @@ -1139,5 +1248,12 @@ const ( DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // 107 PCIE replay count violated DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // 108 The fabricmanager is not initialized DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // 109 NvSwitch fatal error detected - DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 110 // 110 MUST BE THE LAST ERROR CODE + DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110 // 110 GPU GFLOPs threshold violated + DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111 // 111 NaN value detected on this GPU + DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112 // 112 Fabric Manager did not finish training + DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113 // 113 P2P copy test detected an error writing to this GPU over PCIE + DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114 // 114 P2P copy test detected an error writing from this GPU over PCIE + DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115 // 115 P2P copy test detected an error writing to this GPU over NVLink + DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116 // 116 P2P copy test detected an error writing from this GPU over NVLink + DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 117 //!< 117 MUST BE THE LAST ERROR CODE ) diff --git a/pkg/dcgm/dcgm_agent.h b/pkg/dcgm/dcgm_agent.h index d1eba62..ef9afa0 100644 --- a/pkg/dcgm/dcgm_agent.h +++ b/pkg/dcgm/dcgm_agent.h @@ -17,7 +17,7 @@ #ifndef DCGM_AGENT_H #define DCGM_AGENT_H -#define DCGM_PUBLIC_API +#include "dcgm_api_export.h" #include "dcgm_structs.h" #ifdef __cplusplus @@ -403,13 +403,14 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. * - \ref DCGM_ST_BADPARAM if any parameter is invalid */ -dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v3 *linkStatus); +dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v4 *linkStatus); /** * List supported CPUs and their cores present on the system * - * This and other CPU APIs only support datacenter NVIDIA CPUs + * This and other CPU APIs only support datacenter NVIDIA CPUs. Use \ref dcgmGetCpuHierarchy_v2 to + * get additional CPU information. * * @param dcgmHandle IN: DCGM Handle * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated @@ -422,6 +423,22 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dc */ dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v1 *cpuHierarchy); +/** + * List supported CPUs and their cores present on the system + * + * This and other CPU APIs only support datacenter NVIDIA CPUs. + * + * @param dcgmHandle IN: DCGM Handle + * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_NOT_SUPPORTED if the device is unsupported + * - \ref DCGM_ST_MODULE_NOT_LOADED if the sysmon module could not be loaded + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy_v2(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v2 *cpuHierarchy); + /** @} */ /***************************************************************************************************/ @@ -1544,23 +1561,21 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyGet(dcgmHandle_t pDcgmHandle, * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for * which to register a callback function - * @param beginCallback IN: A reference to a function that should be called should a violation occur. + * @param callback IN: A reference to a function that should be called should a violation occur. * This function will be called prior to any actions specified by the policy are taken. - * @param finishCallback IN: A reference to a function that should be called should a violation occur. - * This function will be called after any action specified by the policy are completed. + * @param userData IN: User data pointer to pass to the userData field of callback * * @return * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or - * \a finishCallback is NULL + * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a callback, is NULL * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId * */ -dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicyCondition_t condition, - fpRecvUpdates beginCallback, - fpRecvUpdates finishCallback); +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister_v2(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyCondition_t condition, + fpRecvUpdates callback, + uint64_t userData); /** * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t). @@ -1575,7 +1590,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, * * @return * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL + * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid + * - \ref DCGM_ST_IN_USE if callback from policy registeration is in progress * */ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, @@ -1617,7 +1633,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyValidation_t validate, - dcgmDiagResponse_t *response); + dcgmDiagResponse_v11 *response); /** * Inform the action manager to perform a manual validation of a group of GPUs on the system @@ -1628,6 +1644,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle, * group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform * operation on all the GPUs. * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * Note: It's a caller's responsibility to make sure the response is zero-initialized, + * except for the version field. * * @return * - \ref DCGM_ST_OK if the call was successful @@ -1639,8 +1657,9 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle, * currently not allowed. */ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, - dcgmRunDiag_v7 *drd, - dcgmDiagResponse_t *response); + dcgmRunDiag_v9 *drd, + dcgmDiagResponse_v11 *response); + /** * Run a diagnostic on a group of GPUs @@ -1667,7 +1686,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmReturn_t DCGM_PUBLIC_API dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmDiagnosticLevel_t diagLevel, - dcgmDiagResponse_t *diagResponse); + dcgmDiagResponse_v11 *diagResponse); /** @} */ // Closing for DCGMAPI_PO_MI @@ -1697,6 +1716,27 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle); /** @} */ // Closing for DCGMAPI_Admin_ExecCtrl +/** + * Gets device workload power profile information and status. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which topology information should be fetched + * @param profilesInfo OUT: Information about each of the supported workload power profiles available on this + * device + * @param profilesStatus OUT: Currently active, requested, and enforced workload power profiles on this device + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if \a gpuId, \a profileInfo, or \a profileStatus were not valid. + * - \ref DCGM_ST_VER_MISMATCH if profileInfo or profileStatus were not set to the correct versions. + * + */ +dcgmReturn_t DCGM_PUBLIC_API +dcgmGetDeviceWorkloadPowerProfileInfo(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmWorkloadPowerProfileProfilesInfo_v1 *profilesInfo, + dcgmDeviceWorkloadPowerProfilesStatus_v1 *profileStatus); + /***************************************************************************************************/ /** @defgroup DCGMAPI_Topo Topology * @{ @@ -1893,7 +1933,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t * * See \ref dcgmGroupCreate for details on creating a GPU group - * See \ref dcgmProfWatchFields to actually watch a metric group + * See \ref dcgmWatchFields to actually watch the underlying profiling fields * * @param pDcgmHandle IN: DCGM Handle * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.
@@ -1909,46 +1949,6 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg dcgmReturn_t DCGM_PUBLIC_API dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, dcgmProfGetMetricGroups_t *metricGroups); -/** - * Request that DCGM start recording updates for a given list of profiling field IDs. - * - * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on - * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields, - * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues. - * - * @param pDcgmHandle IN: DCGM Handle - * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 - * for details of what should be put in each struct member. watchFields->version should be - * set to dcgmProfWatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given - * GPU group. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only - * support for homogenous groups of GPUs. - * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to - * requiring the hardware to gather them with multiple passes - * - */ -dcgmReturn_t DCGM_PUBLIC_API dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields); - -/** - * Request that DCGM stop recording updates for all profiling field IDs for all GPUs - * - * @param pDcgmHandle IN: DCGM Handle - * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref - * dcgmProfUnwatchFields_v1 for details of what should be put in each struct member. - * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t DCGM_PUBLIC_API dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); - /** * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. diff --git a/pkg/dcgm/dcgm_errors.h b/pkg/dcgm/dcgm_errors.h index 02d15ab..4f8289a 100644 --- a/pkg/dcgm/dcgm_errors.h +++ b/pkg/dcgm/dcgm_errors.h @@ -16,7 +16,7 @@ #ifndef DCGM_ERRORS_H #define DCGM_ERRORS_H -#define DCGM_PUBLIC_API +#include "dcgm_api_export.h" #include "dcgm_structs.h" /***************************************************************************************************/ @@ -30,50 +30,55 @@ */ typedef enum dcgmError_enum { - DCGM_FR_OK = 0, //!< 0 No error - DCGM_FR_UNKNOWN = 1, //!< 1 Unknown error code - DCGM_FR_UNRECOGNIZED = 2, //!< 2 Unrecognized error code - DCGM_FR_PCI_REPLAY_RATE = 3, //!< 3 Unacceptable rate of PCI errors - DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< 4 Uncorrectable volatile double bit error - DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< 5 Unacceptable rate of volatile single bit errors - DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< 6 Pending page retirements detected - DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< 7 Unacceptable total page retirements detected - DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< 8 Unacceptable total page retirements due to uncorrectable errors - DCGM_FR_CORRUPT_INFOROM = 9, //!< 9 Corrupt inforom found - DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< 10 Clocks being throttled due to overheating - DCGM_FR_POWER_UNREADABLE = 11, //!< 11 Cannot get a reading for power from NVML - DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< 12 Clock being throttled due to power restrictions - DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< 13 Unacceptable rate of NVLink errors - DCGM_FR_NVLINK_DOWN = 14, //!< 14 NVLink is down - DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< 15 Fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< 16 Non-fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_DOWN = 17, //!< 17 NVSwitch is down - NOT USED: DEPRECATED - DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< 18 Cannot access a file - DCGM_FR_NVML_API = 19, //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED - DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< 20 Disagreement in GPU count between /dev and NVML - DCGM_FR_BAD_PARAMETER = 21, //!< 21 Bad parameter passed to API - DCGM_FR_CANNOT_OPEN_LIB = 22, //!< 22 Cannot open a library that must be accessed - DCGM_FR_DENYLISTED_DRIVER = 23, //!< 23 A driver on the denylist (nouveau) is active - DCGM_FR_NVML_LIB_BAD = 24, //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED - DCGM_FR_GRAPHICS_PROCESSES = 25, //!< 25 Graphics processes are active on this GPU - DCGM_FR_HOSTENGINE_CONN = 26, //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED - DCGM_FR_FIELD_QUERY = 27, //!< 27 Error querying a field from DCGM - DCGM_FR_BAD_CUDA_ENV = 28, //!< 28 The environment has variables that hurt CUDA - DCGM_FR_PERSISTENCE_MODE = 29, //!< 29 Persistence mode is disabled - DCGM_FR_LOW_BANDWIDTH = 30, //!< 30 The bandwidth is unacceptably low - DCGM_FR_HIGH_LATENCY = 31, //!< 31 Latency is too high - DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< 32 Cannot find a tag for a field - DCGM_FR_FIELD_VIOLATION = 33, //!< 33 The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD = 34, //!< 34 The value for the specified field is above the threshold - DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< 35 The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< 36 The value for the specified field is above the threshold - DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< 37 Field type cannot be supported - DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< 38 The value for the specified field is above the threshold - DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< 39 The value for the specified field is above the threshold - DCGM_FR_THERMAL_VIOLATIONS = 40, //!< 40 Thermal violations detected - DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< 41 Thermal violations detected with a timestamp - DCGM_FR_TEMP_VIOLATION = 42, //!< 42 Temperature is too high - DCGM_FR_THROTTLING_VIOLATION = 43, //!< 43 Non-benign clock throttling is occurring + DCGM_FR_OK = 0, //!< 0 No error + DCGM_FR_UNKNOWN = 1, //!< 1 Unknown error code + DCGM_FR_UNRECOGNIZED = 2, //!< 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE = 3, //!< 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM = 9, //!< 9 Corrupt inforom found + DCGM_FR_CLOCKS_EVENT_THERMAL = 10, //!< 10 Clocks being optimized for thermal performance + DCGM_FR_CLOCK_THROTTLE_THERMAL + = DCGM_FR_CLOCKS_EVENT_THERMAL, //!< Deprecated: Use DCGM_FR_CLOCKS_EVENT_THERMAL instead + DCGM_FR_POWER_UNREADABLE = 11, //!< 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCKS_EVENT_POWER = 12, //!< 12 Clock being optimized to meet the product's power limit requirements + DCGM_FR_CLOCK_THROTTLE_POWER = DCGM_FR_CLOCKS_EVENT_POWER, //!< Deprecated: Use DCGM_FR_CLOCKS_EVENT_POWER instead + DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN = 14, //!< 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN = 17, //!< 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< 18 Cannot access a file + DCGM_FR_NVML_API = 19, //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER = 21, //!< 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB = 22, //!< 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER = 23, //!< 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD = 24, //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES = 25, //!< 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN = 26, //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY = 27, //!< 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV = 28, //!< 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE = 29, //!< 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH = 30, //!< 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY = 31, //!< 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION = 33, //!< 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD = 34, //!< 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS = 40, //!< 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION = 42, //!< 42 Temperature is too high + DCGM_FR_CLOCKS_EVENT_VIOLATION = 43, //!< 43 Non-benign clocks event is occurring + DCGM_FR_THROTTLING_VIOLATION + = DCGM_FR_CLOCKS_EVENT_VIOLATION, //!< Deprecated: Use DCGM_FR_CLOCKS_EVENT_VIOLATION instead DCGM_FR_INTERNAL = 44, //!< 44 An internal error was detected DCGM_FR_PCIE_GENERATION = 45, //!< 45 PCIe generation is too low DCGM_FR_PCIE_WIDTH = 46, //!< 46 PCIe width is too low @@ -140,7 +145,18 @@ typedef enum dcgmError_enum DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION = 107, //!< 107 PCIE replay count violated DCGM_FR_CUDA_FM_NOT_INITIALIZED = 108, //!< 108 The fabricmanager is not initialized DCGM_FR_SXID_ERROR = 109, //!< 109 NvSwitch fatal error detected - DCGM_FR_ERROR_SENTINEL = 110, //!< 110 MUST BE THE LAST ERROR CODE + DCGM_FR_GFLOPS_THRESHOLD_VIOLATION = 110, //!< 110 GPU GFLOPs threshold violated + DCGM_FR_NAN_VALUE = 111, //!< 111 NaN value detected on this GPU + DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR = 112, //!< 112 Fabric Manager did not finish training + DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE = 113, //!< 113 P2P copy test detected an error writing to this GPU over PCIE + DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE + = 114, //!< 114 P2P copy test detected an error writing from this GPU over PCIE + DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE + = 115, //!< 115 P2P copy test detected an error writing to this GPU over NVLink + DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE + = 116, //!< 116 P2P copy test detected an error writing from this GPU over NVLink + DCGM_FR_TEST_SKIPPED = 117, //!< 117 Indicates that the test was skipped + DCGM_FR_ERROR_SENTINEL = 118, //!< 117 MUST BE THE LAST ERROR CODE } dcgmError_t; typedef enum dcgmErrorSeverity_enum @@ -221,11 +237,11 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; // gpu id #define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." // gpu id -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." +#define DCGM_FR_CLOCKS_EVENT_THERMAL_MSG "Detected clocks event due to thermal violation in GPU %u." // gpu id #define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." // gpu id -#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." +#define DCGM_FR_CLOCKS_EVENT_POWER_MSG "Detected clocks event due to power violation in GPU %u." // nvlink errors detected, nvlink id, error threshold #define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ @@ -304,14 +320,14 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ "Thermal violations totaling %.1f seconds started at %.1f seconds " \ "into the test for GPU %u" -// observed temperature, gpu id, max allowed temperature -#define DCGM_FR_TEMP_VIOLATION_MSG \ - "Temperature %lld of GPU %u exceeded user-specified maximum " \ +// observed temperature, hbm memory on gpu/gpu, gpu id, max allowed temperature +#define DCGM_FR_TEMP_VIOLATION_MSG \ + "Temperature %lld of %s %u exceeded user-specified maximum " \ "allowed temperature %lld" -// gpu id, seconds into test, details about throttling -#define DCGM_FR_THROTTLING_VIOLATION_MSG \ - "Clocks are being throttled for GPU %u because of clock " \ - "throttling starting %.1f seconds into the test. %s" +// gpu id, seconds into test, details about clock event +#define DCGM_FR_CLOCKS_EVENT_VIOLATION_MSG \ + "Clocks event for GPU %u because of clocks event " \ + "starting %.1f seconds into the test. %s" // details about error #define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" // gpu id, PCIe generation, minimum allowed, parameter to control @@ -334,7 +350,7 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; // CUDA API name #define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" // count, gpu id -#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" +#define DCGM_FR_FAULTY_MEMORY_MSG "Found %lld faulty memory elements on GPU %u" // error detail #define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" // gpu id @@ -411,9 +427,21 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; // gpu id #define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u had memory errors and row remappings are pending" // gpu id, test name -#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to by GPU %u in a peer-to-peer test: %s" // gpu id, test name -#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data to GPU %u in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE_MSG \ + "GPU %u was unsuccessfully written to by GPU %u over PCIe in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE_MSG \ + "GPU %u unsuccessfully wrote data to GPU %u over PCIe in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE_MSG \ + "GPU %u was unsuccessfully written to by GPU %u over NVLink in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE_MSG \ + "GPU %u unsuccessfully wrote data to GPU %u over NVLink in a peer-to-peer test: %s" // nvswitch id, nvlink id #define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down." #define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */ @@ -442,6 +470,10 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" #define DCGM_FR_CUDA_FM_NOT_INITIALIZED_MSG "" #define DCGM_FR_SXID_ERROR_MSG "Detected fatal NvSwitch SXID %u" +#define DCGM_FR_GFLOPS_THRESHOLD_VIOLATION_MSG "Detected %.2f %s for GPU %u which is below the threshold %.2f" +#define DCGM_FR_NAN_VALUE_MSG "Found %lld NaN-value memory elements on GPU %u" +#define DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR_MSG "Fabric Manager (Cluster UUID: %s, Clique ID: %ld): %s." +#define DCGM_FR_TEST_SKIPPED_MSG "Test %s was skipped." #define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */ /* @@ -460,9 +492,9 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_CLOCKS_EVENT_THERMAL_NEXT DEBUG_COOLING_MSG #define DCGM_FR_POWER_UNREADABLE_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." +#define DCGM_FR_CLOCKS_EVENT_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." #define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." #define DCGM_FR_NVLINK_DOWN_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG @@ -521,12 +553,12 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_TEMP_VIOLATION_NEXT \ "Verify that the user-specified temperature maximum is set " \ "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG -#define DCGM_FR_THROTTLING_VIOLATION_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG -#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG -#define DCGM_FR_ABORTED_NEXT "" -#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG +#define DCGM_FR_CLOCKS_EVENT_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG +#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG +#define DCGM_FR_ABORTED_NEXT "" +#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG #define DCGM_FR_CANNOT_GET_STAT_NEXT \ "If running a standalone nv-hostengine, verify that it is up " \ "and responsive." @@ -567,19 +599,23 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_GPU_OP_MODE_NEXT \ "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ "" -#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" -#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" -#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG -#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT -#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." -#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG -#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT "" -#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG -#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" +#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" +#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG +#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." +#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT "" +#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE_NEXT BUG_REPORT_MSG #define DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT \ "Please check fabric manager and initialization logs to figure out why the link is down. " \ "You may also need to run a field diagnostic." @@ -604,7 +640,13 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT "Ensure that the FabricManager is running without errors." #define DCGM_FR_SXID_ERROR_NEXT SYSTEM_TRIAGE_MSG -#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ +#define DCGM_FR_GFLOPS_THRESHOLD_VIOLATION_NEXT \ + "Please verify your user-specified variance tolerance is set appropriately; " \ + "if so, and if errors are persistent, please run a field diagnostic." +#define DCGM_FR_NAN_VALUE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR_NEXT DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT +#define DCGM_FR_TEST_SKIPPED_NEXT "" +#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ #ifdef __cplusplus extern "C" { diff --git a/pkg/dcgm/dcgm_fields.h b/pkg/dcgm/dcgm_fields.h index 19d1eae..d4d97b4 100644 --- a/pkg/dcgm/dcgm_fields.h +++ b/pkg/dcgm/dcgm_fields.h @@ -21,7 +21,7 @@ extern "C" { #endif -#define DCGM_PUBLIC_API +#include "dcgm_api_export.h" /***************************************************************************************************/ /** @defgroup dcgmFieldTypes Field Types @@ -93,24 +93,24 @@ extern "C" { * DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by * 16 bits of the minor version. These macros separate the two. */ -#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000) -#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF) +#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x) & 0xFFFF0000) +#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x) & 0x0000FFFF) /** - * DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. - * These macros are masks for relevant throttling, and are a 1:1 map to the NVML + * DCGM_FI_DEV_CLOCKS_EVENT_REASONS is a bitmap of reported clock events + * These macros are masks for relevant clocks events, and are a 1:1 map to the NVML * reasons documented in nvml.h. The notes for the header are copied blow: */ /** Nothing is running on the GPU and the clocks are dropping to Idle state * \note This limiter may be removed in a later release */ -#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL +#define DCGM_CLOCKS_EVENT_REASON_GPU_IDLE 0x0000000000000001LL /** GPU clocks are limited by current setting of applications clocks */ -#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL -/** SW Power Scaling algorithm is reducing the clocks below requested clocks +#define DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING 0x0000000000000002LL +/** The clocks have been optimized to ensure not to exceed currently set power limits */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL +#define DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP 0x0000000000000004LL /** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * * This is an indicator of: @@ -118,9 +118,9 @@ extern "C" { * - External Power Brake Assertion is triggered (e.g. by the system power supply) * - Power draw is too high and Fast Trigger protection is reducing the clocks * - May be also reported during PState or clock change - * - This behavior may be removed in a later release. + * - This behavior may be removed in a later release. */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL +#define DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN 0x0000000000000008LL /** Sync Boost * * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in @@ -129,29 +129,74 @@ extern "C" { * the throttle reasons for other GPUs in the system to see why those GPUs are * holding this one at lower clocks. */ -#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL +#define DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST 0x0000000000000010LL /** SW Thermal Slowdown * - * This is an indicator of one or more of the following: - * - Current GPU temperature above the GPU Max Operating Temperature - * - Current memory temperature above the Memory Max Operating Temperature + * The current clocks have been optimized to ensure the the following is true: + * - Current GPU temperature does not exceed GPU Max Operating Temperature + * - Current memory temperature does not exceed Memory Max Operating Temperature */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL +#define DCGM_CLOCKS_EVENT_REASON_SW_THERMAL 0x0000000000000020LL /** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * * This is an indicator of: * - temperature being too high */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL +#define DCGM_CLOCKS_EVENT_REASON_HW_THERMAL 0x0000000000000040LL /** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged * * This is an indicator of: * - External Power Brake Assertion being triggered (e.g. by the system power supply) */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL +#define DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE 0x0000000000000080LL /** GPU clocks are limited by current setting of Display clocks */ -#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL +#define DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS 0x0000000000000100LL + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_GPU_IDLE instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE DCGM_CLOCKS_EVENT_REASON_GPU_IDLE +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_SW_THERMAL instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL DCGM_CLOCKS_EVENT_REASON_SW_THERMAL + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_HW_THERMAL instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL DCGM_CLOCKS_EVENT_REASON_HW_THERMAL + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE + +/** + * Deprecated: Use DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS instead + */ +#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS + /** * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE @@ -238,7 +283,6 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_CUDA_DRIVER_VERSION 5 - /** * Name of the GPU device */ @@ -451,6 +495,11 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_BAR1_FREE 93 +/** + * * GPM support for the device + * */ +#define DCGM_FI_DEV_GPM_SUPPORT 94 + /** * SM clock for the device */ @@ -477,9 +526,14 @@ typedef unsigned int dcgm_field_eid_t; #define DCGM_FI_DEV_APP_MEM_CLOCK 111 /** - * Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) + * Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) */ -#define DCGM_FI_DEV_CLOCK_THROTTLE_REASONS 112 +#define DCGM_FI_DEV_CLOCKS_EVENT_REASONS 112 + +/** + * Deprecated: Use DCGM_FI_DEV_CLOCKS_EVENT_REASONS instead + */ +#define DCGM_FI_DEV_CLOCK_THROTTLE_REASONS DCGM_FI_DEV_CLOCKS_EVENT_REASONS /** * Maximum supported SM clock for the device @@ -577,6 +631,45 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 +/** + * Requested workload power profile mask(Blackwell and newer) + * + */ +#define DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK 165 + +/** + * Enforced workload power profile mask(Blackwell and newer) + * + */ +#define DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK 166 + +/** + * Requested workload power profile mask(Blackwell and newer) + * + */ +#define DCGM_FI_DEV_VALID_POWER_PROFILE_MASK 167 + +/** + * The status of the fabric manager - a value from dcgmFabricManagerStatus_t. + */ +#define DCGM_FI_DEV_FABRIC_MANAGER_STATUS 170 + +/** + * The failure that happened while starting the Fabric Manager, if any + * NOTE: this is not populated unless the fabric manager completed startup + */ +#define DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE 171 + +/** + * The uuid of the cluster to which this GPU belongs + */ +#define DCGM_FI_DEV_FABRIC_CLUSTER_UUID 172 + +/** + * The ID of the fabric clique to which this GPU belongs + */ +#define DCGM_FI_DEV_FABRIC_CLIQUE_ID 173 + /** * Performance state (P-State) 0-15. 0=highest */ @@ -887,6 +980,87 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 +/** + * Result of the GPU Memory test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_MEMORY_RESULT 350 + +/** + * Result of the Diagnostics test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT 351 + +/** + * Result of the PCIe + NVLink test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_PCIE_RESULT 352 + +/** + * Result of the Targeted Stress test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT 353 + +/** + * Result of the Targeted Power test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT 354 + +/** + * Result of the Memory Bandwidth test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT 355 + +/** + * Result of the Memory Stress test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_MEMTEST_RESULT 356 + +/** + * Result of the Input Energy Delayed Product power (EDPp) test (a.k.a. the + * pulse test) + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT 357 + +/** + * Result of the Extended Utility Diagnostics (EUD) test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_EUD_RESULT 358 + +/** + * Result of the CPU Extended Utility Diagnostics (CPU EUD) test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_CPU_EUD_RESULT 359 + +/** + * Result of the Software test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_SOFTWARE_RESULT 360 + +/** + * Result of the NVBandwidth test + * Refers to a `int64_t` storing a value drawn from `dcgmError_t` enumeration + */ +#define DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT 361 + +/* + * Status of the current diag run + * Refers to a binary blob of a `dcgmDiagStatus_t` struct + */ +#define DCGM_FI_DEV_DIAG_STATUS 362 + +/* Values from 363-380 reserved for future use */ + /** * Historical max available spare memory rows per memory bank */ @@ -1314,6 +1488,21 @@ typedef unsigned int dcgm_field_eid_t; #define DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 495 #define DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 496 +/* + * NVLink CRC Error Counter + */ +#define DCGM_FI_DEV_NVLINK_ERROR_DL_CRC 497 + +/* + * NVLink Recovery Error Counter + */ +#define DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY 498 + +/* + * NVLink Replay Error Counter + */ +#define DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY 499 + /** * Virtualization Mode corresponding to the GPU. * @@ -1418,6 +1607,10 @@ typedef unsigned int dcgm_field_eid_t; /** * License status of the vGPU + * + * 0 = vgpu is not licensed + * + * 1 = vgpu is licensed */ #define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 @@ -1722,6 +1915,46 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 816 +/** + * NvLink lane crc_err_count for lane 4 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 817 + +/** + * NvLink lane crc_err_count for lane 5 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 818 + +/** + * NvLink lane crc_err_count for lane 6 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 819 + +/** + * NvLink lane crc_err_count for lane 7 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 820 + +/** + * NvLink lane ecc_err_count for lane 4 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 821 + +/** + * NvLink lane ecc_err_count for lane 5 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 822 + +/** + * NvLink lane ecc_err_count for lane 6 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 823 + +/** + * NvLink lane ecc_err_count for lane 7 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 824 + /** * NVSwitch fatal error information. * Note: value field indicates the specific SXid reported @@ -1834,9 +2067,9 @@ typedef unsigned int dcgm_field_eid_t; #define DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID 877 /** - * NvLink device link uid. + * NvLink device switch/link uid. */ -#define DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID 878 +#define DCGM_FI_DEV_NVSWITCH_DEVICE_UUID 878 /** * Last field ID of the NVSwitch instance @@ -1976,6 +2209,7 @@ typedef unsigned int dcgm_field_eid_t; * Ratio of cycles each of the NVOFA engines are active. */ #define DCGM_FI_PROF_NVOFA0_ACTIVE 1033 +#define DCGM_FI_PROF_NVOFA1_ACTIVE 1034 /** * The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload. @@ -2085,6 +2319,16 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_CPU_POWER_LIMIT 1131 +/** + * SoC power utilization + */ +#define DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT 1132 + +/** + * Module power utilization + */ +#define DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT 1133 + /** * CPU vendor name */ @@ -2095,10 +2339,90 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_CPU_MODEL 1141 +/** + * Total Tx packets on the link in NVLink5 + */ +#define DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS 1200 + +/** + * Total Tx bytes on the link in NVLink5 + */ +#define DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES 1201 + +/** + * Total Rx packets on the link in NVLink5 + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS 1202 + +/** + * Total Rx bytes on the link in NVLink5 + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES 1203 + +/** + * Number of packets Rx on a link where packets are malformed + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS 1204 + +/** + * Number of packets that were discarded on Rx due to buffer overrun + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS 1205 + +/** + * Total number of packets with errors Rx on a link + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS 1206 + +/** + * Total number of packets Rx - stomp/EBP marker + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS 1207 + +/** + * Total number of packets Rx with header mismatch + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS 1208 + +/** + * Total number of times that the count of local errors exceeded a threshold + */ +#define DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS 1209 + +/** + * Total number of tx error packets that were discarded + */ +#define DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS 1210 + +/** + * Number of times link went from Up to recovery, succeeded and link came back up + */ +#define DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS 1211 + +/** + * Number of times link went from Up to recovery, failed and link was declared down + */ +#define DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS 1212 + +/** + * Number of times link went from Up to recovery, irrespective of the result + */ +#define DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS 1213 + +/** + * Number of errors in rx symbols + */ +#define DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS 1214 + +/** + * BER for symbol errors + */ +#define DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER 1215 + /** * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated */ -#define DCGM_FI_MAX_FIELDS 1142 +#define DCGM_FI_MAX_FIELDS 1216 /** @} */ diff --git a/pkg/dcgm/dcgm_structs.h b/pkg/dcgm/dcgm_structs.h index 74bee6d..bb1613f 100644 --- a/pkg/dcgm/dcgm_structs.h +++ b/pkg/dcgm/dcgm_structs.h @@ -44,6 +44,11 @@ #ifndef DCGM_BLANK_VALUES #define DCGM_BLANK_VALUES +/** + * Base value for 8 bits integer blank. can be used as an unspecified blank + */ +#define DCGM_INT8_BLANK 0x70 + /** * Base value for 32 bits integer blank. can be used as an unspecified blank */ @@ -125,6 +130,11 @@ */ #define DCGM_STR_NOT_PERMISSIONED "<<>>" +/** + * Macro to check if a INT8 value is blank or not + */ +#define DCGM_INT8_IS_BLANK(val) (((val) >= DCGM_INT8_BLANK) ? 1 : 0) + /** * Macro to check if a INT32 value is blank or not */ @@ -188,13 +198,18 @@ **/ #define DCGM_MAX_NUM_SWITCHES 12 +/** + * Max number of XID info to store + */ +#define DCGM_MAX_XID_INFO 10 + /** * Number of NvLink links per NvSwitch supported by DCGM */ -#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 64 +#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 256 /** - * Number of Lines per NvSwitch NvLink supported by DCGM + * Number of Lanes per NvSwitch NvLink supported by DCGM */ #define DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK 4 @@ -362,6 +377,9 @@ typedef enum dcgmReturn_enum DCGM_ST_NVVS_KILLED = -53, //!< The NVVS process was killed by a signal DCGM_ST_PAUSED = -54, //!< The hostengine and all modules are paused DCGM_ST_ALREADY_INITIALIZED = -55, //!< The object is already initialized + DCGM_ST_NVML_NOT_LOADED = -56, //!< Cannot perform operation because NVML isn't loaded + DCGM_ST_NVML_DRIVER_TIMEOUT = -57, //!< Cannot perform operation because an NVML driver timeout error was detected + DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58, //!< The NVVS returns no available tests (NVVS_ST_TEST_NOT_FOUND) } dcgmReturn_t; const char *errorString(dcgmReturn_t result); @@ -387,11 +405,13 @@ typedef enum dcgmGroupType_enum #define DCGM_GROUP_ALL_INSTANCES 0x7ffffffd #define DCGM_GROUP_ALL_COMPUTE_INSTANCES 0x7ffffffc #define DCGM_GROUP_ALL_ENTITIES 0x7ffffffb +#define DCGM_GROUP_NULL 0x7ffffffa /** * Maximum number of entities per entity group */ -#define DCGM_GROUP_MAX_ENTITIES 64 +#define DCGM_GROUP_MAX_ENTITIES_V1 64 +#define DCGM_GROUP_MAX_ENTITIES_V2 1024 /** * Simplified chip architecture. Note that these are made to match nvmlChipArchitecture_t and thus @@ -399,15 +419,16 @@ typedef enum dcgmGroupType_enum */ typedef enum dcgmChipArchitecture_enum { - DCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler - DCGM_CHIP_ARCH_KEPLER = 2, //!< All Kepler-architecture parts - DCGM_CHIP_ARCH_MAXWELL = 3, //!< All Maxwell-architecture parts - DCGM_CHIP_ARCH_PASCAL = 4, //!< All Pascal-architecture parts - DCGM_CHIP_ARCH_VOLTA = 5, //!< All Volta-architecture parts - DCGM_CHIP_ARCH_TURING = 6, //!< All Turing-architecture parts - DCGM_CHIP_ARCH_AMPERE = 7, //!< All Ampere-architecture parts - DCGM_CHIP_ARCH_ADA = 8, //!< All Ada-architecture parts - DCGM_CHIP_ARCH_HOPPER = 9, //!< All Hopper-architecture parts + DCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler + DCGM_CHIP_ARCH_KEPLER = 2, //!< All Kepler-architecture parts + DCGM_CHIP_ARCH_MAXWELL = 3, //!< All Maxwell-architecture parts + DCGM_CHIP_ARCH_PASCAL = 4, //!< All Pascal-architecture parts + DCGM_CHIP_ARCH_VOLTA = 5, //!< All Volta-architecture parts + DCGM_CHIP_ARCH_TURING = 6, //!< All Turing-architecture parts + DCGM_CHIP_ARCH_AMPERE = 7, //!< All Ampere-architecture parts + DCGM_CHIP_ARCH_ADA = 8, //!< All Ada-architecture parts + DCGM_CHIP_ARCH_HOPPER = 9, //!< All Hopper-architecture parts + DCGM_CHIP_ARCH_BLACKWELL = 10, //!< All Blackwell-architecture parts DCGM_CHIP_ARCH_COUNT, //!< Keep this second to last, exclude unknown @@ -473,8 +494,8 @@ typedef struct dcgm_link_s { struct { - dcgm_field_entity_group_t type : 8; /*!< Entity Group */ - uint8_t index : 8; /*!< Link Index Tx before Rx */ + dcgm_field_entity_group_t type : 8; /*!< Entity Group */ + uint32_t index : 32; /*!< Link Index Tx before Rx */ union { dcgm_field_eid_t gpuId : 16; /*!< Physical GPU ID */ @@ -581,26 +602,44 @@ typedef struct */ typedef struct { - unsigned int version; //!< Version Number (use dcgmGroupInfo_version2) - unsigned int count; //!< count of entityIds returned in \a entityList - char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name - dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES]; //!< List of the entities that are in this group + unsigned int version; //!< Version Number (use dcgmGroupInfo_version2) + unsigned int count; //!< count of entityIds returned in \a entityList + char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name + dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES_V1]; //!< List of the entities that are in this group } dcgmGroupInfo_v2; /** - * Typedef for \ref dcgmGroupInfo_v2 + * Version 2 for \ref dcgmGroupInfo_v2 */ -typedef dcgmGroupInfo_v2 dcgmGroupInfo_t; +#define dcgmGroupInfo_version2 MAKE_DCGM_VERSION(dcgmGroupInfo_v2, 2) /** - * Version 2 for \ref dcgmGroupInfo_v2 + * Structure to store information for DCGM group + * + * Added in DCGM 1.5.0 */ -#define dcgmGroupInfo_version2 MAKE_DCGM_VERSION(dcgmGroupInfo_v2, 2) +typedef struct +{ + unsigned int version; //!< Version Number (use dcgmGroupInfo_version3) + unsigned int count; //!< count of entityIds returned in \a entityList + char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name + dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES_V2]; //!< List of the entities that are in this group +} dcgmGroupInfo_v3; + +/** + * Typedef for \ref dcgmGroupInfo_v3 + */ +typedef dcgmGroupInfo_v3 dcgmGroupInfo_t; + +/** + * Version 3 for \ref dcgmGroupInfo_v3 + */ +#define dcgmGroupInfo_version3 MAKE_DCGM_VERSION(dcgmGroupInfo_v3, 3) /** * Latest version for \ref dcgmGroupInfo_t */ -#define dcgmGroupInfo_version dcgmGroupInfo_version2 +#define dcgmGroupInfo_version dcgmGroupInfo_version3 /** * Enum for the different kinds of MIG profiles @@ -713,13 +752,35 @@ typedef struct } cpus[DCGM_MAX_NUM_CPUS]; } dcgmCpuHierarchy_v1; -typedef dcgmCpuHierarchy_v1 dcgmCpuHierarchy_t; - /** * Version 1 for dcgmCpuHierarchy_t */ #define dcgmCpuHierarchy_version1 MAKE_DCGM_VERSION(dcgmCpuHierarchy_v1, 1) +typedef struct +{ + unsigned int version; + unsigned int numCpus; + struct dcgmCpuHierarchyCpu_v2 + { + unsigned int cpuId; + dcgmCpuHierarchyOwnedCores_v1 ownedCores; + char serial[DCGM_MAX_STR_LENGTH]; + } cpus[DCGM_MAX_NUM_CPUS]; +} dcgmCpuHierarchy_v2; + +typedef dcgmCpuHierarchy_v2 dcgmCpuHierarchy_t; + +/** + * Version 2 for dcgmCpuHierarchy_t + */ +#define dcgmCpuHierarchy_version2 MAKE_DCGM_VERSION(dcgmCpuHierarchy_v2, 2) + +/** + * Latest version for \ref dcgmCpuHierarchy_t + */ +#define dcgmCpuHierarchy_version dcgmCpuHierarchy_version2 + /** * Maximum number of field groups that can exist */ @@ -1541,6 +1602,73 @@ typedef dcgmComputeInstanceProfiles_v1 dcgmComputeInstanceProfiles_t; */ #define DCGM_DEVICE_UUID_BUFFER_SIZE 80 +/** + * Workload power profile information + */ + +#define DCGM_POWER_PROFILE_ARRAY_SIZE 8 +#define DCGM_POWER_PROFILE_MASK_BITS_PER_ELEM 32 +#define DCGM_POWER_PROFILE_MAX_NUM (255) +typedef enum +{ + DCGM_POWER_PROFILE_MAX_P = 0, + DCGM_POWER_PROFILE_MAX_Q = 1, + DCGM_POWER_PROFILE_COMPUTE = 2, + DCGM_POWER_PROFILE_MEMORY_BOUND = 3, + DCGM_POWER_PROFILE_NETWORK = 4, + DCGM_POWER_PROFILE_BALANCED = 5, + DCGM_POWER_PROFILE_LLM_INFERENCE = 6, + DCGM_POWER_PROFILE_LLM_TRAINING = 7, + DCGM_POWER_PROFILE_RBM = 8, + DCGM_POWER_PROFILE_DCPCIE = 9, + DCGM_POWER_PROFILE_HMMA_SPARSE = 10, + DCGM_POWER_PROFILE_HMMA_DENSE = 11, + DCGM_POWER_PROFILE_SYNC_BALANCED = 12, + DCGM_POWER_PROFILE_HPC = 13, + DCGM_POWER_PROFILE_MIG = 14, + DCGM_POWER_PROFILE_MAX = 15, +} dcgmPowerProfileType_t; + +typedef struct +{ + unsigned int version; //!< the API version number + dcgmPowerProfileType_t + profileId; // 0 */ - dcgmFieldGrp_t fieldGroupId; /* Field group to retrive values for. This is onlu looked - at if fieldIdCount is 0 */ - unsigned int fieldIdCount; /* Number of field IDs in fieldIds[] that are valid. This - should only be set if fieldGroupId is not set */ + dcgmFieldGrp_t fieldGroupId; /* Field group to retrive values for. This is onlu looked + at if fieldIdCount is 0 */ + unsigned int fieldIdCount; /* Number of field IDs in fieldIds[] that are valid. This + should only be set if fieldGroupId is not set */ unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; /* Field IDs for which values should be retrieved. only looked at if fieldIdCount is > 0 */ unsigned int flags; /* Mask of DCGM_FV_FLAG_? #defines that affect this request */ -} dcgmGetMultipleLatestValues_v1, dcgmGetMultipleLatestValues_t; +} dcgmGetMultipleLatestValues_v2, dcgmGetMultipleLatestValues_t; -#define dcgmGetMultipleLatestValues_version1 MAKE_DCGM_VERSION(dcgmGetMultipleLatestValues_v1, 1) -#define dcgmGetMultipleLatestValues_version dcgmGetMultipleLatestValues_version1 +#define dcgmGetMultipleLatestValues_version2 MAKE_DCGM_VERSION(dcgmGetMultipleLatestValues_v2, 2) +#define dcgmGetMultipleLatestValues_version dcgmGetMultipleLatestValues_version2 /* Represents cached record metadata */ @@ -432,12 +432,12 @@ typedef enum dcgmEntityStatusType_enum /** * Typedef for \ref dcgmRunDiag_t */ -typedef dcgmRunDiag_v7 dcgmRunDiag_t; +typedef dcgmRunDiag_v9 dcgmRunDiag_t; /** * Latest version for \ref dcgmRunDiag_t */ -#define dcgmRunDiag_version dcgmRunDiag_version7 +#define dcgmRunDiag_version dcgmRunDiag_version9 /** * Version 1 of dcgmCreateGroup_t @@ -479,17 +479,34 @@ typedef struct typedef struct { - unsigned int entityGroup; //!< IN: Entity of group to list entities - unsigned int entities[DCGM_GROUP_MAX_ENTITIES]; //!< OUT: Array of entities for entityGroup - unsigned int numEntities; //!< IN/OUT: Upon calling, this should be the number of - // entities that entityList[] can hold. Upon - // return, this will contain the number of - // entities actually saved to entityList. - unsigned int flags; //!< IN: Flags to modify the behavior of this request. - // See DCGM_GEGE_FLAG_* - unsigned int cmdRet; //!< OUT: Error code generated + unsigned int entityGroup; //!< IN: Entity of group to list entities + unsigned int entities[DCGM_GROUP_MAX_ENTITIES_V1]; //!< OUT: Array of entities for entityGroup + unsigned int numEntities; //!< IN/OUT: Upon calling, this should be the number of + // entities that entityList[] can hold. Upon + // return, this will contain the number of + // entities actually saved to entityList. + unsigned int flags; //!< IN: Flags to modify the behavior of this request. + // See DCGM_GEGE_FLAG_* + unsigned int cmdRet; //!< OUT: Error code generated } dcgmGetEntityGroupEntities_v1; +/** + * Version 2 of dcgmGetEntityGroupEntities_t + */ + +typedef struct +{ + unsigned int entityGroup; //!< IN: Entity of group to list entities + unsigned int entities[DCGM_GROUP_MAX_ENTITIES_V2]; //!< OUT: Array of entities for entityGroup + unsigned int numEntities; //!< IN/OUT: Upon calling, this should be the number of + // entities that entityList[] can hold. Upon + // return, this will contain the number of + // entities actually saved to entityList. + unsigned int flags; //!< IN: Flags to modify the behavior of this request. + // See DCGM_GEGE_FLAG_* + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetEntityGroupEntities_v2; + /** * Version 1 of dcgmGroupGetAllIds_t */ @@ -505,13 +522,25 @@ typedef struct * Version 1 of dcgmGroupGetInfo_t */ +typedef struct +{ + unsigned int groupId; //!< IN: Group ID for which information to be fetched + dcgmGroupInfo_v2 groupInfo; //!< OUT: Group Information + long long timestamp; //!< OUT: Timestamp of information + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupGetInfo_v1; + +/** + * Version 2 of dcgmGroupGetInfo_t + */ + typedef struct { unsigned int groupId; //!< IN: Group ID for which information to be fetched dcgmGroupInfo_t groupInfo; //!< OUT: Group Information long long timestamp; //!< OUT: Timestamp of information unsigned int cmdRet; //!< OUT: Error code generated -} dcgmGroupGetInfo_v1; +} dcgmGroupGetInfo_v2; #define SAMPLES_BUFFER_SIZE_V1 16384 @@ -520,9 +549,9 @@ typedef struct */ typedef struct { - unsigned int groupId; //!< IN: Optional group id for information to be fetched - dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for - unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES_V1]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. //!< This is ignored if fieldIdList[] is provided unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for @@ -540,9 +569,9 @@ typedef struct */ typedef struct { - unsigned int groupId; //!< IN: Optional group id for information to be fetched - dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for - unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES_V1]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. //!< This is ignored if fieldIdList[] is provided unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for @@ -553,6 +582,25 @@ typedef struct char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT: this field is last, and can be truncated for speed */ } dcgmEntitiesGetLatestValues_v2; +/** + * Version 3 of dcgmEntitiesGetLatestValues_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES_V2]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. + //!< This is ignored if fieldIdList[] is provided + unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for + unsigned int fieldIdCount; //!< IN: Number of field IDs in fieldIdList[] array. + unsigned int flags; //!< IN: Optional flags that affect how this request is processed. + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT: this field is last, and can be truncated for speed */ +} dcgmEntitiesGetLatestValues_v3; + /** * Version 1 of dcgmGetMultipleValuesForField */ @@ -761,9 +809,9 @@ typedef struct typedef struct { - dcgmNvLinkStatus_v3 ls; //!< IN/OUT: nvlink status populated on success + dcgmNvLinkStatus_v4 ls; //!< IN/OUT: nvlink status populated on success unsigned int cmdRet; //!< OUT: Error code generated -} dcgmGetNvLinkStatus_v2; +} dcgmGetNvLinkStatus_v3; typedef struct { @@ -815,16 +863,46 @@ typedef struct } dcgmMsgNvmlCreateInjectionGpu_v1; #ifdef INJECTION_LIBRARY_AVAILABLE -#define DCGM_MAX_EXTRA_KEYS 4 + typedef struct { - unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected - char key[DCGM_MAX_STR_LENGTH]; //!< IN: The key for the NVML injected value - injectNvmlVal_t extraKeys[DCGM_MAX_EXTRA_KEYS]; //!< IN: extra keys, optional - unsigned int extraKeyCount; //!< IN: the number of extra keys - injectNvmlVal_t value; //!< IN: the NVML value being injected - unsigned int cmdRet; //!< OUT: Error code generated + unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected + char key[DCGM_MAX_STR_LENGTH]; //!< IN: The key for the NVML injected value + injectNvmlVal_t extraKeys[NVML_INJECTION_MAX_EXTRA_KEYS + 1]; //!< IN: extra keys, optional + unsigned int extraKeyCount; //!< IN: the number of extra keys + injectNvmlRet_t injectNvmlRet; //!< IN: the return to the associate keys + unsigned int cmdRet; //!< OUT: Error code generated } dcgmMsgNvmlInjectDevice_v1; + +typedef struct +{ + unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected + char key[DCGM_MAX_STR_LENGTH]; //!< IN: The key for the NVML injected value + injectNvmlVal_t extraKeys[NVML_INJECTION_MAX_EXTRA_KEYS + 1]; //!< IN: extra keys, optional + unsigned int extraKeyCount; //!< IN: the number of extra keys + injectNvmlRet_t injectNvmlRets[NVML_INJECTION_MAX_RETURNS + 1]; //!< IN: the returns to the associate keys + unsigned int retCount; //!< IN: count of valid injectNvmlRets + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlInjectDeviceForFollowingCalls_v1; + +typedef struct +{ + unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlInjectedDeviceReset_v1; + +typedef struct +{ + injectNvmlFuncCallCounts_t funcCallCounts; //!< OUT: the NVML function call count info + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgGetNvmlInjectFuncCallCount_v1; + +typedef struct +{ + char uuid[DCGM_MAX_STR_LENGTH]; //!> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \ + (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type)) + +/** + * GPU Fabric Health Status Mask for various fields can be tested + * using the below macro. + * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE) + */ +#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \ + (NVML_GPU_FABRIC_HEALTH_GET(var, type) == \ + NVML_GPU_FABRIC_HEALTH_MASK##type##val) + +/** +* GPU Fabric information (v2). +* +* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field +* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask +* field to the end. This structure is not backwards-compatible with +* \ref nvmlGpuFabricInfo_t. +*/ +typedef struct { + unsigned int version; //!< Structure version identifier (set to nvmlGpuFabricInfo_v2) + unsigned char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs + nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". + unsigned int cliqueId; //!< ID of the fabric clique to which this GPU belongs + nvmlGpuFabricState_t state; //!< Current state of GPU registration process + unsigned int healthMask; //!< GPU Fabric health Status Mask +} nvmlGpuFabricInfo_v2_t; + +typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; + +/** +* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version. +*/ +#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) + /** @} */ /***************************************************************************************************/ @@ -2467,6 +3338,76 @@ nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); */ nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. + * + * For S-class products. + * + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc + * + * @return + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); + +/** + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Structure to store Driver branch information + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + char branch[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< driver branch +} nvmlSystemDriverBranchInfo_v1_t; +typedef nvmlSystemDriverBranchInfo_v1_t nvmlSystemDriverBranchInfo_t; +#define nvmlSystemDriverBranchInfo_v1 NVML_STRUCT_VERSION(SystemDriverBranchInfo, 1) + +/** + * Retrieves the driver branch of the NVIDIA driver installed on the system. + * + * For all products. + * + * The branch identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * @param branchInfo Pointer to the driver branch information structure \a nvmlSystemDriverBranchInfo_t + * @param length The maximum allowed length of the driver branch string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a branchInfo is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemGetDriverBranch(nvmlSystemDriverBranchInfo_t *branchInfo, unsigned int length); + + /** @} */ /***************************************************************************************************/ @@ -2632,24 +3573,6 @@ nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_ */ nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); /** @} */ /***************************************************************************************************/ @@ -2944,6 +3867,37 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index */ nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); +/** + * Get a unique identifier for the device module on the baseboard + * + * This API retrieves a unique identifier for each GPU module that exists on a given baseboard. + * For non-baseboard products, this ID would always be 0. + * + * @param device The identifier of the target device + * @param moduleId Unique identifier for the GPU module + * + * @return + * - \ref NVML_SUCCESS if \a moduleId has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a moduleId is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetModuleId(nvmlDevice_t device, unsigned int *moduleId); + +/** + * Retrieves the Device's C2C Mode information + * + * @param device The identifier of the target device + * @param c2cModeInfo Output struct containing the device's C2C Mode info + * + * @return + * - \ref NVML_SUCCESS if \a C2C Mode Infor query is successful + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetC2cModeInfoV(nvmlDevice_t device, nvmlC2cModeInfo_v1_t *c2cModeInfo); /***************************************************************************************************/ @@ -3083,6 +4037,19 @@ nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); */ nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); +/** + * Get the NUMA node of the given GPU device. + * This only applies to platforms where the GPUs are NUMA nodes. + * + * @param[in] device The device handle + * @param[out] node NUMA node ID of the device + * + * @returns + * - \ref NVML_SUCCESS if the NUMA node is retrieved successfully + * - \ref NVML_ERROR_NOT_SUPPORTED if request is not supported on the current platform + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device \a node is invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumaNodeId(nvmlDevice_t device, unsigned int *node); /** * Retrieve the common ancestor for two devices * For all products. @@ -3122,25 +4089,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, n */ nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - /** * Retrieve the status for a given p2p capability index between a given pair of GPU * @@ -3184,31 +4132,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t d */ nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); -/** - * Retrieve the MDEV UUID of a vGPU instance. - * - * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * MDEV UUID is displayed only on KVM platform. - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); - /** * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for * each GPU will have the form /dev/nvidia[minor number]. @@ -3344,6 +4267,27 @@ nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t devi */ nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); +/** + * Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * @param timestamp The start timestamp of the last BBX Flush + * @param durationUs The duration (us) of the last BBX Flush + * + * @return + * - \ref NVML_SUCCESS if \a timestamp and \a durationUs are successfully retrieved + * - \ref NVML_ERROR_NOT_READY if the BBX object has not been flushed yet + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetLastBBXFlushTime(nvmlDevice_t device, unsigned long long *timestamp, + unsigned long *durationUs); + /** * Retrieves the display mode for the device. * @@ -3418,6 +4362,25 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableS */ nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); +/** + * Retrieves PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfoExt_v1_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfoExt(nvmlDevice_t device, nvmlPciInfoExt_t *pci); + /** * Retrieves the PCI attributes of this device. * @@ -3459,30 +4422,48 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); /** - * Retrieves the maximum PCIe link width possible with this device and system - * - * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report - * a max link width of 8. + * Retrieves the maximum PCIe link generation supported by this device * * For Fermi &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param maxLinkWidth Reference in which to return the max PCIe link generation + * @param maxLinkGenDevice Reference in which to return the max PCIe link generation * * @return - * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_SUCCESS if \a maxLinkGenDevice has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGenDevice is null * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGenDevice); /** - * Retrieves the current PCIe link generation - * - * For Fermi &tm; or newer fully supported devices. + * Retrieves the maximum PCIe link width possible with this device and system + * + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + +/** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. * * @param device The identifier of the target device * @param currLinkGen Reference in which to return the current PCIe link generation @@ -3601,6 +4582,20 @@ nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t */ nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); +/** + * Retrieve the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved GPCCLK VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); + /** * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. * Can be changed using \ref nvmlDeviceSetApplicationsClocks. @@ -3643,33 +4638,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClo */ nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - /** * Retrieves the clock speed for the clock specified by the clock type and clock ID. * @@ -3788,69 +4756,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, u */ nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); -/** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); - -/** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - /** * Retrieves the intended operating speed of the device's fan. * @@ -3901,6 +4806,30 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *sp */ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); +/** + * Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan. + * + * For Maxwell &tm; or newer fully supported devices. + * + * For all discrete products with dedicated fans. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * @param device The identifier of the target device + * @param fanSpeed Structure specifying the index of the target fan (input) and + * retrieved fan speed value (output) + * + * @return + * - \ref NVML_SUCCESS If everything worked + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, \a fan is not an acceptable + * index, or \a speed is NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED If the \a device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeedRPM(nvmlDevice_t device, nvmlFanSpeedInfo_t *fanSpeed); + /** * Retrieves the intended target speed of the device's specified fan. * @@ -3930,12 +4859,13 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); /** - * Sets the speed of the fan control policy to default. + * Retrieves the min and max fan speed that user can set for the GPU fan. * * For all cuda-capable discrete products with fans * * @param device The identifier of the target device - * @param fan The index of the fan, starting at zero + * @param minSpeed The minimum speed allowed to set + * @param maxSpeed The maximum speed allowed to set * * return * NVML_SUCCESS if speed has been adjusted @@ -3945,27 +4875,29 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned i * (doesn't have fans) * NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, + unsigned int * maxSpeed); /** - * Retrieves the min and max fan speed that user can set for the GPU fan. + * Gets current fan control policy. + * + * For Maxwell &tm; or newer fully supported devices. * * For all cuda-capable discrete products with fans * - * @param device The identifier of the target device - * @param minSpeed The minimum speed allowed to set - * @param maxSpeed The maximum speed allowed to set + * device The identifier of the target \a device + * policy Reference in which to return the fan control \a policy * * return - * NVML_SUCCESS if speed has been adjusted + * NVML_SUCCESS if \a policy has been populated * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * NVML_ERROR_INVALID_ARGUMENT if device is invalid - * NVML_ERROR_NOT_SUPPORTED if the device does not support this - * (doesn't have fans) + * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference + * a fan that exists. + * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell * NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, - unsigned int * maxSpeed); +nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, + nvmlFanControlPolicy_t *policy); /** * Retrieves the number of fans on the device. @@ -3986,56 +4918,86 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned i nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans); /** - * Retrieves the current temperature readings for the device, in degrees C. + * @deprecated Use \ref nvmlDeviceGetTemperatureV instead + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); + +/** + * Retrieves the cooler's information. + * Returns a cooler's control signal characteristics. The possible types are restricted, Variable and Toggle. + * See \ref nvmlCoolerControl_t for details on available signal types. + * Returns objects that cooler cools. Targets may be GPU, Memory, Power Supply or All of these. + * See \ref nvmlCoolerTarget_t for details on available targets. * - * For all products. + * For Maxwell &tm; or newer fully supported devices. * - * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. + * For all discrete products with dedicated fans. * - * @param device The identifier of the target device - * @param sensorType Flag that indicates which sensor reading to retrieve - * @param temp Reference in which to return the temperature reading + * @param[in] device The identifier of the target device + * @param[out] coolerInfo Structure specifying the cooler's control signal characteristics (out) + * and the target that cooler cools (out) * * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS If everything worked + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, \a signalType or \a target is NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED If the \a device does not support this feature */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); +nvmlReturn_t DECLDIR nvmlDeviceGetCoolerInfo(nvmlDevice_t device, nvmlCoolerInfo_t *coolerInfo); /** - * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. + * Structure used to encapsulate temperature info + */ +typedef struct +{ + unsigned int version; + nvmlTemperatureSensors_t sensorType; + int temperature; +} nvmlTemperature_v1_t; + +typedef nvmlTemperature_v1_t nvmlTemperature_t; + +#define nvmlTemperature_v1 NVML_STRUCT_VERSION(Temperature, 1) + +/** + * Retrieves the current temperature readings (in degrees C) for the given device. * - * For Kepler &tm; or newer fully supported devices. + * For all products. * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * @param[in] device Target device identifier. + * @param[in,out] temperature Structure specifying the sensor type (input) and retrieved + * temperature value (output). * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value queried - * @param temp Reference in which to return the temperature reading * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureV(nvmlDevice_t device, nvmlTemperature_t *temperature); + /** - * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. + * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. * - * For Maxwell &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. * + * Note: This API is no longer the preferred interface for retrieving the following temperature thresholds + * on Ada and later architectures: NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, + * NVML_TEMPERATURE_THRESHOLD_MEM_MAX and NVML_TEMPERATURE_THRESHOLD_GPU_MAX. + * + * Support for reading these temperature thresholds for Ada and later architectures would be removed from this + * API in future releases. Please use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_TEMPERATURE_* fields to retrieve + * temperature thresholds on these architectures. + * * @param device The identifier of the target device - * @param thresholdType The type of threshold value to be set - * @param temp Reference which hold the value to be set + * @param thresholdType The type of threshold value queried + * @param temp Reference in which to return the temperature reading * @return * - \ref NVML_SUCCESS if \a temp has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized @@ -4044,7 +5006,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvml * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); /** * Used to execute a list of thermal system instructions. @@ -4084,50 +5046,60 @@ nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); /** - * Retrieves current clocks throttling reasons. + * Retrieves current clocks event reasons. * * For all fully supported products. * * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. * * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * @param clocksEventReasons Reference in which to return bitmask of active clocks event * reasons * * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_SUCCESS if \a clocksEventReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksEventReasons is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons + * @see nvmlClocksEventReasons + * @see nvmlDeviceGetSupportedClocksEventReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksEventReasons(nvmlDevice_t device, unsigned long long *clocksEventReasons); + +/** + * @deprecated Use \ref nvmlDeviceGetCurrentClocksEventReasons instead */ nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); /** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * Retrieves bitmask of supported clocks event reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksEventReasons * * For all fully supported products. * * This method is not supported in virtual machines running virtual GPU (vGPU). * * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons + * @param supportedClocksEventReasons Reference in which to return bitmask of supported + * clocks event reasons * * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_SUCCESS if \a supportedClocksEventReasons has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksEventReasons is NULL * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons + * @see nvmlClocksEventReasons + * @see nvmlDeviceGetCurrentClocksEventReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksEventReasons(nvmlDevice_t device, unsigned long long *supportedClocksEventReasons); + +/** + * @deprecated Use \ref nvmlDeviceGetSupportedClocksEventReasons instead */ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); @@ -4154,164 +5126,375 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t de nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); /** - * This API has been deprecated. - * - * Retrieves the power management mode associated with this device. - * - * For products from the Fermi family. - * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. - * - * For from the Kepler or newer families. - * - Does not require \a NVML_INFOROM_POWER object. - * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that - * that the driver will do so if the appropriate conditions are met. - * - * See \ref nvmlEnableState_t for details on allowed modes. + * Retrieve performance monitor samples from the associated subdevice. * - * @param device The identifier of the target device - * @param mode Reference in which to return the current power management mode + * @param device + * @param pDynamicPstatesInfo * * @return - * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); +nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); /** - * Retrieves the power management limit associated with this device. - * - * For Fermi &tm; or newer fully supported devices. - * - * The power limit defines the upper boundary for the card's power draw. If - * the card's total power draw reaches this limit the power management algorithm kicks in. - * - * This reading is only available if power management mode is supported. - * See \ref nvmlDeviceGetPowerManagementMode. + * Retrieve the MemClk (Memory Clock) VF offset value. + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved MemClk VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Retrieve min and max clocks of some clock domain for a given PState * * @param device The identifier of the target device - * @param limit Reference in which to return the power management limit in milliwatts + * @param type Clock domain + * @param pstate PState to query + * @param minClockMHz Reference in which to return min clock frequency + * @param maxClockMHz Reference in which to return max clock frequency * * @return - * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_SUCCESS if everything worked * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both + * \a minClockMHz and \a maxClockMHz are NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, + unsigned int * minClockMHz, unsigned int * maxClockMHz); /** - * Retrieves information about possible values of power management limits on this device. + * Get all supported Performance States (P-States) for the device. * - * For Kepler &tm; or newer fully supported devices. + * The returned array would contain a contiguous list of valid P-States supported by + * the device. If the number of supported P-States is fewer than the size of the array + * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * + * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. * * @param device The identifier of the target device - * @param minLimit Reference in which to return the minimum power management limit in milliwatts - * @param maxLimit Reference in which to return the maximum power management limit in milliwatts + * @param pstates Container to return the list of performance states + * supported by device + * @param size Size of the supplied \a pstates array in bytes * * @return - * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set + * - \ref NVML_SUCCESS if \a pstates array has been retrieved + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to + * hold the resulting list * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, + nvmlPstates_t *pstates, unsigned int size); + +/** + * Retrieve the GPCCLK min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved GPCCLK VF min offset value + * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve the MemClk (Memory Clock) min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved MemClk VF min offset value + * @param[out] maxOffset The retrieved MemClk VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve min, max and current clock offset of some clock domain for a given PState + * + * For Maxwell &tm; or newer fully supported devices. + * + * Note: \ref nvmlDeviceGetGpcClkVfOffset, \ref nvmlDeviceGetMemClkVfOffset, \ref nvmlDeviceGetGpcClkMinMaxVfOffset and + * \ref nvmlDeviceGetMemClkMinMaxVfOffset will be deprecated in a future release. + Use \ref nvmlDeviceGetClockOffsets instead. + * + * @param device The identifier of the target device + * @param info Structure specifying the clock type (input) and the pstate (input) + * retrieved clock offset value (output), min clock offset (output) + * and max clock offset (output) + * + * @return + * - \ref NVML_SUCCESS If everything worked + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a type or \a pstate are invalid or both + * \a minClockOffsetMHz and \a maxClockOffsetMHz are NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); + +/** + * Control current clock offset of some clock domain for a given PState + * + * For Maxwell &tm; or newer fully supported devices. + * + * Requires privileged user. + * + * @param device The identifier of the target device + * @param info Structure specifying the clock type (input), the pstate (input) + * and clock offset value (input) + * + * @return + * - \ref NVML_SUCCESS If everything worked + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a type or \a pstate are invalid or both + * \a clockOffsetMHz is out of allowed range. + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceSetClockOffsets(nvmlDevice_t device, nvmlClockOffset_t *info); + +/** + * Retrieves a performance mode string with all the + * performance modes defined for this device along with their associated + * GPU Clock and Memory Clock values. + * Not all tokens will be reported on all GPUs, and additional tokens + * may be added in the future. + * For backwards compatibility we still provide nvclock and memclock; + * those are the same as nvclockmin and memclockmin. + * + * Note: These clock values take into account the offset + * set by clients through /ref nvmlDeviceSetClockOffsets. + * + * Maximum available Pstate (P15) shows the minimum performance level (0) and vice versa. + * + * Each performance modes are returned as a comma-separated list of + * "token=value" pairs. Each set of performance mode tokens are separated + * by a ";". Valid tokens: + * + * Token Value + * "perf" unsigned int - the Performance level + * "nvclock" unsigned int - the GPU clocks (in MHz) for the perf level + * "nvclockmin" unsigned int - the GPU clocks min (in MHz) for the perf level + * "nvclockmax" unsigned int - the GPU clocks max (in MHz) for the perf level + * "nvclockeditable" unsigned int - if the GPU clock domain is editable for the perf level + * "memclock" unsigned int - the memory clocks (in MHz) for the perf level + * "memclockmin" unsigned int - the memory clocks min (in MHz) for the perf level + * "memclockmax" unsigned int - the memory clocks max (in MHz) for the perf level + * "memclockeditable" unsigned int - if the memory clock domain is editable for the perf level + * "memtransferrate" unsigned int - the memory transfer rate (in MHz) for the perf level + * "memtransferratemin" unsigned int - the memory transfer rate min (in MHz) for the perf level + * "memtransferratemax" unsigned int - the memory transfer rate max (in MHz) for the perf level + * "memtransferrateeditable" unsigned int - if the memory transfer rate is editable for the perf level + * + * Example: + * + * perf=0, nvclock=324, nvclockmin=324, nvclockmax=324, nvclockeditable=0, + * memclock=324, memclockmin=324, memclockmax=324, memclockeditable=0, + * memtransferrate=648, memtransferratemin=648, memtransferratemax=648, + * memtransferrateeditable=0 ; + * perf=1, nvclock=324, nvclockmin=324, nvclockmax=640, nvclockeditable=0, + * memclock=810, memclockmin=810, memclockmax=810, memclockeditable=0, + * memtransferrate=1620, memtransferrate=1620, memtransferrate=1620, + * memtransferrateeditable=0 ; + * + * + * @param device The identifier of the target device + * @param perfModes Reference in which to return the performance level string + * + * @return + * - \ref NVML_SUCCESS if \a perfModes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceModes(nvmlDevice_t device, nvmlDevicePerfModes_t *perfModes); + +/** + * Retrieves a string with the associated current GPU Clock and Memory Clock values. * - * @see nvmlDeviceSetPowerManagementLimit + * Not all tokens will be reported on all GPUs, and additional tokens + * may be added in the future. + * + * Note: These clock values take into account the offset + * set by clients through /ref nvmlDeviceSetClockOffsets. + * + * Clock values are returned as a comma-separated list of + * "token=value" pairs. + * Valid tokens: + * + * Token Value + * "perf" unsigned int - the Performance level + * "nvclock" unsigned int - the GPU clocks (in MHz) for the perf level + * "nvclockmin" unsigned int - the GPU clocks min (in MHz) for the perf level + * "nvclockmax" unsigned int - the GPU clocks max (in MHz) for the perf level + * "nvclockeditable" unsigned int - if the GPU clock domain is editable for the perf level + * "memclock" unsigned int - the memory clocks (in MHz) for the perf level + * "memclockmin" unsigned int - the memory clocks min (in MHz) for the perf level + * "memclockmax" unsigned int - the memory clocks max (in MHz) for the perf level + * "memclockeditable" unsigned int - if the memory clock domain is editable for the perf level + * "memtransferrate" unsigned int - the memory transfer rate (in MHz) for the perf level + * "memtransferratemin" unsigned int - the memory transfer rate min (in MHz) for the perf level + * "memtransferratemax" unsigned int - the memory transfer rate max (in MHz) for the perf level + * "memtransferrateeditable" unsigned int - if the memory transfer rate is editable for the perf level + * + * Example: + * + * nvclock=324, nvclockmin=324, nvclockmax=324, nvclockeditable=0, + * memclock=324, memclockmin=324, memclockmax=324, memclockeditable=0, + * memtransferrate=648, memtransferratemin=648, memtransferratemax=648, + * memtransferrateeditable=0 ; + * + * + * @param device The identifier of the target device + * @param currentClockFreqs Reference in which to return the performance level string + * + * @return + * - \ref NVML_SUCCESS if \a currentClockFreqs has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClockFreqs(nvmlDevice_t device, nvmlDeviceCurrentClockFreqs_t *currentClockFreqs); /** - * Retrieves default power management limit on this device, in milliwatts. - * Default power management limit is a power management limit that the device boots with. + * This API has been deprecated. * - * For Kepler &tm; or newer fully supported devices. + * Retrieves the power management mode associated with this device. + * + * For products from the Fermi family. + * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. + * + * For from the Kepler or newer families. + * - Does not require \a NVML_INFOROM_POWER object. + * + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that + * that the driver will do so if the appropriate conditions are met. + * + * See \ref nvmlEnableState_t for details on allowed modes. * * @param device The identifier of the target device - * @param defaultLimit Reference in which to return the default power management limit in milliwatts + * @param mode Reference in which to return the current power management mode * * @return - * - \ref NVML_SUCCESS if \a defaultLimit has been set + * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); /** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * Retrieves the power management limit associated with this device. * * For Fermi &tm; or newer fully supported devices. * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * The power limit defines the upper boundary for the card's power draw. If + * the card's total power draw reaches this limit the power management algorithm kicks in. * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * This reading is only available if power management mode is supported. + * See \ref nvmlDeviceGetPowerManagementMode. * * @param device The identifier of the target device - * @param power Reference in which to return the power usage information + * @param limit Reference in which to return the power management limit in milliwatts * * @return - * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_SUCCESS if \a limit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); /** - * Retrieves current power mode on this device. + * Retrieves information about possible values of power management limits on this device. * - * %ADA_OR_NEWER% + * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param powerModeId Reference in which to return the power mode + * @param minLimit Reference in which to return the minimum power management limit in milliwatts + * @param maxLimit Reference in which to return the maximum power management limit in milliwatts * * @return - * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPowerManagementLimit */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerMode(nvmlDevice_t device, unsigned int *powerModeId); +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); /** - * Retrieves bitmask of supported power modes on this device. + * Retrieves default power management limit on this device, in milliwatts. + * Default power management limit is a power management limit that the device boots with. * - * %ADA_OR_NEWER% + * For Kepler &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param supportedPowerModes Reference in which to return the bitmask of supported power mode + * @param defaultLimit Reference in which to return the default power management limit in milliwatts * * @return - * - \ref NVML_SUCCESS if \a bitmask of supported power mode has been populated + * - \ref NVML_SUCCESS if \a defaultLimit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPowerModes(nvmlDevice_t device, unsigned int *supportedPowerModes); +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); /** - * Sets new power mode. + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. On Ampere + * (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval. On GA100 and + * older architectures, instantaneous power is returned. * - * %ADA_OR_NEWER% + * See \ref NVML_FI_DEV_POWER_AVERAGE and \ref NVML_FI_DEV_POWER_INSTANT to query specific power + * values. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. * * @param device The identifier of the target device - * @param powerModeId Power mode to set. + * @param power Reference in which to return the power usage information * * @return * - \ref NVML_SUCCESS if \a power has been populated @@ -4321,7 +5504,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPowerModes(nvmlDevice_t device, unsig * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerMode(nvmlDevice_t device, unsigned int powerModeId); +nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); /** * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded @@ -4406,6 +5589,14 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuO * * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. * + * @note On systems where GPUs are NUMA nodes, the accuracy of FB memory utilization + * provided by this API depends on the memory accounting of the operating system. + * This is because FB memory is managed by the operating system instead of the NVIDIA GPU driver. + * Typically, pages allocated from FB memory are not released even after + * the process terminates to enhance performance. In scenarios where + * the operating system is under memory pressure, it may resort to utilizing FB memory. + * Such actions can result in discrepancies in the accuracy of memory reporting. + * * @param device The identifier of the target device * @param memory Reference in which to return the memory information * @@ -4418,6 +5609,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuO * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); + +/** + * nvmlDeviceGetMemoryInfo_v2 accounts separately for reserved memory and includes it in the used memory amount. + */ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory); /** @@ -4762,10 +5957,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned in * Retrieves information about active encoder sessions on a target device. * * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. @@ -4808,13 +6003,55 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned */ nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); +/** + * Retrieves the current utilization and sampling size in microseconds for the JPG + * + * %TURING_OR_NEWER% + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for jpg utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetJpgUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current utilization and sampling size in microseconds for the OFA (Optical Flow Accelerator) + * + * %TURING_OR_NEWER% + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for ofa utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetOfaUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + /** * Retrieves the active frame buffer capture sessions statistics for a given device. * * For Maxwell &tm; or newer fully supported devices. * * @param device The identifier of the target device -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats * * @return * - \ref NVML_SUCCESS if \a fbcStats is fetched @@ -4832,7 +6069,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t * * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * -* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. * To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. @@ -4859,11 +6096,11 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int /** * Retrieves the current and pending driver model for the device. * - * For Fermi &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * For windows only. * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. + * On Windows platforms the device driver can run in either WDDM, MCDM or WDM (TCC) modes. If a display is attached + * to the device it must run in WDDM mode. MCDM mode is preferred if a display is not attached. TCC mode is deprecated. * * See \ref nvmlDriverModel_t for details on available driver models. * @@ -4879,9 +6116,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlDeviceSetDriverModel() + * @see nvmlDeviceSetDriverModel_v2() */ -nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); +nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); /** * Get VBIOS version of the device. @@ -5012,7 +6249,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); /** - * Get information about processes with a MPS compute context on a device + * Get information about processes with a Multi-Process Service (MPS) compute context on a device * * For Volta &tm; or newer fully supported devices. * @@ -5055,7 +6292,58 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t devic nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); /** - * Check if the GPU devices are on the same physical board. + * Get information about running processes on a device for input context + * + * For Hopper &tm; or newer fully supported devices. + * + * This function returns information only about running processes (e.g. CUDA application which have + * active context). + * + * To determine the size of the \a plist->procArray array to allocate, call the function with + * \a plist->numProcArrayEntries set to zero and \a plist->procArray set to NULL. The return + * code will be either NVML_ERROR_INSUFFICIENT_SIZE (if there are valid processes of type + * \a plist->mode to report on, in which case the \a plist->numProcArrayEntries field will + * indicate the required number of entries in the array) or NVML_SUCCESS (if no processes of type + * \a plist->mode exist). + * + * The usedGpuMemory field returned is all of the memory used by the application. + * The usedGpuCcProtectedMemory field returned is all of the protected memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a plist->procArray table in case new processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in + * vGPU Host virtualization mode. + * Protected memory usage is currently not available in MIG mode and in windows. + * + * @param device The device handle or MIG device handle + * @param plist Reference in which to process detail list + * \a plist->version The api version + * \a plist->mode The process mode + * \a plist->procArray Reference in which to return the process information + * \a plist->numProcArrayEntries Proc array size of returned entries + * + * @return + * - \ref NVML_SUCCESS if \a plist->numprocArrayEntries and \a plist->procArray have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a plist->numprocArrayEntries indicates that the \a plist->procArray is too small + * \a plist->numprocArrayEntries will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a plist is NULL, \a plist->version is invalid, + * \a plist->mode is invalid, + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t *plist); + +/** + * Check if the GPU devices are on the same physical board. * * For all fully supported products. * @@ -5221,7 +6509,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqN * @param numCores The number of cores for the specified device * * @return - * - \ref NVML_SUCCESS if Gpu core count is successfully retrieved + * - \ref NVML_SUCCESS if GPU core count is successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device @@ -5250,7 +6538,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSour * Gets the device's memory bus width * * @param device The identifier of the target device - * @param maxSpeed The devices's memory bus width + * @param busWidth The devices's memory bus width * * @return * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved @@ -5269,7 +6557,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned i * @param maxSpeed The devices's PCIE Max Link speed in MBPS * * @return - * - \ref NVML_SUCCESS if Pcie Max Link Speed is successfully retrieved + * - \ref NVML_SUCCESS if PCIe Max Link Speed is successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a maxSpeed is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device @@ -5297,7 +6585,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *p * Gets the device's Adaptive Clock status * * @param device The identifier of the target device - * @param adaptiveClockStatus The current adaptive clocking status + * @param adaptiveClockStatus The current adaptive clocking status, either + * NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED + * or NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED * * @return * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved @@ -5310,661 +6600,603 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *p nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); /** - * @} + * Get the type of the GPU Bus (PCIe, PCI, ...) + * + * @param device The identifier of the target device + * @param type The PCI Bus type + * + * return + * - \ref NVML_SUCCESS if the bus \a type is successfully retreived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a type is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ +nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); -/** @addtogroup nvmlAccountingStats - * @{ - */ -/** - * Queries the state of per process accounting mode. + /** + * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceGetGpuFabricInfoV instead * - * For Kepler &tm; or newer fully supported devices. + * Get fabric information associated with the device. + * + * For Hopper &tm; or newer fully supported devices. + * + * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager + * Upon successful registration, the GPU is added to the NVLink fabric to enable + * peer-to-peer communication. + * This API reports the current state of the GPU in the NVLink fabric + * along with other useful information. * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode * * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode + * @param gpuFabricInfo Information about GPU fabric state * * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); /** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. +* Versioned wrapper around \ref nvmlDeviceGetGpuFabricInfo that accepts a versioned +* \ref nvmlGpuFabricInfo_v2_t or later output structure. +* +* @note The caller must set the \ref nvmlGpuFabricInfoV_t.version field to the +* appropriate version prior to calling this function. For example: +* \code +* nvmlGpuFabricInfoV_t fabricInfo = +* { .version = nvmlGpuFabricInfo_v2 }; +* nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device,&fabricInfo); +* \endcode +* +* For Hopper &tm; or newer fully supported devices. +* +* @param device The identifier of the target device +* @param gpuFabricInfo Information about GPU fabric state +* +* @return +* - \ref NVML_SUCCESS Upon success +* - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, + nvmlGpuFabricInfoV_t *gpuFabricInfo); + +/** + * Get Conf Computing System capabilities. * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * @param capabilities System CC capabilities * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * @return + * - \ref NVML_SUCCESS if \a capabilities were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); + +/** + * Get Conf Computing System State. * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats + * @param state System CC State * * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_SUCCESS if \a state were successfully queried * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); + +/** + * Get Conf Computing Protected and Unprotected Memory Sizes. * - * @see nvmlDeviceGetAccountingBufferSize + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device handle + * @param memInfo Protected/Unprotected Memory sizes + * + * @return + * - \ref NVML_SUCCESS if \a memInfo were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); /** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. + * Get Conf Computing GPUs ready state. * - * For Kepler &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * @param isAcceptingWork Returns GPU current work accepting state, + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE * - * For more details see \ref nvmlDeviceGetAccountingStats. + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); + +/** + * Get Conf Computing protected memory usage. * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids + * @param memory Reference in which to return the memory information * * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_SUCCESS if \a memory has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); /** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. + * Get Conf Computing GPU certificate details. * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. + * @param gpuCert Reference in which to return the gpu certificate information * * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_SUCCESS if \a gpu certificate info has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, + nvmlConfComputeGpuCertificate_t *gpuCert); /** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * Get Conf Computing GPU attestation report. * - * For Kepler &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into + * @param device The identifier of the target device + * @param gpuAtstReport Reference in which to return the gpu attestation report * * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_SUCCESS if \a gpu attestation report has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses); - +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, + nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); /** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's - * retirement. + * Get Conf Computing key rotation threshold detail. * - * For Kepler &tm; or newer fully supported devices. + * For Hopper &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 + * @param pKeyRotationThrInfo Reference in which to return the key rotation threshold data * * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_SUCCESS if \a gpu key rotation threshold info has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeKeyRotationThresholdInfo( + nvmlConfComputeGetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); /** - * Check if any pages are pending retirement and need a reboot to fully retire. + * Set Conf Computing Unprotected Memory Size. * - * For Kepler &tm; or newer fully supported devices. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param isPending Reference in which to return the pending status + * @param device Device Handle + * @param sizeKiB Unprotected Memory size to be set in KiB * * @return - * - \ref NVML_SUCCESS if \a isPending was populated + * - \ref NVML_SUCCESS if \a sizeKiB successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); +nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeUnprotectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); /** - * Get number of remapped rows. The number of rows reported will be based on - * the cause of the remapping. isPending indicates whether or not there are - * pending remappings. A reset will be required to actually remap the row. - * failureOccurred will be set if a row remapping ever failed in the past. A - * pending remapping won't affect future work on the GPU since - * error-containment and dynamic page blacklisting will take care of that. - * - * @note On MIG-enabled GPUs with active instances, querying the number of - * remapped rows is not supported + * Set Conf Computing GPUs ready state. * * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. * - * @param device The identifier of the target device - * @param corrRows Reference for number of rows remapped due to correctable errors - * @param uncRows Reference for number of rows remapped due to uncorrectable errors - * @param isPending Reference for whether or not remappings are pending - * @param failureOccurred Reference that is set when a remapping has failed in the past + * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN Unexpected error + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device */ -nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, - unsigned int *isPending, unsigned int *failureOccurred); +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); /** - * Get the row remapper histogram. Returns the remap availability for each bank - * on the GPU. + * Set Conf Computing key rotation threshold. * - * @param device Device handle - * @param values Histogram values + * For Hopper &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * This function is to set the confidential compute key rotation threshold parameters. + * \a pKeyRotationThrInfo->maxAttackerAdvantage should be in the range from + * NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MIN to NVML_CC_KEY_ROTATION_THRESHOLD_ATTACKER_ADVANTAGE_MAX. + * Default value is 60. + * + * @param pKeyRotationThrInfo Reference to the key rotation threshold data * * @return - * - \ref NVML_SUCCESS On success - * - \ref NVML_ERROR_UNKNOWN On any unexpected error + * - \ref NVML_SUCCESS if \a key rotation threashold max attacker advantage has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_INVALID_STATE if confidential compute GPU ready state is enabled + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeKeyRotationThresholdInfo( + nvmlConfComputeSetKeyRotationThresholdInfo_t *pKeyRotationThrInfo); /** - * Get architecture for device + * Get Conf Computing System Settings. * - * @param device The identifier of the target device - * @param arch Reference where architecture is returned, if call successful. - * Set to NVML_DEVICE_ARCH_* upon success + * For Hopper &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param settings System CC settings * * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid + * - \ref NVML_SUCCESS If the query is success + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a counters is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeSettings(nvmlSystemConfComputeSettings_t *settings); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitCommands Unit Commands - * This chapter describes NVML operations that change the state of the unit. For S-class products. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ +/** + * Retrieve GSP firmware version. + * + * The caller passes in buffer via \a version and corresponding GSP firmware numbered version + * is returned with the same parameter in string format. + * + * @param device Device handle + * @param version The retrieved GSP firmware version + * + * @return + * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -/***************************************************************************************************/ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); /** - * Set the LED state for the unit. The LED can be either green (0) or amber (1). + * Retrieve GSP firmware mode. * - * For S-class products. - * Requires root/admin permissions. + * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with + * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. * - * This operation takes effect immediately. + * @param device Device handle + * @param isEnabled Pointer to specify if GSP firmware is enabled + * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device * + * @return + * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); + +/** + * Get SRAM ECC error status of this device. * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front - * and back LEDs will be toggled in unison regardless of which unit is specified with this command. + * For Ampere &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * See \ref nvmlLedColor_t for available colors. + * See \ref nvmlEccSramErrorStatus_v1_t for more information on the struct. * - * @param unit The identifier of the target unit - * @param color The target LED color + * @param device The identifier of the target device + * @param status Returns SRAM ECC error status * * @return - * - \ref NVML_SUCCESS if the LED color has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitGetLedState() + * - \ref NVML_SUCCESS If \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a counters is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a nvmlEccSramErrorStatus_t is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); +nvmlReturn_t DECLDIR nvmlDeviceGetSramEccErrorStatus(nvmlDevice_t device, + nvmlEccSramErrorStatus_t *status); -/** @} */ +/** + * @} + */ -/***************************************************************************************************/ -/** @defgroup nvmlDeviceCommands Device Commands - * This chapter describes NVML operations that change the state of the device. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. +/** @addtogroup nvmlAccountingStats * @{ */ -/***************************************************************************************************/ /** - * Set the persistence mode for the device. - * - * For all products. - * For Linux only. - * Requires root/admin permissions. - * - * The persistence mode determines whether the GPU driver software is torn down after the last client - * exits. - * - * This operation takes effect immediately. It is not persistent across reboots. After each reboot the - * persistence mode is reset to "Disabled". + * Queries the state of per process accounting mode. * - * See \ref nvmlEnableState_t for available modes. + * For Kepler &tm; or newer fully supported devices. * - * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA - * memory, the given device handle will no longer be valid, and to continue to interact with this - * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This - * limitation is currently only applicable to devices that have a coherent NVLink connection to - * system memory. + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode * * @param device The identifier of the target device - * @param mode The target persistence mode + * @param mode Reference in which to return the current accounting mode * * @return - * - \ref NVML_SUCCESS if the persistence mode was set + * - \ref NVML_SUCCESS if the mode has been successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPersistenceMode() */ -nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); /** - * Set the compute mode for the device. - * - * For all products. - * Requires root/admin permissions. + * Queries process's accounting stats. * - * The compute mode determines whether a GPU can be used for compute operations and whether it can - * be shared across contexts. + * For Kepler &tm; or newer fully supported devices. * - * This operation takes effect immediately. Under Linux it is not persistent across reboots and - * always resets to "Default". Under windows it is persistent. + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. * - * Under windows compute mode may only be set to DEFAULT when running in WDDM + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. * - * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported * - * See \ref nvmlComputeMode_t for details on available compute modes. + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. * * @param device The identifier of the target device - * @param mode The target compute mode + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats * * @return - * - \ref NVML_SUCCESS if the compute mode was set + * - \ref NVML_SUCCESS if stats have been successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlDeviceGetComputeMode() + * @see nvmlDeviceGetAccountingBufferSize */ -nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); /** - * Set the ECC mode for the device. + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. * * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires root/admin permissions. * - * The ECC mode determines whether the GPU enables its ECC support. + * To query the number of processes under Accounting Mode, call this function with *count = 0 and pids=NULL. + * The return code will be NVML_ERROR_INSUFFICIENT_SIZE with an updated count value indicating the number of processes. * - * This operation takes effect after the next reboot. + * For more details see \ref nvmlDeviceGetAccountingStats. * - * See \ref nvmlEnableState_t for details on available modes. + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. * * @param device The identifier of the target device - * @param ecc The target ECC mode + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids * * @return - * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_SUCCESS if pids were successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlDeviceGetEccMode() + * @see nvmlDeviceGetAccountingBufferSize */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); /** - * Clear the ECC error and other memory error counts for the device. + * Returns the number of processes that the circular buffer with accounting pids can hold. * * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. - * Requires root/admin permissions. - * Requires ECC Mode to be enabled. - * - * Sets all of the specified ECC counters to 0, including both detailed and total counts. * - * This operation takes effect immediately. - * - * See \ref nvmlMemoryErrorType_t for details on available counter types. + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. * * @param device The identifier of the target device - * @param counterType Flag that indicates which type of errors should be cleared. + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. * * @return - * - \ref NVML_SUCCESS if the error counts were cleared + * - \ref NVML_SUCCESS if buffer size was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see - * - nvmlDeviceGetDetailedEccErrors() - * - nvmlDeviceGetTotalEccErrors() + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ */ -nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); /** - * Set the driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * Requires root/admin permissions. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. - * - * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). - * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. - * - * This operation takes effect after the next reboot. - * - * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. - * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or - * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in Xid 63 * - * See \ref nvmlDriverModel_t for details on available driver models. - * See \ref nvmlFlagDefault and \ref nvmlFlagForce + * For Kepler &tm; or newer fully supported devices. * - * @param device The identifier of the target device - * @param driverModel The target driver model - * @param flags Flags that change the default behavior + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into * * @return - * - \ref NVML_SUCCESS if the driver model has been set + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetDriverModel() */ -nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); - -typedef enum nvmlClockLimitId_enum { - NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, - NVML_CLOCK_LIMIT_ID_TDP, - NVML_CLOCK_LIMIT_ID_UNLIMITED -} nvmlClockLimitId_t; +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses); /** - * Set clocks that device will lock to. - * - * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. - * Setting this will supercede application clock values and take effect regardless if a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks - * - * Can be used as a setting to request constant performance. - * - * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. - * See the table below for valid combinations of these values. - * - * minGpuClock | maxGpuClock | Effect - * ------------+-------------+-------------------------------------------------- - * tdp | tdp | Lock clock to TDP - * unlimited | tdp | Upper bound is TDP but clock may drift below this - * tdp | unlimited | Lower bound is TDP but clock may boost above this - * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) - * - * If one arg takes one of these values, the other must be one of these values as - * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. - * - * Requires root/admin permissions. + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in Xid 63 * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetGpuLockedClocks. + * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's + * retirement. * - * For Volta &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * - * @param device The identifier of the target device - * @param minGpuClockMHz Requested minimum gpu clock in MHz - * @param maxGpuClockMHz Requested maximum gpu clock in MHz + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 * * @return - * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); /** - * Resets the gpu clock to the default value - * - * This is the gpu clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * @see nvmlDeviceSetGpuLockedClocks + * Check if any pages are pending retirement and need a reboot to fully retire. * - * For Volta &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * - * @param device The identifier of the target device + * @param device The identifier of the target device + * @param isPending Reference in which to return the pending status * * @return - * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_SUCCESS if \a isPending was populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); /** - * Set memory clocks that device will lock to. - * - * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. - * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks - * - * Can be used as a setting to request constant performance. - * - * Requires root/admin permissions. + * Get number of remapped rows. The number of rows reported will be based on + * the cause of the remapping. isPending indicates whether or not there are + * pending remappings. A reset will be required to actually remap the row. + * failureOccurred will be set if a row remapping ever failed in the past. A + * pending remapping won't affect future work on the GPU since + * error-containment and dynamic page blacklisting will take care of that. * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetMemoryLockedClocks. + * @note On MIG-enabled GPUs with active instances, querying the number of + * remapped rows is not supported * * For Ampere &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param minMemClockMHz Requested minimum memory clock in MHz - * @param maxMemClockMHz Requested maximum memory clock in MHz + * @param corrRows Reference for number of rows remapped due to correctable errors + * @param uncRows Reference for number of rows remapped due to uncorrectable errors + * @param isPending Reference for whether or not remappings are pending + * @param failureOccurred Reference that is set when a remapping has failed in the past * * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN Unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, + unsigned int *isPending, unsigned int *failureOccurred); /** - * Resets the memory clock to the default value - * - * This is the memory clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * @see nvmlDeviceSetMemoryLockedClocks - * - * For Ampere &tm; or newer fully supported devices. + * Get the row remapper histogram. Returns the remap availability for each bank + * on the GPU. * - * @param device The identifier of the target device + * @param device Device handle + * @param values Histogram values * * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS On success + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); +nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); /** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. + * Get architecture for device * * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz + * @param arch Reference where architecture is returned, if call successful. + * Set to NVML_DEVICE_ARCH_* upon success * * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); +nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); /** * Retrieves the frequency monitor fault status for the device. @@ -5990,1391 +7222,2344 @@ nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsign nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); /** - * Set new power limit of this device. + * Retrieves the current utilization and process ID * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. + * For Maxwell &tm; or newer fully supported devices. * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. If no valid sample entries are found since the lastSeenTimeStamp, NVML_ERROR_NOT_FOUND + * is returned. * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. * - * @param device The identifier of the target device - * @param limit Power management limit in milliwatts to set + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @return - * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_SUCCESS if \a utilization has been populated * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPowerManagementLimitConstraints - * @see nvmlDeviceGetPowerManagementDefaultLimit */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); /** - * Sets new GOM. See \a nvmlGpuOperationMode_t for details. + * Retrieves the recent utilization and process ID for all running processes * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * Requires root/admin permissions. + * For Maxwell &tm; or newer fully supported devices. * - * Changing GOMs requires a reboot. - * The reboot requirement might be removed in the future. + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder, jpeg decoder, OFA (Optical Flow Accelerator) + * for all running processes. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a procesesUtilInfo->procUtilArray. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. * - * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when - * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * The caller should allocate a buffer of size processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t). If the buffer is too small, the API will + * return \a NVML_ERROR_INSUFFICIENT_SIZE, with the recommended minimal buffer size at \a procesesUtilInfo->processSamplesCount. The caller should + * invoke the function again with the allocated buffer passed in \a procesesUtilInfo->procUtilArray, and \a procesesUtilInfo->processSamplesCount + * set to the number no less than the recommended value by the previous API return. * - * @param device The identifier of the target device - * @param mode Target GOM + * On successful return, the function updates \a procesesUtilInfo->processSamplesCount with the number of process utilization info structures + * that were actually written. This may differ from a previously read value as instances are created or destroyed. * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * \a procesesUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a procesesUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceGetGpuOperationMode + * \a procesesUtilInfo->version is the version number of the structure nvmlProcessesUtilizationInfo_t, the caller should set the correct version + * number to retrieve the specific version of processes utilization information. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param procesesUtilInfo Pointer to the caller-provided structure of nvmlProcessesUtilizationInfo_t. + + * @return + * - \ref NVML_SUCCESS If \a procesesUtilInfo->procUtilArray has been populated + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a procesesUtilInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND If sample entries are not found + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a procesesUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE If \a procesesUtilInfo->procUtilArray is NULL, or the buffer size of procesesUtilInfo->procUtilArray is too small. + * The caller should check the minimul array size from the returned procesesUtilInfo->processSamplesCount, and call + * the function again with a buffer no smaller than procesesUtilInfo->processSamplesCount * sizeof(nvmlProcessUtilizationInfo_t) + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); +nvmlReturn_t DECLDIR nvmlDeviceGetProcessesUtilizationInfo(nvmlDevice_t device, nvmlProcessesUtilizationInfo_t *procesesUtilInfo); /** - * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. - * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction - * to query the current restriction settings. + * Get platform information of this device. * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. + * %BLACKWELL_OR_NEWER% + * + * See \ref nvmlPlatformInfo_v1_t for more information on the struct. * * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted The target restriction + * @param platformInfo Pointer to the caller-provided structure of nvmlPlatformInfo_t. * * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto - * boosted clocks is not supported by the device) - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t + * - \ref NVML_SUCCESS If \a platformInfo has been retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a platformInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + * - \ref NVML_ERROR_MEMORY if system memory is insufficient + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a nvmlPlatformInfo_t is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); +nvmlReturn_t DECLDIR nvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *platformInfo); -/** - * @} - */ +/** @} */ -/** @addtogroup nvmlAccountingStats +/***************************************************************************************************/ +/** @defgroup nvmlUnitCommands Unit Commands + * This chapter describes NVML operations that change the state of the unit. For S-class products. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. * @{ */ +/***************************************************************************************************/ /** - * Enables or disables per process accounting. + * Set the LED state for the unit. The LED can be either green (0) or amber (1). * - * For Kepler &tm; or newer fully supported devices. + * For S-class products. * Requires root/admin permissions. * - * @note This setting is not persistent and will default to disabled after driver unloads. - * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. + * This operation takes effect immediately. * - * @note Disabling accounting clears all accounting pids information. * - * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. + * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * and back LEDs will be toggled in unison regardless of which unit is specified with this command. * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceClearAccountingPids + * See \ref nvmlLedColor_t for available colors. * - * @param device The identifier of the target device - * @param mode The target accounting mode + * @param unit The identifier of the target unit + * @param color The target LED color * * @return - * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_SUCCESS if the LED color has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Clears accounting information about all processes that have already terminated. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * @see nvmlUnitGetLedState() */ -nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); +nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); /** @} */ /***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. +/** @defgroup nvmlDeviceCommands Device Commands + * This chapter describes NVML operations that change the state of the device. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. * @{ */ /***************************************************************************************************/ /** - * Retrieves the state of the device's NvLink for the link specified + * Set the persistence mode for the device. * - * For Pascal &tm; or newer fully supported devices. + * For all products. + * For Linux only. + * Requires root/admin permissions. + * + * The persistence mode determines whether the GPU driver software is torn down after the last client + * exits. + * + * This operation takes effect immediately. It is not persistent across reboots. After each reboot the + * persistence mode is reset to "Disabled". + * + * See \ref nvmlEnableState_t for available modes. + * + * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA + * memory, the given device handle will no longer be valid, and to continue to interact with this + * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This + * limitation is currently only applicable to devices that have a coherent NVLink connection to + * system memory. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it - * is inactive + * @param mode The target persistence mode * * @return - * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_SUCCESS if the persistence mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPersistenceMode() */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); +nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); /** - * Retrieves the version of the device's NvLink for the link specified + * Set the compute mode for the device. * - * For Pascal &tm; or newer fully supported devices. + * For all products. + * Requires root/admin permissions. + * + * The compute mode determines whether a GPU can be used for compute operations and whether it can + * be shared across contexts. + * + * This operation takes effect immediately. Under Linux it is not persistent across reboots and + * always resets to "Default". Under windows it is persistent. + * + * Under windows compute mode may only be set to DEFAULT when running in WDDM + * + * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. + * + * See \ref nvmlComputeMode_t for details on available compute modes. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param version Requested NvLink version + * @param mode The target compute mode * * @return - * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_SUCCESS if the compute mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetComputeMode() */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); +nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); /** - * Retrieves the requested capability from the device's NvLink for the link specified - * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried - * The return value should be treated as a boolean. + * Set the ECC mode for the device. * - * For Pascal &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is available + * @param ecc The target ECC mode * * @return - * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_SUCCESS if the ECC mode was set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); /** - * Retrieves the PCI information for the remote node on a NvLink link - * Note: pciSubSystemId is not filled in this function and is indeterminate + * Clear the ECC error and other memory error counts for the device. * - * For Pascal &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. + * Requires root/admin permissions. + * Requires ECC Mode to be enabled. + * + * Sets all of the specified ECC counters to 0, including both detailed and total counts. + * + * This operation takes effect immediately. + * + * See \ref nvmlMemoryErrorType_t for details on available counter types. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * @param counterType Flag that indicates which type of errors should be cleared. * * @return - * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_SUCCESS if the error counts were cleared * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see + * - nvmlDeviceGetDetailedEccErrors() + * - nvmlDeviceGetTotalEccErrors() */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); /** - * Retrieves the specified error counter value - * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + * Set the driver model for the device. * - * For Pascal &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * Requires root/admin permissions. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. + * + * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). + * This should only be done if the host is subsequently powered down and the display is detached from the device + * before the next reboot. + * + * This operation takes effect after the next reboot. + * + * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. + * + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * See \ref nvmlFlagDefault and \ref nvmlFlagForce * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the NvLink counter to be queried - * @param counterValue Returned counter value + * @param driverModel The target driver model + * @param flags Flags that change the default behavior * * @return - * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_SUCCESS if the driver model has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetDriverModel() */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, - nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); +nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); + +typedef enum nvmlClockLimitId_enum { + NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, + NVML_CLOCK_LIMIT_ID_TDP, + NVML_CLOCK_LIMIT_ID_UNLIMITED +} nvmlClockLimitId_t; /** - * Resets all error counters to zero - * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset + * Set clocks that device will lock to. * - * For Pascal &tm; or newer fully supported devices. + * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. + * Setting this will supersede application clock values and take effect regardless if a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. + * See the table below for valid combinations of these values. + * + * minGpuClock | maxGpuClock | Effect + * ------------+-------------+-------------------------------------------------- + * tdp | tdp | Lock clock to TDP + * unlimited | tdp | Upper bound is TDP but clock may drift below this + * tdp | unlimited | Lower bound is TDP but clock may boost above this + * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) + * + * If one arg takes one of these values, the other must be one of these values as + * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetGpuLockedClocks. + * + * For Volta &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried + * @param minGpuClockMHz Requested minimum gpu clock in MHz + * @param maxGpuClockMHz Requested maximum gpu clock in MHz * * @return - * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); +nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); /** - * Deprecated: Setting utilization counter control is no longer supported. + * Resets the gpu clock to the default value * - * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset - * of the counters if the reset parameter is non-zero. + * This is the gpu clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. * - * For Pascal &tm; or newer fully supported devices. + * @see nvmlDeviceSetGpuLockedClocks + * + * For Volta &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set - * @param reset Resets the counters on set if non-zero * * @return - * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control, unsigned int reset); +nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); /** - * Deprecated: Getting utilization counter control is no longer supported. + * Set memory clocks that device will lock to. * - * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition + * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. + * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks * - * For Pascal &tm; or newer fully supported devices. + * Can be used as a setting to request constant performance. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetMemoryLockedClocks. + * + * For Ampere &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information + * @param minMemClockMHz Requested minimum memory clock in MHz + * @param maxMemClockMHz Requested maximum memory clock in MHz * * @return - * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control); - +nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); /** - * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. + * Resets the memory clock to the default value * - * Retrieve the NVLINK utilization counter based on the current control for a specified counter. - * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl - * before reading the utilization counters as they have no default state + * This is the memory clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. * - * For Pascal &tm; or newer fully supported devices. + * @see nvmlDeviceSetMemoryLockedClocks + * + * For Ampere &tm; or newer fully supported devices. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be read (0 or 1). - * @param rxcounter Receive counter return value - * @param txcounter Transmit counter return value * * @return - * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, - unsigned long long *rxcounter, unsigned long long *txcounter); +nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); /** - * Deprecated: Freezing NVLINK utilization counters is no longer supported. + * Set clocks that applications will lock to. * - * Freeze the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). * - * For Pascal &tm; or newer fully supported devices. + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. * * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be frozen (0 or 1). - * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters - * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz * * @return - * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, - unsigned int counter, nvmlEnableState_t freeze); +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); /** - * Deprecated: Resetting NVLINK utilization counters is no longer supported. + * Resets the application clock to the default value * - * Reset the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. * - * For Pascal &tm; or newer fully supported devices. + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be reset - * @param counter Specifies the counter that should be reset (0 or 1) + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); - -/** -* Get the NVLink device type of the remote device connected over the given link. -* -* @param device The device handle of the target GPU -* @param link The NVLink link index on the target GPU -* @param pNvLinkDeviceType Pointer in which the output remote device type is returned -* -* @return -* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or -* \a pNvLinkDeviceType is NULL -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is -* otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for - * some event to occur. - * @{ - */ -/***************************************************************************************************/ - -/** - * Create an empty set of events. - * Event set should be freed by \ref nvmlEventSetFree + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. * - * For Fermi &tm; or newer fully supported devices. - * @param set Reference in which to return the event handle + * @param device The identifier of the target device * * @return - * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventSetFree */ -nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); /** - * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t - * - * For Fermi &tm; or newer fully supported devices. - * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) - * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * Try to set the current state of Auto Boosted clocks on a device. * - * For Linux only. + * For Kepler &tm; or newer fully supported devices. * - * \b IMPORTANT: Operations on \a set are not thread safe + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. * - * This call starts recording of events on specific device. - * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. * - * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. - * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes - * are registered in that case. + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. * * @param device The identifier of the target device - * @param eventTypes Bitmask of \ref nvmlEventType to record - * @param set Set to which add new event types + * @param enabled What state to try to set Auto Boosted clocks of the target device to * * @return - * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlEventType - * @see nvmlDeviceGetSupportedEventTypes - * @see nvmlEventSetWait - * @see nvmlEventSetFree */ -nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); /** - * Returns information about events supported on device + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running * - * For Fermi &tm; or newer fully supported devices. + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. * - * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. * * @param device The identifier of the target device - * @param eventTypes Reference in which to return bitmask of supported events + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. * * @return - * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); /** - * Waits on events and delivers events - * - * For Fermi &tm; or newer fully supported devices. + * Sets the speed of the fan control policy to default. * - * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives - * but not longer than specified timeout. This function in certain conditions can return before - * specified timeout passes (e.g. when interrupt arrives) + * For all cuda-capable discrete products with fans * - * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. - * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error - * type is returned for all xid error events. + * @param device The identifier of the target device + * @param fan The index of the fan, starting at zero * - * On Linux, every xid error event would return the associated event data and other information if applicable. + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); + +/** + * Sets current fan control policy. * - * In MIG mode, if device handle is provided, the API reports all the events for the available instances, - * only if the caller has appropriate privileges. In absence of required privileges, only the events which - * affect all the instances (i.e. whole device) are reported. + * For Maxwell &tm; or newer fully supported devices. * - * This API does not currently support per-instance event reporting using MIG device handles. + * Requires privileged user. * - * @param set Reference to set of events to wait on - * @param data Reference in which to return event data - * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * For all cuda-capable discrete products with fans * - * @return - * - \ref NVML_SUCCESS if the data has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL - * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived - * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * device The identifier of the target \a device + * policy The fan control \a policy to set * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents + * return + * NVML_SUCCESS if \a policy has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference + * a fan that exists. + * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell + * NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, + nvmlFanControlPolicy_t policy); /** - * Releases events in the set + * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. * - * For Fermi &tm; or newer fully supported devices. + * For Maxwell &tm; or newer fully supported devices. * - * @param set Reference to events to be released + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value to be set + * @param temp Reference which hold the value to be set * @return - * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_SUCCESS if \a temp has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states - * This chapter describes methods that NVML can perform against each device to control their drain state - * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to - * power on/off GPUs, enable robust reset scenarios, etc. - * @{ */ -/***************************************************************************************************/ +nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); /** - * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. - * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before - * this call is made. - * Must be called as administrator. - * For Linux only. + * Set new power limit of this device. * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * @param pciInfo The PCI address of the GPU drain state to be modified - * @param newState The drain state that should be entered, see \ref nvmlEnableState_t + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * @param device The identifier of the target device + * @param limit Power management limit in milliwatts to set * * @return - * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_SUCCESS if \a limit has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPowerManagementLimitConstraints + * @see nvmlDeviceGetPowerManagementDefaultLimit */ -nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); /** - * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining - * state. - * For Linux only. + * Sets new GOM. See \a nvmlGpuOperationMode_t for details. * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * Requires root/admin permissions. * - * @param pciInfo The PCI address of the GPU drain state to be queried - * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t + * Changing GOMs requires a reboot. + * The reboot requirement might be removed in the future. + * + * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when + * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * + * @param device The identifier of the target device + * @param mode Target GOM * * @return - * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_SUCCESS if \a mode has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceGetGpuOperationMode */ -nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); +nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); /** - * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver - * as long as no other processes are attached. If other processes are attached, this call will return - * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the - * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the - * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled - * prior to this call. - * - * For long-running NVML processes please note that this will change the enumeration of current GPUs. - * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. - * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. - * For Linux only. + * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. + * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction + * to query the current restriction settings. * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * @param pciInfo The PCI address of the GPU to be removed - * @param gpuState Whether the GPU is to be removed, from the OS - * see \ref nvmlDetachGpuState_t - * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted The target restriction * * @return - * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_SUCCESS if \a isRestricted has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * boosted clocks is not supported by the device) + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); +nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); /** - * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. - * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes - * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. + * Sets the speed of a specified fan. * - * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds - * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. + * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor + * the temperature and adjust the fan speed accordingly. + * If you set the fan speed too low you can burn your GPU! + * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. * - * Must be run as administrator. - * For Linux only. + * For all cuda-capable discrete products with fans that are Maxwell or Newer. * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. + * device The identifier of the target device + * fan The index of the fan, starting at zero + * speed The target speed of the fan [0-100] in % of max speed * - * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device - * fields are used in this call. + * return + * NVML_SUCCESS if the fan speed has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, + * or if the fan index doesn't reference an actual fan. + * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. + * NVML_ERROR_UNKNOWN if there was an unexpected error. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); + +/** + * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works + * on Maxwell onwards GPU architectures. + * + * Set the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[in] offset The GPCCLK VF offset value to set * * @return - * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_SUCCESS if \a offset has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature - * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueQueries Field Value Queries - * This chapter describes NVML operations that are associated with retrieving Field Values from NVML - * @{ - */ -/***************************************************************************************************/ +nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); /** - * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. - * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs - * will be populated from a single call rather than making a driver call for each fieldId. + * Deprecated: Will be deprecated in a future release. Use \ref nvmlDeviceSetClockOffsets instead. It works + * on Maxwell onwards GPU architectures. * - * @param device The device handle of the GPU to request field values for - * @param valuesCount Number of entries in values that should be retrieved - * @param values Array of \a valuesCount structures to hold field values. - * Each value's fieldId must be populated prior to this call + * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. + * @param[in] device The identifier of the target device + * @param[in] offset The MemClk VF offset value to set * * @return - * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must - * check the nvmlReturn field of each value for each individual - * status - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - - -/** @} */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); -/***************************************************************************************************/ -/** @defgroup vGPU Enums, Constants and Structs - * @{ +/** + * @} */ -/** @} */ -/***************************************************************************************************/ -/***************************************************************************************************/ -/** @defgroup nvmlVirtualGpuQueries vGPU APIs - * This chapter describes operations that are associated with NVIDIA vGPU Software products. +/** @addtogroup nvmlAccountingStats * @{ */ -/***************************************************************************************************/ /** - * This method is used to get the virtualization mode corresponding to the GPU. + * Enables or disables per process accounting. * * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * @param device Identifier of the target device - * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * @note This setting is not persistent and will default to disabled after driver unloads. + * Enable persistence mode to be sure the setting doesn't switch off to disabled. * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); - -/** - * Queries if SR-IOV host operation is supported on a vGPU supported device. + * @note Enabling accounting mode has no negative impact on the GPU performance. * - * Checks whether SR-IOV host capability is supported by the device and the - * driver, and indicates device is in SR-IOV mode if both of these conditions - * are true. + * @note Disabling accounting clears all accounting pids information. * - * @param device The identifier of the target device - * @param pHostVgpuMode Reference in which to return the current vGPU mode + * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceClearAccountingPids + * + * @param device The identifier of the target device + * @param mode The target accounting mode * * @return - * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. - * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred + * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); +nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); /** - * This method is used to set the virtualization mode corresponding to the GPU. + * Clears accounting information about all processes that have already terminated. * * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device * * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); +nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); /** - * Retrieve the vGPU Software licensable features. + * Set new power limit of this device. * - * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * See \ref nvmlPowerValue_v2_t for more information on the struct. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * This API replaces nvmlDeviceSetPowerManagementLimit. It can be used as a drop-in replacement for the older version. + * + * @param device The identifier of the target device + * @param powerValue Power management limit in milliwatts to set * * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a powerValue is NULL or contains invalid values + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see NVML_FI_DEV_POWER_AVERAGE + * @see NVML_FI_DEV_POWER_INSTANT + * @see NVML_FI_DEV_POWER_MIN_LIMIT + * @see NVML_FI_DEV_POWER_MAX_LIMIT + * @see NVML_FI_DEV_POWER_CURRENT_LIMIT */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit_v2(nvmlDevice_t device, nvmlPowerValue_v2_t *powerValue); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. + * @{ + */ +/***************************************************************************************************/ /** - * Retrieves the current utilization and process ID + * Retrieves the state of the device's NvLink for the link specified * - * For Maxwell &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that + * the link is active and NVML_FEATURE_DISABLED indicates it + * is inactive * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_SUCCESS if \a isActive has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); /** - * Retrieve GSP firmware version. + * Retrieves the version of the device's NvLink for the link specified * - * The caller passes in buffer via \a version and corresponding GSP firmware numbered version - * is returned with the same parameter in string format. + * For Pascal &tm; or newer fully supported devices. * - * @param device Device handle - * @param version The retrieved GSP firmware version + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param version Requested NvLink version from nvmlNvlinkVersion_t * * @return - * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); /** - * Retrieve GSP firmware mode. + * Retrieves the requested capability from the device's NvLink for the link specified + * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried + * The return value should be treated as a boolean. * - * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with - * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * For Pascal &tm; or newer fully supported devices. * - * @param device Device handle - * @param isEnabled Pointer to specify if GSP firmware is enabled - * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is available * * @return - * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpu vGPU Management - * @{ - * - * This chapter describes APIs supporting NVIDIA vGPU. - */ -/***************************************************************************************************/ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, + nvmlNvLinkCapability_t capability, unsigned int *capResult); /** - * Retrieve the supported vGPU types on a physical GPU (device). - * - * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. + * Retrieves the PCI information for the remote node on a NvLink link + * Note: pciSubSystemId is not filled in this function and is indeterminate * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * For Pascal &tm; or newer fully supported devices. * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param pci \a nvmlPciInfo_t of the remote node for the specified link * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); /** - * Retrieve the currently creatable vGPU types on a physical GPU (device). - * - * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types - * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable - * list will be restricted to whatever vGPU type is already running on the device. + * Retrieves the specified error counter value + * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * For Pascal &tm; or newer fully supported devices. * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the NvLink counter to be queried + * @param counterValue Returned counter value * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, + nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); /** - * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * Resets all error counters to zero + * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset * - * For Kepler &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeClass Pointer to string array to return class in - * @param size Size of string + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); /** - * Retrieve the vGPU type name. + * Deprecated: Setting utilization counter control is no longer supported. * - * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not - * exceed 64 characters in length (including the NUL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset + * of the counters if the reset parameter is non-zero. * - * For Kepler &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeName Pointer to buffer to return name - * @param size Size of buffer + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set + * @param reset Resets the counters on set if non-zero * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control, unsigned int reset); /** - * Retrieve the GPU Instance Profile ID for the given vGPU type ID. - * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is - * returned. + * Deprecated: Getting utilization counter control is no longer supported. * - * For Kepler &tm; or newer fully supported devices. + * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition * - * @param vgpuTypeId Handle to vGPU type - * @param gpuInstanceProfileId GPU Instance Profile ID + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control); + /** - * Retrieve the device ID of a vGPU type. + * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. * - * For Kepler &tm; or newer fully supported devices. + * Retrieve the NVLINK utilization counter based on the current control for a specified counter. + * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl + * before reading the utilization counters as they have no default state * - * @param vgpuTypeId Handle to vGPU type - * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be read (0 or 1). + * @param rxcounter Receive counter return value + * @param txcounter Transmit counter return value * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, + unsigned long long *rxcounter, unsigned long long *txcounter); /** - * Retrieve the vGPU framebuffer size in bytes. + * Deprecated: Freezing NVLINK utilization counters is no longer supported. * - * For Kepler &tm; or newer fully supported devices. + * Freeze the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function * - * @param vgpuTypeId Handle to vGPU type - * @param fbSize Pointer to framebuffer size in bytes + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be frozen (0 or 1). + * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters + * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, + unsigned int counter, nvmlEnableState_t freeze); /** - * Retrieve count of vGPU's supported display heads. + * Deprecated: Resetting NVLINK utilization counters is no longer supported. * - * For Kepler &tm; or newer fully supported devices. + * Reset the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function * - * @param vgpuTypeId Handle to vGPU type - * @param numDisplayHeads Pointer to number of display heads + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be reset + * @param counter Specifies the counter that should be reset (0 or 1) * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); /** - * Retrieve vGPU display head's maximum supported resolution. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param displayIndex Zero-based index of display head - * @param xdim Pointer to maximum number of pixels in X dimension - * @param ydim Pointer to maximum number of pixels in Y dimension - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex - * is out of range. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); +* Get the NVLink device type of the remote device connected over the given link. +* +* @param device The device handle of the target GPU +* @param link The NVLink link index on the target GPU +* @param pNvLinkDeviceType Pointer in which the output remote device type is returned +* +* @return +* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or +* \a pNvLinkDeviceType is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is +* otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); /** - * Retrieve license requirements for a vGPU type + * Set NvLink Low Power Threshold for device. * - * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form - * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, - * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * For Hopper &tm; or newer fully supported devices. * - * The total length of the returned string will not exceed 128 characters, including the NUL terminator. - * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * @param device The identifier of the target device + * @param info Reference to \a nvmlNvLinkPowerThres_t struct + * input parameters * - * For Kepler &tm; or newer fully supported devices. + * @return + * - \ref NVML_SUCCESS if the \a Threshold is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a Threshold is not within range + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeLicenseString Pointer to buffer to return license info - * @param size Size of \a vgpuTypeLicenseString buffer + **/ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info); + +/** + * Set the global nvlink bandwith mode * + * @param nvlinkBwMode nvlink bandwidth mode * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided + * - \ref NVML_ERROR_IN_USE if P2P object exists + * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. + * - \ref NVML_ERROR_NO_PERMISSION if not root user */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); +nvmlReturn_t DECLDIR nvmlSystemSetNvlinkBwMode(unsigned int nvlinkBwMode); /** - * Retrieve the static frame rate limit value of the vGPU type - * - * For Kepler &tm; or newer fully supported devices. + * Get the global nvlink bandwith mode * - * @param vgpuTypeId Handle to vGPU type - * @param frameRateLimit Reference to return the frame rate limit value + * @param nvlinkBwMode reference of nvlink bandwidth mode * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + * - \ref NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture. + * - \ref NVML_ERROR_NO_PERMISSION if not root user */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); +nvmlReturn_t DECLDIR nvmlSystemGetNvlinkBwMode(unsigned int *nvlinkBwMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ +/***************************************************************************************************/ /** - * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree * - * For Kepler &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle * - * @param device The identifier of the target device - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCount Pointer to get the max number of vGPU instances - * that can be created on a deicve for given vgpuTypeId * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if the event has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, - * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); +nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); /** - * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t * - * For Kepler &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. + * ECC events are available only on ECC-enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if the event has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); +nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); /** - * Retrieve the active vGPU instances on a device. - * - * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances - * written to the buffer. + * Returns information about events supported on device * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. - * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return - * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * For Fermi &tm; or newer fully supported devices. * - * For Kepler &tm; or newer fully supported devices. + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. * - * @param device The identifier of the target device - * @param vgpuCount Pointer which passes in the array size as well as get - * back the number of types - * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events * * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents */ -nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); /** - * Retrieve the VM ID associated with a vGPU instance. + * Waits on events and delivers events * - * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * For Fermi &tm; or newer fully supported devices. * - * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) * - * For Kepler &tm; or newer fully supported devices. + * On Windows, in case of Xid error, the function returns the most recent Xid error type seen by the system. + * If there are multiple Xid errors generated before nvmlEventSetWait is invoked then the last seen Xid error + * type is returned for all Xid error events. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vmId Pointer to caller-supplied buffer to hold VM ID - * @param size Size of buffer in bytes - * @param vmIdType Pointer to hold VM ID type + * On Linux, every Xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if the data has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); +nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); /** - * Retrieve the UUID of a vGPU instance. - * - * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * Releases events in the set * - * For Kepler &tm; or newer fully supported devices. + * For Fermi &tm; or newer fully supported devices. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID - * @param size Size of buffer in bytes + * @param set Reference to events to be released * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if the event has been successfully released * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); +nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlZPI Drain states + * This chapter describes methods that NVML can perform against each device to control their drain state + * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to + * power on/off GPUs, enable robust reset scenarios, etc. + * @{ + */ +/***************************************************************************************************/ /** - * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. - * - * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version - * string will not exceed 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the - * NVIDIA driver is loaded and initialized. + * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. + * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before + * this call is made. + * Must be called as administrator. + * For Linux only. * - * For Kepler &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param version Caller-supplied buffer to return driver version string - * @param length Size of \a version buffer + * @param pciInfo The PCI address of the GPU drain state to be modified + * @param newState The drain state that should be entered, see \ref nvmlEnableState_t * * @return - * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); +nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); /** - * Retrieve the framebuffer usage in bytes. - * - * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining + * state. + * For Linux only. * - * For Kepler &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. * - * @param vgpuInstance The identifier of the target instance - * @param fbUsage Pointer to framebuffer usage in bytes + * @param pciInfo The PCI address of the GPU drain state to be queried + * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t * * @return - * - \ref NVML_SUCCESS successful completion + * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); +nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); /** - * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. - * - * Retrieve the current licensing state of the vGPU instance. + * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver + * as long as no other processes are attached. If other processes are attached, this call will return + * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the + * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called + * to initiate the draining state is if that process was using, and is still using, a GPU before the + * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled + * prior to this call. * - * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * For long-running NVML processes please note that this will change the enumeration of current GPUs. + * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. + * Also, device handles after the removed GPU will not be valid and must be re-established. + * Must be run as administrator. + * For Linux only. * - * For Kepler &tm; or newer fully supported devices. + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param licensed Reference to return the licensing status + * @param pciInfo The PCI address of the GPU to be removed + * @param gpuState Whether the GPU is to be removed, from the OS + * see \ref nvmlDetachGpuState_t + * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t * * @return - * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); /** - * Retrieve the vGPU type of a vGPU instance. + * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes + * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. * - * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds + * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. * - * For Kepler &tm; or newer fully supported devices. + * Must be run as administrator. + * For Linux only. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vgpuTypeId Reference to return the vgpuTypeId + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device + * fields are used in this call. * * @return - * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_SUCCESS if counters were successfully reset * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature + * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); +nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueQueries Field Value Queries + * This chapter describes NVML operations that are associated with retrieving Field Values from NVML + * @{ + */ +/***************************************************************************************************/ /** - * Retrieve the frame rate limit set for the vGPU instance. - * - * Returns the value of the frame rate limit set for the vGPU instance - * - * For Kepler &tm; or newer fully supported devices. + * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. + * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs + * will be populated from a single call rather than making a driver call for each fieldId. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param frameRateLimit Reference to return the frame rate limit + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be retrieved + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call * * @return - * - \ref NVML_SUCCESS if \a frameRateLimit has been set - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); +nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); /** - * Retrieve the current ECC mode of vGPU instance. + * Clear values for a list of fields for a device. This API allows multiple fields to be cleared at once. * - * @param vgpuInstance The identifier of the target vGPU instance - * @param eccMode Reference in which to return the current ECC mode + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be cleared + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call * * @return - * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if any values in \a values were cleared. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); +nvmlReturn_t DECLDIR nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVirtualGpuQueries vGPU APIs + * This chapter describes operations that are associated with NVIDIA vGPU Software products. + * @{ + */ +/***************************************************************************************************/ /** - * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * This method is used to get the virtualization mode corresponding to the GPU. * - * For Maxwell &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * @param device Identifier of the target device + * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? * * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if \a pVirtualMode is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); +nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); /** - * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * Queries if SR-IOV host operation is supported on a vGPU supported device. * - * For Maxwell &tm; or newer fully supported devices. + * Checks whether SR-IOV host capability is supported by the device and the + * driver, and indicates device is in SR-IOV mode if both of these conditions + * are true. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Unsigned int for the encoder capacity value + * @param device The identifier of the target device + * @param pHostVgpuMode Reference in which to return the current vGPU mode * * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. + * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); +nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); /** - * Retrieves the current encoder statistics of a vGPU Instance + * This method is used to set the virtualization mode corresponding to the GPU. * - * For Maxwell &tm; or newer fully supported devices. + * For Kepler &tm; or newer fully supported devices. * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? * * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_SUCCESS if \a virtualMode is set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a virtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); /** - * Retrieves information about all active encoder sessions on a vGPU Instance. + * Get the vGPU heterogeneous mode for the device. + * + * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. + * + * On successful return, the function returns \a pHeterogeneousMode->mode with the current vGPU heterogeneous mode. + * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should + * set the correct version number to retrieve the vGPU heterogeneous mode. + * \a pHeterogeneousMode->mode can either be \ref NVML_FEATURE_ENABLED or \ref NVML_FEATURE_DISABLED. + * + * @param device The identifier of the target device + * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a pHeterogeneousMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support this feature + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuHeterogeneousMode(nvmlDevice_t device, nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); + +/** + * Enable or disable vGPU heterogeneous mode for the device. + * + * When in heterogeneous mode, a vGPU can concurrently host timesliced vGPUs with differing framebuffer sizes. + * + * API would return an appropriate error code upon unsuccessful activation. For example, the heterogeneous mode + * set will fail with error \ref NVML_ERROR_IN_USE if any vGPU instance is active on the device. The caller of this API + * is expected to shutdown the vGPU VMs and retry setting the \a mode. + * On successful return, the function updates the vGPU heterogeneous mode with the user provided \a pHeterogeneousMode->mode. + * \a pHeterogeneousMode->version is the version number of the structure nvmlVgpuHeterogeneousMode_t, the caller should + * set the correct version number to set the vGPU heterogeneous mode. + * + * @param device Identifier of the target device + * @param pHeterogeneousMode Pointer to the caller-provided structure of nvmlVgpuHeterogeneousMode_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a pHeterogeneousMode is NULL or \a pHeterogeneousMode->mode is invalid + * - \ref NVML_ERROR_IN_USE If the \a device is in use + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or \a device doesn't support this feature + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pHeterogeneousMode is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuHeterogeneousMode(nvmlDevice_t device, const nvmlVgpuHeterogeneousMode_t *pHeterogeneousMode); + +/** + * Query the placement ID of active vGPU instance. + * + * When in vGPU heterogeneous mode, this function returns a valid placement ID as \a pPlacement->placementId + * else NVML_INVALID_VGPU_PLACEMENT_ID is returned. + * \a pPlacement->version is the version number of the structure nvmlVgpuPlacementId_t, the caller should + * set the correct version number to get placement id of the vGPU instance \a vgpuInstance. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param pPlacement Pointer to vGPU placement ID structure \a nvmlVgpuPlacementId_t + * + * @return + * - \ref NVML_SUCCESS If information is successfully retrieved + * - \ref NVML_ERROR_NOT_FOUND If \a vgpuInstance does not match a valid active vGPU instance + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuInstance is invalid or \a pPlacement is NULL + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacement is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetPlacementId(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuPlacementId_t *pPlacement); + +/** + * Query the supported vGPU placement ID of the vGPU type. + * + * The function returns an array of supported vGPU placement IDs for the specified vGPU type ID in the buffer provided + * by the caller at \a pPlacementList->placementIds. The required memory for the placementIds array must be allocated + * based on the maximum number of vGPU type instances, which is retrievable through \ref nvmlVgpuTypeGetMaxInstances(). + * If the provided count by the caller is insufficient, the function will return NVML_ERROR_INSUFFICIENT_SIZE along with + * the number of required entries in \a pPlacementList->count. The caller should then reallocate a buffer with the size + * of pPlacementList->count * sizeof(pPlacementList->placementIds) and invoke the function again. + * + * To obtain a list of homogeneous placement IDs, the caller needs to set \a pPlacementList->mode to NVML_VGPU_PGPU_HOMOGENEOUS_MODE. + * For heterogeneous placement IDs, \a pPlacementList->mode should be set to NVML_VGPU_PGPU_HETEROGENEOUS_MODE. + * By default, a list of heterogeneous placement IDs is returned. + * + * @param device Identifier of the target device + * @param vgpuTypeId Handle to vGPU type. The vGPU type ID + * @param pPlacementList Pointer to the vGPU placement structure \a nvmlVgpuPlacementList_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE If the buffer is small, element count is returned in \a pPlacementList->count + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeSupportedPlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); + +/** + * Query the creatable vGPU placement ID of the vGPU type. + * + * An array of creatable vGPU placement IDs for the vGPU type ID indicated by \a vgpuTypeId is returned in the + * caller-supplied buffer of \a pPlacementList->placementIds. Memory needed for the placementIds array should be + * allocated based on maximum instances of a vGPU type which can be queried via \ref nvmlVgpuTypeGetMaxInstances(). + * The creatable vGPU placement IDs may differ over time, as there may be restrictions on what type of vGPU the + * vGPU instance is running. + * + * The function will return \ref NVML_ERROR_NOT_SUPPORTED if the \a device is not in vGPU heterogeneous mode. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type. The vGPU type ID + * @param pPlacementList Pointer to the list of vGPU placement structure \a nvmlVgpuPlacementList_t + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a vgpuTypeId is invalid or \a pPlacementList is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device or \a vgpuTypeId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a pPlacementList is invalid + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuTypeCreatablePlacements(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuPlacementList_t *pPlacementList); + +/** + * Retrieve the static GSP heap size of the vGPU type in bytes + * + * @param vgpuTypeId Handle to vGPU type + * @param gspHeapSize Reference to return the GSP heap size value + * @return + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a gspHeapSize is NULL + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetGspHeapSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *gspHeapSize); + +/** + * Retrieve the static framebuffer reservation of the vGPU type in bytes + * + * @param vgpuTypeId Handle to vGPU type + * @param fbReservation Reference to return the framebuffer reservation + * @return + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a vgpuTypeId is invalid, or \a fbReservation is NULL + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFbReservation(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbReservation); + +/** + * Set the desirable vGPU capability of a device + * + * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be set. + * See \ref nvmlEnableState_t for available state. + * + * @param device The identifier of the target device + * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be set + * @param state The target capability mode + * + * @return + * - \ref NVML_SUCCESS Successful completion + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a capability is invalid, or \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state, or \a device not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN On any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, nvmlEnableState_t state); + +/** + * Retrieve the vGPU Software licensable features. + * + * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpu vGPU Management + * @{ + * + * This chapter describes APIs supporting NVIDIA vGPU. + */ +/***************************************************************************************************/ + +/** + * Retrieve the requested vGPU driver capability. + * + * Refer to the \a nvmlVgpuDriverCapability_t structure for the specific capabilities that can be queried. + * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability + * is supported. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param capability Specifies the \a nvmlVgpuDriverCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is supported + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capability is invalid, or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a devices not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int *capResult); + +/** + * Retrieve the requested vGPU capability for GPU. + * + * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried. + * The return value in \a capResult reports a non-zero value indicating that the capability + * is supported, and also reports the capability's data based on the queried capability. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried + * @param capResult Specifies that the queried capability is supported, and also returns capability's data + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a capability is invalid, or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a device not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int *capResult); + +/** + * Retrieve the supported vGPU types on a physical GPU (device). + * + * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the currently creatable vGPU types on a physical GPU (device). + * + * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types + * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable + * list will be restricted to whatever vGPU type is already running on the device. + * + * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types that can be created for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeClass Pointer to string array to return class in + * @param size Size of string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); + +/** + * Retrieve the vGPU type name. + * + * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not + * exceed 64 characters in length (including the NUL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeName Pointer to buffer to return name + * @param size Size of buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); + +/** + * Retrieve the GPU Instance Profile ID for the given vGPU type ID. + * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is + * returned. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param gpuInstanceProfileId GPU Instance Profile ID + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); + +/** + * Retrieve the device ID of a vGPU type. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsystem ID and subsystem vendor ID of the device contained in single 32 bit value + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); + +/** + * Retrieve the vGPU framebuffer size in bytes. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param fbSize Pointer to framebuffer size in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); + +/** + * Retrieve count of vGPU's supported display heads. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param numDisplayHeads Pointer to number of display heads + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); + +/** + * Retrieve vGPU display head's maximum supported resolution. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param displayIndex Zero-based index of display head + * @param xdim Pointer to maximum number of pixels in X dimension + * @param ydim Pointer to maximum number of pixels in Y dimension + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex + * is out of range. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); + +/** + * Retrieve license requirements for a vGPU type + * + * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form + * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, + * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * + * The total length of the returned string will not exceed 128 characters, including the NUL terminator. + * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeLicenseString Pointer to buffer to return license info + * @param size Size of \a vgpuTypeLicenseString buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); + +/** + * Retrieve the static frame rate limit value of the vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param frameRateLimit Reference to return the frame rate limit value + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); + +/** + * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCount Pointer to get the max number of vGPU instances + * that can be created on a deicve for given vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, + * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); + +/** + * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); + +/** + * Retrieve the BAR1 info for given vGPU type. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param bar1Info Pointer to the vGPU type BAR1 information structure \a nvmlVgpuTypeBar1Info_t + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a bar1Info is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetBAR1Info(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuTypeBar1Info_t *bar1Info); + +/** + * Retrieve the active vGPU instances on a device. + * + * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The + * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * written to the buffer. + * + * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. + * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return + * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer which passes in the array size as well as get + * back the number of types + * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); + +/** + * Retrieve the VM ID associated with a vGPU instance. + * + * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vmId Pointer to caller-supplied buffer to hold VM ID + * @param size Size of buffer in bytes + * @param vmIdType Pointer to hold VM ID type + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); + +/** + * Retrieve the UUID of a vGPU instance. + * + * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); + +/** + * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. + * + * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version + * string will not exceed 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is + * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * NVIDIA driver is loaded and initialized. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param version Caller-supplied buffer to return driver version string + * @param length Size of \a version buffer + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); + +/** + * Retrieve the framebuffer usage in bytes. + * + * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target instance + * @param fbUsage Pointer to framebuffer usage in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); + +/** + * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. + * + * Retrieve the current licensing state of the vGPU instance. + * + * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licensed Reference to return the licensing status + * + * @return + * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); + +/** + * Retrieve the vGPU type of a vGPU instance. + * + * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vgpuTypeId Reference to return the vgpuTypeId + * + * @return + * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); + +/** + * Retrieve the frame rate limit set for the vGPU instance. + * + * Returns the value of the frame rate limit set for the vGPU instance + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param frameRateLimit Reference to return the frame rate limit + * + * @return + * - \ref NVML_SUCCESS if \a frameRateLimit has been set + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); + +/** + * Retrieve the current ECC mode of vGPU instance. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param eccMode Reference in which to return the current ECC mode + * + * @return + * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); + +/** + * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); + +/** + * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Unsigned int for the encoder capacity value + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); + +/** + * Retrieves the current encoder statistics of a vGPU Instance + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about all active encoder sessions on a vGPU Instance. * * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. @@ -7404,7 +9589,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuI * For Maxwell &tm; or newer fully supported devices. * * @param vgpuInstance Identifier of the target vGPU instance -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats * * @return * - \ref NVML_SUCCESS if \a fbcStats is fetched @@ -7422,7 +9607,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * written to the buffer. * -* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* If the supplied buffer is not large enough to accommodate the active session array, the function returns * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. * To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return * NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. @@ -7503,6 +9688,31 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance */ nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); +/** + * Retrieve the MDEV UUID of a vGPU instance. + * + * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * MDEV UUID is displayed only on KVM platform. + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); + /** @} */ /***************************************************************************************************/ @@ -7532,7 +9742,7 @@ typedef struct nvmlVgpuMetadata_st char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host unsigned int reserved[6]; //!< Reserved for internal use - unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld + unsigned int vgpuVirtualizationCaps; //!< vGPU virtualization capabilities bitfield unsigned int guestVgpuVersion; //!< vGPU version of guest driver unsigned int opaqueDataSize; //!< Size of opaque data field in bytes char opaqueData[4]; //!< Opaque data @@ -7546,7 +9756,7 @@ typedef struct nvmlVgpuPgpuMetadata_st unsigned int version; //!< Current version of the structure unsigned int revision; //!< Current revision of the structure char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualization capabilities bitfield unsigned int reserved[5]; //!< Reserved for internal use nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver unsigned int opaqueDataSize; //!< Size of opaque data field in bytes @@ -7644,7 +9854,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpu * * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * with the physical GPU is limited, a limit code indicates the factor limiting compatability. * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). * * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to @@ -7681,6 +9891,91 @@ nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, */ nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); +/** + * Returns the vGPU Software scheduler logs. + * \a pSchedulerLog points to a caller-allocated structure to contain the logs. The number of elements returned will + * never exceed \a NVML_SCHEDULER_SW_MAX_LOG_ENTRIES. + * + * To get the entire logs, call the function atleast 5 times a second. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pSchedulerLog Reference in which \a pSchedulerLog is written + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler logs were successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerLog is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t *pSchedulerLog); + +/** + * Returns the vGPU scheduler state. + * The information returned in \a nvmlVgpuSchedulerGetState_t is not relevant if the BEST EFFORT policy is set. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pSchedulerState Reference in which \a pSchedulerState is returned + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler state is successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState); + +/** + * Returns the vGPU scheduler capabilities. + * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from + * the NVML_VGPU_SCHEDULER_POLICY_*. This list enumerates the supported scheduler policies + * if the engine is Graphics type. + * The other values in \a nvmlVgpuSchedulerCapabilities_t are also applicable if the engine is + * Graphics type. For other engine types, it is BEST EFFORT policy. + * If ARR is supported and enabled, scheduling frequency and averaging factor are applicable + * else timeSlice is applicable. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pCapabilities Reference in which \a pCapabilities is written + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler capabilities were successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pCapabilities is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities); + +/** + * Sets the vGPU scheduler state. + * + * For Pascal &tm; or newer fully supported devices. + * + * The scheduler state change won't persist across module load/unload. + * Scheduler state and params will be allowed to set only when no VM is running. + * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode is enabled then + * provide avgFactorForARR and frequency as input. If enableARRMode is disabled + * then provide timeslice as input. + * + * @param device The identifier of the target \a device + * @param pSchedulerState vGPU \a pSchedulerState to set + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid + * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, + * reboot is required to overcome from this error. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * or if any vGPU instance currently exists on the \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); + /* * Virtual GPU (vGPU) version * @@ -7798,6 +10093,52 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); +/** + * Retrieves recent utilization for vGPU instances running on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for vGPU + * instances running on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied + * buffer pointed at by \a vgpuUtilInfo->vgpuUtilArray. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a vgpuUtilInfo->sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a vgpuUtilInfo->vgpuUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuUtilInfo->vgpuInstanceCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t). Invoke the function again with + * the allocated buffer passed in \a vgpuUtilInfo->vgpuUtilArray, and \a vgpuUtilInfo->vgpuInstanceCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuUtilInfo->vgpuInstanceCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * \a vgpuUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set \a vgpuUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param vgpuUtilInfo Pointer to the caller-provided structure of nvmlVgpuInstancesUtilizationInfo_t + + * @return + * - \ref NVML_SUCCESS If utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, \a vgpuUtilInfo is NULL, or \a vgpuUtilInfo->vgpuInstanceCount is 0 + * - \ref NVML_ERROR_NOT_SUPPORTED If vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a vgpuUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE If \a vgpuUtilInfo->vgpuUtilArray is NULL, or the buffer size of vgpuUtilInfo->vgpuInstanceCount is too small. + * The caller should check the current vGPU instance count from the returned vgpuUtilInfo->vgpuInstanceCount, and call + * the function again with a buffer of size vgpuUtilInfo->vgpuInstanceCount * sizeof(nvmlVgpuInstanceUtilizationInfo_t) + * - \ref NVML_ERROR_NOT_FOUND If sample entries are not found + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuInstancesUtilizationInfo(nvmlDevice_t device, + nvmlVgpuInstancesUtilizationInfo_t *vgpuUtilInfo); + /** * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). * @@ -7844,6 +10185,52 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, unsigned int *vgpuProcessSamplesCount, nvmlVgpuProcessUtilizationSample_t *utilizationSamples); + +/** + * Retrieves recent utilization for processes running on vGPU instances on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, video decoder, jpeg decoder, and OFA for processes running + * on vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied + * buffer pointed at by \a vgpuProcUtilInfo->vgpuProcUtilArray. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a vgpuProcUtilInfo->vgpuProcUtilArray set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current processes' count + * running on vGPU instances in \a vgpuProcUtilInfo->vgpuProcessCount. The caller should allocate a buffer of size + * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a vgpuProcUtilInfo->vgpuProcUtilArray, and \a vgpuProcUtilInfo->vgpuProcessCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a vgpuProcUtilInfo->vgpuProcessCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * vgpuProcUtilInfo->lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set vgpuProcUtilInfo->lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param vgpuProcUtilInfo Pointer to the caller-provided structure of nvmlVgpuProcessesUtilizationInfo_t + + * @return + * - \ref NVML_SUCCESS If utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid, or \a vgpuProcUtilInfo is null + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the version of \a vgpuProcUtilInfo is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE If \a vgpuProcUtilInfo->vgpuProcUtilArray is null, or supplied \a vgpuProcUtilInfo->vgpuProcessCount + * is too small to return samples for all processes on vGPU instances currently executing on the device. + * The caller should check the current processes count from the returned \a vgpuProcUtilInfo->vgpuProcessCount, + * and call the function again with a buffer of size + * vgpuProcUtilInfo->vgpuProcessCount * sizeof(nvmlVgpuProcessUtilizationSample_t) + * - \ref NVML_ERROR_NOT_SUPPORTED If vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND If sample entries are not found + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessesUtilizationInfo(nvmlDevice_t device, nvmlVgpuProcessesUtilizationInfo_t *vgpuProcUtilInfo); + /** * Queries the state of per process accounting mode on vGPU. * @@ -8055,6 +10442,24 @@ nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlEx #define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 #define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA +/** + * MIG GPU instance profile capability. + * + * Bit field values representing MIG profile capabilities + * \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities + */ +#define NVML_GPU_INSTANCE_PROFILE_CAPS_P2P 0x1 +#define NVML_GPU_INTSTANCE_PROFILE_CAPS_P2P 0x1 //!< Deprecated, do not use +#define NVML_GPU_INSTANCE_PROFILE_CAPS_GFX 0x2 + +/** + * MIG compute instance profile capability. + * + * Bit field values representing MIG profile capabilities + * \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities + */ +#define NVML_COMPUTE_INSTANCE_PROFILE_CAPS_GFX 0x1 + typedef struct nvmlGpuInstancePlacement_st { unsigned int start; //!< Index of first occupied memory slice @@ -8080,18 +10485,45 @@ typedef struct nvmlGpuInstanceProfileInfo_st } nvmlGpuInstanceProfileInfo_t; /** - * GPU instance profile information (v2). + * GPU instance profile information (v2). + * + * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field + * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name + * field to the end. This structure is not backwards-compatible with + * \ref nvmlGpuInstanceProfileInfo_t. + */ +typedef struct nvmlGpuInstanceProfileInfo_v2_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlGpuInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + */ +#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) + +/** + * GPU instance profile information (v3). * - * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field - * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name - * field to the end. This structure is not backwards-compatible with - * \ref nvmlGpuInstanceProfileInfo_t. + * Version 3 removes isP2pSupported field and adds the \ref nvmlGpuInstanceProfileInfo_v3_t.capabilities + * field \ref nvmlGpuInstanceProfileInfo_t. */ -typedef struct nvmlGpuInstanceProfileInfo_v2_st +typedef struct nvmlGpuInstanceProfileInfo_v3_st { - unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v3) unsigned int id; //!< Unique profile ID within the device - unsigned int isP2pSupported; //!< Peer-to-Peer support unsigned int sliceCount; //!< GPU Slice count unsigned int instanceCount; //!< GPU instance count unsigned int multiprocessorCount; //!< Streaming Multiprocessor count @@ -8102,12 +10534,13 @@ typedef struct nvmlGpuInstanceProfileInfo_v2_st unsigned int ofaCount; //!< OFA Engine count unsigned long long memorySizeMB; //!< Memory size in MBytes char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name -} nvmlGpuInstanceProfileInfo_v2_t; + unsigned int capabilities; //!< Additional capabilities +} nvmlGpuInstanceProfileInfo_v3_t; /** - * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v3_t.version. */ -#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) +#define nvmlGpuInstanceProfileInfo_v3 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 3) typedef struct nvmlGpuInstanceInfo_st { @@ -8188,979 +10621,722 @@ typedef struct nvmlComputeInstanceProfileInfo_v2_st */ #define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) -typedef struct nvmlComputeInstanceInfo_st -{ - nvmlDevice_t device; //!< Parent device - nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance - unsigned int id; //!< Unique instance ID within the GPU instance - unsigned int profileId; //!< Unique profile ID within the GPU instance - nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} -} nvmlComputeInstanceInfo_t; - -typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; - -/** - * Set MIG mode for the device. - * - * For Ampere &tm; or newer fully supported devices. - * Requires root user. - * - * This mode determines whether a GPU instance can be created. - * - * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the - * device, such as minor number, might change. The caller of this API is expected to query such attributes again. - * - * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM - * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. - * - * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device - * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API - * is expected to idle the device and retry setting the \a mode. - * - * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref - * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. - * - * @param device The identifier of the target device - * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * @param activationStatus The activationStatus status - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); - -/** - * Get MIG mode for the device. - * - * For Ampere &tm; or newer fully supported devices. - * - * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the - * next activation trigger. - * - * @param device The identifier of the target device - * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); - -/** - * Get GPU instance profile information. - * - * Information provided by this API is immutable throughout the lifetime of a MIG mode. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* - * @param info Returns detailed profile information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, - nvmlGpuInstanceProfileInfo_t *info); - -/** - * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned - * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. - * - * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the - * appropriate version prior to calling this function. For example: - * \code - * nvmlGpuInstanceProfileInfo_v2_t profileInfo = - * { .version = nvmlGpuInstanceProfileInfo_v2 }; - * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, - * profile, - * &profileInfo); - * \endcode - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* - * @param info Returns detailed profile information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, - nvmlGpuInstanceProfileInfo_v2_t *info); - -/** - * Get GPU instance placements. - * - * A placement represents the location of a GPU instance within a device. This API only returns all the possible - * placements for the given profile. - * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will - * fail if there is overlap with the already occupied memory slices. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param placements Returns placements allowed for the profile. Can be NULL to discover number - * of allowed placements for this profile. If non-NULL must be large enough - * to accommodate the placements supported by the profile. - * @param count Returns number of allowed placemenets for the profile. - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstancePlacement_t *placements, - unsigned int *count); - -/** - * Get GPU instance profile capacity. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param count Returns remaining instance count for the profile ID - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, - unsigned int *count); - -/** - * Create GPU instance. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would - * become invalid. The GPU instance must be recreated to acquire a valid handle. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param gpuInstance Returns the GPU instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created - */ -nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstance_t *gpuInstance); - /** - * Create GPU instance with the specified placement. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would - * become invalid. The GPU instance must be recreated to acquire a valid handle. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 - * @param gpuInstance Returns the GPU instance handle + * Compute instance profile information (v3). * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + * Version 3 adds the \ref nvmlComputeInstanceProfileInfo_v3_t.capabilities field + * \ref nvmlComputeInstanceProfileInfo_t. */ -nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, - const nvmlGpuInstancePlacement_t *placement, - nvmlGpuInstance_t *gpuInstance); -/** - * Destroy GPU instance. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The GPU instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes - * (e.g. CUDA application) or compute instances are active on the - * GPU instance. +typedef struct nvmlComputeInstanceProfileInfo_v3_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v3) + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name + unsigned int capabilities; //!< Additional capabilities +} nvmlComputeInstanceProfileInfo_v3_t; + +/** + * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v3_t.version. */ -nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); +#define nvmlComputeInstanceProfileInfo_v3 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 3) + +typedef struct nvmlComputeInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance + unsigned int id; //!< Unique instance ID within the GPU instance + unsigned int profileId; //!< Unique profile ID within the GPU instance + nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} +} nvmlComputeInstanceInfo_t; + +typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; /** - * Get GPU instances for given profile ID. + * Set MIG mode for the device. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. + * Requires root user. * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to - * accommodate the instances supported by the profile. - * See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param count The count of returned GPU instances + * This mode determines whether a GPU instance can be created. * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstance_t *gpuInstances, unsigned int *count); - -/** - * Get GPU instances for given instance ID. + * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the + * device, such as minor number, might change. The caller of this API is expected to query such attributes again. * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. + * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM + * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. + * + * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device + * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API + * is expected to idle the device and retry setting the \a mode. + * + * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref + * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. * * @param device The identifier of the target device - * @param id The GPU instance ID - * @param gpuInstance Returns GPU instance + * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param activationStatus The activationStatus status * * @return * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); +nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); /** - * Get GPU instance information. + * Get MIG mode for the device. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. * - * @param gpuInstance The GPU instance handle - * @param info Return GPU instance information + * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the + * next activation trigger. + * + * @param device The identifier of the target device + * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE * * @return * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); +nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); /** - * Get compute instance profile information. + * Get GPU instance profile information * * Information provided by this API is immutable throughout the lifetime of a MIG mode. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. * - * @param gpuInstance The identifier of the target GPU instance - * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* - * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* * @param info Returns detailed profile information * * @return * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profile isn't supported * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, - unsigned int engProfile, - nvmlComputeInstanceProfileInfo_t *info); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_t *info); /** - * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned - * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. + * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned + * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. * * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the * appropriate version prior to calling this function. For example: * \code - * nvmlComputeInstanceProfileInfo_v2_t profileInfo = - * { .version = nvmlComputeInstanceProfileInfo_v2 }; - * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, - * profile, - * engProfile, - * &profileInfo); + * nvmlGpuInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlGpuInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, + * profile, + * &profileInfo); * \endcode * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. * - * @param gpuInstance The identifier of the target GPU instance - * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* - * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* * @param info Returns detailed profile information * * @return * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, - unsigned int engProfile, - nvmlComputeInstanceProfileInfo_v2_t *info); - -/** - * Get compute instance profile capacity. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param count Returns remaining instance count for the profile ID - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, - unsigned int profileId, unsigned int *count); - -/** - * Create compute instance. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed - * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire - * a valid handle. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param computeInstance Returns the compute instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, - nvmlComputeInstance_t *computeInstance); - -/** - * Destroy compute instance. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param computeInstance The compute instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if - * processes (e.g. CUDA application) are active on the compute instance. */ -nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_v2_t *info); /** - * Get compute instances for given profile ID. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to - * accommodate the instances supported by the profile. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param count The count of returned compute instances + * Get GPU instance placements. * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, - nvmlComputeInstance_t *computeInstances, unsigned int *count); - -/** - * Get compute instance for given instance ID. + * A placement represents the location of a GPU instance within a device. This API only returns all the possible + * placements for the given profile regardless of whether MIG is enabled or not. + * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will + * fail if there is overlap with the already occupied memory slices. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. * Requires privileged user. * - * @param gpuInstance The identifier of the target GPU instance - * @param id The compute instance ID - * @param computeInstance Returns compute instance - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, - nvmlComputeInstance_t *computeInstance); - -/** - * Get compute instance information. - * - * For Ampere &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param computeInstance The compute instance handle - * @param info Return compute instance information + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placements Returns placements allowed for the profile. Can be NULL to discover number + * of allowed placements for this profile. If non-NULL must be large enough + * to accommodate the placements supported by the profile. + * @param count Returns number of allowed placemenets for the profile. * * @return * - \ref NVML_SUCCESS Upon success * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG or \a profileId isn't supported * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstancePlacement_t *placements, + unsigned int *count); /** - * Test if the given handle refers to a MIG device. - * - * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. - * These overloaded references can be used (with some restrictions) interchangeably - * with a GPU device handle to execute queries at a per-compute instance granularity. + * Get GPU instance profile capacity. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param device NVML handle to test - * @param isMigDevice True when handle refers to a MIG device + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID * * @return - * - \ref NVML_SUCCESS if \a device status was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, + unsigned int *count); /** - * Get GPU instance ID for the given MIG device handle. - * - * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. + * Create GPU instance. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param device Target MIG device handle - * @param id GPU instance ID + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstance Returns the GPU instance handle * * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstance); /** - * Get compute instance ID for the given MIG device handle. - * - * Compute instance IDs are unique per GPU instance and remain valid until the compute instance - * is destroyed. + * Create GPU instance with the specified placement. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param device Target MIG device handle - * @param id Compute instance ID + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 + * @param gpuInstance Returns the GPU instance handle * * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); - +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, + const nvmlGpuInstancePlacement_t *placement, + nvmlGpuInstance_t *gpuInstance); /** - * Get the maximum number of MIG devices that can exist under a given parent NVML device. - * - * Returns zero if MIG is not supported or enabled. + * Destroy GPU instance. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param device Target device handle - * @param count Count of MIG devices + * @param gpuInstance The GPU instance handle * * @return - * - \ref NVML_SUCCESS if \a count was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes + * (e.g. CUDA application) or compute instances are active on the + * GPU instance. */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); +nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); /** - * Get MIG device handle for the given index under its parent NVML device. - * - * If the compute instance is destroyed either explicitly or by destroying, - * resetting or unbinding the parent GPU instance or the GPU device itself - * the MIG device handle would remain invalid and must be requested again - * using this API. Handles may be reused and their properties can change in - * the process. + * Get GPU instances for given profile ID. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param device Reference to the parent GPU device handle - * @param index Index of the MIG device - * @param migDevice Reference to the MIG device handle + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count The count of returned GPU instances * * @return - * - \ref NVML_SUCCESS if \a migDevice handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, - nvmlDevice_t *migDevice); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstances, unsigned int *count); /** - * Get parent device handle from a MIG device handle. + * Get GPU instances for given instance ID. * * For Ampere &tm; or newer fully supported devices. * Supported on Linux only. + * Requires privileged user. * - * @param migDevice MIG device handle - * @param device Device handle + * @param device The identifier of the target device + * @param id The GPU instance ID + * @param gpuInstance Returns GPU instance * * @return - * - \ref NVML_SUCCESS if \a device handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. */ -nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); /** - * Get the type of the GPU Bus (PCIe, PCI, ...) - * - * @param device The identifier of the target device - * @param type The PCI Bus type + * Get GPU instance information. * - * return - * - \ref NVML_SUCCESS if the bus \a type is successfully retreived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \device is invalid or \type is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); - -/** - * Retrieve performance monitor samples from the associated subdevice. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. * - * @param device - * @param pDynamicPstatesInfo + * @param gpuInstance The GPU instance handle + * @param info Return GPU instance information * * @return - * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); /** - * Sets the speed of a specified fan. - * - * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor - * the temperature and adjust the fan speed accordingly. - * If you set the fan speed too low you can burn your GPU! - * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. - * - * For all cuda-capable discrete products with fans that are Maxwell or Newer. + * Get compute instance profile information. * - * device The identifier of the target device - * fan The index of the fan, starting at zero - * speed The target speed of the fan [0-100] in % of max speed + * Information provided by this API is immutable throughout the lifetime of a MIG mode. * - * return - * NVML_SUCCESS if the fan speed has been set - * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, - * or if the fan index doesn't reference an actual fan. - * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. - * NVML_ERROR_UNKNOWN if there was an unexpected error. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); - -/** - * Retrieve the GPCCLK VF offset value - * @param[in] device The identifier of the target device - * @param[out] offset The retrieved GPCCLK VF offset value + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. * - * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); - -/** - * Set the GPCCLK VF offset value - * @param[in] device The identifier of the target device - * @param[in] offset The GPCCLK VF offset value to set + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information * * @return - * - \ref NVML_SUCCESS if \a offset has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_t *info); /** - * Retrieve the MemClk (Memory Clock) VF offset value. - * @param[in] device The identifier of the target device - * @param[out] offset The retrieved MemClk VF offset value + * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned + * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlComputeInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlComputeInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, + * profile, + * engProfile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_v2_t *info); /** - * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. - * @param[in] device The identifier of the target device - * @param[in] offset The MemClk VF offset value to set + * Get compute instance profile capacity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID * * @return - * - \ref NVML_SUCCESS if \a offset has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, unsigned int *count); /** - * Retrieve min and max clocks of some clock domain for a given PState + * Get compute instance placements. * - * @param device The identifier of the target device - * @param type Clock domain - * @param pstate PState to query - * @param minClockMHz Reference in which to return min clock frequency - * @param maxClockMHz Reference in which to return max clock frequency + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * A placement represents the location of a compute instance within a GPU instance. This API only returns all the possible + * placements for the given profile. + * A created compute instance occupies compute slices described by its placement. Creation of new compute instance will + * fail if there is overlap with the already occupied compute slices. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param placements Returns placements allowed for the profile. Can be NULL to discover number + * of allowed placements for this profile. If non-NULL must be large enough + * to accommodate the placements supported by the profile. + * @param count Returns number of allowed placemenets for the profile. * * @return - * - \ref NVML_SUCCESS if everything worked - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both - * \a minClockMHz and \a maxClockMHz are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, - unsigned int * minClockMHz, unsigned int * maxClockMHz); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, + nvmlComputeInstancePlacement_t *placements, + unsigned int *count); /** - * Get all supported Performance States (P-States) for the device. + * Create compute instance. * - * The returned array would contain a contiguous list of valid P-States supported by - * the device. If the number of supported P-States is fewer than the size of the array - * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. * - * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. * - * @param device The identifier of the target device - * @param pstates Container to return the list of performance states - * supported by device - * @param size Size of the supplied \a pstates array in bytes + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstance Returns the compute instance handle * * @return - * - \ref NVML_SUCCESS if \a pstates array has been retrieved - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to - * hold the resulting list - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, - nvmlPstates_t *pstates, unsigned int size); +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstance); /** - * Retrieve the GPCCLK min max VF offset value. - * @param[in] device The identifier of the target device - * @param[out] minOffset The retrieved GPCCLK VF min offset value - * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * Create compute instance with the specified placement. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param placement The requested placement. See \ref nvmlGpuInstanceGetComputeInstancePossiblePlacements + * @param computeInstance Returns the compute instance handle * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, - int *minOffset, int *maxOffset); +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstanceWithPlacement(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + const nvmlComputeInstancePlacement_t *placement, + nvmlComputeInstance_t *computeInstance); /** - * Retrieve the MemClk (Memory Clock) min max VF offset value. - * @param[in] device The identifier of the target device - * @param[out] minOffset The retrieved MemClk VF min offset value - * @param[out] maxOffset The retrieved MemClk VF max offset value + * Destroy compute instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param computeInstance The compute instance handle * * @return - * - \ref NVML_SUCCESS if \a offset has been successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if + * processes (e.g. CUDA application) are active on the compute instance. */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, - int *minOffset, int *maxOffset); +nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); /** - * Get Conf Computing System capabilities. + * Get compute instances for given profile ID. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. + * Requires privileged user. * - * @param capabilities System CC capabilities + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count The count of returned compute instances * * @return - * - \ref NVML_SUCCESS if \a capabilities were successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstances, unsigned int *count); /** - * Get Conf Computing System State. + * Get compute instance for given instance ID. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. + * Requires privileged user. * - * @param state System CC State + * @param gpuInstance The identifier of the target GPU instance + * @param id The compute instance ID + * @param computeInstance Returns compute instance * * @return - * - \ref NVML_SUCCESS if \a state were successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. */ -nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, + nvmlComputeInstance_t *computeInstance); /** - * Get Conf Computing Protected and Unprotected Memory Sizes. + * Get compute instance information. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. - * - * @param device Device handle - * @param memInfo Protected/Unprotected Memory sizes + * Supported on Linux only. * - * @return - * - \ref NVML_SUCCESS if \a memInfo were successfully queried - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * @param computeInstance The compute instance handle + * @param info Return compute instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation */ -nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); /** - * Set Conf Computing Protected Memory Size. + * Test if the given handle refers to a MIG device. + * + * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. + * These overloaded references can be used (with some restrictions) interchangeably + * with a GPU device handle to execute queries at a per-compute instance granularity. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param device Device Handle - * @param sizeKiB Protected Memory size to be set in KiB + * @param device NVML handle to test + * @param isMigDevice True when handle refers to a MIG device * * @return - * - \ref NVML_SUCCESS if \a sizeKiB successfully set + * - \ref NVML_SUCCESS if \a device status was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeProtectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); +nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); /** - * Set Conf Computing GPUs ready state. + * Get GPU instance ID for the given MIG device handle. + * + * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or - * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * @param device Target MIG device handle + * @param id GPU instance ID * - * return - * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); /** - * Get Conf Computing GPUs ready state. + * Get compute instance ID for the given MIG device handle. + * + * Compute instance IDs are unique per GPU instance and remain valid until the compute instance + * is destroyed. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param isAcceptingWork Returns GPU current work accepting state, - * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or - * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * @param device Target MIG device handle + * @param id Compute instance ID * - * return - * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); /** - * Get Conf Computing protected memory usage. + * Get the maximum number of MIG devices that can exist under a given parent NVML device. + * + * Returns zero if MIG is not supported or enabled. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param device The identifier of the target device - * @param memory Reference in which to return the memory information + * @param device Target device handle + * @param count Count of MIG devices * * @return - * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_SUCCESS if \a count was successfully retrieved * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); +nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); /** - * Get Conf Computing Gpu certificate details. + * Get MIG device handle for the given index under its parent NVML device. + * + * If the compute instance is destroyed either explicitly or by destroying, + * resetting or unbinding the parent GPU instance or the GPU device itself + * the MIG device handle would remain invalid and must be requested again + * using this API. Handles may be reused and their properties can change in + * the process. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param device The identifier of the target device - * @param gpuCert Reference in which to return the gpu certificate information + * @param device Reference to the parent GPU device handle + * @param index Index of the MIG device + * @param migDevice Reference to the MIG device handle * * @return - * - \ref NVML_SUCCESS if \a gpu certificate info has been populated + * - \ref NVML_SUCCESS if \a migDevice handle was successfully created * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, - nvmlConfComputeGpuCertificate_t *gpuCert); +nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, + nvmlDevice_t *migDevice); /** - * Get Conf Computing Gpu attestation report. + * Get parent device handle from a MIG device handle. * * For Ampere &tm; or newer fully supported devices. - * Supported on Linux, Windows TCC. + * Supported on Linux only. * - * @param device The identifier of the target device - * @param gpuAtstReport Reference in which to return the gpu attestation report + * @param migDevice MIG device handle + * @param device Device handle * * @return - * - \ref NVML_SUCCESS if \a gpu attestation report has been populated + * - \ref NVML_SUCCESS if \a device handle was successfully created * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ -nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, - nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); +nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); + +/** @} */ // @defgroup nvmlMultiInstanceGPU -/** @} */ /***************************************************************************************************/ /** @defgroup GPM NVML GPM @@ -9172,81 +11348,83 @@ nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t d */ /***************************************************************************************************/ -/* GPM Metric Identifiers */ +/** + * GPM Metric Identifiers + */ typedef enum { - NVML_GPM_METRIC_GRAPHICS_UTIL = 1, /* Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 */ - NVML_GPM_METRIC_SM_UTIL = 2, /* Percentage of SMs that were busy. 0.0 - 100.0 */ - NVML_GPM_METRIC_SM_OCCUPANCY = 3, /* Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 */ - NVML_GPM_METRIC_INTEGER_UTIL = 4, /* Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, /* Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, /* Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, /* Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, /* Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 */ - NVML_GPM_METRIC_DRAM_BW_UTIL = 10, /* Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP64_UTIL = 11, /* Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP32_UTIL = 12, /* Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_FP16_UTIL = 13, /* Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 */ - NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, /* PCIe traffic from this GPU in MiB/sec */ - NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, /* PCIe traffic to this GPU in MiB/sec */ - NVML_GPM_METRIC_NVDEC_0_UTIL = 30, /* Percent utilization of NVDEC 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_1_UTIL = 31, /* Percent utilization of NVDEC 1. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_2_UTIL = 32, /* Percent utilization of NVDEC 2. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_3_UTIL = 33, /* Percent utilization of NVDEC 3. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_4_UTIL = 34, /* Percent utilization of NVDEC 4. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_5_UTIL = 35, /* Percent utilization of NVDEC 5. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_6_UTIL = 36, /* Percent utilization of NVDEC 6. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVDEC_7_UTIL = 37, /* Percent utilization of NVDEC 7. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_0_UTIL = 40, /* Percent utilization of NVJPG 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_1_UTIL = 41, /* Percent utilization of NVJPG 1. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_2_UTIL = 42, /* Percent utilization of NVJPG 2. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_3_UTIL = 43, /* Percent utilization of NVJPG 3. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_4_UTIL = 44, /* Percent utilization of NVJPG 4. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_5_UTIL = 45, /* Percent utilization of NVJPG 5. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_6_UTIL = 46, /* Percent utilization of NVJPG 6. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVJPG_7_UTIL = 47, /* Percent utilization of NVJPG 7. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVOFA_0_UTIL = 50, /* Percent utilization of NVOFA 0. 0.0 - 100.0 */ - NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, /* NvLink read bandwidth for all links in MiB/sec */ - NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, /* NvLink write bandwidth for all links in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, /* NvLink read bandwidth for link 0 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, /* NvLink write bandwidth for link 0 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, /* NvLink read bandwidth for link 1 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, /* NvLink write bandwidth for link 1 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, /* NvLink read bandwidth for link 2 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, /* NvLink write bandwidth for link 2 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, /* NvLink read bandwidth for link 3 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, /* NvLink write bandwidth for link 3 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, /* NvLink read bandwidth for link 4 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, /* NvLink write bandwidth for link 4 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, /* NvLink read bandwidth for link 5 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, /* NvLink write bandwidth for link 5 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, /* NvLink read bandwidth for link 6 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, /* NvLink write bandwidth for link 6 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, /* NvLink read bandwidth for link 7 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, /* NvLink write bandwidth for link 7 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, /* NvLink read bandwidth for link 8 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, /* NvLink write bandwidth for link 8 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, /* NvLink read bandwidth for link 9 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, /* NvLink write bandwidth for link 9 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, /* NvLink read bandwidth for link 10 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, /* NvLink write bandwidth for link 10 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, /* NvLink read bandwidth for link 11 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, /* NvLink write bandwidth for link 11 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, /* NvLink read bandwidth for link 12 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, /* NvLink write bandwidth for link 12 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, /* NvLink read bandwidth for link 13 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, /* NvLink write bandwidth for link 13 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, /* NvLink read bandwidth for link 14 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, /* NvLink write bandwidth for link 14 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, /* NvLink read bandwidth for link 15 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, /* NvLink write bandwidth for link 15 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, /* NvLink read bandwidth for link 16 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, /* NvLink write bandwidth for link 16 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, /* NvLink read bandwidth for link 17 in MiB/sec */ - NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, /* NvLink write bandwidth for link 17 in MiB/sec */ - NVML_GPM_METRIC_MAX = 98, /* Maximum value above +1. Note that changing this - should also change NVML_GPM_METRICS_GET_VERSION - due to struct size change */ + NVML_GPM_METRIC_GRAPHICS_UTIL = 1, //!< Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 + NVML_GPM_METRIC_SM_UTIL = 2, //!< Percentage of SMs that were busy. 0.0 - 100.0 + NVML_GPM_METRIC_SM_OCCUPANCY = 3, //!< Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 + NVML_GPM_METRIC_INTEGER_UTIL = 4, //!< Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 + NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, //!< Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, //!< Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, //!< Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, //!< Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 + NVML_GPM_METRIC_DRAM_BW_UTIL = 10, //!< Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP64_UTIL = 11, //!< Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 + NVML_GPM_METRIC_FP32_UTIL = 12, //!< Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 + NVML_GPM_METRIC_FP16_UTIL = 13, //!< Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 + NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, //!< PCIe traffic from this GPU in MiB/sec + NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, //!< PCIe traffic to this GPU in MiB/sec + NVML_GPM_METRIC_NVDEC_0_UTIL = 30, //!< Percent utilization of NVDEC 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_1_UTIL = 31, //!< Percent utilization of NVDEC 1. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_2_UTIL = 32, //!< Percent utilization of NVDEC 2. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_3_UTIL = 33, //!< Percent utilization of NVDEC 3. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_4_UTIL = 34, //!< Percent utilization of NVDEC 4. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_5_UTIL = 35, //!< Percent utilization of NVDEC 5. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_6_UTIL = 36, //!< Percent utilization of NVDEC 6. 0.0 - 100.0 + NVML_GPM_METRIC_NVDEC_7_UTIL = 37, //!< Percent utilization of NVDEC 7. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_0_UTIL = 40, //!< Percent utilization of NVJPG 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_1_UTIL = 41, //!< Percent utilization of NVJPG 1. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_2_UTIL = 42, //!< Percent utilization of NVJPG 2. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_3_UTIL = 43, //!< Percent utilization of NVJPG 3. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_4_UTIL = 44, //!< Percent utilization of NVJPG 4. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_5_UTIL = 45, //!< Percent utilization of NVJPG 5. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_6_UTIL = 46, //!< Percent utilization of NVJPG 6. 0.0 - 100.0 + NVML_GPM_METRIC_NVJPG_7_UTIL = 47, //!< Percent utilization of NVJPG 7. 0.0 - 100.0 + NVML_GPM_METRIC_NVOFA_0_UTIL = 50, //!< Percent utilization of NVOFA 0. 0.0 - 100.0 + NVML_GPM_METRIC_NVOFA_1_UTIL = 51, //!< Percent utilization of NVOFA 1. 0.0 - 100.0 + NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, //!< NvLink read bandwidth for all links in MiB/sec + NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, //!< NvLink write bandwidth for all links in MiB/sec + NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, //!< NvLink read bandwidth for link 0 in MiB/sec + NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, //!< NvLink write bandwidth for link 0 in MiB/sec + NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, //!< NvLink read bandwidth for link 1 in MiB/sec + NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, //!< NvLink write bandwidth for link 1 in MiB/sec + NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, //!< NvLink read bandwidth for link 2 in MiB/sec + NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, //!< NvLink write bandwidth for link 2 in MiB/sec + NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, //!< NvLink read bandwidth for link 3 in MiB/sec + NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, //!< NvLink write bandwidth for link 3 in MiB/sec + NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, //!< NvLink read bandwidth for link 4 in MiB/sec + NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, //!< NvLink write bandwidth for link 4 in MiB/sec + NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, //!< NvLink read bandwidth for link 5 in MiB/sec + NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, //!< NvLink write bandwidth for link 5 in MiB/sec + NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, //!< NvLink read bandwidth for link 6 in MiB/sec + NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, //!< NvLink write bandwidth for link 6 in MiB/sec + NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, //!< NvLink read bandwidth for link 7 in MiB/sec + NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, //!< NvLink write bandwidth for link 7 in MiB/sec + NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, //!< NvLink read bandwidth for link 8 in MiB/sec + NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, //!< NvLink write bandwidth for link 8 in MiB/sec + NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, //!< NvLink read bandwidth for link 9 in MiB/sec + NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, //!< NvLink write bandwidth for link 9 in MiB/sec + NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, //!< NvLink read bandwidth for link 10 in MiB/sec + NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, //!< NvLink write bandwidth for link 10 in MiB/sec + NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, //!< NvLink read bandwidth for link 11 in MiB/sec + NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, //!< NvLink write bandwidth for link 11 in MiB/sec + NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, //!< NvLink read bandwidth for link 12 in MiB/sec + NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, //!< NvLink write bandwidth for link 12 in MiB/sec + NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, //!< NvLink read bandwidth for link 13 in MiB/sec + NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, //!< NvLink write bandwidth for link 13 in MiB/sec + NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, //!< NvLink read bandwidth for link 14 in MiB/sec + NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, //!< NvLink write bandwidth for link 14 in MiB/sec + NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, //!< NvLink read bandwidth for link 15 in MiB/sec + NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, //!< NvLink write bandwidth for link 15 in MiB/sec + NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, //!< NvLink read bandwidth for link 16 in MiB/sec + NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, //!< NvLink write bandwidth for link 16 in MiB/sec + NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, //!< NvLink read bandwidth for link 17 in MiB/sec + NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, //!< NvLink write bandwidth for link 17 in MiB/sec + //Put new metrics for BLACKWELL here... + NVML_GPM_METRIC_MAX = 98, //!< Maximum value above +1. Note that changing this should also change NVML_GPM_METRICS_GET_VERSION due to struct size change } nvmlGpmMetricId_t; /** @} */ // @defgroup nvmlGpmEnums @@ -9258,39 +11436,48 @@ typedef enum */ /***************************************************************************************************/ -/* Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc() - Free this with nvmlGpmSampleFree() */ +/** + * Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc(). Free this with nvmlGpmSampleFree(). + */ typedef struct nvmlGpmSample_st* nvmlGpmSample_t; +/** + * GPM metric information. + */ typedef struct { - unsigned int metricId; /* IN: NVML_GPM_METRIC_? #define of which metric to retrieve */ - nvmlReturn_t nvmlReturn; /* OUT: Status of this metric. If this is nonzero, then value is not valid */ - double value; /* OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) */ + unsigned int metricId; //!< IN: NVML_GPM_METRIC_? define of which metric to retrieve + nvmlReturn_t nvmlReturn; //!< OUT: Status of this metric. If this is nonzero, then value is not valid + double value; //!< OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) struct { char *shortName; char *longName; char *unit; - } metricInfo; /* OUT: Metric name and unit. Those can be NULL if not defined */ + } metricInfo; //!< OUT: Metric name and unit. Those can be NULL if not defined } nvmlGpmMetric_t; +/** + * GPM buffer information. + */ typedef struct { - unsigned int version; /* IN: Set to NVML_GPM_METRICS_GET_VERSION */ - unsigned int numMetrics; /* IN: How many metrics to retrieve in metrics[] */ - nvmlGpmSample_t sample1; /* IN: Sample buffer */ - nvmlGpmSample_t sample2; /* IN: Sample buffer */ - nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; /* IN/OUT: Array of metrics. Set metricId on call. - see nvmlReturn and value on return */ + unsigned int version; //!< IN: Set to NVML_GPM_METRICS_GET_VERSION + unsigned int numMetrics; //!< IN: How many metrics to retrieve in metrics[] + nvmlGpmSample_t sample1; //!< IN: Sample buffer + nvmlGpmSample_t sample2; //!< IN: Sample buffer + nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; //!< IN/OUT: Array of metrics. Set metricId on call. See nvmlReturn and value on return } nvmlGpmMetricsGet_t; #define NVML_GPM_METRICS_GET_VERSION 1 +/** + * GPM device information. + */ typedef struct { - unsigned int version; /* IN: Set to NVML_GPM_SUPPORT_VERSION */ - unsigned int isSupportedDevice; /* OUT: Indicates device support */ + unsigned int version; //!< IN: Set to NVML_GPM_SUPPORT_VERSION + unsigned int isSupportedDevice; //!< OUT: Indicates device support } nvmlGpmSupport_t; #define NVML_GPM_SUPPORT_VERSION 1 @@ -9306,10 +11493,9 @@ typedef struct /** * Calculate GPM metrics from two samples. * + * For Hopper &tm; or newer fully supported devices. * - * @param metricsGet IN/OUT: populated nvmlGpmMetricsGet_t struct - * - * %HOPPER_OR_NEWER% + * @param metricsGet IN/OUT: populated \a nvmlGpmMetricsGet_t struct * * @return * - \ref NVML_SUCCESS on success @@ -9321,7 +11507,7 @@ nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); /** * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param gpmSample Sample to free * @@ -9336,7 +11522,7 @@ nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); * Allocate a sample buffer to be used with NVML GPM . You will need to allocate * at least two of these buffers to use with the NVML GPM feature * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param gpmSample Where the allocated sample will be stored * @@ -9352,7 +11538,7 @@ nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); * two samples are gathered, you can call nvmlGpmMetricGet on those samples to * retrive metrics * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param device Device to get samples for * @param gpmSample Buffer to read samples into @@ -9363,25 +11549,13 @@ nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); */ nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); -/** - * Indicate whether the supplied device supports GPM - * - * @param device NVML device to query for - * @param gpmSupport Structure to indicate GPM support. Indicates - * GPM support per system for the supplied device - * - * @return - * - NVML_SUCCESS on success - * - Nonzero NVML_ERROR_? enum if there is an error in processing the query - */ - /** * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. * * After two samples are gathered, you can call nvmlGpmMetricGet on those * samples to retrive metrics * - * %HOPPER_OR_NEWER% + * For Hopper &tm; or newer fully supported devices. * * @param device Device to get samples for * @param gpuInstanceId MIG GPU Instance ID @@ -9393,11 +11567,290 @@ nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSa */ nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); +/** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support \a nvmlGpmSupport_t. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); +/* GPM Stream State */ +/** + * Get GPM stream state. + * + * For Hopper &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param state Returns GPM stream state + * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED + * + * @return + * - \ref NVML_SUCCESS if \a current GPM stream state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlGpmQueryIfStreamingEnabled(nvmlDevice_t device, unsigned int *state); + +/** + * Set GPM stream state. + * + * For Hopper &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param state GPM stream state, + * NVML_FEATURE_DISABLED or NVML_FEATURE_ENABLED + * + * @return + * - \ref NVML_SUCCESS if \a current GPM stream state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlGpmSetStreamingEnabled(nvmlDevice_t device, unsigned int state); + /** @} */ // @defgroup nvmlGpmFunctions /** @} */ // @defgroup GPM +#define NVML_DEV_CAP_EGM (1 << 0) // Extended GPU memory +/** + * Device capabilities + */ +typedef struct +{ + unsigned int version; //!< the API version number + unsigned int capMask; //!< OUT: Bit mask of capabilities. +} nvmlDeviceCapabilities_v1_t; +typedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t; +#define nvmlDeviceCapabilities_v1 NVML_STRUCT_VERSION(DeviceCapabilities, 1) + +/** + * Get device capabilities + * + * See \ref nvmlDeviceCapabilities_v1_t for more information on the struct. + * + * @param device The identifier of the target device + * @param caps Returns GPU's capabilities + * + * @return + * - \ref NVML_SUCCESS If the query is success + * - \ref NVML_ERROR_UNINITIALIZED If the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device is invalid or \a counters is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED If the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST If the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_ARGUMENT_VERSION_MISMATCH If the provided version is invalid/unsupported + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCapabilities(nvmlDevice_t device, + nvmlDeviceCapabilities_t *caps); + +/* + * Generic bitmask to hold 255 bits, represented by 8 elements of 32 bits + */ +#define NVML_255_MASK_BITS_PER_ELEM 32 +#define NVML_255_MASK_NUM_ELEMS 8 +#define NVML_255_MASK_BIT_SET(index, nvmlMask) \ + nvmlMask.mask[index / NVML_255_MASK_BITS_PER_ELEM] |= (1 << (index % NVML_255_MASK_BITS_PER_ELEM)) + +#define NVML_255_MASK_BIT_GET(index, nvmlMask) \ + nvmlMask.mask[index / NVML_255_MASK_BITS_PER_ELEM] & (1 << (index % NVML_255_MASK_BITS_PER_ELEM)) + +#define NVML_255_MASK_BIT_SET_PTR(index, nvmlMask) \ + nvmlMask->mask[index / NVML_255_MASK_BITS_PER_ELEM] |= (1 << (index % NVML_255_MASK_BITS_PER_ELEM)) + +#define NVML_255_MASK_BIT_GET_PTR(index, nvmlMask) \ + nvmlMask->mask[index / NVML_255_MASK_BITS_PER_ELEM] & (1 << (index % NVML_255_MASK_BITS_PER_ELEM)) + +typedef struct +{ + unsigned int mask[NVML_255_MASK_NUM_ELEMS]; //