From a835619abff6be9ba3f494187fd9c778e9f6429d Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Tue, 26 Dec 2023 20:49:42 -0600 Subject: [PATCH 1/4] The gcdm.Policy function was refactored, plus unit tests were added. Signed-off-by: Vadym Fedorov --- .vscode/launch.json | 24 + Makefile | 2 +- go.mod | 1 + go.sum | 17 + pkg/dcgm/dcgm_nvml.h | 4 + pkg/dcgm/dcgm_structs_internal.h | 906 +++ pkg/dcgm/dcgm_test_apis.h | 451 ++ pkg/dcgm/dcgm_test_structs.h | 202 + pkg/dcgm/internal.go | 101 + pkg/dcgm/nvml.h | 9459 ++++++++++++++++++++++++++++++ pkg/dcgm/policy.go | 51 +- pkg/dcgm/policy_test.go | 229 + pkg/dcgm/structs.go | 41 + 13 files changed, 11450 insertions(+), 38 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 pkg/dcgm/dcgm_nvml.h create mode 100755 pkg/dcgm/dcgm_structs_internal.h create mode 100644 pkg/dcgm/dcgm_test_apis.h create mode 100644 pkg/dcgm/dcgm_test_structs.h create mode 100644 pkg/dcgm/internal.go create mode 100644 pkg/dcgm/nvml.h create mode 100644 pkg/dcgm/policy_test.go create mode 100644 pkg/dcgm/structs.go diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..604aead --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,24 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Launch test function", + "type": "go", + "request": "launch", + "mode": "test", + "program": "${workspaceFolder}/${relativeFileDirname}", + "args": [ + "-test.run", + "^TestPolicyForDoubleBitErrors$" + ], + "env": { + "GOEXPERIMENT": "cgocheck2" + }, + "showLog": true + } + + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index 4cc3d62..c86cfbc 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ binary: cd samples/topology; go build test-main: - go test ./tests + go test -race ./tests check-format: test $$(gofmt -l . | tee /dev/stderr | wc -l) -eq 0 diff --git a/go.mod b/go.mod index de18c08..71b5744 100644 --- a/go.mod +++ b/go.mod @@ -6,4 +6,5 @@ require ( github.com/Masterminds/semver v1.5.0 github.com/bits-and-blooms/bitset v1.2.1 github.com/gorilla/mux v1.8.0 + github.com/stretchr/testify v1.8.4 ) diff --git a/go.sum b/go.sum index 07fad37..51d3295 100644 --- a/go.sum +++ b/go.sum @@ -2,5 +2,22 @@ github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3Q github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/bits-and-blooms/bitset v1.2.1 h1:M+/hrU9xlMp7t4TyTDQW97d3tRPVuKFC6zBEK16QnXY= github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/dcgm/dcgm_nvml.h b/pkg/dcgm/dcgm_nvml.h new file mode 100644 index 0000000..02b24e8 --- /dev/null +++ b/pkg/dcgm/dcgm_nvml.h @@ -0,0 +1,4 @@ +#pragma once + +#define NVML_NO_UNVERSIONED_FUNC_DEFS +#include "nvml.h" diff --git a/pkg/dcgm/dcgm_structs_internal.h b/pkg/dcgm/dcgm_structs_internal.h new file mode 100755 index 0000000..a224635 --- /dev/null +++ b/pkg/dcgm/dcgm_structs_internal.h @@ -0,0 +1,906 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * File: dcgm_structs_internal.h + */ + +#ifndef DCGM_STRUCTS_INTERNAL_H +#define DCGM_STRUCTS_INTERNAL_H + +/* Make sure that dcgm_structs.h is loaded first. This file depends on it */ +#include "dcgm_agent.h" +#include "dcgm_structs.h" +#include "dcgm_test_structs.h" +#include + +#ifdef INJECTION_LIBRARY_AVAILABLE +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * The following is a compile time assertion. It makes use of the + * restriction that you cannot have an array with a negative size. + * If the expression resolves to 0, then the index to the array is + * defined as -1, and a compile time error is generated. Note that + * all three macros are needed because of the way the preprocessor + * evaluates the directives. Also note that the line number is + * embedded in the name of the array so that the array name is unique + * and we can have multiple calls to the assert with the same msg. + * + * Usage would be like this: + * DCGM_CASSERT(DCGM_VGPU_NAME_BUFFER_SIZE == NVML_VGPU_NAME_BUFFER_SIZE, DCGM_VGPU_NAME_BUFFER_SIZE); + * + */ +#define _DCGM_CASSERT_SYMBOL_INNER(line, msg) COMPILE_TIME_ASSERT_DETECTED_AT_LINE_##line##__##msg +#define _DCGM_CASSERT_SYMBOL(line, msg) _DCGM_CASSERT_SYMBOL_INNER(line, msg) +#define DCGM_CASSERT(expression, msg) \ + __attribute__((unused)) typedef char _DCGM_CASSERT_SYMBOL(__LINE__, msg)[((expression) ? 1 : -1)] + +/** + * Max length of the DCGM string field + */ +#define DCGM_MAX_STR_LENGTH 256 + +typedef struct +{ + unsigned int gpuId; /* DCGM GPU ID */ + char uuid[DCGM_MAX_STR_LENGTH]; /* UUID String */ +} dcgmGpuInfo_t; + +/* Below is a test API simply to make sure versioning is working correctly + */ + +typedef struct +{ + // version must always be first + unsigned int version; + + unsigned int a; +} dcgmVersionTest_v1; + +typedef struct +{ + // version must always be first + unsigned int version; + + unsigned int a; + unsigned int b; +} dcgmVersionTest_v2; + +typedef dcgmVersionTest_v2 dcgmVersionTest_t; +#define dcgmVersionTest_version1 MAKE_DCGM_VERSION(dcgmVersionTest_v1, 1) +#define dcgmVersionTest_version2 MAKE_DCGM_VERSION(dcgmVersionTest_v2, 2) +#define dcgmVersionTest_version3 MAKE_DCGM_VERSION(dcgmVersionTest_v2, 3) +#define dcgmVersionTest_version dcgmVersionTest_version2 + +/** + * Represents a command to save or load a JSON file to/from the DcgmCacheManager + */ + +typedef enum dcgmStatsFileType_enum +{ + DCGM_STATS_FILE_TYPE_JSON = 0 /* JSON */ +} dcgmStatsFileType_t; + +typedef struct +{ + // version must always be first + unsigned int version; + + dcgmStatsFileType_t fileType; /* File type to save to/load from */ + char filename[256]; /* Filename to save to/load from */ +} dcgmCacheManagerSave_v1_t; + +#define dcgmCacheManagerSave_version1 MAKE_DCGM_VERSION(dcgmCacheManagerSave_v1_t, 1) +#define dcgmCacheManagerSave_version dcgmCacheManagerSave_version1 + +typedef dcgmCacheManagerSave_v1_t dcgmCacheManagerSave_t; + +/* Same message contents for now */ +typedef dcgmCacheManagerSave_v1_t dcgmCacheManagerLoad_v1_t; + +typedef dcgmCacheManagerLoad_v1_t dcgmCacheManagerLoad_t; + +#define dcgmCacheManagerLoad_version1 MAKE_DCGM_VERSION(dcgmCacheManagerLoad_v1_t, 1) +#define dcgmCacheManagerLoad_version dcgmCacheManagerLoad_version1 + +#define dcgmWatchFieldValue_version1 1 +#define dcgmWatchFieldValue_version dcgmWatchFieldValue_version1 + +#define dcgmUpdateAllFields_version1 1 +#define dcgmUpdateAllFields_version dcgmUpdateAllFields_version1 + +#define dcgmGetMultipleValuesForField_version1 1 +#define dcgmGetMultipleValuesForField_version dcgmGetMultipleValuesForField_version1 + +#define dcgmUnwatchFieldValue_version1 1 +#define dcgmUnwatchFieldValue_version dcgmUnwatchFieldValue_version1 + +/** + * This structure is used to represent a field value to be injected into + * the cache manager + */ +typedef dcgmFieldValue_v1 dcgmInjectFieldValue_v1; +typedef dcgmInjectFieldValue_v1 dcgmInjectFieldValue_t; +#define dcgmInjectFieldValue_version1 MAKE_DCGM_VERSION(dcgmInjectFieldValue_v1, 1) +#define dcgmInjectFieldValue_version dcgmInjectFieldValue_version1 + +#define dcgmGetMultipleValuesForFieldResponse_version1 1 +#define dcgmGetMultipleValuesForFieldResponse_version dcgmGetMultipleValuesForFieldResponse_version1 + +/* Underlying structure for the GET_MULTIPLE_LATEST_VALUES request */ +typedef struct +{ + unsigned int version; /* Set this to dcgmGetMultipleLatestValues_version1 */ + dcgmGpuGrp_t groupId; /* Entity group to retrieve values for. This is only + looked at if entitiesCount is 0 */ + unsigned int entitiesCount; /* Number of entities provided in entities[]. This + should only be provided if you aren't also setting + entityGroupId */ + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; /* Entities to retrieve values for. + Only looked at if entitiesCount > 0 */ + dcgmFieldGrp_t fieldGroupId; /* Field group to retrive values for. This is onlu looked + at if fieldIdCount is 0 */ + unsigned int fieldIdCount; /* Number of field IDs in fieldIds[] that are valid. This + should only be set if fieldGroupId is not set */ + unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; /* Field IDs for which values should + be retrieved. only looked at if fieldIdCount is > 0 */ + unsigned int flags; /* Mask of DCGM_FV_FLAG_? #defines that affect this + request */ + +} dcgmGetMultipleLatestValues_v1, dcgmGetMultipleLatestValues_t; + +#define dcgmGetMultipleLatestValues_version1 MAKE_DCGM_VERSION(dcgmGetMultipleLatestValues_v1, 1) +#define dcgmGetMultipleLatestValues_version dcgmGetMultipleLatestValues_version1 + +/* Represents cached record metadata */ + +/* Represents a unique watcher of an entity in DCGM */ + +/* Watcher types. Each watcher type's watches are tracked separately within subsystems */ +typedef enum +{ + DcgmWatcherTypeClient = 0, /* Embedded or remote client via external APIs */ + DcgmWatcherTypeHostEngine = 1, /* Watcher is DcgmHostEngineHandler */ + DcgmWatcherTypeHealthWatch = 2, /* Watcher is DcgmHealthWatch */ + DcgmWatcherTypePolicyManager = 3, /* Watcher is DcgmPolicyMgr */ + DcgmWatcherTypeCacheManager = 4, /* Watcher is DcgmCacheManager */ + DcgmWatcherTypeConfigManager = 5, /* Watcher is DcgmConfigMgr */ + DcgmWatcherTypeNvSwitchManager = 6, /* Watcher is NvSwitchManager */ + + DcgmWatcherTypeCount /* Should always be last */ +} DcgmWatcherType_t; + + +/* ID of a remote client connection within the host engine */ +typedef unsigned int dcgm_connection_id_t; + +/* Special constant for not connected */ +#define DCGM_CONNECTION_ID_NONE ((dcgm_connection_id_t)0) + +/* Cache Manager Info flags */ +#define DCGM_CMI_F_WATCHED 0x00000001 /* Is this field being watched? */ + +/* This structure mirrors the DcgmWatcher object */ +typedef struct dcgm_cm_field_info_watcher_t +{ + DcgmWatcherType_t watcherType; /* Type of watcher. See DcgmWatcherType_t */ + dcgm_connection_id_t connectionId; /* Connection ID of the watcher */ + long long monitorIntervalUsec; /* How often this field should be sampled */ + long long maxAgeUsec; /* Maximum time to cache samples of this + field. If 0, the class default is used */ +} dcgm_cm_field_info_watcher_t, *dcgm_cm_field_info_watcher_p; + +/** + * Number of watchers to show for each field + */ +#define DCGM_CM_FIELD_INFO_NUM_WATCHERS 10 + +typedef struct dcgmCacheManagerFieldInfo_v4_t +{ + unsigned int version; /* Version. Check against dcgmCacheManagerInfo_version */ + unsigned int flags; /* Bitmask of DCGM_CMI_F_? #defines that apply to this field */ + unsigned int entityId; /* ordinal id for this entity */ + unsigned int entityGroupId; /* the type of entity, see dcgm_field_entity_group_t */ + unsigned short fieldId; /* Field ID of this field */ + short lastStatus; /* Last nvml status returned for this field when taking a sample */ + long long oldestTimestamp; /* Timestamp of the oldest record. 0=no records or single + non-time series record */ + long long newestTimestamp; /* Timestamp of the newest record. 0=no records or + single non-time series record */ + long long monitorIntervalUsec; /* How often is this field updated in usec */ + long long maxAgeUsec; /* How often is this field updated */ + long long execTimeUsec; /* Cumulative time spent updating this + field since the cache manager started */ + long long fetchCount; /* Number of times that this field has been + fetched from the driver */ + int numSamples; /* Number of samples currently cached for this field */ + int numWatchers; /* Number of watchers that are valid in watchers[] */ + dcgm_cm_field_info_watcher_t watchers[DCGM_CM_FIELD_INFO_NUM_WATCHERS]; /* Who are the first 10 + watchers of this field? */ +} dcgmCacheManagerFieldInfo_v4_t, *dcgmCacheManagerFieldInfo_v4_p; + +#define dcgmCacheManagerFieldInfo_version4 MAKE_DCGM_VERSION(dcgmCacheManagerFieldInfo_v4_t, 4) + +/** + * The maximum number of topology elements possible given DCGM_MAX_NUM_DEVICES + * calculated using arithmetic sequence formula + * (DCGM_MAX_NUM_DEVICES - 1) * (1 + (DCGM_MAX_NUM_DEVICES-2)/2) + */ +#define DCGM_TOPOLOGY_MAX_ELEMENTS 496 + +/** + * Topology element structure + */ +typedef struct +{ + unsigned int dcgmGpuA; //!< GPU A + unsigned int dcgmGpuB; //!< GPU B + unsigned int AtoBNvLinkIds; //!< bits representing the links connected from GPU A to GPU B + //!< e.g. if this field == 3, links 0 and 1 are connected, + //!< field is only valid if NVLINKS actually exist between GPUs + unsigned int BtoANvLinkIds; //!< bits representing the links connected from GPU B to GPU A + //!< e.g. if this field == 3, links 0 and 1 are connected, + //!< field is only valid if NVLINKS actually exist between GPUs + dcgmGpuTopologyLevel_t path; //!< path between A and B +} dcgmTopologyElement_t; + +/** + * Topology results structure + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmTopology_version) + unsigned int numElements; //!< number of valid dcgmTopologyElement_t elements + + dcgmTopologyElement_t element[DCGM_TOPOLOGY_MAX_ELEMENTS]; +} dcgmTopology_v1; + +/** + * Typedef for \ref dcgmTopology_v1 + */ +typedef dcgmTopology_v1 dcgmTopology_t; + +/** + * Version 1 for \ref dcgmTopology_v1 + */ +#define dcgmTopology_version1 MAKE_DCGM_VERSION(dcgmTopology_v1, 1) + +/** + * Latest version for \ref dcgmTopology_t + */ +#define dcgmTopology_version dcgmTopology_version1 + +typedef struct +{ + unsigned int numGpus; + struct + { + unsigned int dcgmGpuId; + unsigned long bitmask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; + } affinityMasks[DCGM_MAX_NUM_DEVICES]; +} dcgmAffinity_t; + + +typedef struct +{ + unsigned int version; //!< IN: Version number (dcgmCreateFakeEntities_version) + unsigned int numToCreate; //!< IN: Number of fake entities to create + dcgmMigHierarchyInfo_t entityList[DCGM_MAX_HIERARCHY_INFO]; //!< IN: specifies who to create and the parent +} dcgmCreateFakeEntities_v2; + +typedef dcgmCreateFakeEntities_v2 dcgmCreateFakeEntities_t; + +/** + * Version 2 for \ref dcgmCreateFakeEntities_t + */ +#define dcgmCreateFakeEntities_version2 MAKE_DCGM_VERSION(dcgmCreateFakeEntities_v2, 2) + +/** + * Latest version for \ref dcgmCreateFakeEntities_t + */ +#define dcgmCreateFakeEntities_version dcgmCreateFakeEntities_version2 + + +/* Field watch predefined groups */ +typedef enum +{ + DCGM_WATCH_PREDEF_INVALID = 0, + DCGM_WATCH_PREDEF_PID, /*!< PID stats */ + DCGM_WATCH_PREDEF_JOB, /*!< Job stats */ +} dcgmWatchPredefinedType_t; + +typedef struct +{ + unsigned int version; + dcgmWatchPredefinedType_t watchPredefType; /*!< Which type of predefined watch are we adding? */ + + dcgmGpuGrp_t groupId; /*!< GPU group to watch fields for */ + long long updateFreq; /*!< How often to update the fields in usec */ + double maxKeepAge; /*!< How long to keep values for the fields in seconds */ + int maxKeepSamples; /*!< Maximum number of samples we should keep at a time */ +} dcgmWatchPredefined_v1; + +typedef dcgmWatchPredefined_v1 dcgmWatchPredefined_t; + +/** + * Version 1 for \ref dcgmWatchPredefined_t + */ +#define dcgmWatchPredefined_version1 MAKE_DCGM_VERSION(dcgmWatchPredefined_v1, 1) + +/** + * Latest version for \ref dcgmWatchPredefined_t + */ +#define dcgmWatchPredefined_version dcgmWatchPredefined_version1 + +/** + * Request to set a NvLink link state for an entity + */ +typedef struct +{ + unsigned int version; /*!< Version. Should be dcgmSetNvLinkLinkState_version1 */ + dcgm_field_entity_group_t entityGroupId; /*!< Entity group of the entity to set the link state of */ + dcgm_field_eid_t entityId; /*!< ID of the entity to set the link state of */ + unsigned int linkId; /*!< Link (or portId) of the link to set the state of */ + dcgmNvLinkLinkState_t linkState; /*!< State to set the link to */ + unsigned int unused; /*!< Not used for now. Set to 0 */ +} dcgmSetNvLinkLinkState_v1; + +#define dcgmSetNvLinkLinkState_version1 MAKE_DCGM_VERSION(dcgmSetNvLinkLinkState_v1, 1) + + +/** + * Request to add a module ID to the denylist + */ +typedef struct +{ + unsigned int version; /*!< Version. Should be dcgmModuleDenylist_version */ + dcgmModuleId_t moduleId; /*!< Module to add to the denylist */ +} dcgmModuleDenylist_v1; + +#define dcgmModuleDenylist_version1 MAKE_DCGM_VERSION(dcgmModuleDenylist_v1, 1) + + +/** + * Counter to use for NvLink + */ +#define DCGMCM_NVLINK_COUNTER_BYTES 0 + +/** + * The Brand of the GPU. These are 1:1 with NVML_BRAND_*. There's a DCGM_CASSERT() below that tests that + */ +typedef enum dcgmGpuBrandType_enum +{ + DCGM_GPU_BRAND_UNKNOWN = 0, + DCGM_GPU_BRAND_QUADRO = 1, + DCGM_GPU_BRAND_TESLA = 2, + DCGM_GPU_BRAND_NVS = 3, + DCGM_GPU_BRAND_GRID = 4, + DCGM_GPU_BRAND_GEFORCE = 5, + DCGM_GPU_BRAND_TITAN = 6, + /* The following are new as of r460 TRD2's nvml.h */ + DCGM_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications + DCGM_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC + DCGM_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server + DCGM_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation + DCGM_BRAND_NVIDIA_VGAMING = 11, // NVIDIA vGaming + DCGM_BRAND_QUADRO_RTX = 12, + DCGM_BRAND_NVIDIA_RTX = 13, + DCGM_BRAND_NVIDIA = 14, + DCGM_BRAND_GEFORCE_RTX = 15, + DCGM_BRAND_TITAN_RTX = 16, + // Keep this last + DCGM_GPU_BRAND_COUNT +} dcgmGpuBrandType_t; + +/*****************************************************************************/ +typedef enum dcgmEntityStatusType_enum +{ + DcgmEntityStatusUnknown = 0, /* Entity has not been referenced yet */ + DcgmEntityStatusOk, /* Entity is known and OK */ + DcgmEntityStatusUnsupported, /* Entity is unsupported by DCGM */ + DcgmEntityStatusInaccessible, /* Entity is inaccessible, usually due to cgroups */ + DcgmEntityStatusLost, /* Entity has been lost. Usually set from NVML + returning NVML_ERROR_GPU_IS_LOST */ + DcgmEntityStatusFake, /* Entity is a fake, injection-only entity for testing */ + DcgmEntityStatusDisabled, /* Don't collect values from this GPU */ + DcgmEntityStatusDetached /* Entity is detached, not good for any uses */ +} DcgmEntityStatus_t; + +/** + * Making these internal so that client apps must be explicit with struct versions. + */ + +/** + * Typedef for \ref dcgmRunDiag_t + */ +typedef dcgmRunDiag_v7 dcgmRunDiag_t; + +/** + * Latest version for \ref dcgmRunDiag_t + */ +#define dcgmRunDiag_version dcgmRunDiag_version7 + +/** + * Version 1 of dcgmCreateGroup_t + */ + +typedef struct +{ + dcgmGroupType_t groupType; //!< Type of group to create + char groupName[1024]; //!< Name to give new group + dcgmGpuGrp_t newGroupId; //!< On success, the ID of the newly created group + dcgmReturn_t cmdRet; //!< Error code generated when creating new group +} dcgmCreateGroup_v1; + +/** + * Version 1 of dcgmRemoveEntity_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group id from which entity should be removed + unsigned int entityGroupId; //!< IN: Entity group that entity belongs to + unsigned int entityId; //!< IN: Entity id to remove + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmAddRemoveEntity_v1; + +/** + * Version 1 of dcgmGroupDestroy_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group to remove + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupDestroy_v1; + +/** + * Version 1 of dcgmGetEntityGroupEntities_t + */ + +typedef struct +{ + unsigned int entityGroup; //!< IN: Entity of group to list entities + unsigned int entities[DCGM_GROUP_MAX_ENTITIES]; //!< OUT: Array of entities for entityGroup + unsigned int numEntities; //!< IN/OUT: Upon calling, this should be the number of + // entities that entityList[] can hold. Upon + // return, this will contain the number of + // entities actually saved to entityList. + unsigned int flags; //!< IN: Flags to modify the behavior of this request. + // See DCGM_GEGE_FLAG_* + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetEntityGroupEntities_v1; + +/** + * Version 1 of dcgmGroupGetAllIds_t + */ + +typedef struct +{ + unsigned int groupIds[DCGM_MAX_NUM_GROUPS]; //!< OUT: List of group ids + unsigned int numGroups; //!< OUT: Number of group ids in the list + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupGetAllIds_v1; + +/** + * Version 1 of dcgmGroupGetInfo_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID for which information to be fetched + dcgmGroupInfo_t groupInfo; //!< OUT: Group Information + long long timestamp; //!< OUT: Timestamp of information + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupGetInfo_v1; + +#define SAMPLES_BUFFER_SIZE_V1 16384 + +/** + * Version 1 of dcgmEntitiesGetLatestValues_t + */ +typedef struct +{ + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. + //!< This is ignored if fieldIdList[] is provided + unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for + unsigned int fieldIdCount; //!< IN: Number of field IDs in fieldIdList[] array. + unsigned int flags; //!< IN: Optional flags that affect how this request is processed. + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V1]; //!< OUT: this field is last, and can be truncated for speed */ +} dcgmEntitiesGetLatestValues_v1; + +#define SAMPLES_BUFFER_SIZE_V2 4186112 // 4MB - 8k for header + +/** + * Version 2 of dcgmEntitiesGetLatestValues_t + */ +typedef struct +{ + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. + //!< This is ignored if fieldIdList[] is provided + unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for + unsigned int fieldIdCount; //!< IN: Number of field IDs in fieldIdList[] array. + unsigned int flags; //!< IN: Optional flags that affect how this request is processed. + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT: this field is last, and can be truncated for speed */ +} dcgmEntitiesGetLatestValues_v2; + +/** + * Version 1 of dcgmGetMultipleValuesForField + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: Optional group id for information to be fetched + unsigned int entityId; //!< IN: Optional entity id for information to be fetched + unsigned int fieldId; //!< IN: Field id to fetch + long long startTs; //!< IN: Starting timestamp + long long endTs; //!< IN: End timestamp + unsigned int order; //!< IN: Order for output data, see dcgmOrder_t + unsigned int count; //!< IN: Number of values to retrieve (may be limited by size of buffer) + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V1]; //!< OUT:: this field is last, and can be truncated for speed */ +} dcgmGetMultipleValuesForField_v1; + +/** + * Version 2 of dcgmGetMultipleValuesForField + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: Optional group id for information to be fetched + unsigned int entityId; //!< IN: Optional entity id for information to be fetched + unsigned int fieldId; //!< IN: Field id to fetch + long long startTs; //!< IN: Starting timestamp + long long endTs; //!< IN: End timestamp + unsigned int order; //!< IN: Order for output data, see dcgmOrder_t + unsigned int count; //!< IN: Number of values to retrieve (may be limited by size of buffer) + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT:: this field is last, and can be truncated for speed */ +} dcgmGetMultipleValuesForField_v2; + +/** + * Version 1 of dcgmJobCmd_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: optional group id + char jobId[64]; //!< IN: job id + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmJobCmd_v1; + +/** + * Version 1 of dcgmJobGetStats_t + */ + +typedef struct +{ + char jobId[64]; //!< IN: job id + dcgmJobInfo_t jobStats; //!< OUT: job stats + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmJobGetStats_v1; + +/** + * Version 1 of dcgmWatchFieldValue_t (DCGM 2.x) + */ +typedef struct +{ + int gpuId; //!< IN: GPU ID to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field ID to watch + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFieldValue_v1; + +/** + * Version 2 of dcgmWatchFieldValue_t (DCGM 3.x+) + */ +typedef struct +{ + unsigned int entityId; //!< IN: entityId (gpuId for GPUs) to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field ID to watch + unsigned char unused[6]; //!< IN: Unused. Aligns next member to 8-byte boundary + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + int updateOnFirstWatcher; //!< IN: Should we do an UpdateAllFields() automatically if we are the first watcher? + //!< 1=yes. 0=no. + int wereFirstWatcher; //!< OUT: Returns 1 if we were the first watcher. 0 if not */ + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFieldValue_v2; + +/** + * Version 1 of dcgmUpdateAllFields_v1 + */ +typedef struct +{ + int waitForUpdate; //!< IN: Whether or not to wait for the update loop to complete before returning to the + // caller 1=wait. 0=do not wait. + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmUpdateAllFields_v1; + +/** + * Version 1 of dcgmUnwatchFieldValue_t + */ +typedef struct +{ + int gpuId; //!< IN: GPU ID to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field id to unwatch + int clearCache; //!< IN: Whether or not to clear all cached data for + // the field after the watch is removed + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmUnwatchFieldValue_v1; + +/** + * Version 1 of dcgmInjectFieldValue_t + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: entity group id + unsigned int entityId; //!< IN: entity id + dcgmFieldValue_v1 fieldValue; //!< IN: field value to insert + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmInjectFieldValueMsg_v1; + +#define dcgmInjectFieldValueMsg_version1 MAKE_DCGM_VERSION(dcgmInjectFieldValueMsg_v1, 1) +#define dcgmInjectFieldValueMsg_version dcgmInjectFieldValueMsg_version1 +typedef dcgmInjectFieldValueMsg_v1 dcgmInjectFieldValueMsg_t; + +/** + * Version 2 of dcgmGetCacheManagerFieldInfo_t + */ +typedef struct +{ + dcgmCacheManagerFieldInfo_v4_t + fieldInfo; //!< IN/OUT: Structure to populate. fieldInfo->gpuId and fieldInfo->fieldId must + // be populated on calling for this call to work + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetCacheManagerFieldInfo_v2; + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + unsigned int fieldGroupId; //!< IN: Fields to watch. + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFields_v1; + +#define dcgmWatchFields_version1 1 +#define dcgmWatchFields_version dcgmWatchFields_version1 + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + dcgmTopology_t topology; //!< OUT: populated struct + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetTopologyMsg_v1; + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + dcgmAffinity_t affinity; //!< OUT: populated struct + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetTopologyAffinityMsg_v1; + +typedef struct +{ + uint64_t inputGpus; //!< IN: bitmask of available gpus + uint32_t numGpus; //!< IN: number of gpus needed + uint64_t flags; //!< IN: Hints to ignore certain factors for the scheduling hint + uint64_t outputGpus; //!< OUT: bitmask of selected gpus + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmSelectGpusByTopologyMsg_v1; + +typedef struct +{ + int supported; //!< IN: boolean to ONLY include Ids of supported GPUs + unsigned int devices[DCGM_MAX_NUM_DEVICES]; //!< OUT: GPU Ids present on the system. + int count; //!< OUT: Number of devices returned in "devices" + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetAllDevicesMsg_v1; + +typedef struct +{ + int persistAfterDisconnect; //!< IN: boolean whether to persist groups, etc after client is disconnected + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmClientLogin_v1; + +typedef struct +{ + dcgmFieldGroupInfo_t fg; //!< IN/OUT: field group info populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmFieldGroupOp_v1; + +typedef struct +{ + unsigned int groupId; //!< IN: group id for query + dcgmPidInfo_t pidInfo; //!< IN/OUT: pid info populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmPidGetInfo_v1; + +typedef struct +{ + dcgmFieldSummaryRequest_t fsr; //!< IN/OUT: field summary populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetFieldSummary_v1; + +typedef struct +{ + dcgmNvLinkStatus_v3 ls; //!< IN/OUT: nvlink status populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetNvLinkStatus_v2; + +typedef struct +{ + dcgmCreateFakeEntities_v2 fe; //!< IN/OUT: fake entity info, populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgCreateFakeEntities_v1; + +typedef struct +{ + dcgmWatchPredefined_t wpf; //!< IN: watch info + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchPredefinedFields_v1; + +typedef struct +{ + unsigned int moduleId; //!< IN: Module to add to the denylist + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgModuleDenylist_v1; + +typedef struct +{ + dcgmModuleGetStatuses_t st; //!< IN/OUT: module status + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgModuleStatus_v1; + +typedef struct +{ + unsigned int overallHealth; //!< IN/OUT: hostengine health + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgHostEngineHealth_v1; + +typedef struct +{ + dcgmAllFieldGroup_t fg; //!< IN/OUT: hostengine health + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetAllFieldGroup_v1; + +typedef struct +{ + dcgmMigHierarchy_v2 data; //!< OUT: populated on success + + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgGetGpuInstanceHierarchy_v1; + +typedef struct +{ + unsigned int index; //!< IN: the index of the GPU to create + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlCreateInjectionGpu_v1; + +#ifdef INJECTION_LIBRARY_AVAILABLE +#define DCGM_MAX_EXTRA_KEYS 4 +typedef struct +{ + unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected + char key[DCGM_MAX_STR_LENGTH]; //!< IN: The key for the NVML injected value + injectNvmlVal_t extraKeys[DCGM_MAX_EXTRA_KEYS]; //!< IN: extra keys, optional + unsigned int extraKeyCount; //!< IN: the number of extra keys + injectNvmlVal_t value; //!< IN: the NVML value being injected + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlInjectDevice_v1; +#endif + +/** + * Verify that DCGM definitions that are copies of NVML ones match up with their NVML counterparts + */ +DCGM_CASSERT(DCGM_VGPU_NAME_BUFFER_SIZE == NVML_VGPU_NAME_BUFFER_SIZE, NVML_VGPU_NAME_BUFFER_SIZE); +DCGM_CASSERT(DCGM_GRID_LICENSE_BUFFER_SIZE == NVML_GRID_LICENSE_BUFFER_SIZE, NVML_GRID_LICENSE_BUFFER_SIZE); +DCGM_CASSERT(DCGM_DEVICE_UUID_BUFFER_SIZE == NVML_DEVICE_UUID_BUFFER_SIZE, NVML_DEVICE_UUID_BUFFER_SIZE); +DCGM_CASSERT(DCGM_NVLINK_MAX_LINKS_PER_GPU == NVML_NVLINK_MAX_LINKS, NVML_NVLINK_MAX_LINKS); +DCGM_CASSERT((int)DCGM_GPU_BRAND_COUNT == (int)NVML_BRAND_COUNT, NVML_BRAND_COUNT); + +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_NONE == (int)NVML_GPU_VIRTUALIZATION_MODE_NONE, + NVML_GPU_VIRTUALIZATION_MODE_NONE); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH == (int)NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH, + NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_VGPU == (int)NVML_GPU_VIRTUALIZATION_MODE_VGPU, + NVML_GPU_VIRTUALIZATION_MODE_VGPU); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU == (int)NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU, + NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA == (int)NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA, + NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA); + +/** + * Verify correct version of APIs that use a versioned structure + */ + +DCGM_CASSERT(dcgmPidInfo_version == (long)0x02004528, 1); +DCGM_CASSERT(dcgmConfig_version == (long)16777256, 1); +DCGM_CASSERT(dcgmConnectV2Params_version1 == (long)16777224, 1); +DCGM_CASSERT(dcgmConnectV2Params_version == (long)0x02000010, 1); +DCGM_CASSERT(dcgmCpuHierarchyOwnedCores_version1 == (long)0x1000088, 1); +DCGM_CASSERT(dcgmCpuHierarchy_version1 == (long)0x1000488, 1); +DCGM_CASSERT(dcgmFieldGroupInfo_version == (long)16777744, 1); +DCGM_CASSERT(dcgmAllFieldGroup_version == (long)16811016, 1); +DCGM_CASSERT(dcgmDeviceAttributes_version3 == (long)0x3001464, 1); +DCGM_CASSERT(dcgmDeviceAttributes_version == (long)0x3001464, 1); +DCGM_CASSERT(dcgmHealthResponse_version4 == (long)0x0401050C, 1); +DCGM_CASSERT(dcgmIntrospectMemory_version == (long)16777232, 1); +DCGM_CASSERT(dcgmIntrospectCpuUtil_version == (long)16777248, 1); +DCGM_CASSERT(dcgmJobInfo_version == (long)0x030098A8, 1); +DCGM_CASSERT(dcgmPolicy_version == (long)16777360, 1); +DCGM_CASSERT(dcgmPolicyCallbackResponse_version == (long)16777240, 1); +DCGM_CASSERT(dcgmDiagResponse_version7 == (long)0x07099290, 1); +DCGM_CASSERT(dcgmDiagResponse_version8 == (long)0x80d9690, 8); +DCGM_CASSERT(dcgmDiagResponse_version9 == (long)0x914f4dc, 9); +DCGM_CASSERT(dcgmDiagResponse_version == (long)0x914f4dc, 9); +DCGM_CASSERT(dcgmRunDiag_version7 == (long)0x70054D0, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version6 == (long)16787744, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version7 == (long)117451168, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version == (long)117451168, 1); +DCGM_CASSERT(dcgmVgpuInstanceAttributes_version == (long)16777556, 1); +DCGM_CASSERT(dcgmVgpuConfig_version == (long)16777256, 1); +DCGM_CASSERT(dcgmModuleGetStatuses_version == (long)0x01000088, 1); +DCGM_CASSERT(dcgmModuleDenylist_version1 == (long)0x01000008, 1); +DCGM_CASSERT(dcgmSettingsSetLoggingSeverity_version1 == (long)0x01000008, 1); +DCGM_CASSERT(dcgmVersionInfo_version == (long)0x2000204, 1); +DCGM_CASSERT(dcgmStartEmbeddedV2Params_version1 == (long)0x01000048, 1); +DCGM_CASSERT(dcgmStartEmbeddedV2Params_version2 == (long)0x02000050, 2); +DCGM_CASSERT(dcgmInjectFieldValue_version1 == (long)0x1001018, 1); +DCGM_CASSERT(dcgmInjectFieldValue_version == (long)0x1001018, 1); +DCGM_CASSERT(dcgmNvLinkStatus_version3 == (long)0x30015bc, 3); + +#ifndef DCGM_ARRAY_CAPACITY +#ifdef __cplusplus +#define DCGM_ARRAY_CAPACITY(a) std::extent::value +static_assert(NVML_COMPUTE_INSTANCE_PROFILE_COUNT == 0x08); +static_assert(NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 == 0x09); +#endif +#endif + +#ifndef DCGM_ARRAY_CAPACITY +#define DCGM_ARRAY_CAPACITY(a) (sizeof(a) / sizeof(a[0])) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_STRUCTS_H */ diff --git a/pkg/dcgm/dcgm_test_apis.h b/pkg/dcgm/dcgm_test_apis.h new file mode 100644 index 0000000..6b8bf78 --- /dev/null +++ b/pkg/dcgm/dcgm_test_apis.h @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * File: dcgm_test_apis.h + */ + +#ifndef DCGM_AGENT_INTERNAL_H +#define DCGM_AGENT_INTERNAL_H + +#include "dcgm_api_export.h" +#include "dcgm_structs.h" +#include "dcgm_structs_internal.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/***************************************************************************** + *****************************************************************************/ +/***************************************************************************** + * DCGM Test Methods, only used for testing, not officially supported + *****************************************************************************/ +/***************************************************************************** + *****************************************************************************/ + +#define DCGM_EMBEDDED_HANDLE 0x7fffffff + +/** + * This method starts the Host Engine Server + * + * @param portNumber IN: TCP port to listen on. This is only used if isTcp == 1. + * @param socketPath IN: This is the path passed to bind() when creating the socket + * For isConnectionTCP == 1, this is the bind address. "" or NULL = All interfaces + * For isConnectionTCP == 0, this is the path to the domain socket to use + * @param isConnectionTCP IN: Whether to listen on a TCP/IP socket (1) or a unix domain socket (0) + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEngineRun(unsigned short portNumber, + char const *socketPath, + unsigned int isConnectionTCP); + +/** + * This method is used to get values corresponding to the fields. + * @return + * - \ref DCGM_ST_SUCCESS On Success. Even when the API returns success, check for + * individual status inside each field. + * Look at values[index].status. The field values will be + * populated only when status in each field is DCGM_ST_SUCCESS + * - DCGM_ST_? In case of error + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldIds[], + unsigned int count, + dcgmFieldValue_v1 values[]); + +/** + * This method is used to get multiple values for a single field + * + * @return + * - \ref DCGM_ST_SUCCESS on success. + * - DCGM_ST_? error code on failure + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetMultipleValuesForField(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + int *count, + long long startTs, + long long endTs, + dcgmOrder_t order, + dcgmFieldValue_v1 values[]); + +/** + * Request updates for all field values that have updated since a given timestamp + * + * @param groupId IN: Group ID representing a collection of one or more GPUs + * Refer to \ref dcgmEngineGroupCreate for details on creating a group + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will + * be returned in nextSinceTimestamp for subsequent calls + * 0 = request all data + * @param fieldIds IN: Fields to return data for + * @param numFieldIds IN: Number of entries in fieldIds + * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function + * @param enumCB IN: Callback to invoke for every field value update. Note that + * multiple updates can be returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetFieldValuesSince(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long sinceTimestamp, + unsigned short *fieldIds, + int numFieldIds, + long long *nextSinceTimestamp, + dcgmFieldValueEnumeration_f enumCB, + void *userData); + +/** + * This method is used to tell the cache manager to watch a field value + * + * @param gpuId GPU ID to watch field on + * @param fieldId Field ID to watch + * @param updateFreq How often to update this field in usec + * @param maxKeepAge How long to keep data for this field in seconds + * @param maxKeepSamples Maximum number of samples to keep. 0=no limit + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a gpuId, \a fieldId, \a updateFreq, \a maxKeepAge, + * or \a maxKeepSamples are invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmWatchFieldValue(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); + +/** + * This method is used to tell the cache manager to unwatch a field value + * + * @param gpuId GPU ID to watch field on + * @param fieldId Field ID to watch + * @param clearCache Whether or not to clear all cached data for + * the field after the watch is removed + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a gpuId, \a fieldId, or \a clearCache is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmUnwatchFieldValue(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + int clearCache); + +/*************************************************************************/ +/** + * Used to set vGPU configuration for the group of one or more GPUs identified by \a groupId. + * + * The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the + * group. Since DCGM groups are a logical grouping of GPUs, the configuration settings Set for a group + * stay intact for the individual GPUs even after the group is destroyed. + * + * If the user wishes to ignore the configuration of one or more properties in the input + * \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, + * \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the + * property to be ignored. + * + * If any of the properties fail to be configured for any of the GPUs in the group then the API + * returns an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @param groupId IN: Group ID representing collection of one or more GPUs. Look + * at \ref dcgmGroupCreate for details on creating the group. + * applied for all the GPU in the group represented by + * \a groupId. The caller must populate the version field of + * \a pDeviceConfig. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully set. + * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. + * - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigSet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmVgpuConfig_t *pDeviceConfig, + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Used to get vGPU configuration for all the GPUs present in the group. + * + * This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. + * Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration + * properties are maintained by DCGM and are automatically enforced after a GPU reset or + * reinitialization is completed. + * + * The method can also be used to get the actual configuration state for the GPUs in the group. + * Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the + * actual configuration state will be exact same as the target configuration state. + * + * If any of the property in the target configuration is unknown then the property value in the + * output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or + * DCGM_STR_BLANK based on the data type of the property. + * + * If any of the property in the current configuration state is not supported then the property + * value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, + * DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. + * + * If any of the properties can't be fetched for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look + * at \ref dcgmGroupCreate for details on creating the group. + * @param type IN: Type of configuration values to be fetched. + * @param count IN: The number of entries that \a deviceConfigList array can store. + * @param deviceConfigList OUT: Pointer to memory to hold requested configuration + * corresponding to all the GPUs in the group (\a groupId). The + * size of the memory must be greater than or equal to hold + * output information for the number of GPUs present in the + * group (\a groupId). + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully fetched. + * - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, + * or \a deviceConfigList is invalid. + * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. + * - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfigType_t type, + int count, + dcgmVgpuConfig_t deviceConfigList[], + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Used to enforce previously set vGPU configuration for all the GPUs present in the group. + * + * This API provides a mechanism to the users to manually enforce the configuration at any point of + * time. The configuration can only be enforced if it's already configured using the API \ref + * dcgmConfigSet. + * + * If any of the properties can't be enforced for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at + * \ref dcgmGroupCreate for details on creating the group. + * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS + * to perform operation on all the GPUs. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully enforced. + * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. + * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigEnforce(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Gets vGPU device attributes corresponding to the \a gpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * dcgm_structs.h. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which the attributes + * should be fetched + * @param pDcgmVgpuAttr IN/OUT: vGPU Device attributes corresponding to \a gpuId.
+ * .version should be set to \ref dcgmVgpuDeviceAttributes_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if version is not set or is invalid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetVgpuDeviceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmVgpuDeviceAttributes_t *pDcgmVgpuAttr); + +/*************************************************************************/ +/** + * Gets vGPU attributes corresponding to the \a vgpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * dcgm_structs.h. + * + * @param pDcgmHandle IN: DCGM Handle + * @param vgpuId IN: vGPU Id corresponding to which the attributes should be fetched + * @param pDcgmVgpuInstanceAttr IN/OUT: vGPU attributes corresponding to \a vgpuId.
.version should be set to + * \ref dcgmVgpuInstanceAttributes_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetVgpuInstanceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int vgpuId, + dcgmVgpuInstanceAttributes_t *pDcgmVgpuInstanceAttr); + +/** + * Stop a diagnostic if there is one currently running. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStopDiagnostic(dcgmHandle_t pDcgmHandle); + +/** + * This method injects a sample into the cache manager + * + * @param gpuId + * @param dcgmInjectFieldValue + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmInjectFieldValue(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmInjectFieldValue_t *dcgmInjectFieldValue); + +/** + * This method retries the state of a field within the cache manager + * + * @param fieldInfo Structure to populate. fieldInfo->gpuId and fieldInfo->fieldId must + * be populated on calling for this call to work + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetCacheManagerFieldInfo(dcgmHandle_t pDcgmHandle, + dcgmCacheManagerFieldInfo_v4_t *fieldInfo); + +/** + * This method returns the status of the gpu + * + * @param gpuId + * @param DcgmEntityStatus_t + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuStatus(dcgmHandle_t pDcgmHandle, unsigned int gpuId, DcgmEntityStatus_t *status); + +/** + * Create fake entities for injection testing + * + * @param createFakeEntities Details about the number and type of entities to create + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmCreateFakeEntities(dcgmHandle_t pDcgmHandle, + dcgmCreateFakeEntities_t *createFakeEntities); + +/** + * This method injects a sample into the cache manager + * + * @param entityGroupId + * @param entityId + * @param dcgmInjectFieldValue + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEntityInjectFieldValue(dcgmHandle_t pDcgmHandle, + dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmInjectFieldValue_t *dcgmInjectFieldValue); + +/** + * This method sets the link state of an entity's NvLink + * + * dcgmHandle_t dcgmHandle + * linkState contains details about the link state to set + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmSetEntityNvLinkLinkState(dcgmHandle_t dcgmHandle, + dcgmSetNvLinkLinkState_v1 *linkState); + +/** + * Creates a MIG entity with according to the specifications in the struct + * + * @param dcgmHandle IN: DCGM Handle + * @param cme IN: struct stating which kind of entity is being created, who the parent entity is, flags + * for processing it, and the profile to specify what size of that entity to create. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the hostengine is not running as root + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmCreateMigEntity(dcgmHandle_t dcgmHandle, dcgmCreateMigEntity_t *cme); + +/** + * Delete the specified MIG entity + * + * @param dcgmHandle IN: DCGM Handle + * @param dme IN: struct specifying which entity to delete with flags to suggest how to process it. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the hostengine is not running as root + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmDeleteMigEntity(dcgmHandle_t dcgmHandle, dcgmDeleteMigEntity_t *dme); + +/** + * @brief Pauses all DCGM modules from updating field values + * + * This method sends a pause message to each loaded module. + * It's up to the module to decide whether to handle or ignore the message. + * + * @param[in] pDcgmHandle DCGM Handle of an active connection + * + * @return + * - \ref DCGM_ST_OK if successful + * - \ref DCGM_ST_* on error + * + * @note If this function fails, the modules may be in an inconsistent state. + * @note You may call \ref dcgmModuleGetStatuses to see which modules are paused. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPauseTelemetryForDiag(dcgmHandle_t pDcgmHandle); + +/** + * @brief Resumes all DCGM modules to updating field values + * + * This method sends a resume message to each loaded module. + * It's up to the module to decide whether to handle or ignore the message. + * + * @param[in] pDcgmHandle DCGM Handle of an active connection + * + * @return + * - \ref DCGM_ST_OK if successful + * - \ref DCGM_ST_* on error + * + * @note If this function fails, the modules may be in an inconsistent state. + * @note You may call \ref dcgmModuleGetStatuses to see which modules are resumed. The satus of the resumed modules + * should be \ref dcgmModuleStatus_t::DcgmModuleStatusLoaded. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmResumeTelemetryForDiag(dcgmHandle_t pDcgmHandle); + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_AGENT_INTERNAL_H */ diff --git a/pkg/dcgm/dcgm_test_structs.h b/pkg/dcgm/dcgm_test_structs.h new file mode 100644 index 0000000..83e3418 --- /dev/null +++ b/pkg/dcgm/dcgm_test_structs.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * File: dcgm_test_structs.h + */ + +#ifndef DCGM_TEST_STRUCTS_H +#define DCGM_TEST_STRUCTS_H + +#include "dcgm_fields.h" +#include "dcgm_structs.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Structure to represent default and target vgpu configuration for a device + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmConfig_version) + unsigned int gpuId; //!< GPU ID + unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) + unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) + dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) + dcgmConfigPowerLimit_t powerLimit; //!< Power Limits +} dcgmVgpuConfig_v1; + +/** + * Typedef for \ref dcgmVgpuConfig_v1 + */ +typedef dcgmVgpuConfig_v1 dcgmVgpuConfig_t; + +/** + * Version 1 for \ref dcgmVgpuConfig_v1 + */ +#define dcgmVgpuConfig_version1 MAKE_DCGM_VERSION(dcgmVgpuConfig_v1, 1) + +/** + * Latest version for \ref dcgmVgpuConfig_t + */ +#define dcgmVgpuConfig_version dcgmVgpuConfig_version1 + +/** + * Represents the vGPU attributes corresponding to a physical device + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuDeviceAttributes_version) + unsigned int activeVgpuInstanceCount; //!< Count of active vGPU instances on the device + unsigned int activeVgpuInstanceIds[DCGM_MAX_VGPU_INSTANCES_PER_PGPU]; //!< List of vGPU instances + unsigned int creatableVgpuTypeCount; //!< Creatable vGPU type count + unsigned int creatableVgpuTypeIds[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< List of Creatable vGPU types + unsigned int supportedVgpuTypeCount; //!< Supported vGPU type count + dcgmDeviceVgpuTypeInfo_v1 + supportedVgpuTypeInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Info related to vGPUs supported on the device + dcgmDeviceVgpuUtilInfo_v1 vgpuUtilInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Utilization specific to vGPU instance + unsigned int gpuUtil; //!< GPU utilization + unsigned int memCopyUtil; //!< Memory utilization + unsigned int encUtil; //!< Encoder utilization + unsigned int decUtil; //!< Decoder utilization +} dcgmVgpuDeviceAttributes_v6; + +/** + * Version 6 for \ref dcgmVgpuDeviceAttributes_v6 + */ +#define dcgmVgpuDeviceAttributes_version6 MAKE_DCGM_VERSION(dcgmVgpuDeviceAttributes_v6, 1) + +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuDeviceAttributes_version) + unsigned int activeVgpuInstanceCount; //!< Count of active vGPU instances on the device + unsigned int activeVgpuInstanceIds[DCGM_MAX_VGPU_INSTANCES_PER_PGPU]; //!< List of vGPU instances + unsigned int creatableVgpuTypeCount; //!< Creatable vGPU type count + unsigned int creatableVgpuTypeIds[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< List of Creatable vGPU types + unsigned int supportedVgpuTypeCount; //!< Supported vGPU type count + dcgmDeviceVgpuTypeInfo_v2 + supportedVgpuTypeInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Info related to vGPUs supported on the device + dcgmDeviceVgpuUtilInfo_v1 vgpuUtilInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Utilization specific to vGPU instance + unsigned int gpuUtil; //!< GPU utilization + unsigned int memCopyUtil; //!< Memory utilization + unsigned int encUtil; //!< Encoder utilization + unsigned int decUtil; //!< Decoder utilization +} dcgmVgpuDeviceAttributes_v7; + +/** + * * Typedef for \ref dcgmVgpuDeviceAttributes_v7 + * */ +typedef dcgmVgpuDeviceAttributes_v7 dcgmVgpuDeviceAttributes_t; + +/** + * Version 7 for \ref dcgmVgpuDeviceAttributes_v7 + */ +#define dcgmVgpuDeviceAttributes_version7 MAKE_DCGM_VERSION(dcgmVgpuDeviceAttributes_v7, 7) + +/** + * Latest version for \ref dcgmVgpuDeviceAttributes_t + */ +#define dcgmVgpuDeviceAttributes_version dcgmVgpuDeviceAttributes_version7 + +/** + * Represents attributes specific to vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuInstanceAttributes_version) + char vmId[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< VM ID of the vGPU instance + char vmName[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< VM name of the vGPU instance + unsigned int vgpuTypeId; //!< Type ID of the vGPU instance + char vgpuUuid[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< UUID of the vGPU instance + char vgpuDriverVersion[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< Driver version of the vGPU instance + unsigned int fbUsage; //!< Fb usage of the vGPU instance + unsigned int licenseStatus; //!< License status of the vGPU instance + unsigned int frameRateLimit; //!< Frame rate limit of the vGPU instance +} dcgmVgpuInstanceAttributes_v1; + +/** + * Typedef for \ref dcgmVgpuInstanceAttributes_v1 + */ +typedef dcgmVgpuInstanceAttributes_v1 dcgmVgpuInstanceAttributes_t; + +/** + * Version 1 for \ref dcgmVgpuInstanceAttributes_v1 + */ +#define dcgmVgpuInstanceAttributes_version1 MAKE_DCGM_VERSION(dcgmVgpuInstanceAttributes_v1, 1) + +/** + * Latest version for \ref dcgmVgpuInstanceAttributes_t + */ +#define dcgmVgpuInstanceAttributes_version dcgmVgpuInstanceAttributes_version1 + +/* Flags to ask nv-hostengine to process MIG events differently. */ +/* + * This flag is only meant to be used when running many commands that will trigger the + * MIG configuration to get loaded again. The intended use is that if you are running + * many commands that will cause the MIG configuration to change, then ask the hostengine + * to only process the last one in order to prevent conflicts in how you are updating the + * MIG information. For example, if you delete one compute instance on a GPU and + * the hostengine processes the event from NVML before you delete the next one, the ID + * of the compute instance will have changed in between. Using the flag asks the + * hostengine to ignore those events temporarily while you are performing updates */ +#define DCGM_MIG_RECONFIG_DELAY_PROCESSING 0x1 + +typedef struct +{ + unsigned int version; //!< Version number of this struct + dcgm_field_entity_group_t entityGroupId; //!< entity group of the MIG entity being deleted + dcgm_field_eid_t entityId; //!< entity id of the MIG entity being deleted + unsigned int flags; //!< flags to influence nv-hostengine's processing of the request +} dcgmDeleteMigEntity_v1; + +#define dcgmDeleteMigEntity_version1 MAKE_DCGM_VERSION(dcgmDeleteMigEntity_v1, 1) + +#define dcgmDeleteMigEntity_version dcgmDeleteMigEntity_version1 + +typedef dcgmDeleteMigEntity_v1 dcgmDeleteMigEntity_t; + +/** + * Enum for the kinds of MIG creations + */ +typedef enum +{ + DcgmMigCreateGpuInstance = 0, /*!< Create a GPU instance */ + DcgmMigCreateComputeInstance = 1, /*!< Create a compute instance */ +} dcgmMigCreate_t; + +typedef struct +{ + unsigned int version; //!< Version number of this request + dcgm_field_eid_t parentId; //!< The entity id of the parent (entity group is inferred from createOption + dcgmMigProfile_t profile; //!< Specify the MIG profile to create + dcgmMigCreate_t createOption; //!< Specify if we're creating a GPU instance or compute instance + unsigned int flags; //!< flags to influence nv-hostengine's processing of the request +} dcgmCreateMigEntity_v1; + +#define dcgmCreateMigEntity_version1 MAKE_DCGM_VERSION(dcgmCreateMigEntity_v1, 1) + +#define dcgmCreateMigEntity_version dcgmCreateMigEntity_version1 + +typedef dcgmCreateMigEntity_v1 dcgmCreateMigEntity_t; + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_STRUCTS_H */ diff --git a/pkg/dcgm/internal.go b/pkg/dcgm/internal.go new file mode 100644 index 0000000..01b3ac4 --- /dev/null +++ b/pkg/dcgm/internal.go @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -ldl -Wl,--export-dynamic -Wl,-undefined,dynamic_lookup + +#include "dcgm_test_apis.h" +#include "dcgm_test_structs.h" +#include "dcgm_structs_internal.h" +*/ +import "C" +import ( + "unsafe" +) + +type MigHierarchyInfo struct { + Entity GroupEntityPair + Parent GroupEntityPair + SliceProfile MigProfile +} + +func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error) { + ccfe := C.dcgmCreateFakeEntities_v2{ + version: C.dcgmCreateFakeEntities_version2, + numToCreate: C.uint(len(entities)), + entityList: [C.DCGM_MAX_HIERARCHY_INFO]C.dcgmMigHierarchyInfo_t{}, + } + for i := range entities { + if i >= C.DCGM_MAX_HIERARCHY_INFO { + break + } + entity := entities[i] + ccfe.entityList[i] = C.dcgmMigHierarchyInfo_t{ + entity: C.dcgmGroupEntityPair_t{ + entityGroupId: C.dcgm_field_entity_group_t(entity.Entity.EntityGroupId), + entityId: C.uint(entity.Entity.EntityId), + }, + parent: C.dcgmGroupEntityPair_t{ + entityGroupId: C.dcgm_field_entity_group_t(entity.Parent.EntityGroupId), + entityId: C.uint(entity.Parent.EntityId), + }, + sliceProfile: C.dcgmMigProfile_t(entity.SliceProfile), + } + } + result := C.dcgmCreateFakeEntities(handle.handle, &ccfe) + + if err := errorString(result); err != nil { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + gpuIDs := make([]uint, ccfe.numToCreate) + for i := 0; i < int(ccfe.numToCreate); i++ { + gpuIDs[i] = uint(ccfe.entityList[i].entity.entityId) + } + + return gpuIDs, nil +} + +func InjectFieldValue(gpu uint, fieldID int, fieldType uint, status int, ts int64, value interface{}) error { + field := C.dcgmInjectFieldValue_t{ + version: C.dcgmInjectFieldValue_version1, + fieldId: C.ushort(fieldID), + fieldType: C.ushort(fieldType), + status: C.int(status), + ts: C.long(ts), + } + + switch fieldType { + case DCGM_FT_INT64: + i64Val := value.(int64) + ptr := (*C.int64_t)(unsafe.Pointer(&field.value[0])) + *ptr = C.int64_t(i64Val) + case DCGM_FT_DOUBLE: + dbVal := value.(float64) + ptr := (*C.double)(unsafe.Pointer(&field.value[0])) + *ptr = C.double(dbVal) + } + + result := C.dcgmInjectFieldValue(handle.handle, C.uint(gpu), &field) + + if err := errorString(result); err != nil { + return &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + return nil +} diff --git a/pkg/dcgm/nvml.h b/pkg/dcgm/nvml.h new file mode 100644 index 0000000..6f4e8a8 --- /dev/null +++ b/pkg/dcgm/nvml.h @@ -0,0 +1,9459 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +/* +NVML API Reference + +The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and +managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building +3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi +tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. + +API Documentation + +Supported platforms: +- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit +- Linux: 32-bit and 64-bit +- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 + +Supported products: +- Full Support + - All Tesla products, starting with the Fermi architecture + - All Quadro products, starting with the Fermi architecture + - All vGPU Software products, starting with the Kepler architecture + - Selected GeForce Titan products +- Limited Support + - All Geforce products, starting with the Fermi architecture + +The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is +not be added to the system path by default. To dynamically link to NVML, add this path to the PATH +environmental variable. To dynamically load NVML, call LoadLibrary with this path. + +On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit +and 64 bit NVML libraries will be installed. + +Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html +*/ + +#ifndef __nvml_nvml_h__ +#define __nvml_nvml_h__ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On Windows, set up methods for DLL export + * define NVML_STATIC_IMPORT when using nvml_loader library + */ +#if defined _WINDOWS + #if !defined NVML_STATIC_IMPORT + #if defined NVML_LIB_EXPORT + #define DECLDIR __declspec(dllexport) + #else + #define DECLDIR __declspec(dllimport) + #endif + #else + #define DECLDIR + #endif +#else + #define DECLDIR +#endif + +/** + * NVML API versioning support + */ +#define NVML_API_VERSION 11 +#define NVML_API_VERSION_STR "11" +/** + * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. + * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this + * guard if you need to support older versions of the API + */ +#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS + #define nvmlInit nvmlInit_v2 + #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 + #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 + #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 + #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 + #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 + #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 + #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v4 + #define nvmlEventSetWait nvmlEventSetWait_v2 + #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 + #define nvmlComputeInstanceGetInfo nvmlComputeInstanceGetInfo_v2 + #define nvmlDeviceGetComputeRunningProcesses nvmlDeviceGetComputeRunningProcesses_v3 + #define nvmlDeviceGetGraphicsRunningProcesses nvmlDeviceGetGraphicsRunningProcesses_v3 + #define nvmlDeviceGetMPSComputeRunningProcesses nvmlDeviceGetMPSComputeRunningProcesses_v3 + #define nvmlBlacklistDeviceInfo_t nvmlExcludedDeviceInfo_t + #define nvmlGetBlacklistDeviceCount nvmlGetExcludedDeviceCount + #define nvmlGetBlacklistDeviceInfoByIndex nvmlGetExcludedDeviceInfoByIndex + #define nvmlDeviceGetGpuInstancePossiblePlacements nvmlDeviceGetGpuInstancePossiblePlacements_v2 + #define nvmlVgpuInstanceGetLicenseInfo nvmlVgpuInstanceGetLicenseInfo_v2 +#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS + +#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \ + (ver << 24U)) + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceStructs Device Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Special constant that some fields take when they are not available. + * Used when only part of the struct is not available. + * + * Each structure explicitly states when to check for this value. + */ +#define NVML_VALUE_NOT_AVAILABLE (-1) + +typedef struct nvmlDevice_st* nvmlDevice_t; + +/** + * Buffer size guaranteed to be large enough for pci bus id + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 + +/** + * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 + +/** + * PCI information about a GPU device. + */ +typedef struct nvmlPciInfo_st +{ + char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + + // Added in NVML 2.285 API + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) +} nvmlPciInfo_t; + +/** + * PCI format string for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" + +/** + * PCI format string for ::busId + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" + +/** + * Utility macro for filling the pci bus id format from a nvmlPciInfo_t + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ + (pciInfo)->bus, \ + (pciInfo)->device + +/** + * Detailed ECC error counts for a device. + * + * @deprecated Different GPU families can have different memory error counters + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef struct nvmlEccErrorCounts_st +{ + unsigned long long l1Cache; //!< L1 cache errors + unsigned long long l2Cache; //!< L2 cache errors + unsigned long long deviceMemory; //!< Device memory errors + unsigned long long registerFile; //!< Register file errors +} nvmlEccErrorCounts_t; + +/** + * Utilization information for a device. + * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. + */ +typedef struct nvmlUtilization_st +{ + unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU + unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written +} nvmlUtilization_t; + +/** + * Memory allocation information for a device (v1). + * The total amount is equal to the sum of the amounts of free and used memory. + */ +typedef struct nvmlMemory_st +{ + unsigned long long total; //!< Total physical device memory (in bytes) + unsigned long long free; //!< Unallocated device memory (in bytes) + unsigned long long used; //!< Sum of Reserved and Allocated device memory (in bytes). + //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_t; + +/** + * Memory allocation information for a device (v2). + * + * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output. + * @note The \ref nvmlMemory_v2_t.used amount also includes the \ref nvmlMemory_v2_t.reserved amount. + */ +typedef struct nvmlMemory_v2_st +{ + unsigned int version; //!< Structure format version (must be 2) + unsigned long long total; //!< Total physical device memory (in bytes) + unsigned long long reserved; //!< Device memory (in bytes) reserved for system use (driver or firmware) + unsigned long long free; //!< Unallocated device memory (in bytes) + unsigned long long used; //!< Allocated device memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_v2_t; + +#define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2) + +/** + * BAR1 Memory allocation Information for a device + */ +typedef struct nvmlBAR1Memory_st +{ + unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) + unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) + unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) +}nvmlBAR1Memory_t; + +/** + * Information about running compute processes on the GPU, legacy version + * for older versions of the API. + */ +typedef struct nvmlProcessInfo_v1_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver +} nvmlProcessInfo_v1_t; + +/** + * Information about running compute processes on the GPU + */ +typedef struct nvmlProcessInfo_v2_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver + unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. +} nvmlProcessInfo_v2_t; + +/** + * Information about running compute processes on the GPU + * Version 2 adds versioning for the struct and the conf compute protected memory in output. + */ +typedef struct nvmlProcessInfo_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver + unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned long long usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes. +} nvmlProcessInfo_t; + +typedef struct nvmlDeviceAttributes_st +{ + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + unsigned int gpuInstanceSliceCount; //!< GPU instance slice count + unsigned int computeInstanceSliceCount; //!< Compute instance slice count + unsigned long long memorySizeMB; //!< Device memory size (in MiB) +} nvmlDeviceAttributes_t; + +/** + * Possible values that classify the remap availability for each bank. The max + * field will contain the number of banks that have maximum remap availability + * (all reserved rows are available). None means that there are no reserved + * rows available. + */ +typedef struct nvmlRowRemapperHistogramValues_st +{ + unsigned int max; + unsigned int high; + unsigned int partial; + unsigned int low; + unsigned int none; +} nvmlRowRemapperHistogramValues_t; + +/** + * Enum to represent type of bridge chip + */ +typedef enum nvmlBridgeChipType_enum +{ + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 +}nvmlBridgeChipType_t; + +/** + * Maximum number of NvLink links supported + */ +#define NVML_NVLINK_MAX_LINKS 18 + +/** + * Enum to represent the NvLink utilization counter packet units + */ +typedef enum nvmlNvLinkUtilizationCountUnits_enum +{ + NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles + NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets + NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes + NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use + // this must be last + NVML_NVLINK_COUNTER_UNIT_COUNT +} nvmlNvLinkUtilizationCountUnits_t; + +/** + * Enum to represent the NvLink utilization counter packet types to count + * ** this is ONLY applicable with the units as packets or bytes + * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t + * ** all packet filter descriptions are target GPU centric + * ** these can be "OR'd" together + */ +typedef enum nvmlNvLinkUtilizationCountPktTypes_enum +{ + NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets + NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets + NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets + NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests + NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data + NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data + NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets +} nvmlNvLinkUtilizationCountPktTypes_t; + +/** + * Struct to define the NVLINK counter controls + */ +typedef struct nvmlNvLinkUtilizationControl_st +{ + nvmlNvLinkUtilizationCountUnits_t units; + nvmlNvLinkUtilizationCountPktTypes_t pktfilter; +} nvmlNvLinkUtilizationControl_t; + +/** + * Enum to represent NvLink queryable capabilities + */ +typedef enum nvmlNvLinkCapability_enum +{ + NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported + NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported + NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported + NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported + NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link + NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device + // should be last + NVML_NVLINK_CAP_COUNT +} nvmlNvLinkCapability_t; + +/** + * Enum to represent NvLink queryable error counters + */ +typedef enum nvmlNvLinkErrorCounter_enum +{ + NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter + NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter + NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter + NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter + NVML_NVLINK_ERROR_DL_ECC_DATA = 4, // Data link receive data ECC error counter + + // this must be last + NVML_NVLINK_ERROR_COUNT +} nvmlNvLinkErrorCounter_t; + +/** + * Enum to represent NvLink's remote device type + */ +typedef enum nvmlIntNvLinkDeviceType_enum +{ + NVML_NVLINK_DEVICE_TYPE_GPU = 0x00, + NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01, + NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02, + NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF +} nvmlIntNvLinkDeviceType_t; + +/** + * Represents level relationships within a system between two GPUs + * The enums are spaced to allow for future relationships + */ +typedef enum nvmlGpuLevel_enum +{ + NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 + NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch + NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge + NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge + NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges + NVML_TOPOLOGY_SYSTEM = 50 // all devices in the system + + // there is purposefully no COUNT here because of the need for spacing above +} nvmlGpuTopologyLevel_t; + +/* Compatibility for CPU->NODE renaming */ +#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN + +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +}nvmlGpuP2PCapsIndex_t; + +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + +/** + * Information about the Bridge Chip Firmware + */ +typedef struct nvmlBridgeChipInfo_st +{ + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable +}nvmlBridgeChipInfo_t; + +/** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ +typedef struct nvmlBridgeChipHierarchy_st +{ + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board +}nvmlBridgeChipHierarchy_t; + +/** + * Represents Type of Sampling Event + */ +typedef enum nvmlSamplingType_enum +{ + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT +}nvmlSamplingType_t; + +/** + * Represents the queryable PCIe utilization counters + */ +typedef enum nvmlPcieUtilCounter_enum +{ + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT +} nvmlPcieUtilCounter_t; + +/** + * Represents the type for sample value returned + */ +typedef enum nvmlValueType_enum +{ + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT +}nvmlValueType_t; + + +/** + * Union to represent different types of Value + */ +typedef union nvmlValue_st +{ + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long +}nvmlValue_t; + +/** + * Information for Sample + */ +typedef struct nvmlSample_st +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value +}nvmlSample_t; + +/** + * Represents type of perf policy for which violation times can be queried + */ +typedef enum nvmlPerfPolicyType_enum +{ + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT +}nvmlPerfPolicyType_t; + +/** + * Struct to hold perf policy violation status data + */ +typedef struct nvmlViolationTime_st +{ + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds +}nvmlViolationTime_t; + +#define NVML_MAX_THERMAL_SENSORS_PER_GPU 3 + +typedef enum +{ + NVML_THERMAL_TARGET_NONE = 0, + NVML_THERMAL_TARGET_GPU = 1, //!< GPU core temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_MEMORY = 2, //!< GPU memory temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_POWER_SUPPLY = 4, //!< GPU power supply temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_BOARD = 8, //!< GPU board ambient temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_VCD_BOARD = 9, //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_INLET = 10, //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_OUTLET = 11, //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle + + NVML_THERMAL_TARGET_ALL = 15, + NVML_THERMAL_TARGET_UNKNOWN = -1, +} nvmlThermalTarget_t; + +typedef enum +{ + NVML_THERMAL_CONTROLLER_NONE = 0, + NVML_THERMAL_CONTROLLER_GPU_INTERNAL, + NVML_THERMAL_CONTROLLER_ADM1032, + NVML_THERMAL_CONTROLLER_ADT7461, + NVML_THERMAL_CONTROLLER_MAX6649, + NVML_THERMAL_CONTROLLER_MAX1617, + NVML_THERMAL_CONTROLLER_LM99, + NVML_THERMAL_CONTROLLER_LM89, + NVML_THERMAL_CONTROLLER_LM64, + NVML_THERMAL_CONTROLLER_G781, + NVML_THERMAL_CONTROLLER_ADT7473, + NVML_THERMAL_CONTROLLER_SBMAX6649, + NVML_THERMAL_CONTROLLER_VBIOSEVT, + NVML_THERMAL_CONTROLLER_OS, + NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS, + NVML_THERMAL_CONTROLLER_NVSYSCON_E551, + NVML_THERMAL_CONTROLLER_MAX6649R, + NVML_THERMAL_CONTROLLER_ADT7473S, + NVML_THERMAL_CONTROLLER_UNKNOWN = -1, +} nvmlThermalController_t; + +typedef struct +{ + unsigned int count; + struct + { + nvmlThermalController_t controller; + unsigned int defaultMinTemp; + unsigned int defaultMaxTemp; + unsigned int currentTemp; + nvmlThermalTarget_t target; + } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; + +} nvmlGpuThermalSettings_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceEnumvs Device Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Generic enable/disable enum. + */ +typedef enum nvmlEnableState_enum +{ + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled +} nvmlEnableState_t; + +//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. +#define nvmlFlagDefault 0x00 +//! Generic flag used to force some behavior. See description of particular functions for details. +#define nvmlFlagForce 0x01 + +/** + * * The Brand of the GPU + * */ +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, + NVML_BRAND_QUADRO = 1, + NVML_BRAND_TESLA = 2, + NVML_BRAND_NVS = 3, + NVML_BRAND_GRID = 4, // Deprecated from API reporting. Keeping definition for backward compatibility. + NVML_BRAND_GEFORCE = 5, + NVML_BRAND_TITAN = 6, + NVML_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications + NVML_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC + NVML_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server + NVML_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation + NVML_BRAND_NVIDIA_CLOUD_GAMING = 11, // NVIDIA Cloud Gaming + NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING, // Deprecated from API reporting. Keeping definition for backward compatibility. + NVML_BRAND_QUADRO_RTX = 12, + NVML_BRAND_NVIDIA_RTX = 13, + NVML_BRAND_NVIDIA = 14, + NVML_BRAND_GEFORCE_RTX = 15, // Unused + NVML_BRAND_TITAN_RTX = 16, // Unused + + // Keep this last + NVML_BRAND_COUNT +} nvmlBrandType_t; + +/** + * Temperature thresholds. + */ +typedef enum nvmlTemperatureThresholds_enum +{ + NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will + // shut down for HW protection + NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will + // begin HW slowdown + NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will + // begin SW slowdown + NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU + // can be throttled below base clock + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4, // Minimum GPU Temperature that can be + // set as acoustic threshold + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, // Current temperature that is set as + // acoustic threshold. + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6, // Maximum GPU temperature that can be + // set as acoustic threshold. + // Keep this last + NVML_TEMPERATURE_THRESHOLD_COUNT +} nvmlTemperatureThresholds_t; + +/** + * Temperature sensors. + */ +typedef enum nvmlTemperatureSensors_enum +{ + NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die + + // Keep this last + NVML_TEMPERATURE_COUNT +} nvmlTemperatureSensors_t; + +/** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ +typedef enum nvmlComputeMode_enum +{ + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT +} nvmlComputeMode_t; + +/** + * Max Clock Monitors available + */ +#define MAX_CLK_DOMAINS 32 + +/** + * Clock Monitor error types + */ +typedef struct nvmlClkMonFaultInfo_struct { + /** + * The Domain which faulted + */ + unsigned int clkApiDomain; + + /** + * Faults Information + */ + unsigned int clkDomainFaultMask; +} nvmlClkMonFaultInfo_t; + +/** + * Clock Monitor Status + */ +typedef struct nvmlClkMonStatus_status { + /** + * Fault status Indicator + */ + unsigned int bGlobalStatus; + + /** + * Total faulted domain numbers + */ + unsigned int clkMonListSize; + + /** + * The fault Information structure + */ + nvmlClkMonFaultInfo_t clkMonList[MAX_CLK_DOMAINS]; +} nvmlClkMonStatus_t; + +/** + * ECC bit types. + * + * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type + */ +#define nvmlEccBitType_t nvmlMemoryErrorType_t + +/** + * Single bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED + */ +#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED + +/** + * Double bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED + */ +#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED + +/** + * Memory error types + */ +typedef enum nvmlMemoryErrorType_enum +{ + /** + * A memory error that was corrected + * + * For ECC errors, these are single bit errors + * For Texture memory, these are errors fixed by resend + */ + NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, + /** + * A memory error that was not corrected + * + * For ECC errors, these are double bit errors + * For Texture memory, these are errors where the resend fails + */ + NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, + + + // Keep this last + NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types + +} nvmlMemoryErrorType_t; + +/** + * ECC counter types. + * + * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. + * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver + * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app + * is run. + */ +typedef enum nvmlEccCounterType_enum +{ + NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. + NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) + + // Keep this last + NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types +} nvmlEccCounterType_t; + +/** + * Clock types. + * + * All speeds are in Mhz. + */ +typedef enum nvmlClockType_enum +{ + NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain + NVML_CLOCK_SM = 1, //!< SM clock domain + NVML_CLOCK_MEM = 2, //!< Memory clock domain + NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain + + // Keep this last + NVML_CLOCK_COUNT //!< Count of clock types +} nvmlClockType_t; + +/** + * Clock Ids. These are used in combination with nvmlClockType_t + * to specify a single clock value. + */ +typedef enum nvmlClockId_enum +{ + NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value + NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock + NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target + NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate + + //Keep this last + NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. +} nvmlClockId_t; + +/** + * Driver models. + * + * Windows only. + */ + +typedef enum nvmlDriverModel_enum +{ + NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device + NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device +} nvmlDriverModel_t; + +#define NVML_MAX_GPU_PERF_PSTATES 16 + +/** + * Allowed PStates. + */ +typedef enum nvmlPStates_enum +{ + NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance + NVML_PSTATE_1 = 1, //!< Performance state 1 + NVML_PSTATE_2 = 2, //!< Performance state 2 + NVML_PSTATE_3 = 3, //!< Performance state 3 + NVML_PSTATE_4 = 4, //!< Performance state 4 + NVML_PSTATE_5 = 5, //!< Performance state 5 + NVML_PSTATE_6 = 6, //!< Performance state 6 + NVML_PSTATE_7 = 7, //!< Performance state 7 + NVML_PSTATE_8 = 8, //!< Performance state 8 + NVML_PSTATE_9 = 9, //!< Performance state 9 + NVML_PSTATE_10 = 10, //!< Performance state 10 + NVML_PSTATE_11 = 11, //!< Performance state 11 + NVML_PSTATE_12 = 12, //!< Performance state 12 + NVML_PSTATE_13 = 13, //!< Performance state 13 + NVML_PSTATE_14 = 14, //!< Performance state 14 + NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance + NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state +} nvmlPstates_t; + +/** + * GPU Operation Mode + * + * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. + * + * Each GOM is designed to meet specific user needs. + */ +typedef enum nvmlGom_enum +{ + NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed + + NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations + //!< are not allowed + + NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require + //!< high bandwidth double precision +} nvmlGpuOperationMode_t; + +/** + * Available infoROM objects. + */ +typedef enum nvmlInforomObject_enum +{ + NVML_INFOROM_OEM = 0, //!< An object defined by OEM + NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support + NVML_INFOROM_POWER = 2, //!< The power management object + + // Keep this last + NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about +} nvmlInforomObject_t; + +/** + * Return values for NVML API calls. + */ +typedef enum nvmlReturn_enum +{ + // cppcheck-suppress * + NVML_SUCCESS = 0, //!< The operation was successful + NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() + NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid + NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device + NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation + NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting + NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful + NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough + NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached + NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded + NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed + NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU + NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded + NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function + NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted + NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible + NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again + NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups + NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch + NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use + NVML_ERROR_MEMORY = 20, //!< Insufficient memory + NVML_ERROR_NO_DATA = 21, //!< No data + NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled + NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory + NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory + NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported + NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred +} nvmlReturn_t; + +/** + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef enum nvmlMemoryLocation_enum +{ + NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache + NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache + NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM + NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory + NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File + NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory + NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory + NVML_MEMORY_LOCATION_CBU = 6, //!< CBU + NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM + // Keep this last + NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about +} nvmlMemoryLocation_t; + +/** + * Causes for page retirement + */ +typedef enum nvmlPageRetirementCause_enum +{ + NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error + NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error + + // Keep this last + NVML_PAGE_RETIREMENT_CAUSE_COUNT +} nvmlPageRetirementCause_t; + +/** + * API types that allow changes to default permission restrictions + */ +typedef enum nvmlRestrictedAPI_enum +{ + NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks + //!< and see nvmlDeviceResetApplicationsClocks + NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks + //!< see nvmlDeviceSetAutoBoostedClocksEnabled + // Keep this last + NVML_RESTRICTED_API_COUNT +} nvmlRestrictedAPI_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup virtualGPU + * @{ + */ +/***************************************************************************************************/ +/** @defgroup nvmlVirtualGpuEnums vGPU Enums + * @{ + */ +/***************************************************************************************************/ + +/*! + * GPU virtualization mode types. + */ +typedef enum nvmlGpuVirtualizationMode { + NVML_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU + NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthorugh + NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. + NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode + NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 //!< Device is associated with VGX hypervisor in vSGA mode +} nvmlGpuVirtualizationMode_t; + +/** + * Host vGPU modes + */ +typedef enum nvmlHostVgpuMode_enum +{ + NVML_HOST_VGPU_MODE_NON_SRIOV = 0, //!< Non SR-IOV mode + NVML_HOST_VGPU_MODE_SRIOV = 1 //!< SR-IOV mode +} nvmlHostVgpuMode_t; + +/*! + * Types of VM identifiers + */ +typedef enum nvmlVgpuVmIdType { + NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID + NVML_VGPU_VM_ID_UUID = 1 //!< VM ID represents UUID +} nvmlVgpuVmIdType_t; + +/** + * vGPU GUEST info state + */ +typedef enum nvmlVgpuGuestInfoState_enum +{ + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 //!< Guest-dependent fields initialized +} nvmlVgpuGuestInfoState_t; + +/** + * vGPU software licensable features + */ +typedef enum { + NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0, //!< Unknown + NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU + NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2, //!< Nvidia RTX + NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX, //!< Deprecated, do not use. + NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3, //!< Gaming + NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 //!< Compute +} nvmlGridLicenseFeatureCode_t; + +/** + * Status codes for license expiry + */ +#define NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE 0 //!< Expiry information not available +#define NVML_GRID_LICENSE_EXPIRY_INVALID 1 //!< Invalid expiry or error fetching expiry +#define NVML_GRID_LICENSE_EXPIRY_VALID 2 //!< Valid expiry +#define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE 3 //!< Expiry not applicable +#define NVML_GRID_LICENSE_EXPIRY_PERMANENT 4 //!< Permanent expiry + +/** + * vGPU queryable capabilities + */ +typedef enum nvmlVgpuCapability_enum +{ + NVML_VGPU_CAP_NVLINK_P2P = 0, //!< P2P over NVLink is supported + NVML_VGPU_CAP_GPUDIRECT = 1, //!< GPUDirect capability is supported + // Keep this last + NVML_VGPU_CAP_COUNT +} nvmlVgpuCapability_t; + +/** @} */ + +/***************************************************************************************************/ + +/** @defgroup nvmlVgpuConstants vGPU Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense + */ +#define NVML_GRID_LICENSE_BUFFER_SIZE 128 + +#define NVML_VGPU_NAME_BUFFER_SIZE 64 + +#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 + +#define INVALID_GPU_INSTANCE_PROFILE_ID 0xFFFFFFFF + +#define INVALID_GPU_INSTANCE_ID 0xFFFFFFFF + +/*! + * Macros for vGPU instance's virtualization capabilities bitfield. + */ +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 + +/*! + * Macros for pGPU's virtualization capabilities bitfield. + */ +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuStructs vGPU Structs + * @{ + */ +/***************************************************************************************************/ + +typedef unsigned int nvmlVgpuTypeId_t; + +typedef unsigned int nvmlVgpuInstance_t; + +/** + * Structure to store Utilization Value and vgpuInstance + */ +typedef struct nvmlVgpuInstanceUtilizationSample_st +{ + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value + nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value + nvmlValue_t encUtil; //!< Encoder Util Value + nvmlValue_t decUtil; //!< Decoder Util Value +} nvmlVgpuInstanceUtilizationSample_t; + +/** + * Structure to store Utilization Value, vgpuInstance and subprocess information + */ +typedef struct nvmlVgpuProcessUtilizationSample_st +{ + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + unsigned int pid; //!< PID of process running within the vGPU VM + char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value +} nvmlVgpuProcessUtilizationSample_t; + +/** + * Structure to store the vGPU license expiry details + */ +typedef struct nvmlVgpuLicenseExpiry_st +{ + unsigned int year; //!< Year of license expiry + unsigned short month; //!< Month of license expiry + unsigned short day; //!< Day of license expiry + unsigned short hour; //!< Hour of license expiry + unsigned short min; //!< Minutes of license expiry + unsigned short sec; //!< Seconds of license expiry + unsigned char status; //!< License expiry status +} nvmlVgpuLicenseExpiry_t; + +/** + * vGPU license state + */ +#define NVML_GRID_LICENSE_STATE_UNKNOWN 0 //!< Unknown state +#define NVML_GRID_LICENSE_STATE_UNINITIALIZED 1 //!< Uninitialized state +#define NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED 2 //!< Unlicensed unrestricted state +#define NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED 3 //!< Unlicensed restricted state +#define NVML_GRID_LICENSE_STATE_UNLICENSED 4 //!< Unlicensed state +#define NVML_GRID_LICENSE_STATE_LICENSED 5 //!< Licensed state + +typedef struct nvmlVgpuLicenseInfo_st +{ + unsigned char isLicensed; //!< License status + nvmlVgpuLicenseExpiry_t licenseExpiry; //!< License expiry information + unsigned int currentState; //!< Current license state +} nvmlVgpuLicenseInfo_t; + +/** + * Structure to store utilization value and process Id + */ +typedef struct nvmlProcessUtilizationSample_st +{ + unsigned int pid; //!< PID of process + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value +} nvmlProcessUtilizationSample_t; + +/** + * Structure to store license expiry date and time values + */ +typedef struct nvmlGridLicenseExpiry_st +{ + unsigned int year; //!< Year value of license expiry + unsigned short month; //!< Month value of license expiry + unsigned short day; //!< Day value of license expiry + unsigned short hour; //!< Hour value of license expiry + unsigned short min; //!< Minutes value of license expiry + unsigned short sec; //!< Seconds value of license expiry + unsigned char status; //!< License expiry status +} nvmlGridLicenseExpiry_t; + +/** + * Structure containing vGPU software licensable feature information + */ +typedef struct nvmlGridLicensableFeature_st +{ + nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code + unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero + char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Deprecated. + char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Product name of feature + unsigned int featureEnabled; //!< Non-zero if feature is enabled, otherwise zero + nvmlGridLicenseExpiry_t licenseExpiry; //!< License expiry structure containing date and time +} nvmlGridLicensableFeature_t; + +/** + * Structure to store vGPU software licensable features + */ +typedef struct nvmlGridLicensableFeatures_st +{ + int isGridLicenseSupported; //!< Non-zero if vGPU Software Licensing is supported on the system, otherwise zero + unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array + nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. +} nvmlGridLicensableFeatures_t; + +/** + * GSP firmware + */ +#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40 + +/** + * Simplified chip architecture + */ +#define NVML_DEVICE_ARCH_KEPLER 2 // Devices based on the NVIDIA Kepler architecture +#define NVML_DEVICE_ARCH_MAXWELL 3 // Devices based on the NVIDIA Maxwell architecture +#define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture +#define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture +#define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture + +#define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture + +#define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture + +#define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture + +#define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer + +typedef unsigned int nvmlDeviceArchitecture_t; + +/** + * PCI bus types + */ +#define NVML_BUS_TYPE_UNKNOWN 0 +#define NVML_BUS_TYPE_PCI 1 +#define NVML_BUS_TYPE_PCIE 2 +#define NVML_BUS_TYPE_FPCI 3 +#define NVML_BUS_TYPE_AGP 4 + +typedef unsigned int nvmlBusType_t; + +/** + * Device Power Modes + */ +#define NVML_POWER_MODE_ID_BALANCED 0 +#define NVML_POWER_MODE_ID_MAX 1 + +/** + * Device Power Source + */ +#define NVML_POWER_SOURCE_AC 0x00000000 +#define NVML_POWER_SOURCE_BATTERY 0x00000001 + +typedef unsigned int nvmlPowerSource_t; + +/* + * Device PCIE link Max Speed + */ +#define NVML_PCIE_LINK_MAX_SPEED_INVALID 0x00000000 +#define NVML_PCIE_LINK_MAX_SPEED_2500MBPS 0x00000001 +#define NVML_PCIE_LINK_MAX_SPEED_5000MBPS 0x00000002 +#define NVML_PCIE_LINK_MAX_SPEED_8000MBPS 0x00000003 +#define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004 +#define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005 + +/* + * Adaptive clocking status + */ +#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000 +#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED 0x00000001 + +#define NVML_MAX_GPU_UTILIZATIONS 8 +typedef enum nvmlGpuUtilizationDomainId_t +{ + NVML_GPU_UTILIZATION_DOMAIN_GPU = 0, //!< Graphics engine domain + NVML_GPU_UTILIZATION_DOMAIN_FB = 1, //!< Frame buffer domain + NVML_GPU_UTILIZATION_DOMAIN_VID = 2, //!< Video engine domain + NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain +} nvmlGpuUtilizationDomainId_t; + +typedef struct nvmlGpuDynamicPstatesInfo_st +{ + unsigned int flags; //!< Reserved for future use + struct + { + unsigned int bIsPresent; //!< Set if this utilization domain is present on this GPU + unsigned int percentage; //!< Percentage of time where the domain is considered busy in the last 1-second interval + unsigned int incThreshold; //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed + unsigned int decThreshold; //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed + } utilization[NVML_MAX_GPU_UTILIZATIONS]; +} nvmlGpuDynamicPstatesInfo_t; + +/** @} */ +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueEnums Field Value Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Field Identifiers. + * + * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. + */ +#define NVML_FI_DEV_ECC_CURRENT 1 //!< Current ECC mode. 1=Active. 0=Inactive +#define NVML_FI_DEV_ECC_PENDING 2 //!< Pending ECC mode. 1=Active. 0=Inactive +/* ECC Count Totals */ +#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL 3 //!< Total single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL 4 //!< Total double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL 5 //!< Total single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL 6 //!< Total double bit aggregate (persistent) ECC errors +/* Individual ECC locations */ +#define NVML_FI_DEV_ECC_SBE_VOL_L1 7 //!< L1 cache single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_L1 8 //!< L1 cache double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_L2 9 //!< L2 cache single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_L2 10 //!< L2 cache double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_DEV 11 //!< Device memory single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_DEV 12 //!< Device memory double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_REG 13 //!< Register file single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_REG 14 //!< Register file double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_TEX 15 //!< Texture memory single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_TEX 16 //!< Texture memory double bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_CBU 17 //!< CBU double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_L1 18 //!< L1 cache single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_L1 19 //!< L1 cache double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_L2 20 //!< L2 cache single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_L2 21 //!< L2 cache double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_DEV 22 //!< Device memory single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_DEV 23 //!< Device memory double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_REG 24 //!< Register File single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_REG 25 //!< Register File double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_TEX 26 //!< Texture memory single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_TEX 27 //!< Texture memory double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_CBU 28 //!< CBU double bit aggregate ECC errors + +/* Page Retirement */ +#define NVML_FI_DEV_RETIRED_SBE 29 //!< Number of retired pages because of single bit errors +#define NVML_FI_DEV_RETIRED_DBE 30 //!< Number of retired pages because of double bit errors +#define NVML_FI_DEV_RETIRED_PENDING 31 //!< If any pages are pending retirement. 1=yes. 0=no. + +/* NvLink Flit Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 32 //!< NVLink flow control CRC Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 33 //!< NVLink flow control CRC Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 34 //!< NVLink flow control CRC Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 35 //!< NVLink flow control CRC Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 36 //!< NVLink flow control CRC Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 37 //!< NVLink flow control CRC Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC Error Counter total for all Lanes + +/* NvLink CRC Data Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 39 //!< NVLink data CRC Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 40 //!< NVLink data CRC Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 41 //!< NVLink data CRC Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 42 //!< NVLink data CRC Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 43 //!< NVLink data CRC Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 44 //!< NVLink data CRC Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes + +/* NvLink Replay Error Counters */ +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 46 //!< NVLink Replay Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 47 //!< NVLink Replay Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 48 //!< NVLink Replay Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 49 //!< NVLink Replay Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 50 //!< NVLink Replay Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 51 //!< NVLink Replay Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 52 //!< NVLink Replay Error Counter total for all Lanes + +/* NvLink Recovery Error Counters */ +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 53 //!< NVLink Recovery Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 54 //!< NVLink Recovery Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 55 //!< NVLink Recovery Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 56 //!< NVLink Recovery Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 57 //!< NVLink Recovery Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 58 //!< NVLink Recovery Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes + +/* NvLink Bandwidth Counters */ +/* + * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. + * Please use the following field values instead: + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX + */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL 66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes + +/* NvLink Bandwidth Counters */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL 73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes + +/* NVML Perf Policy Counters */ +#define NVML_FI_DEV_PERF_POLICY_POWER 74 //!< Perf Policy Counter for Power Policy +#define NVML_FI_DEV_PERF_POLICY_THERMAL 75 //!< Perf Policy Counter for Thermal Policy +#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST 76 //!< Perf Policy Counter for Sync boost Policy +#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT 77 //!< Perf Policy Counter for Board Limit +#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION 78 //!< Perf Policy Counter for Low GPU Utilization Policy +#define NVML_FI_DEV_PERF_POLICY_RELIABILITY 79 //!< Perf Policy Counter for Reliability Policy +#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS 80 //!< Perf Policy Counter for Total App Clock Policy +#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS 81 //!< Perf Policy Counter for Total Base Clocks Policy + +/* Memory temperatures */ +#define NVML_FI_DEV_MEMORY_TEMP 82 //!< Memory temperature for the device + +/* Energy Counter */ +#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded + +/* NVLink Speed */ +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 84 //!< NVLink Speed in MBps for Link 0 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 85 //!< NVLink Speed in MBps for Link 1 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 86 //!< NVLink Speed in MBps for Link 2 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 87 //!< NVLink Speed in MBps for Link 3 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 88 //!< NVLink Speed in MBps for Link 4 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 89 //!< NVLink Speed in MBps for Link 5 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links + +#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device + +#define NVML_FI_DEV_RETIRED_PENDING_SBE 92 //!< If any pages are pending retirement due to SBE. 1=yes. 0=no. +#define NVML_FI_DEV_RETIRED_PENDING_DBE 93 //!< If any pages are pending retirement due to DBE. 1=yes. 0=no. + +#define NVML_FI_DEV_PCIE_REPLAY_COUNTER 94 //!< PCIe replay counter +#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER 95 //!< PCIe replay rollover counter + +/* NvLink Flit Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 96 //!< NVLink flow control CRC Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 97 //!< NVLink flow control CRC Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 98 //!< NVLink flow control CRC Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 99 //!< NVLink flow control CRC Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 100 //!< NVLink flow control CRC Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 101 //!< NVLink flow control CRC Error Counter for Lane 11 + +/* NvLink CRC Data Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 102 //!< NVLink data CRC Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 103 //!< NVLink data CRC Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 104 //!< NVLink data CRC Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 105 //!< NVLink data CRC Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 106 //!< NVLink data CRC Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 107 //!< NVLink data CRC Error Counter for Lane 11 + +/* NvLink Replay Error Counters */ +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 108 //!< NVLink Replay Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 109 //!< NVLink Replay Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 110 //!< NVLink Replay Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 111 //!< NVLink Replay Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 112 //!< NVLink Replay Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 113 //!< NVLink Replay Error Counter for Lane 11 + +/* NvLink Recovery Error Counters */ +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 114 //!< NVLink Recovery Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 115 //!< NVLink Recovery Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 116 //!< NVLink Recovery Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 117 //!< NVLink Recovery Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 118 //!< NVLink Recovery Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 119 //!< NVLink Recovery Error Counter for Lane 11 + +/* NvLink Bandwidth Counters */ +/* + * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. + * Please use the following field values instead: + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX + */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11 + +/* NvLink Bandwidth Counters */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11 + +/* NVLink Speed */ +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 132 //!< NVLink Speed in MBps for Link 6 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 133 //!< NVLink Speed in MBps for Link 7 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 134 //!< NVLink Speed in MBps for Link 8 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 135 //!< NVLink Speed in MBps for Link 9 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 136 //!< NVLink Speed in MBps for Link 10 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 137 //!< NVLink Speed in MBps for Link 11 + +/** + * NVLink throughput counters field values + * + * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. + * A scopeId of UINT_MAX returns aggregate value summed up across all links + * for the specified counter type in fieldId. + */ +#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX 138 //!< NVLink TX Data throughput in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX 139 //!< NVLink RX Data throughput in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 //!< NVLink TX Data + protocol overhead in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 //!< NVLink RX Data + protocol overhead in KiB + +/* Row Remapper */ +#define NVML_FI_DEV_REMAPPED_COR 142 //!< Number of remapped rows due to correctable errors +#define NVML_FI_DEV_REMAPPED_UNC 143 //!< Number of remapped rows due to uncorrectable errors +#define NVML_FI_DEV_REMAPPED_PENDING 144 //!< If any rows are pending remapping. 1=yes 0=no +#define NVML_FI_DEV_REMAPPED_FAILURE 145 //!< If any rows failed to be remapped 1=yes 0=no + +/** + * Remote device NVLink ID + * + * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. + */ +#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID + +/** + * NVSwitch: connected NVLink count + */ +#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch + +/* NvLink ECC Data Error Counters + * + * Lane ID needs to be specified in the scopeId field in nvmlFieldValue_t. + * + */ +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 148 //!< NVLink data ECC Error Counter for Link 0 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 149 //!< NVLink data ECC Error Counter for Link 1 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 150 //!< NVLink data ECC Error Counter for Link 2 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 151 //!< NVLink data ECC Error Counter for Link 3 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 152 //!< NVLink data ECC Error Counter for Link 4 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 153 //!< NVLink data ECC Error Counter for Link 5 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 154 //!< NVLink data ECC Error Counter for Link 6 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 155 //!< NVLink data ECC Error Counter for Link 7 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 156 //!< NVLink data ECC Error Counter for Link 8 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 157 //!< NVLink data ECC Error Counter for Link 9 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 158 //!< NVLink data ECC Error Counter for Link 10 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NvLink data ECC Error Counter total for all Links + +#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 +#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 +#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 +#define NVML_FI_DEV_NVLINK_GET_SPEED 164 +#define NVML_FI_DEV_NVLINK_GET_STATE 165 +#define NVML_FI_DEV_NVLINK_GET_VERSION 166 + +#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device +#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE +#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links + +/** + * Retrieves power usage for this GPU in milliwatts. + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and + * \ref nvmlDeviceGetPowerUsage. + * + * scopeId needs to be specified. It signifies: + * 0 - GPU Only Scope - Metrics for GPU are retrieved + * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved. + * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported + */ +#define NVML_FI_DEV_POWER_AVERAGE 185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures. +#define NVML_FI_DEV_POWER_INSTANT 186 //!< Current GPU power, supported on all architectures. +#define NVML_FI_DEV_POWER_MIN_LIMIT 187 //!< Minimum power limit in milliwatts. +#define NVML_FI_DEV_POWER_MAX_LIMIT 188 //!< Maximum power limit in milliwatts. +#define NVML_FI_DEV_POWER_DEFAULT_LIMIT 189 //!< Default power limit in milliwatts (limit which device boots with). +#define NVML_FI_DEV_POWER_CURRENT_LIMIT 190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band). +#define NVML_FI_DEV_ENERGY 191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU. +#define NVML_FI_DEV_POWER_REQUESTED_LIMIT 192 //!< Power limit requested by NVML or any other userspace client. +#define NVML_FI_MAX 193 //!< One greater than the largest field ID defined above + +/** + * Information for a Field Value Sample + */ +typedef struct nvmlFieldValue_st +{ + unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. + unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. + long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 + long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. + nvmlValueType_t valueType; //!< Type of the value stored in value + nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS + nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS +} nvmlFieldValue_t; + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitStructs Unit Structs + * @{ + */ +/***************************************************************************************************/ + +typedef struct nvmlUnit_st* nvmlUnit_t; + +/** + * Description of HWBC entry + */ +typedef struct nvmlHwbcEntry_st +{ + unsigned int hwbcId; + char firmwareVersion[32]; +} nvmlHwbcEntry_t; + +/** + * Fan state enum. + */ +typedef enum nvmlFanState_enum +{ + NVML_FAN_NORMAL = 0, //!< Fan is working properly + NVML_FAN_FAILED = 1 //!< Fan has failed +} nvmlFanState_t; + +/** + * Led color enum. + */ +typedef enum nvmlLedColor_enum +{ + NVML_LED_COLOR_GREEN = 0, //!< GREEN, indicates good health + NVML_LED_COLOR_AMBER = 1 //!< AMBER, indicates problem +} nvmlLedColor_t; + + +/** + * LED states for an S-class unit. + */ +typedef struct nvmlLedState_st +{ + char cause[256]; //!< If amber, a text description of the cause + nvmlLedColor_t color; //!< GREEN or AMBER +} nvmlLedState_t; + +/** + * Static S-class unit info. + */ +typedef struct nvmlUnitInfo_st +{ + char name[96]; //!< Product name + char id[96]; //!< Product identifier + char serial[96]; //!< Product serial number + char firmwareVersion[96]; //!< Firmware version +} nvmlUnitInfo_t; + +/** + * Power usage information for an S-class unit. + * The power supply state is a human readable string that equals "Normal" or contains + * a combination of "Abnormal" plus one or more of the following: + * + * - High voltage + * - Fan failure + * - Heatsink temperature + * - Current limit + * - Voltage below UV alarm threshold + * - Low-voltage + * - SI2C remote off command + * - MOD_DISABLE input + * - Short pin transition +*/ +typedef struct nvmlPSUInfo_st +{ + char state[256]; //!< The power supply state + unsigned int current; //!< PSU current (A) + unsigned int voltage; //!< PSU voltage (V) + unsigned int power; //!< PSU power draw (W) +} nvmlPSUInfo_t; + +/** + * Fan speed reading for a single fan in an S-class unit. + */ +typedef struct nvmlUnitFanInfo_st +{ + unsigned int speed; //!< Fan speed (RPM) + nvmlFanState_t state; //!< Flag that indicates whether fan is working properly +} nvmlUnitFanInfo_t; + +/** + * Fan speed readings for an entire S-class unit. + */ +typedef struct nvmlUnitFanSpeeds_st +{ + nvmlUnitFanInfo_t fans[24]; //!< Fan speed data for each fan + unsigned int count; //!< Number of fans in unit +} nvmlUnitFanSpeeds_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup nvmlEvents + * @{ + */ +/***************************************************************************************************/ + +/** + * Handle to an event set + */ +typedef struct nvmlEventSet_st* nvmlEventSet_t; + +/** @defgroup nvmlEventType Event Types + * @{ + * Event Types which user can be notified about. + * See description of particular functions for details. + * + * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices + * support each event. + * + * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents + */ +//! Event about single bit ECC errors +/** + * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event + */ +#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL + +//! Event about double bit ECC errors +/** + * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event + */ +#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL + +//! Event about PState changes +/** + * \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to + * no work being executed on the GPU, power capping or thermal capping. In a typical situation, + * Fermi-based GPU should stay in P0 for the duration of the execution of the compute process. + */ +#define nvmlEventTypePState 0x0000000000000004LL + +//! Event that Xid critical error occurred +#define nvmlEventTypeXidCriticalError 0x0000000000000008LL + +//! Event about clock changes +/** + * Kepler only + */ +#define nvmlEventTypeClock 0x0000000000000010LL + +//! Event about AC/Battery power source changes +#define nvmlEventTypePowerSourceChange 0x0000000000000080LL + +//! Event about MIG configuration changes +#define nvmlEventMigConfigChange 0x0000000000000100LL + +//! Mask with no events +#define nvmlEventTypeNone 0x0000000000000000LL + +//! Mask of all events +#define nvmlEventTypeAll (nvmlEventTypeNone \ + | nvmlEventTypeSingleBitEccError \ + | nvmlEventTypeDoubleBitEccError \ + | nvmlEventTypePState \ + | nvmlEventTypeClock \ + | nvmlEventTypeXidCriticalError \ + | nvmlEventTypePowerSourceChange \ + | nvmlEventMigConfigChange \ + ) +/** @} */ + +/** + * Information about occurred event + */ +typedef struct nvmlEventData_st +{ + nvmlDevice_t device; //!< Specific device where the event occurred + unsigned long long eventType; //!< Information about what specific event occurred + unsigned long long eventData; //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError, + // eventData is 0 for any other event. eventData is set as 999 for unknown xid error. + unsigned int gpuInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU + // instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF + // otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a + // compute instance, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. +} nvmlEventData_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup nvmlClocksThrottleReasons + * @{ + */ +/***************************************************************************************************/ + +/** Nothing is running on the GPU and the clocks are dropping to Idle state + * \note This limiter may be removed in a later release + */ +#define nvmlClocksThrottleReasonGpuIdle 0x0000000000000001LL + +/** GPU clocks are limited by current setting of applications clocks + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetApplicationsClock + */ +#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL + +/** + * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting + * as the name describes the situation more accurately. + */ +#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksThrottleReasonApplicationsClocksSetting + +/** SW Power Scaling algorithm is reducing the clocks below requested clocks + * + * @see nvmlDeviceGetPowerUsage + * @see nvmlDeviceSetPowerManagementLimit + * @see nvmlDeviceGetPowerManagementLimit + */ +#define nvmlClocksThrottleReasonSwPowerCap 0x0000000000000004LL + +/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + * - External Power Brake Assertion is triggered (e.g. by the system power supply) + * - Power draw is too high and Fast Trigger protection is reducing the clocks + * - May be also reported during PState or clock change + * - This behavior may be removed in a later release. + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwSlowdown 0x0000000000000008LL + +/** Sync Boost + * + * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in + * order to maximize performance per watt. All GPUs in the sync boost group + * will boost to the minimum possible clocks across the entire group. Look at + * the throttle reasons for other GPUs in the system to see why those GPUs are + * holding this one at lower clocks. + * + */ +#define nvmlClocksThrottleReasonSyncBoost 0x0000000000000010LL + +/** SW Thermal Slowdown + * + * This is an indicator of one or more of the following: + * - Current GPU temperature above the GPU Max Operating Temperature + * - Current memory temperature above the Memory Max Operating Temperature + * + */ +#define nvmlClocksThrottleReasonSwThermalSlowdown 0x0000000000000020LL + +/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwThermalSlowdown 0x0000000000000040LL + +/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - External Power Brake Assertion being triggered (e.g. by the system power supply) + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown 0x0000000000000080LL + +/** GPU clocks are limited by current setting of Display clocks + * + * @see bug 1997531 + */ +#define nvmlClocksThrottleReasonDisplayClockSetting 0x0000000000000100LL + +/** Bit mask representing no clocks throttling + * + * Clocks are as high as possible. + * */ +#define nvmlClocksThrottleReasonNone 0x0000000000000000LL + +/** Bit mask representing all supported clocks throttling reasons + * New reasons might be added to this list in the future + */ +#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \ + | nvmlClocksThrottleReasonGpuIdle \ + | nvmlClocksThrottleReasonApplicationsClocksSetting \ + | nvmlClocksThrottleReasonSwPowerCap \ + | nvmlClocksThrottleReasonHwSlowdown \ + | nvmlClocksThrottleReasonSyncBoost \ + | nvmlClocksThrottleReasonSwThermalSlowdown \ + | nvmlClocksThrottleReasonHwThermalSlowdown \ + | nvmlClocksThrottleReasonHwPowerBrakeSlowdown \ + | nvmlClocksThrottleReasonDisplayClockSetting \ +) +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlAccountingStats Accounting Statistics + * @{ + * + * Set of APIs designed to provide per process information about usage of GPU. + * + * @note All accounting statistics and accounting mode live in nvidia driver and reset + * to default (Disabled) when driver unloads. + * It is advised to run with persistence mode enabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + */ +/***************************************************************************************************/ + +/** + * Describes accounting statistics of a process. + */ +typedef struct nvmlAccountingStats_st { + unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU. + //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a + //! process (not just the last sample period). + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported + + unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) memory was being read or written. + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported + + unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported + + + unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if + //!< the process is not terminated + + unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process + + unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) + + unsigned int reserved[5]; //!< Reserved for future use +} nvmlAccountingStats_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEncoderStructs Encoder Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Represents type of encoder for capacity can be queried + */ +typedef enum nvmlEncoderQueryType_enum +{ + NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder + NVML_ENCODER_QUERY_HEVC = 1 //!< HEVC encoder +}nvmlEncoderType_t; + +/** + * Structure to hold encoder session data + */ +typedef struct nvmlEncoderSessionInfo_st +{ + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) + nvmlEncoderType_t codecType; //!< Video encoder type + unsigned int hResolution; //!< Current encode horizontal resolution + unsigned int vResolution; //!< Current encode vertical resolution + unsigned int averageFps; //!< Moving average encode frames per second + unsigned int averageLatency; //!< Moving average encode latency in microseconds +}nvmlEncoderSessionInfo_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures +* @{ +*/ +/***************************************************************************************************/ + +/** + * Represents frame buffer capture session type + */ +typedef enum nvmlFBCSessionType_enum +{ + NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon + NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys + NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda + NVML_FBC_SESSION_TYPE_VID, //!< Vid + NVML_FBC_SESSION_TYPE_HWENC //!< HEnc +} nvmlFBCSessionType_t; + +/** + * Structure to hold frame buffer capture sessions stats + */ +typedef struct nvmlFBCStats_st +{ + unsigned int sessionsCount; //!< Total no of sessions + unsigned int averageFPS; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} nvmlFBCStats_t; + +#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. +#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. + +/** + * Structure to hold FBC session data + */ +typedef struct nvmlFBCSessionInfo_st +{ + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) + unsigned int displayOrdinal; //!< Display identifier + nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session + unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). + unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session + unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session + unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call + unsigned int vResolution; //!< Vertical resolution requested by caller in capture call + unsigned int averageFPS; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} nvmlFBCSessionInfo_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDrainDefs definitions related to the drain state + * @{ + */ +/***************************************************************************************************/ + +/** + * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() + */ +typedef enum nvmlDetachGpuState_enum +{ + NVML_DETACH_GPU_KEEP = 0, + NVML_DETACH_GPU_REMOVE +} nvmlDetachGpuState_t; + +/** + * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() + */ +typedef enum nvmlPcieLinkState_enum +{ + NVML_PCIE_LINK_KEEP = 0, + NVML_PCIE_LINK_SHUT_DOWN +} nvmlPcieLinkState_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlSystem/nvmlDevice definitions related to Confidential Computing + * @{ + */ +/***************************************************************************************************/ +/** + * Confidential Compute CPU Capabilities values + */ +#define NVML_CC_SYSTEM_CPU_CAPS_NONE 0 +#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV 1 + +/** + * Confidenial Compute GPU Capabilities values + */ +#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0 +#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE 1 + +typedef struct nvmlConfComputeSystemCaps_st { + unsigned int cpuCaps; + unsigned int gpusCaps; +} nvmlConfComputeSystemCaps_t; + +/** + * Confidential Compute DEV Mode values + */ +#define NVML_CC_SYSTEM_DEV_MODE_OFF 0 +#define NVML_CC_SYSTEM_DEV_MODE_ON 1 + +/** + * Confidential Compute Environment values + */ +#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0 +#define NVML_CC_SYSTEM_ENVIRONMENT_SIM 1 +#define NVML_CC_SYSTEM_ENVIRONMENT_PROD 2 + +/** + * Confidential Compute Feature Status values + */ +#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 +#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 + +typedef struct nvmlConfComputeSystemState_st { + unsigned int environment; + unsigned int ccFeature; + unsigned int devMode; +} nvmlConfComputeSystemState_t; + +/** + * Protected memory size + */ +typedef struct +nvmlConfComputeMemSizeInfo_st +{ + unsigned long long protectedMemSizeKib; + unsigned long long unprotectedMemSizeKib; +} nvmlConfComputeMemSizeInfo_t; + +/** + * Confidential Compute GPUs/System Ready State values + */ +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0 +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE 1 + +/** + * GPU Certificate Details + */ +#define NVML_GPU_CERT_CHAIN_SIZE 0x1000 +#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1000 + +typedef struct nvmlConfComputeGpuCertificate_st { + unsigned int certChainSize; + unsigned int attestationCertChainSize; + unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE]; + unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE]; +} nvmlConfComputeGpuCertificate_t; + +/** + * GPU Attestation Report + */ +#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20 +#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000 +#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000 +#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0 +#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1 + +typedef struct nvmlConfComputeGpuAttestationReport_st { + unsigned int isCecAttestationReportPresent; + unsigned int attestationReportSize; + unsigned int cecAttestationReportSize; + unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE]; + unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE]; + unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE]; +} nvmlConfComputeGpuAttestationReport_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup + * This chapter describes the methods that handle NVML initialization and cleanup. + * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and + * nvmlShutdown() once NVML is no longer being used. + * @{ + */ +/***************************************************************************************************/ + +#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found +#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs + +/** + * Initialize NVML, but don't initialize any GPUs yet. + * + * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values + * modifying the behaviour of nvmlInit(). + * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that + * did initialize all GPU devices in the system. + * + * This allows NVML to communicate with a GPU + * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are + * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. + * + * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in + * a bad or unstable state. + * + * For all products. + * + * This method, should be called once before invoking any other methods in the library. + * A reference count of the number of initializations is maintained. Shutdown only occurs + * when the reference count reaches zero. + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly initialized + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running + * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlInit_v2(void); + +/** + * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values + * modifying the behaviour of nvmlInit(). + * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. + * + * For all products. + * + * @param flags behaviour modifier flags + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly initialized + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running + * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); + +/** + * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). + * + * For all products. + * + * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() + * A reference count of the number of initializations is maintained. Shutdown only occurs + * when the reference count reaches zero. For backwards compatibility, no error is reported if + * nvmlShutdown() is called more times than nvmlInit(). + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly shut down + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlShutdown(void); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlErrorReporting Error reporting + * This chapter describes helper functions for error reporting routines. + * @{ + */ +/***************************************************************************************************/ + +/** + * Helper method for converting NVML error codes into readable strings. + * + * For all products. + * + * @param result NVML error code to convert + * + * @return String representation of the error. + * + */ +const DECLDIR char* nvmlErrorString(nvmlReturn_t result); +/** @} */ + + +/***************************************************************************************************/ +/** @defgroup nvmlConstants Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion + */ +#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 + +/** + * Buffer size guaranteed to be large enough for storing GPU identifiers. + */ +#define NVML_DEVICE_UUID_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID + */ +#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber + */ +#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion + */ +#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion + */ +#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for storing GPU device names. + */ +#define NVML_DEVICE_NAME_BUFFER_SIZE 64 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName + */ +#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial + */ +#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion + */ +#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlSystemQueries System Queries + * This chapter describes the queries that NVML can perform against the local system. These queries + * are not device-specific. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the version of the system's graphics driver. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); + +/** + * Retrieves the version of the NVML library. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); + +/** + * Retrieves the version of the CUDA driver. + * + * For all products. + * + * The CUDA driver version returned will be retreived from the currently installed version of CUDA. + * If the cuda library is not found, this function will return a known supported version number. + * + * @param cudaDriverVersion Reference in which to return the version identifier + * + * @return + * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL + */ +nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); + +/** + * Retrieves the version of the CUDA driver from the shared library. + * + * For all products. + * + * The returned CUDA driver version by calling cuDriverGetVersion() + * + * @param cudaDriverVersion Reference in which to return the version identifier + * + * @return + * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL + * - \ref NVML_ERROR_LIBRARY_NOT_FOUND if \a libcuda.so.1 or libcuda.dll is not found + * - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library + */ +nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); + +/** + * Macros for converting the CUDA driver version number to Major and Minor version numbers. + */ +#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000) +#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10) + +/** + * Gets name of the process with provided process id + * + * For all products. + * + * Returned process name is cropped to provided length. + * name string is encoded in ANSI. + * + * @param pid The identifier of the process + * @param name Reference in which to return the process name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. + * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitQueries Unit Queries + * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. + * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by + * calling \ref nvmlUnitGetHandleByIndex(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of units in the system. + * + * For S-class products. + * + * @param unitCount Reference in which to return the number of units + * + * @return + * - \ref NVML_SUCCESS if \a unitCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); + +/** + * Acquire the handle for a particular unit, based on its index. + * + * For S-class products. + * + * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). + * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. + * + * The order in which NVML enumerates units has no guarantees of consistency between reboots. + * + * @param index The index of the target unit, >= 0 and < \a unitCount + * @param unit Reference in which to return the unit handle + * + * @return + * - \ref NVML_SUCCESS if \a unit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); + +/** + * Retrieves the static information associated with a unit. + * + * For S-class products. + * + * See \ref nvmlUnitInfo_t for details on available unit info. + * + * @param unit The identifier of the target unit + * @param info Reference in which to return the unit information + * + * @return + * - \ref NVML_SUCCESS if \a info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL + */ +nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); + +/** + * Retrieves the LED state associated with this unit. + * + * For S-class products. + * + * See \ref nvmlLedState_t for details on allowed states. + * + * @param unit The identifier of the target unit + * @param state Reference in which to return the current LED state + * + * @return + * - \ref NVML_SUCCESS if \a state has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitSetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); + +/** + * Retrieves the PSU stats for the unit. + * + * For S-class products. + * + * See \ref nvmlPSUInfo_t for details on available PSU info. + * + * @param unit The identifier of the target unit + * @param psu Reference in which to return the PSU information + * + * @return + * - \ref NVML_SUCCESS if \a psu has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); + +/** + * Retrieves the temperature readings for the unit, in degrees C. + * + * For S-class products. + * + * Depending on the product, readings may be available for intake (type=0), + * exhaust (type=1) and board (type=2). + * + * @param unit The identifier of the target unit + * @param type The type of reading to take + * @param temp Reference in which to return the intake temperature + * + * @return + * - \ref NVML_SUCCESS if \a temp has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); + +/** + * Retrieves the fan speed readings for the unit. + * + * For S-class products. + * + * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. + * + * @param unit The identifier of the target unit + * @param fanSpeeds Reference in which to return the fan speed information + * + * @return + * - \ref NVML_SUCCESS if \a fanSpeeds has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); + +/** + * Retrieves the set of GPU devices that are attached to the specified unit. + * + * For S-class products. + * + * The \a deviceCount argument is expected to be set to the size of the input \a devices array. + * + * @param unit The identifier of the target unit + * @param deviceCount Reference in which to provide the \a devices array size, and + * to return the number of attached GPU devices + * @param devices Reference in which to return the references to the attached GPU devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); + +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. + * + * For S-class products. + * + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc + * + * @return + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceQueries Device Queries + * This chapter describes that queries that NVML can perform against each device. + * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by + * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), + * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of compute devices in the system. A compute device is a single GPU. + * + * For all products. + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * @param deviceCount Reference in which to return the number of accessible devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); + +/** + * Get attributes (engine counts etc.) for the given NVML device handle. + * + * @note This API currently only supports MIG device handles. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device NVML device handle + * @param attributes Device attributes + * + * @return + * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); + +/** + * Acquire the handle for a particular device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or UUID. See + * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. + * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't + * need to worry about that. + * + * @param index The index of the target GPU, >= 0 and < \a accessibleDevices + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetIndex + * @see nvmlDeviceGetCount + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. + * + * For all products. + * + * @param uuid The UUID of the target GPU or MIG instance + * @param device Reference in which to return the device handle or MIG device handle + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null + * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its PCI bus id. + * + * For all products. + * + * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND + * instead of NVML_ERROR_NO_PERMISSION. + * + * @param pciBusId The PCI bus id of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); + +/** + * Retrieves the name of this device. + * + * For all products. + * + * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not + * exceed 96 characters in length (including the NULL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns MIG device names which can be used to identify devices + * based on their attributes. + * + * @param device The identifier of the target device + * @param name Reference in which to return the product name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); + +/** + * Retrieves the brand of this device. + * + * For all products. + * + * The type is a member of \ref nvmlBrandType_t defined above. + * + * @param device The identifier of the target device + * @param type Reference in which to return the product brand type + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); + +/** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). + * + * When used with MIG device handles this API returns indices that can be + * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. + * MIG device indices are unique within a device. + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + +/** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + + +/***************************************************************************************************/ + +/** @defgroup nvmlAffinity CPU and Memory Affinity + * This chapter describes NVML operations that are associated with CPU and memory + * affinity. + * @{ + */ +/***************************************************************************************************/ + +//! Scope of NUMA node for affinity queries +#define NVML_AFFINITY_SCOPE_NODE 0 +//! Scope of processor socket for affinity queries +#define NVML_AFFINITY_SCOPE_SOCKET 1 + +typedef unsigned int nvmlAffinityScope_t; + +/** + * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with + * the ideal memory affinity within node or socket for the device. + * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, + * result[0] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the memory affinity for the immediate non-I/O + * ancestor of the device. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param nodeSetSize The size of the nodeSet array that is safe to access + * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the + * ideal CPU affinity within node or socket for the device. + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the CPU affinity for the immediate non-I/O + * ancestor of the device. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +/** + * Sets the ideal affinity for the calling thread and device using the guidelines + * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. + * Older versions set the affinity for a calling process and all children. + * Currently supports up to 1024 processors. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully bound + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +/** + * Clear all affinity bindings for the calling thread. Note, this is a change as of version + * 8.0 as older versions cleared the affinity for a calling process and all children. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully unbound + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); + +/** + * Retrieve the common ancestor for two devices + * For all products. + * Supported on Linux only. + * + * @param device1 The identifier of the first device + * @param device2 The identifier of the second device + * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type + * + * @return + * - \ref NVML_SUCCESS if \a pathInfo has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ + +/** @} */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + +/** + * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level + * For all products. + * Supported on Linux only. + * + * @param device The identifier of the first device + * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found at \a level + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the status for a given p2p capability index between a given pair of GPU + * + * @param device1 The first device + * @param device2 The second device + * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 + * @param p2pStatus Reference in which to return the status of the \a p2pIndex + * between \a device1 and \a device2 + * @return + * - \ref NVML_SUCCESS if \a p2pStatus has been populated + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); + +/** + * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, + * that augments the immutable, board serial identifier. + * + * For all products. + * + * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. + * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG + * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. + * + * @param device The identifier of the target device + * @param uuid Reference in which to return the GPU UUID + * @param length The maximum allowed length of the string returned in \a uuid + * + * @return + * - \ref NVML_SUCCESS if \a uuid has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); + +/** + * Retrieve the MDEV UUID of a vGPU instance. + * + * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * MDEV UUID is displayed only on KVM platform. + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); + +/** + * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for + * each GPU will have the form /dev/nvidia[minor number]. + * + * For all products. + * Supported only for Linux + * + * @param device The identifier of the target device + * @param minorNumber Reference in which to return the minor number for the device + * @return + * - \ref NVML_SUCCESS if the minor number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); + +/** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); + +/** + * Retrieves the version information for the device's infoROM object. + * + * For all products with an inforom. + * + * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate + * ECC counts. The version of the data structures in this memory may change from time to time. It will not + * exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * See \ref nvmlInforomObject_t for details on the available infoROM objects. + * + * @param device The identifier of the target device + * @param object The target infoROM object + * @param version Reference in which to return the infoROM version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomImageVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); + +/** + * Retrieves the global infoROM image version + * + * For all products with an inforom. + * + * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board + * in contrast to infoROM object version which is only an indicator of supported features. + * Version string will not exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference in which to return the infoROM image version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Retrieves the checksum of the configuration stored in the device's infoROM. + * + * For all products with an inforom. + * + * Can be used to make sure that two GPUs have the exact same configuration. + * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. + * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) + * + * @param device The identifier of the target device + * @param checksum Reference in which to return the infoROM configuration checksum + * + * @return + * - \ref NVML_SUCCESS if \a checksum has been set + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); + +/** + * Reads the infoROM from the flash and verifies the checksums. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if infoROM is not corrupted + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); + +/** + * Retrieves the display mode for the device. + * + * For all products. + * + * This method indicates whether a physical display (e.g. monitor) is currently connected to + * any of the device's connectors. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param display Reference in which to return the display mode + * + * @return + * - \ref NVML_SUCCESS if \a display has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); + +/** + * Retrieves the display active state for the device. + * + * For all products. + * + * This method indicates whether a display is initialized on the device. + * For example whether X Server is attached to this device and has allocated memory for the screen. + * + * Display can be active even when no monitor is physically attached. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param isActive Reference in which to return the display active state + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); + +/** + * Retrieves the persistence mode associated with this device. + * + * For all products. + * For Linux only. + * + * When driver persistence mode is enabled the driver software state is not torn down when the last + * client disconnects. By default this feature is disabled. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current driver persistence mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); + +/** + * Retrieves the maximum PCIe link generation possible with this device and system + * + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will + * report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); + +/** + * Retrieves the maximum PCIe link width possible with this device and system + * + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + +/** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + +/** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + +/** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); + +/** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + +/** + * Retrieves the current clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the maximum clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks + * by few MHz. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. + * Can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the default applications clock that GPU boots with or + * defaults to after \ref nvmlDeviceResetApplicationsClocks call. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the default clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * \see nvmlDeviceGetApplicationsClock + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Resets the application clock to the default value + * + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. + * + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks + * + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); + +/** + * Retrieves the clock speed for the clock specified by the clock type and clock ID. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockId Identify which clock in the domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); + +/** + * Retrieves the customer defined maximum boost clock speed specified by the given clock type. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of + * required elements) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedGraphicsClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param memoryClockMHz Memory clock for which to return possible graphics clocks + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clocks in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedMemoryClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. + * + * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device + * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will + * revert to when no applications are using the GPU + * + * @return + * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); + +/** + * Try to set the current state of Auto Boosted clocks on a device. + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set Auto Boosted clocks of the target device to + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); + +/** + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); + + +/** + * Retrieves the intended operating speed of the device's fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); + + +/** + * Retrieves the intended operating speed of the device's specified fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); + +/** + * Retrieves the intended target speed of the device's specified fan. + * + * Normally, the driver dynamically adjusts the fan based on + * the needs of the GPU. But when user set fan speed using nvmlDeviceSetFanSpeed_v2, + * the driver will attempt to make the fan achieve the setting in + * nvmlDeviceSetFanSpeed_v2. The actual current speed of the fan + * is reported in nvmlDeviceGetFanSpeed_v2. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param targetSpeed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); + +/** + * Sets the speed of the fan control policy to default. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param fan The index of the fan, starting at zero + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); + +/** + * Retrieves the min and max fan speed that user can set for the GPU fan. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param minSpeed The minimum speed allowed to set + * @param maxSpeed The maximum speed allowed to set + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, + unsigned int * maxSpeed); + +/** + * Retrieves the number of fans on the device. + * + * For all discrete products with dedicated fans. + * + * @param device The identifier of the target device + * @param numFans The number of fans + * + * @return + * - \ref NVML_SUCCESS if \a fan number query was successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a numFans is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans); + +/** + * Retrieves the current temperature readings for the device, in degrees C. + * + * For all products. + * + * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. + * + * @param device The identifier of the target device + * @param sensorType Flag that indicates which sensor reading to retrieve + * @param temp Reference in which to return the temperature reading + * + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); + +/** + * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value queried + * @param temp Reference in which to return the temperature reading + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); + +/** + * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Maxwell &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value to be set + * @param temp Reference which hold the value to be set + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); + +/** + * Used to execute a list of thermal system instructions. + * + * @param device The identifier of the target device + * @param sensorIndex The index of the thermal sensor + * @param pThermalSettings Reference in which to return the thermal sensor information + * + * @return + * - \ref NVML_SUCCESS if \a pThermalSettings has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pThermalSettings is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings); + +/** + * Retrieves the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * Retrieves current clocks throttling reasons. + * + * For all fully supported products. + * + * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. + * + * @param device The identifier of the target device + * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * reasons + * + * @return + * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetSupportedClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); + +/** + * Retrieves bitmask of supported clocks throttle reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * + * For all fully supported products. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported + * clocks throttle reasons + * + * @return + * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetCurrentClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); + +/** + * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. + * + * Retrieve the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * This API has been deprecated. + * + * Retrieves the power management mode associated with this device. + * + * For products from the Fermi family. + * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. + * + * For from the Kepler or newer families. + * - Does not require \a NVML_INFOROM_POWER object. + * + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that + * that the driver will do so if the appropriate conditions are met. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current power management mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the power management limit associated with this device. + * + * For Fermi &tm; or newer fully supported devices. + * + * The power limit defines the upper boundary for the card's power draw. If + * the card's total power draw reaches this limit the power management algorithm kicks in. + * + * This reading is only available if power management mode is supported. + * See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves information about possible values of power management limits on this device. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minLimit Reference in which to return the minimum power management limit in milliwatts + * @param maxLimit Reference in which to return the maximum power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPowerManagementLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); + +/** + * Retrieves default power management limit on this device, in milliwatts. + * Default power management limit is a power management limit that the device boots with. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param defaultLimit Reference in which to return the default power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a defaultLimit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); + +/** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + +/** + * Retrieves current power mode on this device. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param powerModeId Reference in which to return the power mode + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerMode(nvmlDevice_t device, unsigned int *powerModeId); + +/** + * Retrieves bitmask of supported power modes on this device. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param supportedPowerModes Reference in which to return the bitmask of supported power mode + * + * @return + * - \ref NVML_SUCCESS if \a bitmask of supported power mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPowerModes(nvmlDevice_t device, unsigned int *supportedPowerModes); + +/** + * Sets new power mode. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param powerModeId Power mode to set. + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerMode(nvmlDevice_t device, unsigned int powerModeId); + +/** + * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param energy Reference in which to return the energy consumption information + * + * @return + * - \ref NVML_SUCCESS if \a energy has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); + +/** + * Get the effective power limit that the driver enforces after taking into account all limiters + * + * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere + * This includes the out of band power limit interface + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The device to communicate with + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current GOM + * @param pending Reference in which to return the pending GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceSetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); + +/** + * Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. + * The reserved amount is supported on version 2 only. + * + * For all products. + * + * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + * Under WDDM most device memory is allocated and managed on startup by Windows. + * + * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated + * by all active channels on the device. + * + * See \ref nvmlMemory_v2_t for details on available memory info. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate + * information, only if the caller has appropriate privileges. Per-instance + * information can be queried by using specific MIG device handles. + * + * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory); + +/** + * Retrieves the current compute mode for the device. + * + * For all products. + * + * See \ref nvmlComputeMode_t for details on allowed compute modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current compute mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); + +/** + * Retrieves the CUDA compute capability of the device. + * + * For all products. + * + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). + * + * @param device The identifier of the target device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability + * + * @return + * - \ref NVML_SUCCESS if \a major and \a minor have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); + +/** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); + +/** + * Retrieves the default ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); + +/** + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). + * + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID + * + * @return + * - \ref NVML_SUCCESS if \a boardId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); + +/** + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board + * + * @return + * - \ref NVML_SUCCESS if \a multiGpuBool has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); + +/** + * Retrieves the total ECC error counts for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires ECC Mode to be enabled. + * + * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of + * errors across the entire device. + * + * See \ref nvmlMemoryErrorType_t for a description of available error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); + +/** + * Retrieves the detailed ECC error counts for the device. + * + * @deprecated This API supports only a fixed set of ECC error locations + * On different GPU architectures different locations are supported + * See \ref nvmlDeviceGetMemoryErrorCounter + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. + * Requires ECC Mode to be enabled. + * + * Detailed errors provide separate ECC counts for specific parts of the memory system. + * + * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); + +/** + * Retrieves the requested memory error counter for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. + * + * Only applicable to devices with ECC. + * + * Requires ECC Mode to be enabled. + * + * @note On MIG-enabled GPUs, per instance information can be queried using specific + * MIG device handles. Per instance information is currently only supported for + * non-DRAM uncorrectable volatile errors. Querying volatile errors using device + * handles is currently not supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of error. + * @param counterType Flag that specifies the counter-type of the errors. + * @param locationType Specifies the location of the counter. + * @param count Reference in which to return the ECC counter + * + * @return + * - \ref NVML_SUCCESS if \a count has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is + * invalid, or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, + nvmlEccCounterType_t counterType, + nvmlMemoryLocation_t locationType, unsigned long long *count); + +/** + * Retrieves the current utilization rates for the device's major subsystems. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlUtilization_t for details on available utilization rates. + * + * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + * + * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference in which to return the utilization information + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Encoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for encoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param encoderQueryType Type of encoder to query + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); + +/** + * Retrieves the current encoder statistics for a given device. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, + * or \a averageLatency is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about active encoder sessions on a target device. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. + * @param sessionInfos Reference in which to return the session information + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfos is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Decoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for decoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** +* Retrieves the active frame buffer capture sessions statistics for a given device. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param device The identifier of the target device +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a target device. +* +* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param device The identifier of the target device +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + +/** + * Retrieves the current and pending driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current driver model + * @param pending Reference in which to return the pending driver model + * + * @return + * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); + +/** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Get Bridge Chip Information for all the bridge chips on the board. + * + * For all fully supported products. + * Only applicable to multi-GPU products. + * + * @param device The identifier of the target device + * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy + * + * @return + * - \ref NVML_SUCCESS if bridge chip exists + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); + +/** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a graphics context on a device + * + * For Kepler &tm; or newer fully supported devices. + * + * This function returns information only about graphics based processes + * (eg. applications using OpenGL, DirectX) + * + * To query the current number of running graphics processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new graphics processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a MPS compute context on a device + * + * For Volta &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by + * this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + +/** + * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. + * + * For all fully supported products. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted Reference in which to return the current restriction + * NVML_FEATURE_ENABLED indicates that the API is root-only + * NVML_FEATURE_DISABLED indicates that the API is accessible to all users + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support + * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is + * not supported by the device) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); + +/** + * Gets recent samples for the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by + * the driver. + * + * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. + * + * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. + * The returned samplesCount will provide the number of samples that can be queried. The user needs to + * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). + * + * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the + * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query + * to get more recent samples. + * + * This method fetches the number of entries which can be accommodated in the provided samples array, and the + * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this + * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. + * + * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES + * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. + * + * @param device The identifier for the target device + * @param type Type of sampling event + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t + * @param sampleCount Reference to provide the number of elements which can be queried in samples array + * @param samples Reference in which samples are returned + + * @return + * - \ref NVML_SUCCESS if samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or + * reference to \a sampleCount is 0 for non null \a samples + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); + +/** + * Gets Total, Available and Used size of BAR1 memory. + * + * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party + * devices (peer-to-peer on the PCIE bus). + * + * @note In MIG mode, if device handle is provided, the API returns aggregate + * information, only if the caller has appropriate privileges. Per-instance + * information can be queried by using specific MIG device handles. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param bar1Memory Reference in which BAR1 memory + * information is returned. + * + * @return + * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); + +/** + * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + * or thermal constraints. + * + * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The + * difference in violation times at two different reference times gives the indication of GPU throttling event. + * + * Violation for thermal capping is not supported at this time. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param perfPolicyType Represents Performance policy which can trigger GPU throttling + * @param violTime Reference to which violation time related information is returned + * + * + * @return + * - \ref NVML_SUCCESS if violation time is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); + +/** + * Gets the device's interrupt number + * + * @param device The identifier of the target device + * @param irqNum The interrupt number associated with the specified device + * + * @return + * - \ref NVML_SUCCESS if irq number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a irqNum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqNum); + +/** + * Gets the device's core count + * + * @param device The identifier of the target device + * @param numCores The number of cores for the specified device + * + * @return + * - \ref NVML_SUCCESS if Gpu core count is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int *numCores); + +/** + * Gets the devices power source + * + * @param device The identifier of the target device + * @param powerSource The power source of the device + * + * @return + * - \ref NVML_SUCCESS if the current power source was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a powerSource is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t *powerSource); + +/** + * Gets the device's memory bus width + * + * @param device The identifier of the target device + * @param maxSpeed The devices's memory bus width + * + * @return + * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a busWidth is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int *busWidth); + +/** + * Gets the device's PCIE Max Link speed in MBPS + * + * @param device The identifier of the target device + * @param maxSpeed The devices's PCIE Max Link speed in MBPS + * + * @return + * - \ref NVML_SUCCESS if Pcie Max Link Speed is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a maxSpeed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed); + +/** + * Gets the device's PCIe Link speed in Mbps + * + * @param device The identifier of the target device + * @param pcieSpeed The devices's PCIe Max Link speed in Mbps + * + * @return + * - \ref NVML_SUCCESS if \a pcieSpeed has been retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pcieSpeed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support PCIe speed getting + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed); + +/** + * Gets the device's Adaptive Clock status + * + * @param device The identifier of the target device + * @param adaptiveClockStatus The current adaptive clocking status + * + * @return + * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a adaptiveClockStatus is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Queries the state of per process accounting mode. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Queries process's accounting stats. + * + * For Kepler &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * + * @param device The identifier of the target device + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. + * + * For Kepler &tm; or newer fully supported devices. + * + * To just query the number of processes ready to be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlDeviceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); + +/** + * Returns the number of processes that the circular buffer with accounting pids can hold. + * + * For Kepler &tm; or newer fully supported devices. + * + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. + * + * @param device The identifier of the target device + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. + * + * @return + * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ + */ + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses); + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's + * retirement. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); + +/** + * Check if any pages are pending retirement and need a reboot to fully retire. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param isPending Reference in which to return the pending status + * + * @return + * - \ref NVML_SUCCESS if \a isPending was populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); + +/** + * Get number of remapped rows. The number of rows reported will be based on + * the cause of the remapping. isPending indicates whether or not there are + * pending remappings. A reset will be required to actually remap the row. + * failureOccurred will be set if a row remapping ever failed in the past. A + * pending remapping won't affect future work on the GPU since + * error-containment and dynamic page blacklisting will take care of that. + * + * @note On MIG-enabled GPUs with active instances, querying the number of + * remapped rows is not supported + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param corrRows Reference for number of rows remapped due to correctable errors + * @param uncRows Reference for number of rows remapped due to uncorrectable errors + * @param isPending Reference for whether or not remappings are pending + * @param failureOccurred Reference that is set when a remapping has failed in the past + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN Unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, + unsigned int *isPending, unsigned int *failureOccurred); + +/** + * Get the row remapper histogram. Returns the remap availability for each bank + * on the GPU. + * + * @param device Device handle + * @param values Histogram values + * + * @return + * - \ref NVML_SUCCESS On success + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); + +/** + * Get architecture for device + * + * @param device The identifier of the target device + * @param arch Reference where architecture is returned, if call successful. + * Set to NVML_DEVICE_ARCH_* upon success + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitCommands Unit Commands + * This chapter describes NVML operations that change the state of the unit. For S-class products. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the LED state for the unit. The LED can be either green (0) or amber (1). + * + * For S-class products. + * Requires root/admin permissions. + * + * This operation takes effect immediately. + * + * + * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * and back LEDs will be toggled in unison regardless of which unit is specified with this command. + * + * See \ref nvmlLedColor_t for available colors. + * + * @param unit The identifier of the target unit + * @param color The target LED color + * + * @return + * - \ref NVML_SUCCESS if the LED color has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitGetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceCommands Device Commands + * This chapter describes NVML operations that change the state of the device. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the persistence mode for the device. + * + * For all products. + * For Linux only. + * Requires root/admin permissions. + * + * The persistence mode determines whether the GPU driver software is torn down after the last client + * exits. + * + * This operation takes effect immediately. It is not persistent across reboots. After each reboot the + * persistence mode is reset to "Disabled". + * + * See \ref nvmlEnableState_t for available modes. + * + * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA + * memory, the given device handle will no longer be valid, and to continue to interact with this + * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This + * limitation is currently only applicable to devices that have a coherent NVLink connection to + * system memory. + * + * @param device The identifier of the target device + * @param mode The target persistence mode + * + * @return + * - \ref NVML_SUCCESS if the persistence mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Set the compute mode for the device. + * + * For all products. + * Requires root/admin permissions. + * + * The compute mode determines whether a GPU can be used for compute operations and whether it can + * be shared across contexts. + * + * This operation takes effect immediately. Under Linux it is not persistent across reboots and + * always resets to "Default". Under windows it is persistent. + * + * Under windows compute mode may only be set to DEFAULT when running in WDDM + * + * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. + * + * See \ref nvmlComputeMode_t for details on available compute modes. + * + * @param device The identifier of the target device + * @param mode The target compute mode + * + * @return + * - \ref NVML_SUCCESS if the compute mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); + +/** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + +/** + * Clear the ECC error and other memory error counts for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. + * Requires root/admin permissions. + * Requires ECC Mode to be enabled. + * + * Sets all of the specified ECC counters to 0, including both detailed and total counts. + * + * This operation takes effect immediately. + * + * See \ref nvmlMemoryErrorType_t for details on available counter types. + * + * @param device The identifier of the target device + * @param counterType Flag that indicates which type of errors should be cleared. + * + * @return + * - \ref NVML_SUCCESS if the error counts were cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see + * - nvmlDeviceGetDetailedEccErrors() + * - nvmlDeviceGetTotalEccErrors() + */ +nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); + +/** + * Set the driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * Requires root/admin permissions. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. + * + * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). + * This should only be done if the host is subsequently powered down and the display is detached from the device + * before the next reboot. + * + * This operation takes effect after the next reboot. + * + * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. + * + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * See \ref nvmlFlagDefault and \ref nvmlFlagForce + * + * @param device The identifier of the target device + * @param driverModel The target driver model + * @param flags Flags that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if the driver model has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); + +typedef enum nvmlClockLimitId_enum { + NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, + NVML_CLOCK_LIMIT_ID_TDP, + NVML_CLOCK_LIMIT_ID_UNLIMITED +} nvmlClockLimitId_t; + +/** + * Set clocks that device will lock to. + * + * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. + * Setting this will supercede application clock values and take effect regardless if a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. + * See the table below for valid combinations of these values. + * + * minGpuClock | maxGpuClock | Effect + * ------------+-------------+-------------------------------------------------- + * tdp | tdp | Lock clock to TDP + * unlimited | tdp | Upper bound is TDP but clock may drift below this + * tdp | unlimited | Lower bound is TDP but clock may boost above this + * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) + * + * If one arg takes one of these values, the other must be one of these values as + * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetGpuLockedClocks. + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minGpuClockMHz Requested minimum gpu clock in MHz + * @param maxGpuClockMHz Requested maximum gpu clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); + +/** + * Resets the gpu clock to the default value + * + * This is the gpu clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetGpuLockedClocks + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); + +/** + * Set memory clocks that device will lock to. + * + * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. + * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetMemoryLockedClocks. + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minMemClockMHz Requested minimum memory clock in MHz + * @param maxMemClockMHz Requested maximum memory clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); + +/** + * Resets the memory clock to the default value + * + * This is the memory clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetMemoryLockedClocks + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); + +/** + * Set clocks that applications will lock to. + * + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). + * + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. + * + * @param device The identifier of the target device + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); + +/** + * Retrieves the frequency monitor fault status for the device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root user. + * + * See \ref nvmlClkMonStatus_t for details on decoding the status output. + * + * @param device The identifier of the target device + * @param status Reference in which to return the clkmon fault status + * + * @return + * - \ref NVML_SUCCESS if \a status has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetClkMonStatus() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); + +/** + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * @param device The identifier of the target device + * @param limit Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPowerManagementLimitConstraints + * @see nvmlDeviceGetPowerManagementDefaultLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); + +/** + * Sets new GOM. See \a nvmlGpuOperationMode_t for details. + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * Requires root/admin permissions. + * + * Changing GOMs requires a reboot. + * The reboot requirement might be removed in the future. + * + * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when + * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * + * @param device The identifier of the target device + * @param mode Target GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceGetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); + +/** + * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. + * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction + * to query the current restriction settings. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted The target restriction + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * boosted clocks is not supported by the device) + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Enables or disables per process accounting. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note This setting is not persistent and will default to disabled after driver unloads. + * Enable persistence mode to be sure the setting doesn't switch off to disabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + * + * @note Disabling accounting clears all accounting pids information. + * + * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceClearAccountingPids + * + * @param device The identifier of the target device + * @param mode The target accounting mode + * + * @return + * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Clears accounting information about all processes that have already terminated. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the state of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that + * the link is active and NVML_FEATURE_DISABLED indicates it + * is inactive + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); + +/** + * Retrieves the version of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param version Requested NvLink version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); + +/** + * Retrieves the requested capability from the device's NvLink for the link specified + * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried + * The return value should be treated as a boolean. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is available + * + * @return + * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, + nvmlNvLinkCapability_t capability, unsigned int *capResult); + +/** + * Retrieves the PCI information for the remote node on a NvLink link + * Note: pciSubSystemId is not filled in this function and is indeterminate + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * + * @return + * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); + +/** + * Retrieves the specified error counter value + * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the NvLink counter to be queried + * @param counterValue Returned counter value + * + * @return + * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, + nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); + +/** + * Resets all error counters to zero + * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * + * @return + * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); + +/** + * Deprecated: Setting utilization counter control is no longer supported. + * + * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset + * of the counters if the reset parameter is non-zero. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set + * @param reset Resets the counters on set if non-zero + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control, unsigned int reset); + +/** + * Deprecated: Getting utilization counter control is no longer supported. + * + * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control); + + +/** + * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. + * + * Retrieve the NVLINK utilization counter based on the current control for a specified counter. + * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl + * before reading the utilization counters as they have no default state + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be read (0 or 1). + * @param rxcounter Receive counter return value + * @param txcounter Transmit counter return value + * + * @return + * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, + unsigned long long *rxcounter, unsigned long long *txcounter); + +/** + * Deprecated: Freezing NVLINK utilization counters is no longer supported. + * + * Freeze the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be frozen (0 or 1). + * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters + * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters + * + * @return + * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, + unsigned int counter, nvmlEnableState_t freeze); + +/** + * Deprecated: Resetting NVLINK utilization counters is no longer supported. + * + * Reset the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be reset + * @param counter Specifies the counter that should be reset (0 or 1) + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); + +/** +* Get the NVLink device type of the remote device connected over the given link. +* +* @param device The device handle of the target GPU +* @param link The NVLink link index on the target GPU +* @param pNvLinkDeviceType Pointer in which the output remote device type is returned +* +* @return +* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or +* \a pNvLinkDeviceType is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is +* otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ +/***************************************************************************************************/ + +/** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + +/** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); + +/** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + +/** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. + * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error + * type is returned for all xid error events. + * + * On Linux, every xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); + +/** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlZPI Drain states + * This chapter describes methods that NVML can perform against each device to control their drain state + * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to + * power on/off GPUs, enable robust reset scenarios, etc. + * @{ + */ +/***************************************************************************************************/ + +/** + * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. + * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before + * this call is made. + * Must be called as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be modified + * @param newState The drain state that should be entered, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); + +/** + * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining + * state. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be queried + * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); + +/** + * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver + * as long as no other processes are attached. If other processes are attached, this call will return + * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the + * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called + * to initiate the draining state is if that process was using, and is still using, a GPU before the + * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled + * prior to this call. + * + * For long-running NVML processes please note that this will change the enumeration of current GPUs. + * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. + * Also, device handles after the removed GPU will not be valid and must be re-established. + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU to be removed + * @param gpuState Whether the GPU is to be removed, from the OS + * see \ref nvmlDetachGpuState_t + * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed + */ +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); + +/** + * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes + * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. + * + * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds + * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. + * + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device + * fields are used in this call. + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature + * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueQueries Field Value Queries + * This chapter describes NVML operations that are associated with retrieving Field Values from NVML + * @{ + */ +/***************************************************************************************************/ + +/** + * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. + * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs + * will be populated from a single call rather than making a driver call for each fieldId. + * + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be retrieved + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call + * + * @return + * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup vGPU Enums, Constants and Structs + * @{ + */ +/** @} */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup nvmlVirtualGpuQueries vGPU APIs + * This chapter describes operations that are associated with NVIDIA vGPU Software products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to get the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); + +/** + * Queries if SR-IOV host operation is supported on a vGPU supported device. + * + * Checks whether SR-IOV host capability is supported by the device and the + * driver, and indicates device is in SR-IOV mode if both of these conditions + * are true. + * + * @param device The identifier of the target device + * @param pHostVgpuMode Reference in which to return the current vGPU mode + * + * @return + * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. + * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); + +/** + * This method is used to set the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); + +/** + * Retrieve the vGPU Software licensable features. + * + * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + +/** + * Retrieve GSP firmware version. + * + * The caller passes in buffer via \a version and corresponding GSP firmware numbered version + * is returned with the same parameter in string format. + * + * @param device Device handle + * @param version The retrieved GSP firmware version + * + * @return + * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); + +/** + * Retrieve GSP firmware mode. + * + * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with + * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * + * @param device Device handle + * @param isEnabled Pointer to specify if GSP firmware is enabled + * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * + * @return + * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpu vGPU Management + * @{ + * + * This chapter describes APIs supporting NVIDIA vGPU. + */ +/***************************************************************************************************/ + +/** + * Retrieve the supported vGPU types on a physical GPU (device). + * + * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the currently creatable vGPU types on a physical GPU (device). + * + * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types + * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable + * list will be restricted to whatever vGPU type is already running on the device. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeClass Pointer to string array to return class in + * @param size Size of string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); + +/** + * Retrieve the vGPU type name. + * + * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not + * exceed 64 characters in length (including the NUL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeName Pointer to buffer to return name + * @param size Size of buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); + +/** + * Retrieve the GPU Instance Profile ID for the given vGPU type ID. + * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is + * returned. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param gpuInstanceProfileId GPU Instance Profile ID + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); + +/** + * Retrieve the device ID of a vGPU type. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); + +/** + * Retrieve the vGPU framebuffer size in bytes. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param fbSize Pointer to framebuffer size in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); + +/** + * Retrieve count of vGPU's supported display heads. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param numDisplayHeads Pointer to number of display heads + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); + +/** + * Retrieve vGPU display head's maximum supported resolution. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param displayIndex Zero-based index of display head + * @param xdim Pointer to maximum number of pixels in X dimension + * @param ydim Pointer to maximum number of pixels in Y dimension + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex + * is out of range. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); + +/** + * Retrieve license requirements for a vGPU type + * + * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form + * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, + * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * + * The total length of the returned string will not exceed 128 characters, including the NUL terminator. + * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeLicenseString Pointer to buffer to return license info + * @param size Size of \a vgpuTypeLicenseString buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); + +/** + * Retrieve the static frame rate limit value of the vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param frameRateLimit Reference to return the frame rate limit value + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); + +/** + * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCount Pointer to get the max number of vGPU instances + * that can be created on a deicve for given vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, + * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); + +/** + * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); + +/** + * Retrieve the active vGPU instances on a device. + * + * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The + * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. + * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return + * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer which passes in the array size as well as get + * back the number of types + * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); + +/** + * Retrieve the VM ID associated with a vGPU instance. + * + * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vmId Pointer to caller-supplied buffer to hold VM ID + * @param size Size of buffer in bytes + * @param vmIdType Pointer to hold VM ID type + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); + +/** + * Retrieve the UUID of a vGPU instance. + * + * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); + +/** + * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. + * + * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version + * string will not exceed 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is + * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * NVIDIA driver is loaded and initialized. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param version Caller-supplied buffer to return driver version string + * @param length Size of \a version buffer + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); + +/** + * Retrieve the framebuffer usage in bytes. + * + * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target instance + * @param fbUsage Pointer to framebuffer usage in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); + +/** + * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. + * + * Retrieve the current licensing state of the vGPU instance. + * + * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licensed Reference to return the licensing status + * + * @return + * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); + +/** + * Retrieve the vGPU type of a vGPU instance. + * + * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vgpuTypeId Reference to return the vgpuTypeId + * + * @return + * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); + +/** + * Retrieve the frame rate limit set for the vGPU instance. + * + * Returns the value of the frame rate limit set for the vGPU instance + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param frameRateLimit Reference to return the frame rate limit + * + * @return + * - \ref NVML_SUCCESS if \a frameRateLimit has been set + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); + +/** + * Retrieve the current ECC mode of vGPU instance. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param eccMode Reference in which to return the current ECC mode + * + * @return + * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); + +/** + * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); + +/** + * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Unsigned int for the encoder capacity value + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); + +/** + * Retrieves the current encoder statistics of a vGPU Instance + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about all active encoder sessions on a vGPU Instance. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The + * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to caller supplied array size, and returns + * the number of sessions. + * @param sessionInfo Reference to caller supplied array in which the list + * of session information us returned. + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfo is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is + returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); + +/** +* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a vGPU Instance. +* +* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + +/** +* Retrieve the GPU Instance ID for the given vGPU Instance. +* The API will return a valid GPU Instance ID for MIG backed vGPU Instance, else INVALID_GPU_INSTANCE_ID is returned. +* +* For Kepler &tm; or newer fully supported devices. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param gpuInstanceId GPU Instance ID +* +* @return +* - \ref NVML_SUCCESS successful completion +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a gpuInstanceId is NULL. +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int *gpuInstanceId); + +/** +* Retrieves the PCI Id of the given vGPU Instance i.e. the PCI Id of the GPU as seen inside the VM. +* +* The vGPU PCI id is returned as "00000000:00:00.0" if NVIDIA driver is not installed on the vGPU instance. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param vgpuPciId Caller-supplied buffer to return vGPU PCI Id string +* @param length Size of the vgpuPciId buffer +* +* @return +* - \ref NVML_SUCCESS if vGPU PCI Id is sucessfully retrieved +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuPciId is NULL +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small, \a length is set to required length +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length); + +/** +* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure +* for the specific capabilities that can be queried. The return value in \a capResult should be treated as +* a boolean, with a non-zero value indicating that the capability is supported. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuTypeId Handle to vGPU type +* @param capability Specifies the \a nvmlVgpuCapability_t to be queried +* @param capResult A boolean for the queried capability indicating that feature is supported +* +* @return +* - \ref NVML_SUCCESS successful completion +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvml vGPU Migration + * This chapter describes operations that are associated with vGPU Migration. + * @{ + */ +/***************************************************************************************************/ + +/** + * Structure representing range of vGPU versions. + */ +typedef struct nvmlVgpuVersion_st +{ + unsigned int minVersion; //!< Minimum vGPU version. + unsigned int maxVersion; //!< Maximum vGPU version. +} nvmlVgpuVersion_t; + +/** + * vGPU metadata structure. + */ +typedef struct nvmlVgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields + char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host + unsigned int reserved[6]; //!< Reserved for internal use + unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld + unsigned int guestVgpuVersion; //!< vGPU version of guest driver + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuMetadata_t; + +/** + * Physical GPU metadata structure + */ +typedef struct nvmlVgpuPgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int reserved[5]; //!< Reserved for internal use + nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuPgpuMetadata_t; + +/** + * vGPU VM compatibility codes + */ +typedef enum nvmlVgpuVmCompatibility_enum +{ + NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable + NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) + NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) + NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) + NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 //!< vGPU is runnable from a live/paused (ACPI S0) +} nvmlVgpuVmCompatibility_t; + +/** + * vGPU-pGPU compatibility limit codes + */ +typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum +{ + NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. + NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< ompatibility is limited by host driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. + NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 //!< Compatibility is limited by an undefined factor. +} nvmlVgpuPgpuCompatibilityLimitCode_t; + +/** + * vGPU-pGPU compatibility structure + */ +typedef struct nvmlVgpuPgpuCompatibility_st +{ + nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t + nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t +} nvmlVgpuPgpuCompatibility_t; + +/** + * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM + * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section + * containing internal state. + * + * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are + * dependent on information obtained from the guest VM, which may not yet have reached a state where that information + * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field. + * + * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide + * it to Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. + * + * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure + * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param vgpuInstance vGPU instance handle + * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written + * @param bufferSize Size of vgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); + +/** + * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about + * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section + * containing internal state. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata + * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); + +/** + * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a + * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the + * physical GPU. + * + * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The + * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility + * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). + * + * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to + * boot a given vGPU or associated VM. + * + * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure + * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure + * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); + +/** + * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the + * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); + +/* + * Virtual GPU (vGPU) version + * + * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that + * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager + * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. + * + * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver + * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. + */ + +/** + * Query the ranges of supported vGPU versions. + * + * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. + * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. + * + * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: + * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. + * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. + * + * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written + * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written + * + * @return + * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. + * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. + */ +nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); + +/** + * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. + * + * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the + * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to + * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. + * + * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. + * + * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. + * + * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. + * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. + * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated + * vGPU version that is the maximum value in the overlapping range. + * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. + * + * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. + * + * @return + * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. + */ +nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUtil vGPU Utilization and Accounting + * This chapter describes operations that are associated with vGPU Utilization and Accounting. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves current utilization for vGPUs on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running + * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer + * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values + * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is + * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, + nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); + +/** + * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on + * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the + * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size + * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is + * passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + unsigned int *vgpuProcessSamplesCount, + nvmlVgpuProcessUtilizationSample_t *utilizationSamples); +/** + * Queries the state of per process accounting mode on vGPU. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); + +/** + * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes + * returned can be in running or terminated state. + * + * For Maxwell &tm; or newer fully supported devices. + * + * To just query the maximum number of processes that can be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlVgpuInstanceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlVgpuInstanceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); + +/** + * Queries process's accounting stats. + * + * For Maxwell &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and + * can be queried during life time of the process or after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * or \a stats is not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Clears accounting information of the vGPU instance that have already terminated. + * + * For Maxwell &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications + * stats don't contribute to GPU utilization. + * + * @param vgpuInstance The identifier of the target vGPU instance + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); + +/** + * Query the license information of the vGPU instance. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licenseInfo Pointer to vGPU license information structure + * + * @return + * - \ref NVML_SUCCESS if information is successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licenseInfo is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlExcludedGpuQueries Excluded GPU Queries + * This chapter describes NVML operations that are associated with excluded GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Excluded GPU device information + **/ +typedef struct nvmlExcludedDeviceInfo_st +{ + nvmlPciInfo_t pciInfo; //!< The PCI information for the excluded GPU + char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the excluded GPU +} nvmlExcludedDeviceInfo_t; + + /** + * Retrieves the number of excluded GPU devices in the system. + * + * For all products. + * + * @param deviceCount Reference in which to return the number of excluded devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + */ +nvmlReturn_t DECLDIR nvmlGetExcludedDeviceCount(unsigned int *deviceCount); + +/** + * Acquire the device information for an excluded GPU device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a deviceCount returned by + * \ref nvmlGetExcludedDeviceCount(). For example, if \a deviceCount is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * @param index The index of the target GPU, >= 0 and < \a deviceCount + * @param info Reference in which to return the device information + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL + * + * @see nvmlGetExcludedDeviceCount + */ +nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t *info); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management + * This chapter describes NVML operations that are associated with Multi Instance GPU management. + * @{ + */ +/***************************************************************************************************/ + +/** + * Disable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_DISABLE 0x0 + +/** + * Enable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_ENABLE 0x1 + +/** + * GPU instance profiles. + * + * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the + * detailed information about a GPU instance such as profile ID, engine counts. + */ +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_GPU_INSTANCE_PROFILE_8_SLICE 0x5 +#define NVML_GPU_INSTANCE_PROFILE_6_SLICE 0x6 +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8 +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 +#define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA + +typedef struct nvmlGpuInstancePlacement_st +{ + unsigned int start; //!< Index of first occupied memory slice + unsigned int size; //!< Number of memory slices occupied +} nvmlGpuInstancePlacement_t; + +/** + * GPU instance profile information. + */ +typedef struct nvmlGpuInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes +} nvmlGpuInstanceProfileInfo_t; + +/** + * GPU instance profile information (v2). + * + * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field + * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name + * field to the end. This structure is not backwards-compatible with + * \ref nvmlGpuInstanceProfileInfo_t. + */ +typedef struct nvmlGpuInstanceProfileInfo_v2_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlGpuInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + */ +#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) + +typedef struct nvmlGpuInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + unsigned int id; //!< Unique instance ID within the device + unsigned int profileId; //!< Unique profile ID within the device + nvmlGpuInstancePlacement_t placement; //!< Placement for this instance +} nvmlGpuInstanceInfo_t; + +typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; + +/** + * Compute instance profiles. + * + * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the + * detailed information about a compute instance such as profile ID, engine counts + */ +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 +#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 +#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 + +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 + +typedef struct nvmlComputeInstancePlacement_st +{ + unsigned int start; //!< Index of first occupied compute slice + unsigned int size; //!< Number of compute slices occupied +} nvmlComputeInstancePlacement_t; + +/** + * Compute instance profile information. + */ +typedef struct nvmlComputeInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count +} nvmlComputeInstanceProfileInfo_t; + +/** + * Compute instance profile information (v2). + * + * Version 2 adds the \ref nvmlComputeInstanceProfileInfo_v2_t.version field + * to the start of the structure, and the \ref nvmlComputeInstanceProfileInfo_v2_t.name + * field to the end. This structure is not backwards-compatible with + * \ref nvmlComputeInstanceProfileInfo_t. + */ +typedef struct nvmlComputeInstanceProfileInfo_v2_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlComputeInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v2_t.version. + */ +#define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) + +typedef struct nvmlComputeInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance + unsigned int id; //!< Unique instance ID within the GPU instance + unsigned int profileId; //!< Unique profile ID within the GPU instance + nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} +} nvmlComputeInstanceInfo_t; + +typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; + +/** + * Set MIG mode for the device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root user. + * + * This mode determines whether a GPU instance can be created. + * + * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the + * device, such as minor number, might change. The caller of this API is expected to query such attributes again. + * + * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM + * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. + * + * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device + * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API + * is expected to idle the device and retry setting the \a mode. + * + * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref + * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. + * + * @param device The identifier of the target device + * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param activationStatus The activationStatus status + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); + +/** + * Get MIG mode for the device. + * + * For Ampere &tm; or newer fully supported devices. + * + * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the + * next activation trigger. + * + * @param device The identifier of the target device + * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); + +/** + * Get GPU instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_t *info); + +/** + * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned + * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlGpuInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlGpuInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, + * profile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_v2_t *info); + +/** + * Get GPU instance placements. + * + * A placement represents the location of a GPU instance within a device. This API only returns all the possible + * placements for the given profile. + * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will + * fail if there is overlap with the already occupied memory slices. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placements Returns placements allowed for the profile. Can be NULL to discover number + * of allowed placements for this profile. If non-NULL must be large enough + * to accommodate the placements supported by the profile. + * @param count Returns number of allowed placemenets for the profile. + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstancePlacement_t *placements, + unsigned int *count); + +/** + * Get GPU instance profile capacity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, + unsigned int *count); + +/** + * Create GPU instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstance Returns the GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + */ +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstance); + +/** + * Create GPU instance with the specified placement. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 + * @param gpuInstance Returns the GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + */ +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, + const nvmlGpuInstancePlacement_t *placement, + nvmlGpuInstance_t *gpuInstance); +/** + * Destroy GPU instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes + * (e.g. CUDA application) or compute instances are active on the + * GPU instance. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); + +/** + * Get GPU instances for given profile ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count The count of returned GPU instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstances, unsigned int *count); + +/** + * Get GPU instances for given instance ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param id The GPU instance ID + * @param gpuInstance Returns GPU instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); + +/** + * Get GPU instance information. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The GPU instance handle + * @param info Return GPU instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); + +/** + * Get compute instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_t *info); + +/** + * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned + * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlComputeInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlComputeInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, + * profile, + * engProfile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_v2_t *info); + +/** + * Get compute instance profile capacity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, unsigned int *count); + +/** + * Create compute instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstance Returns the compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstance); + +/** + * Destroy compute instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param computeInstance The compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if + * processes (e.g. CUDA application) are active on the compute instance. + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); + +/** + * Get compute instances for given profile ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count The count of returned compute instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstances, unsigned int *count); + +/** + * Get compute instance for given instance ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param id The compute instance ID + * @param computeInstance Returns compute instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, + nvmlComputeInstance_t *computeInstance); + +/** + * Get compute instance information. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param computeInstance The compute instance handle + * @param info Return compute instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); + +/** + * Test if the given handle refers to a MIG device. + * + * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. + * These overloaded references can be used (with some restrictions) interchangeably + * with a GPU device handle to execute queries at a per-compute instance granularity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device NVML handle to test + * @param isMigDevice True when handle refers to a MIG device + * + * @return + * - \ref NVML_SUCCESS if \a device status was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); + +/** + * Get GPU instance ID for the given MIG device handle. + * + * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id GPU instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get compute instance ID for the given MIG device handle. + * + * Compute instance IDs are unique per GPU instance and remain valid until the compute instance + * is destroyed. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id Compute instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get the maximum number of MIG devices that can exist under a given parent NVML device. + * + * Returns zero if MIG is not supported or enabled. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target device handle + * @param count Count of MIG devices + * + * @return + * - \ref NVML_SUCCESS if \a count was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); + +/** + * Get MIG device handle for the given index under its parent NVML device. + * + * If the compute instance is destroyed either explicitly or by destroying, + * resetting or unbinding the parent GPU instance or the GPU device itself + * the MIG device handle would remain invalid and must be requested again + * using this API. Handles may be reused and their properties can change in + * the process. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Reference to the parent GPU device handle + * @param index Index of the MIG device + * @param migDevice Reference to the MIG device handle + * + * @return + * - \ref NVML_SUCCESS if \a migDevice handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, + nvmlDevice_t *migDevice); + +/** + * Get parent device handle from a MIG device handle. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param migDevice MIG device handle + * @param device Device handle + * + * @return + * - \ref NVML_SUCCESS if \a device handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); + +/** + * Get the type of the GPU Bus (PCIe, PCI, ...) + * + * @param device The identifier of the target device + * @param type The PCI Bus type + * + * return + * - \ref NVML_SUCCESS if the bus \a type is successfully retreived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \device is invalid or \type is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); + +/** + * Retrieve performance monitor samples from the associated subdevice. + * + * @param device + * @param pDynamicPstatesInfo + * + * @return + * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); + +/** + * Sets the speed of a specified fan. + * + * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor + * the temperature and adjust the fan speed accordingly. + * If you set the fan speed too low you can burn your GPU! + * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. + * + * For all cuda-capable discrete products with fans that are Maxwell or Newer. + * + * device The identifier of the target device + * fan The index of the fan, starting at zero + * speed The target speed of the fan [0-100] in % of max speed + * + * return + * NVML_SUCCESS if the fan speed has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, + * or if the fan index doesn't reference an actual fan. + * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. + * NVML_ERROR_UNKNOWN if there was an unexpected error. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); + +/** + * Retrieve the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved GPCCLK VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[in] offset The GPCCLK VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve the MemClk (Memory Clock) VF offset value. + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved MemClk VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. + * @param[in] device The identifier of the target device + * @param[in] offset The MemClk VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve min and max clocks of some clock domain for a given PState + * + * @param device The identifier of the target device + * @param type Clock domain + * @param pstate PState to query + * @param minClockMHz Reference in which to return min clock frequency + * @param maxClockMHz Reference in which to return max clock frequency + * + * @return + * - \ref NVML_SUCCESS if everything worked + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both + * \a minClockMHz and \a maxClockMHz are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, + unsigned int * minClockMHz, unsigned int * maxClockMHz); + +/** + * Get all supported Performance States (P-States) for the device. + * + * The returned array would contain a contiguous list of valid P-States supported by + * the device. If the number of supported P-States is fewer than the size of the array + * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * + * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * + * @param device The identifier of the target device + * @param pstates Container to return the list of performance states + * supported by device + * @param size Size of the supplied \a pstates array in bytes + * + * @return + * - \ref NVML_SUCCESS if \a pstates array has been retrieved + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to + * hold the resulting list + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, + nvmlPstates_t *pstates, unsigned int size); + +/** + * Retrieve the GPCCLK min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved GPCCLK VF min offset value + * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve the MemClk (Memory Clock) min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved MemClk VF min offset value + * @param[out] maxOffset The retrieved MemClk VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Get Conf Computing System capabilities. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param capabilities System CC capabilities + * + * @return + * - \ref NVML_SUCCESS if \a capabilities were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); + +/** + * Get Conf Computing System State. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param state System CC State + * + * @return + * - \ref NVML_SUCCESS if \a state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); + +/** + * Get Conf Computing Protected and Unprotected Memory Sizes. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device handle + * @param memInfo Protected/Unprotected Memory sizes + * + * @return + * - \ref NVML_SUCCESS if \a memInfo were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); + +/** + * Set Conf Computing Protected Memory Size. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device Handle + * @param sizeKiB Protected Memory size to be set in KiB + * + * @return + * - \ref NVML_SUCCESS if \a sizeKiB successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeProtectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); + +/** + * Set Conf Computing GPUs ready state. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); + +/** + * Get Conf Computing GPUs ready state. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param isAcceptingWork Returns GPU current work accepting state, + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); + +/** + * Get Conf Computing protected memory usage. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); + +/** + * Get Conf Computing Gpu certificate details. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param gpuCert Reference in which to return the gpu certificate information + * + * @return + * - \ref NVML_SUCCESS if \a gpu certificate info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, + nvmlConfComputeGpuCertificate_t *gpuCert); + +/** + * Get Conf Computing Gpu attestation report. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param gpuAtstReport Reference in which to return the gpu attestation report + * + * @return + * - \ref NVML_SUCCESS if \a gpu attestation report has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, + nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup GPM NVML GPM + * @{ + */ +/***************************************************************************************************/ +/** @defgroup nvmlGpmEnums GPM Enums + * @{ + */ +/***************************************************************************************************/ + +/* GPM Metric Identifiers */ +typedef enum +{ + NVML_GPM_METRIC_GRAPHICS_UTIL = 1, /* Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_UTIL = 2, /* Percentage of SMs that were busy. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_OCCUPANCY = 3, /* Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_INTEGER_UTIL = 4, /* Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, /* Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, /* Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, /* Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, /* Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DRAM_BW_UTIL = 10, /* Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP64_UTIL = 11, /* Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP32_UTIL = 12, /* Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP16_UTIL = 13, /* Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, /* PCIe traffic from this GPU in MiB/sec */ + NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, /* PCIe traffic to this GPU in MiB/sec */ + NVML_GPM_METRIC_NVDEC_0_UTIL = 30, /* Percent utilization of NVDEC 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_1_UTIL = 31, /* Percent utilization of NVDEC 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_2_UTIL = 32, /* Percent utilization of NVDEC 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_3_UTIL = 33, /* Percent utilization of NVDEC 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_4_UTIL = 34, /* Percent utilization of NVDEC 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_5_UTIL = 35, /* Percent utilization of NVDEC 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_6_UTIL = 36, /* Percent utilization of NVDEC 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_7_UTIL = 37, /* Percent utilization of NVDEC 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_0_UTIL = 40, /* Percent utilization of NVJPG 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_1_UTIL = 41, /* Percent utilization of NVJPG 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_2_UTIL = 42, /* Percent utilization of NVJPG 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_3_UTIL = 43, /* Percent utilization of NVJPG 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_4_UTIL = 44, /* Percent utilization of NVJPG 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_5_UTIL = 45, /* Percent utilization of NVJPG 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_6_UTIL = 46, /* Percent utilization of NVJPG 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_7_UTIL = 47, /* Percent utilization of NVJPG 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVOFA_0_UTIL = 50, /* Percent utilization of NVOFA 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, /* NvLink read bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, /* NvLink write bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, /* NvLink read bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, /* NvLink write bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, /* NvLink read bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, /* NvLink write bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, /* NvLink read bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, /* NvLink write bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, /* NvLink read bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, /* NvLink write bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, /* NvLink read bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, /* NvLink write bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, /* NvLink read bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, /* NvLink write bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, /* NvLink read bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, /* NvLink write bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, /* NvLink read bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, /* NvLink write bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, /* NvLink read bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, /* NvLink write bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, /* NvLink read bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, /* NvLink write bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, /* NvLink read bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, /* NvLink write bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, /* NvLink read bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, /* NvLink write bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, /* NvLink read bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, /* NvLink write bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, /* NvLink read bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, /* NvLink write bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, /* NvLink read bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, /* NvLink write bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, /* NvLink read bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, /* NvLink write bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, /* NvLink read bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, /* NvLink write bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, /* NvLink read bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, /* NvLink write bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_MAX = 98, /* Maximum value above +1. Note that changing this + should also change NVML_GPM_METRICS_GET_VERSION + due to struct size change */ +} nvmlGpmMetricId_t; + +/** @} */ // @defgroup nvmlGpmEnums + + +/***************************************************************************************************/ +/** @defgroup nvmlGpmStructs GPM Structs + * @{ + */ +/***************************************************************************************************/ + +/* Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc() + Free this with nvmlGpmSampleFree() */ +typedef struct nvmlGpmSample_st* nvmlGpmSample_t; + +typedef struct +{ + unsigned int metricId; /* IN: NVML_GPM_METRIC_? #define of which metric to retrieve */ + nvmlReturn_t nvmlReturn; /* OUT: Status of this metric. If this is nonzero, then value is not valid */ + double value; /* OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) */ + struct + { + char *shortName; + char *longName; + char *unit; + } metricInfo; /* OUT: Metric name and unit. Those can be NULL if not defined */ +} nvmlGpmMetric_t; + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_METRICS_GET_VERSION */ + unsigned int numMetrics; /* IN: How many metrics to retrieve in metrics[] */ + nvmlGpmSample_t sample1; /* IN: Sample buffer */ + nvmlGpmSample_t sample2; /* IN: Sample buffer */ + nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; /* IN/OUT: Array of metrics. Set metricId on call. + see nvmlReturn and value on return */ +} nvmlGpmMetricsGet_t; + +#define NVML_GPM_METRICS_GET_VERSION 1 + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_SUPPORT_VERSION */ + unsigned int isSupportedDevice; /* OUT: Indicates device support */ +} nvmlGpmSupport_t; + +#define NVML_GPM_SUPPORT_VERSION 1 + +/** @} */ // @defgroup nvmlGPMStructs + +/***************************************************************************************************/ +/** @defgroup nvmlGpmFunctions GPM Functions + * @{ + */ +/***************************************************************************************************/ + +/** + * Calculate GPM metrics from two samples. + * + * + * @param metricsGet IN/OUT: populated nvmlGpmMetricsGet_t struct + * + * %HOPPER_OR_NEWER% + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + + +/** + * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Sample to free + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + */ +nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); + + +/** + * Allocate a sample buffer to be used with NVML GPM . You will need to allocate + * at least two of these buffers to use with the NVML GPM feature + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Where the allocated sample will be stored + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + * - \ref NVML_ERROR_MEMORY if system memory is insufficient + */ +nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer. After + * two samples are gathered, you can call nvmlGpmMetricGet on those samples to + * retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); + +/** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. + * + * After two samples are gathered, you can call nvmlGpmMetricGet on those + * samples to retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpuInstanceId MIG GPU Instance ID + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); + +nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + +/** @} */ // @defgroup nvmlGpmFunctions +/** @} */ // @defgroup GPM + +/** + * NVML API versioning support + */ + +#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS +nvmlReturn_t DECLDIR nvmlInit(void); +nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); +#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS + +#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) +// We don't define APIs to run new versions if this guard is present so there is +// no need to undef +#elif defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceGetGraphicsRunningProcesses +#undef nvmlDeviceGetComputeRunningProcesses +#undef nvmlDeviceGetMPSComputeRunningProcesses +#undef nvmlDeviceGetAttributes +#undef nvmlComputeInstanceGetInfo +#undef nvmlEventSetWait +#undef nvmlDeviceGetGridLicensableFeatures +#undef nvmlDeviceRemoveGpu +#undef nvmlDeviceGetNvLinkRemotePciInfo +#undef nvmlDeviceGetPciInfo +#undef nvmlDeviceGetCount +#undef nvmlDeviceGetHandleByIndex +#undef nvmlDeviceGetHandleByPciBusId +#undef nvmlInit +#undef nvmlBlacklistDeviceInfo_t +#undef nvmlGetBlacklistDeviceCount +#undef nvmlGetBlacklistDeviceInfoByIndex +#undef nvmlDeviceGetGpuInstancePossiblePlacements +#undef nvmlVgpuInstanceGetLicenseInfo +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pkg/dcgm/policy.go b/pkg/dcgm/policy.go index 692f480..3539180 100644 --- a/pkg/dcgm/policy.go +++ b/pkg/dcgm/policy.go @@ -274,16 +274,20 @@ func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList // C union types are represented as a Go byte array binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value) } + var statusHandle C.dcgmStatus_t + result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle) if err = errorString(result); err != nil { return fmt.Errorf("Error setting policies: %s", err) } + log.Println("Policy successfully set.") + return } -func registerPolicy(gpuId uint, typ ...policyCondition) (violation chan PolicyViolation, err error) { +func registerPolicy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { // init policy globals for internal API makePolicyChannels() makePolicyParmsMap() @@ -291,10 +295,11 @@ func registerPolicy(gpuId uint, typ ...policyCondition) (violation chan PolicyVi name := fmt.Sprintf("policy%d", rand.Uint64()) groupId, err := CreateGroup(name) if err != nil { - return + return nil, err } + if err = AddToGroup(groupId, gpuId); err != nil { - return + return nil, err } // make a list of all callback channels @@ -337,56 +342,28 @@ func registerPolicy(gpuId uint, typ ...policyCondition) (violation chan PolicyVi } if err = setPolicy(groupId, condition, paramKeys); err != nil { - return + return nil, err } result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(C.violationNotify)) if err = errorString(result); err != nil { - return violation, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} } log.Println("Listening for violations...") - // create a publisher - publisher := newPublisher() - _ = publisher.add() - _ = publisher.add() - - // broadcast - go publisher.broadcast() - - go func() { - for { - select { - case dbe := <-callbacks["dbe"]: - publisher.send(dbe) - case pcie := <-callbacks["pcie"]: - publisher.send(pcie) - case maxrtpg := <-callbacks["maxrtpg"]: - publisher.send(maxrtpg) - case thermal := <-callbacks["thermal"]: - publisher.send(thermal) - case power := <-callbacks["power"]: - publisher.send(power) - case nvlink := <-callbacks["nvlink"]: - publisher.send(nvlink) - case xid := <-callbacks["xid"]: - publisher.send(xid) - } - } - }() - // merge - violation = make(chan PolicyViolation, len(channels)) + violation := make(chan PolicyViolation, len(channels)) go func() { for _, c := range channels { val := <-c violation <- val } + DestroyGroup(groupId) close(violation) }() - _ = DestroyGroup(groupId) - return + + return violation, err } func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { diff --git a/pkg/dcgm/policy_test.go b/pkg/dcgm/policy_test.go new file mode 100644 index 0000000..d905462 --- /dev/null +++ b/pkg/dcgm/policy_test.go @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPolicyErrors(t *testing.T) { + type testCase struct { + policy policyCondition + injectError func(gpu uint) error + assert func(cb PolicyViolation) + } + tests := []testCase{ + { + policy: DbePolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_ECC_DBE_VOL_DEV", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_ECC_DBE_VOL_DEV, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(1), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, DbePolicy, cb.Condition) + require.IsType(t, dbePolicyCondition{}, cb.Data) + policyCondition := cb.Data.(dbePolicyCondition) + assert.Equal(t, uint(1), policyCondition.NumErrors) + assert.Equal(t, "Device", policyCondition.Location) + }, + }, + { + policy: PowerPolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_POWER_USAGE", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_POWER_USAGE, + DCGM_FT_DOUBLE, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + float64(300.0), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, PowerPolicy, cb.Condition) + require.IsType(t, powerPolicyCondition{}, cb.Data) + policyCondition := cb.Data.(powerPolicyCondition) + assert.Equal(t, uint(300), policyCondition.PowerViolation) + }, + }, + { + policy: PCIePolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_POWER_USAGE", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(1), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, PCIePolicy, cb.Condition) + require.IsType(t, pciPolicyCondition{}, cb.Data) + pciPolicyCondition := cb.Data.(pciPolicyCondition) + assert.Equal(t, uint(1), pciPolicyCondition.ReplayCounter) + }, + }, + { + policy: MaxRtPgPolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_RETIRED_DBE", gpu) + err := InjectFieldValue(gpu, + DCGM_FI_DEV_RETIRED_DBE, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(10), + ) + if err == nil { + //inject a SBE too so that the health check code gets past its internal checks + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_RETIRED_SBE", gpu) + err = InjectFieldValue(gpu, + DCGM_FI_DEV_RETIRED_SBE, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(10), + ) + } + return err + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, MaxRtPgPolicy, cb.Condition) + require.IsType(t, retiredPagesPolicyCondition{}, cb.Data) + retiredPagesPolicyCondition := cb.Data.(retiredPagesPolicyCondition) + assert.Equal(t, uint(10), retiredPagesPolicyCondition.DbePages) + }, + }, + { + policy: ThermalPolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_GPU_TEMP", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_GPU_TEMP, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(101), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, ThermalPolicy, cb.Condition) + require.IsType(t, thermalPolicyCondition{}, cb.Data) + thermalPolicyCondition := cb.Data.(thermalPolicyCondition) + assert.Equal(t, uint(101), thermalPolicyCondition.ThermalViolation) + }, + }, + { + policy: NvlinkPolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(1), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, NvlinkPolicy, cb.Condition) + require.IsType(t, nvlinkPolicyCondition{}, cb.Data) + nvlinkPolicyCondition := cb.Data.(nvlinkPolicyCondition) + assert.Equal(t, uint(1), nvlinkPolicyCondition.Counter) + }, + }, + { + policy: XidPolicy, + injectError: func(gpu uint) error { + t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) + return InjectFieldValue(gpu, + DCGM_FI_DEV_XID_ERRORS, + DCGM_FT_INT64, + 0, + time.Now().Add(60*time.Second).UnixMicro(), + int64(16), + ) + }, + assert: func(cb PolicyViolation) { + require.NotNil(t, cb) + assert.Equal(t, XidPolicy, cb.Condition) + require.IsType(t, xidPolicyCondition{}, cb.Data) + xidPolicyCondition := cb.Data.(xidPolicyCondition) + assert.Equal(t, uint(16), xidPolicyCondition.ErrNum) + }, + }, + } + for _, tc := range tests { + t.Run(string(tc.policy), func(t *testing.T) { + cleanup, err := Init(Embedded) + require.NoError(t, err) + + defer cleanup() + + numGPUs, err := GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", MAX_NUM_DEVICES) + } + + entityList := []MigHierarchyInfo{ + { + Entity: GroupEntityPair{EntityGroupId: FE_GPU}, + }, + } + + gpuIDs, err := CreateFakeEntities(entityList) + require.NoError(t, err) + + gpu := gpuIDs[0] + + callback, err := Policy(gpu, tc.policy) + require.NoError(t, err) + + err = tc.injectError(gpu) + require.NoError(t, err) + + select { + case callbackData := <-callback: + require.NotNil(t, callbackData) + tc.assert(callbackData) + break + case <-time.After(20 * time.Second): + require.Fail(t, "policy callback never happened") + } + }) + } +} diff --git a/pkg/dcgm/structs.go b/pkg/dcgm/structs.go new file mode 100644 index 0000000..5b075ef --- /dev/null +++ b/pkg/dcgm/structs.go @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +type MigProfile int + +const ( + MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ + MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ + MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ + MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ + MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ + MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ + MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ + MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ + MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ + MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ + MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ + MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ + MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ + MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ + MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ + MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ + MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ + MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ + MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ +) From a2a3f9fa3d1ddb60f512dfdd727b12602e829786 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Fri, 29 Dec 2023 11:17:37 -0600 Subject: [PATCH 2/4] Data types were modified Signed-off-by: Vadym Fedorov --- pkg/dcgm/internal.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/dcgm/internal.go b/pkg/dcgm/internal.go index 01b3ac4..8a1f2ff 100644 --- a/pkg/dcgm/internal.go +++ b/pkg/dcgm/internal.go @@ -71,7 +71,7 @@ func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error) { return gpuIDs, nil } -func InjectFieldValue(gpu uint, fieldID int, fieldType uint, status int, ts int64, value interface{}) error { +func InjectFieldValue(gpu uint, fieldID uint, fieldType uint, status int, ts int64, value interface{}) error { field := C.dcgmInjectFieldValue_t{ version: C.dcgmInjectFieldValue_version1, fieldId: C.ushort(fieldID), From f536751bcb9138a0e5b93e7a3b6af721554e007e Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Fri, 29 Dec 2023 11:41:12 -0600 Subject: [PATCH 3/4] Removed vscode launch.json Signed-off-by: Vadym Fedorov --- .vscode/launch.json | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 604aead..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Launch test function", - "type": "go", - "request": "launch", - "mode": "test", - "program": "${workspaceFolder}/${relativeFileDirname}", - "args": [ - "-test.run", - "^TestPolicyForDoubleBitErrors$" - ], - "env": { - "GOEXPERIMENT": "cgocheck2" - }, - "showLog": true - } - - ] -} \ No newline at end of file From 9e6c9c4d7ea0c7bbc660c5de3412f27239ce056f Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Fri, 29 Dec 2023 11:48:22 -0600 Subject: [PATCH 4/4] Missing header file was added Signed-off-by: Vadym Fedorov --- pkg/dcgm/dcgm_api_export.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 pkg/dcgm/dcgm_api_export.h diff --git a/pkg/dcgm/dcgm_api_export.h b/pkg/dcgm/dcgm_api_export.h new file mode 100644 index 0000000..52f9e0d --- /dev/null +++ b/pkg/dcgm/dcgm_api_export.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DCGM_DCGM_API_EXPORT_H +#define DCGM_DCGM_API_EXPORT_H + +#undef DCGM_PUBLIC_API +#undef DCGM_PRIVATE_API + +#if defined(DCGM_API_EXPORT) +#define DCGM_PUBLIC_API __attribute((visibility("default"))) +#else +#define DCGM_PUBLIC_API +#if defined(ERROR_IF_NOT_PUBLIC) +#error(Should be public) +#endif +#endif + +#define DCGM_PRIVATE_API __attribute((visibility("hidden"))) + + +#endif // DCGM_DCGM_API_EXPORT_H