Update to DCGM 4.0.0 (#77)

* Update go-dcgm bindings for DCGM 4.0 * Update to latest DCGM 4.0 headers --------- Signed-off-by: Douglas Wightman <[email protected]>
NVIDIA · Jan 6, 2025 · 850266c · 850266c
1 parent 85ceb31
commit 850266c
Show file tree

Hide file tree

Showing 17 changed files with 7,378 additions and 3,903 deletions.
diff --git a/Makefile b/Makefile
@@ -28,6 +28,7 @@ binary:
 	cd samples/processInfo; go build
 	cd samples/restApi; go build
 	cd samples/topology; go build
+	cd samples/diag; go build
 
 test-main:
 	go test -race ./tests
@@ -46,4 +47,4 @@ clean:
 	rm -f samples/topology/topology
 
 lint:
-	golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT)  --new-from-rev=HEAD~1 --fix
+	golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT)  --new-from-rev=HEAD~1 --fix
diff --git a/pkg/dcgm/admin.go b/pkg/dcgm/admin.go
@@ -60,7 +60,7 @@ var (
 
 func initDcgm(m mode, args ...string) (err error) {
 	const (
-		dcgmLib = "libdcgm.so"
+		dcgmLib = "libdcgm.so.4"
 	)
 	lib := C.CString(dcgmLib)
 	defer freeCString(lib)

diff --git a/pkg/dcgm/const.go b/pkg/dcgm/const.go
diff --git a/pkg/dcgm/dcgm_agent.h b/pkg/dcgm/dcgm_agent.h
@@ -17,7 +17,7 @@
 #ifndef DCGM_AGENT_H
 #define DCGM_AGENT_H
 
-#define DCGM_PUBLIC_API
+#include "dcgm_api_export.h"
 #include "dcgm_structs.h"
 
 #ifdef __cplusplus
@@ -403,13 +403,14 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle
  *        - \ref DCGM_ST_NOT_SUPPORTED     if the given entityGroup does not support enumeration.
  *        - \ref DCGM_ST_BADPARAM          if any parameter is invalid
  */
-dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v3 *linkStatus);
+dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v4 *linkStatus);
 
 
 /**
  * List supported CPUs and their cores present on the system
  *
- * This and other CPU APIs only support datacenter NVIDIA CPUs
+ * This and other CPU APIs only support datacenter NVIDIA CPUs.  Use \ref dcgmGetCpuHierarchy_v2 to
+ * get additional CPU information.
  *
  * @param dcgmHandle   IN: DCGM Handle
  * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated
@@ -422,6 +423,22 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dc
  */
 dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v1 *cpuHierarchy);
 
+/**
+ * List supported CPUs and their cores present on the system
+ *
+ * This and other CPU APIs only support datacenter NVIDIA CPUs.
+ *
+ * @param dcgmHandle   IN: DCGM Handle
+ * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated
+ *
+ * @return
+ *        - \ref DCGM_ST_OK                if the call was successful.
+ *        - \ref DCGM_ST_NOT_SUPPORTED     if the device is unsupported
+ *        - \ref DCGM_ST_MODULE_NOT_LOADED if the sysmon module could not be loaded
+ *        - \ref DCGM_ST_BADPARAM          if any parameter is invalid
+ */
+dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy_v2(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v2 *cpuHierarchy);
+
 /** @} */
 
 /***************************************************************************************************/
@@ -1544,23 +1561,21 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyGet(dcgmHandle_t pDcgmHandle,
  *                               \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs.
  * @param condition          IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for
  *                               which to register a callback function
- * @param beginCallback      IN: A reference to a function that should be called should a violation occur.
+ * @param callback           IN: A reference to a function that should be called should a violation occur.
  *                               This function will be called prior to any actions specified by the policy are taken.
- * @param finishCallback     IN: A reference to a function that should be called should a violation occur.
- *                           This function will be called after any action specified by the policy are completed.
+ * @param userData           IN: User data pointer to pass to the userData field of callback
  *
  * @return
  *        - \ref DCGM_ST_OK                   if the call was successful
- *        - \ref DCGM_ST_BADPARAM             if \a groupId, \a condition, is invalid, \a beginCallback, or
- *                                            \a finishCallback is NULL
+ *        - \ref DCGM_ST_BADPARAM             if \a groupId, \a condition, is invalid, \a callback, is NULL
  *        - \ref DCGM_ST_NOT_SUPPORTED        if any unsupported GPUs are part of the GPU group specified in groupId
  *
  */
-dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle,
-                                                dcgmGpuGrp_t groupId,
-                                                dcgmPolicyCondition_t condition,
-                                                fpRecvUpdates beginCallback,
-                                                fpRecvUpdates finishCallback);
+dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister_v2(dcgmHandle_t pDcgmHandle,
+                                                   dcgmGpuGrp_t groupId,
+                                                   dcgmPolicyCondition_t condition,
+                                                   fpRecvUpdates callback,
+                                                   uint64_t userData);
 
 /**
  * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t).
@@ -1575,7 +1590,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle,
  *
  * @return
  *        - \ref DCGM_ST_OK                   if the call was successful
- *        - \ref DCGM_ST_BADPARAM             if \a groupId, \a condition, is invalid or \a callback is NULL
+ *        - \ref DCGM_ST_BADPARAM             if \a groupId, \a condition, is invalid
+ *        - \ref DCGM_ST_IN_USE               if callback from policy registeration is in progress
  *
  */
 dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle,
@@ -1617,7 +1633,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle,
 dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
                                                 dcgmGpuGrp_t groupId,
                                                 dcgmPolicyValidation_t validate,
-                                                dcgmDiagResponse_t *response);
+                                                dcgmDiagResponse_v11 *response);
 
 /**
  * Inform the action manager to perform a manual validation of a group of GPUs on the system
@@ -1628,6 +1644,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
  *                               group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform
  *                               operation on all the GPUs.
  * @param response          OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details.
+ *                               Note: It's a caller's responsibility to make sure the response is zero-initialized,
+ *                                     except for the version field.
  *
  * @return
  *        - \ref DCGM_ST_OK                   if the call was successful
@@ -1639,8 +1657,9 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
  *                                            currently not allowed.
  */
 dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle,
-                                                   dcgmRunDiag_v7 *drd,
-                                                   dcgmDiagResponse_t *response);
+                                                   dcgmRunDiag_v9 *drd,
+                                                   dcgmDiagResponse_v11 *response);
+
 
 /**
  * Run a diagnostic on a group of GPUs
@@ -1667,7 +1686,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle,
 dcgmReturn_t DCGM_PUBLIC_API dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle,
                                                dcgmGpuGrp_t groupId,
                                                dcgmDiagnosticLevel_t diagLevel,
-                                               dcgmDiagResponse_t *diagResponse);
+                                               dcgmDiagResponse_v11 *diagResponse);
 
 /** @} */ // Closing for DCGMAPI_PO_MI
 
@@ -1697,6 +1716,27 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle);
 
 /** @} */ // Closing for DCGMAPI_Admin_ExecCtrl
 
+/**
+ * Gets device workload power profile information and status.
+ *
+ * @param pDcgmHandle             IN: DCGM Handle
+ * @param gpuId                   IN: GPU Id corresponding to which topology information should be fetched
+ * @param profilesInfo           OUT: Information about each of the supported workload power profiles available on this
+ *                                    device
+ * @param profilesStatus         OUT: Currently active, requested, and enforced workload power profiles on this device
+ *
+ * @return
+ *        - \ref DCGM_ST_OK                   if the call was successful.
+ *        - \ref DCGM_ST_BADPARAM             if \a gpuId, \a profileInfo, or \a profileStatus were not valid.
+ *        - \ref DCGM_ST_VER_MISMATCH         if profileInfo or profileStatus were not set to the correct versions.
+ *
+ */
+dcgmReturn_t DCGM_PUBLIC_API
+dcgmGetDeviceWorkloadPowerProfileInfo(dcgmHandle_t pDcgmHandle,
+                                      unsigned int gpuId,
+                                      dcgmWorkloadPowerProfileProfilesInfo_v1 *profilesInfo,
+                                      dcgmDeviceWorkloadPowerProfilesStatus_v1 *profileStatus);
+
 /***************************************************************************************************/
 /** @defgroup DCGMAPI_Topo Topology
  *  @{
@@ -1893,7 +1933,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg
  * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t
  *
  * See \ref dcgmGroupCreate for details on creating a GPU group
- * See \ref dcgmProfWatchFields to actually watch a metric group
+ * See \ref dcgmWatchFields to actually watch the underlying profiling fields
  *
  * @param pDcgmHandle        IN: DCGM Handle
  * @param metricGroups   IN/OUT: Metric groups supported for metricGroups->groupId.<br>
@@ -1909,46 +1949,6 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg
 dcgmReturn_t DCGM_PUBLIC_API dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle,
                                                               dcgmProfGetMetricGroups_t *metricGroups);
 
-/**
- * Request that DCGM start recording updates for a given list of profiling field IDs.
- *
- * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on
- * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields,
- * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues.
- *
- * @param pDcgmHandle        IN: DCGM Handle
- * @param watchFields        IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1
- *                               for details of what should be put in each struct member. watchFields->version should be
- *                               set to dcgmProfWatchFields_version upon calling.
- *
- * @return
- *        - \ref DCGM_ST_OK                     if the call was successful
- *        - \ref DCGM_ST_BADPARAM               if a parameter is invalid
- *        - \ref DCGM_ST_NOT_SUPPORTED          if profiling metric group metricGroupTag is not supported for the given
- *                                              GPU group.
- *        - \ref DCGM_ST_GROUP_INCOMPATIBLE     if groupId's GPUs are not identical GPUs. Profiling metrics are only
- *                                              support for homogenous groups of GPUs.
- *        - \ref DCGM_ST_PROFILING_MULTI_PASS   if any of the metric groups could not be watched concurrently due to
- *                                              requiring the hardware to gather them with multiple passes
- *
- */
-dcgmReturn_t DCGM_PUBLIC_API dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields);
-
-/**
- * Request that DCGM stop recording updates for all profiling field IDs for all GPUs
- *
- * @param pDcgmHandle        IN: DCGM Handle
- * @param unwatchFields      IN: Details of which metric groups to unwatch for which GPUs. See \ref
- *                               dcgmProfUnwatchFields_v1 for details of what should be put in each struct member.
- *                               unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling.
- *
- * @return
- *        - \ref DCGM_ST_OK                   if the call was successful
- *        - \ref DCGM_ST_BADPARAM             if a parameter is invalid
- *
- */
-dcgmReturn_t DCGM_PUBLIC_API dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields);
-
 /**
  * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
  * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.