Skip to content

Commit

Permalink
Update to DCGM 4.0.0 (#77)
Browse files Browse the repository at this point in the history
* Update go-dcgm bindings for DCGM 4.0
* Update to latest DCGM 4.0 headers
---------

Signed-off-by: Douglas Wightman <[email protected]>
  • Loading branch information
glowkey authored Jan 6, 2025
1 parent 85ceb31 commit 850266c
Show file tree
Hide file tree
Showing 17 changed files with 7,378 additions and 3,903 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ binary:
cd samples/processInfo; go build
cd samples/restApi; go build
cd samples/topology; go build
cd samples/diag; go build

test-main:
go test -race ./tests
Expand All @@ -46,4 +47,4 @@ clean:
rm -f samples/topology/topology

lint:
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
2 changes: 1 addition & 1 deletion pkg/dcgm/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ var (

func initDcgm(m mode, args ...string) (err error) {
const (
dcgmLib = "libdcgm.so"
dcgmLib = "libdcgm.so.4"
)
lib := C.CString(dcgmLib)
defer freeCString(lib)
Expand Down
1,790 changes: 953 additions & 837 deletions pkg/dcgm/const.go

Large diffs are not rendered by default.

118 changes: 59 additions & 59 deletions pkg/dcgm/dcgm_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#ifndef DCGM_AGENT_H
#define DCGM_AGENT_H

#define DCGM_PUBLIC_API
#include "dcgm_api_export.h"
#include "dcgm_structs.h"

#ifdef __cplusplus
Expand Down Expand Up @@ -403,13 +403,14 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle
* - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration.
* - \ref DCGM_ST_BADPARAM if any parameter is invalid
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v3 *linkStatus);
dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v4 *linkStatus);


/**
* List supported CPUs and their cores present on the system
*
* This and other CPU APIs only support datacenter NVIDIA CPUs
* This and other CPU APIs only support datacenter NVIDIA CPUs. Use \ref dcgmGetCpuHierarchy_v2 to
* get additional CPU information.
*
* @param dcgmHandle IN: DCGM Handle
* @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated
Expand All @@ -422,6 +423,22 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dc
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v1 *cpuHierarchy);

/**
* List supported CPUs and their cores present on the system
*
* This and other CPU APIs only support datacenter NVIDIA CPUs.
*
* @param dcgmHandle IN: DCGM Handle
* @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated
*
* @return
* - \ref DCGM_ST_OK if the call was successful.
* - \ref DCGM_ST_NOT_SUPPORTED if the device is unsupported
* - \ref DCGM_ST_MODULE_NOT_LOADED if the sysmon module could not be loaded
* - \ref DCGM_ST_BADPARAM if any parameter is invalid
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy_v2(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v2 *cpuHierarchy);

/** @} */

/***************************************************************************************************/
Expand Down Expand Up @@ -1544,23 +1561,21 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyGet(dcgmHandle_t pDcgmHandle,
* \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs.
* @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for
* which to register a callback function
* @param beginCallback IN: A reference to a function that should be called should a violation occur.
* @param callback IN: A reference to a function that should be called should a violation occur.
* This function will be called prior to any actions specified by the policy are taken.
* @param finishCallback IN: A reference to a function that should be called should a violation occur.
* This function will be called after any action specified by the policy are completed.
* @param userData IN: User data pointer to pass to the userData field of callback
*
* @return
* - \ref DCGM_ST_OK if the call was successful
* - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or
* \a finishCallback is NULL
* - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a callback, is NULL
* - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId
*
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle,
dcgmGpuGrp_t groupId,
dcgmPolicyCondition_t condition,
fpRecvUpdates beginCallback,
fpRecvUpdates finishCallback);
dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister_v2(dcgmHandle_t pDcgmHandle,
dcgmGpuGrp_t groupId,
dcgmPolicyCondition_t condition,
fpRecvUpdates callback,
uint64_t userData);

/**
* Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t).
Expand All @@ -1575,7 +1590,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle,
*
* @return
* - \ref DCGM_ST_OK if the call was successful
* - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL
* - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid
* - \ref DCGM_ST_IN_USE if callback from policy registeration is in progress
*
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle,
Expand Down Expand Up @@ -1617,7 +1633,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle,
dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
dcgmGpuGrp_t groupId,
dcgmPolicyValidation_t validate,
dcgmDiagResponse_t *response);
dcgmDiagResponse_v11 *response);

/**
* Inform the action manager to perform a manual validation of a group of GPUs on the system
Expand All @@ -1628,6 +1644,8 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
* group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform
* operation on all the GPUs.
* @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details.
* Note: It's a caller's responsibility to make sure the response is zero-initialized,
* except for the version field.
*
* @return
* - \ref DCGM_ST_OK if the call was successful
Expand All @@ -1639,8 +1657,9 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle,
* currently not allowed.
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle,
dcgmRunDiag_v7 *drd,
dcgmDiagResponse_t *response);
dcgmRunDiag_v9 *drd,
dcgmDiagResponse_v11 *response);


/**
* Run a diagnostic on a group of GPUs
Expand All @@ -1667,7 +1686,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle,
dcgmReturn_t DCGM_PUBLIC_API dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle,
dcgmGpuGrp_t groupId,
dcgmDiagnosticLevel_t diagLevel,
dcgmDiagResponse_t *diagResponse);
dcgmDiagResponse_v11 *diagResponse);

/** @} */ // Closing for DCGMAPI_PO_MI

Expand Down Expand Up @@ -1697,6 +1716,27 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle);

/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl

/**
* Gets device workload power profile information and status.
*
* @param pDcgmHandle IN: DCGM Handle
* @param gpuId IN: GPU Id corresponding to which topology information should be fetched
* @param profilesInfo OUT: Information about each of the supported workload power profiles available on this
* device
* @param profilesStatus OUT: Currently active, requested, and enforced workload power profiles on this device
*
* @return
* - \ref DCGM_ST_OK if the call was successful.
* - \ref DCGM_ST_BADPARAM if \a gpuId, \a profileInfo, or \a profileStatus were not valid.
* - \ref DCGM_ST_VER_MISMATCH if profileInfo or profileStatus were not set to the correct versions.
*
*/
dcgmReturn_t DCGM_PUBLIC_API
dcgmGetDeviceWorkloadPowerProfileInfo(dcgmHandle_t pDcgmHandle,
unsigned int gpuId,
dcgmWorkloadPowerProfileProfilesInfo_v1 *profilesInfo,
dcgmDeviceWorkloadPowerProfilesStatus_v1 *profileStatus);

/***************************************************************************************************/
/** @defgroup DCGMAPI_Topo Topology
* @{
Expand Down Expand Up @@ -1893,7 +1933,7 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg
* Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t
*
* See \ref dcgmGroupCreate for details on creating a GPU group
* See \ref dcgmProfWatchFields to actually watch a metric group
* See \ref dcgmWatchFields to actually watch the underlying profiling fields
*
* @param pDcgmHandle IN: DCGM Handle
* @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.<br>
Expand All @@ -1909,46 +1949,6 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcg
dcgmReturn_t DCGM_PUBLIC_API dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle,
dcgmProfGetMetricGroups_t *metricGroups);

/**
* Request that DCGM start recording updates for a given list of profiling field IDs.
*
* Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on
* the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields,
* \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues.
*
* @param pDcgmHandle IN: DCGM Handle
* @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1
* for details of what should be put in each struct member. watchFields->version should be
* set to dcgmProfWatchFields_version upon calling.
*
* @return
* - \ref DCGM_ST_OK if the call was successful
* - \ref DCGM_ST_BADPARAM if a parameter is invalid
* - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given
* GPU group.
* - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only
* support for homogenous groups of GPUs.
* - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to
* requiring the hardware to gather them with multiple passes
*
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields);

/**
* Request that DCGM stop recording updates for all profiling field IDs for all GPUs
*
* @param pDcgmHandle IN: DCGM Handle
* @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref
* dcgmProfUnwatchFields_v1 for details of what should be put in each struct member.
* unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling.
*
* @return
* - \ref DCGM_ST_OK if the call was successful
* - \ref DCGM_ST_BADPARAM if a parameter is invalid
*
*/
dcgmReturn_t DCGM_PUBLIC_API dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields);

/**
* Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
* from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.
Expand Down
Loading

0 comments on commit 850266c

Please sign in to comment.