Skip to content

Commit

Permalink
hmem/cuda: Fallback to CUDA calls when NVML call fails.
Browse files Browse the repository at this point in the history
The NVML calls to verify nvidia devices are nice-to-have
features that allow a cheaper check and earlier exit when
cuda devices are not present. When NVML call fails, we
shouldn't fail the whole cuda hmem init but should fall
back to the normal cuda api calls.

It is observed that users may use an NVML libraries that
only contain stub functions for build purpose.

Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
shijin-aws committed Oct 27, 2023
1 parent 2786c8d commit 9928e6f
Showing 1 changed file with 24 additions and 9 deletions.
33 changes: 24 additions & 9 deletions src/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -493,46 +493,61 @@ static int cuda_hmem_verify_devices(void)
nvmlReturn_t nvml_ret;
cudaError_t cuda_ret;
unsigned int nvml_device_count = 0;
bool nvml_succeeded = false;

/* Check w/ nvmlDeviceGetCount_v2() first, to avoid more expensive
* call to cudaGetDeviceCount() when possible.
*/

/* Check for NVIDIA devices, if NVML library is dlopen-ed*/
/*
* Check for NVIDIA devices as the best effort, if NVML library is dlopen-ed
* Fallback to the CUDA API calls if NVML calls failed
*/
if (cuda_attr.nvml_handle) {

/* Make certain that the NVML routines are initialized */
nvml_ret = ofi_nvmlInit_v2();
if (nvml_ret != NVML_SUCCESS)
return -FI_ENOSYS;
if (nvml_ret != NVML_SUCCESS) {
FI_WARN(&core_prov, FI_LOG_CORE,
"nvmInit_v2 failed! ret: %d\n", nvml_ret);
goto cuda_api_call;
}

/* Verify NVIDIA devices are present on the host. */
nvml_ret = ofi_nvmlDeviceGetCount_v2(&nvml_device_count);
if (nvml_ret != NVML_SUCCESS) {
FI_WARN(&core_prov, FI_LOG_CORE,
"nvmlDeviceGetCount_v2 failed! ret: %d\n", nvml_ret);
ofi_nvmlShutdown();
return -FI_ENOSYS;
goto cuda_api_call;
}

/* Make certain that the NVML routines get shutdown */
/* Note: nvmlInit / Shutdown calls are refcounted, so no harm in
* calling nvmlShutdown here, if the user has called nvmlInit.
*/
nvml_ret = ofi_nvmlShutdown();
if (nvml_ret != NVML_SUCCESS)
return -FI_ENOSYS;
if (nvml_ret != NVML_SUCCESS) {
FI_WARN(&core_prov, FI_LOG_CORE,
"nvmlShutdown failed! ret: %d\n", nvml_ret);
goto cuda_api_call;
}

FI_INFO(&core_prov, FI_LOG_CORE,
"Number of NVIDIA devices detected: %u\n",
nvml_device_count);

nvml_succeeded = true;
} else {
FI_INFO(&core_prov, FI_LOG_CORE,
"Skipping check for NVIDIA devices with NVML routines\n");
}

/* If NVIDIA devices are present, now perform more expensive check
* for actual GPUs.
cuda_api_call:
/* If NVIDIA devices are present, or the NVML setup fails,
* perform more expensive check for actual GPUs.
*/
if (!cuda_attr.nvml_handle || nvml_device_count > 0) {
if (!nvml_succeeded || nvml_device_count > 0) {
/* Verify CUDA compute-capable devices are present on the host. */
cuda_ret = ofi_cudaGetDeviceCount(&cuda_attr.device_count);
switch (cuda_ret) {
Expand Down

0 comments on commit 9928e6f

Please sign in to comment.