Merge pull request ICLDisco#644 from abouteiller/bugfix/no-gpu-found

Consolidated error handling when GPU only tests execute on CPU systems
bosilca · May 24, 2024 · a5f49ab · a5f49ab
2 parents 1fdfded + 5ff246a
commit a5f49ab
Show file tree

Hide file tree

Showing 10 changed files with 42 additions and 41 deletions.
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -70,6 +70,8 @@ function(parsec_addtest_cmd target)
   # restrict memory use for oversubscribed runners
   set_tests_properties(${target} PROPERTIES ENVIRONMENT
     "PARSEC_MCA_device_cuda_enabled=0;PARSEC_MCA_device_hip_enabled=0;PARSEC_MCA_device_level_zero_enabled=0;PARSEC_MCA_device_cuda_memory_use=10;PARSEC_MCA_device_hip_memory_use=10;PARSEC_MCA_device_level_zero_memory_use=10")
+  # skip tests that fail because the device is not available */
+  set_tests_properties(${target} PROPERTIES SKIP_RETURN_CODE 10) # 10 is -PARSEC_ERR_DEVICE, positive 7bit return codes are more portable
 endfunction(parsec_addtest_cmd)
 
 check_function_exists(erand48 PARSEC_HAVE_ERAND48)

diff --git a/tests/dsl/dtd/dtd_test_task_insertion.c b/tests/dsl/dtd/dtd_test_task_insertion.c
@@ -94,11 +94,12 @@ int main(int argc, char ** argv)
 #endif
 
     int m, n;
-    int no_of_tasks = 500000;
+    int no_of_tasks = 50000;
     int amount_of_work[3] = {100, 1000, 10000};
     parsec_taskpool_t *dtd_tp;
 
     parsec = parsec_init( cores, &argc, &argv );
+    cores = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_CORES);
 
     dtd_tp = parsec_dtd_taskpool_new();
 

diff --git a/tests/runtime/cuda/nvlink_main.c b/tests/runtime/cuda/nvlink_main.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2020 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  */
@@ -34,6 +34,14 @@ int main(int argc, char *argv[])
 
     parsec = parsec_init(-1, &argc, &argv);
 
+    /* can the test run? */
+    int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        exit(-PARSEC_ERR_DEVICE);
+    }
+
     tp = testing_nvlink_New(parsec, 10, 512);
     if( NULL != tp ) {
         parsec_context_add_taskpool(parsec, tp);

diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
@@ -1,6 +1,5 @@
-
 /**
- * Copyright (c) 2019-2021 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -103,20 +102,8 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
     parsec_matrix_block_cyclic_t *userM;
 
     /** Find all CUDA devices */
-    nb = 0;
-    for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
-        parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
-            nb++;
-        }
-    }
-    if(nb == 0) {
-        char hostname[256];
-        gethostname(hostname, 256);
-        fprintf(stderr, "This test requires at least one CUDA device per node -- no CUDA device found on rank %d on %s\n",
-                ctx->my_rank, hostname);
-        return NULL;
-    }
+    nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb >= 0);
     dev_index = (int*)malloc(nb * sizeof(int));
     nb = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
@@ -156,7 +143,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
 
     /* GEMM1 tasks will create one data copy per GPU, and work on those.
      * see nvlink.jdf:MAKE_C tasks */
-    
+
     /* userM is a user-managed matrix: the user creates the data copies
      * only on the GPU they want the GEMM2 to run. To simplify the code,
      * we use parsec_matrix_block_cyclic that requires to also have a CPU data
@@ -208,14 +195,14 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
             g++;
         }
     }
-    
+
     testing_handle = parsec_nvlink_new(dcA, userM, ctx->nb_nodes, CuHI, nb, dev_index);
 
     parsec_add2arena( &testing_handle->arenas_datatypes[PARSEC_nvlink_DEFAULT_ADT_IDX],
                              parsec_datatype_double_complex_t,
                              PARSEC_MATRIX_FULL, 1, mb, mb, mb,
                              PARSEC_ARENA_ALIGNMENT_SSE, -1 );
-    
+
     return &testing_handle->super;
 }
 
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
@@ -1,6 +1,6 @@
 extern "C" %{
 /*
- * Copyright (c) 2019-2023 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -162,7 +162,7 @@ BODY [type=CUDA
                          lbeta,  (double*)A, ldam );
     status = cublasGetError();
     PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -203,7 +203,7 @@ BODY [type=CUDA
                          lbeta,  (double*)B, ldbm );
     status = cublasGetError();
     PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 
 }
 END

diff --git a/tests/runtime/cuda/stage_main.c b/tests/runtime/cuda/stage_main.c
@@ -14,7 +14,6 @@ int main(int argc, char *argv[])
 {
     parsec_context_t *parsec = NULL;
     parsec_taskpool_t *tp;
-    int i;
     int size = 1;
     int rank = 0;
     int M;
@@ -41,7 +40,15 @@ int main(int argc, char *argv[])
         exit(-1);
     }
 
+    /* can the test run? */
     assert(size == 1);
+    int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        printf("TEST SKIPPED\n");
+        exit(-PARSEC_ERR_DEVICE);
+    }
 
     /* Test: comparing results when:
         - tile matrix transfered to GPU with default stage_in/stage_out
@@ -87,9 +94,9 @@ int main(int argc, char *argv[])
         parsec_taskpool_free(tp);
     }
 
-    if(ret!= 0){
-        printf("TEST FAILED\n");
-    }else{
+    if( ret != 0) {
+        printf("TEST FAILED (%d errors)\n", ret);
+    } else {
         printf("TEST PASSED\n");
     }
 
@@ -98,5 +105,5 @@ int main(int argc, char *argv[])
     MPI_Finalize();
 #endif /* DISTRIBUTED */
 
-    return ret;
+    return (0 == ret)? EXIT_SUCCESS: EXIT_FAILURE;
 }
diff --git a/tests/runtime/cuda/stress.jdf b/tests/runtime/cuda/stress.jdf
@@ -125,7 +125,7 @@ r = 0 .. NP-1
 // Parameters
 READ A <- (g == 0) ? A READ_A(m, r) : A GEMM(m, g-1, r)
        -> ((g + 1) < NGPUs)         ? A GEMM(m, g+1, r)
-READ B <- A READ_A( (m+g) % descA->super.mt, r)
+READ B <- A READ_A(m, r)
 RW   C <- (m == 0) ? C MAKE_C(g, r) : C GEMM(m-1, g, r)
        -> ((m + 1) < (descA->super.mt)) ? C GEMM(m+1, g, r)
                                     : C DISCARD_C(g, r)

diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c
@@ -28,7 +28,7 @@ int main(int argc, char *argv[])
 
     parsec = parsec_init(-1, &argc, &argv);
 
-    tp = testing_stress_New(parsec, 4000, 1024);
+    tp = testing_stress_New(parsec, 80, 1024);
     if( NULL != tp ) {
         parsec_context_add_taskpool(parsec, tp);
         parsec_context_start(parsec);

diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c
@@ -15,6 +15,7 @@ static void __parsec_stress_destructor( parsec_taskpool_t *tp )
     dcA = stress_taskpool->_g_descA;
     parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)stress_taskpool->_g_descA );
     free(dcA);
+    free(stress_taskpool->_g_cuda_device_index);
 }
 
 PARSEC_OBJ_CLASS_INSTANCE(parsec_stress_taskpool_t, parsec_taskpool_t,
@@ -27,19 +28,14 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
     parsec_matrix_block_cyclic_t *dcA;
 
     /** Find all CUDA devices */
-    nb = 0;
-    for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
-        parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
-            nb++;
-        }
-    }
+    nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb >= 0);
     if(nb == 0) {
         /* We just simulate a run on CPUs, with an arbitrary number of pseudo-GPUs */
         nb = 8;
         dev_index = (int*)malloc(nb * sizeof(int));
         memset(dev_index, -1, nb*sizeof(int));
-        fprintf(stderr, "Simulating %d GPUs for sanity checking in stress test\n", nb);
+        parsec_warning("Simulating %d GPUs for sanity checking in stress test\n", nb);
     } else {
         dev_index = (int*)malloc(nb * sizeof(int));
         nb = 0;

diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c
@@ -159,7 +159,7 @@ int main(int argc, char *argv[])
 
     /* Check result */
     if( 0 == rank && info != 0 ) {
-        fprintf(stderr, "Result is Wrong !!!\n");
+        fprintf(stderr, "Result is Wrong (info %d) !!!\n", info);
     }
 
     parsec_data_free(dcA.mat);
@@ -172,5 +172,5 @@ int main(int argc, char *argv[])
     MPI_Finalize();
 #endif
 
-    return info;
+    return (0 == info)? EXIT_SUCCESS: EXIT_FAILURE;
 }