From 97011eab160f424d1728899096d209d02ae5cfb2 Mon Sep 17 00:00:00 2001
From: Zach Atkins <Zach.Atkins@colorado.edu>
Date: Tue, 14 Jan 2025 14:09:06 -0800
Subject: [PATCH] Fix issue in block sizing for GPU shared basis

---
 backends/cuda-shared/ceed-cuda-shared-basis.c |  9 ++++++---
 backends/hip-shared/ceed-hip-shared-basis.c   | 15 ++++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 5991559cd3..599ece636d 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -486,18 +486,21 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       }
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedInt Q;
+      CeedInt P, Q;
 
       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       {
         // avoid >512 total threads
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1));
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
-        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args));
+        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args));
       }
     } break;
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 144af79c2f..1c18099a82 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -550,19 +550,20 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
       }
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedInt Q;
-      CeedInt block_size = data->block_sizes[2];
+      CeedInt P, Q;
 
       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
-      void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+      CeedInt thread        = CeedIntMax(Q, P);
+      void   *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       {
-        const CeedInt opt_elems       = block_size / Q;
-        const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
-        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q, elems_per_block, 1, weight_args));
+        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
       }
     } break;
     case CEED_EVAL_NONE: /* handled separately below */