From 97011eab160f424d1728899096d209d02ae5cfb2 Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Tue, 14 Jan 2025 14:09:06 -0800 Subject: [PATCH] Fix issue in block sizing for GPU shared basis --- backends/cuda-shared/ceed-cuda-shared-basis.c | 9 ++++++--- backends/hip-shared/ceed-hip-shared-basis.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 5991559cd3..599ece636d 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -486,18 +486,21 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a } } break; case CEED_EVAL_WEIGHT: { - CeedInt Q; + CeedInt P, Q; CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); + CeedInt thread = CeedIntMax(Q, P); + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; { // avoid >512 total threads - CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1)); + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); - CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args)); + CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args)); } } break; case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 144af79c2f..1c18099a82 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -550,19 +550,20 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad } } break; case CEED_EVAL_WEIGHT: { - CeedInt Q; - CeedInt block_size = data->block_sizes[2]; + CeedInt P, Q; CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); + CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); - void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + CeedInt thread = CeedIntMax(Q, P); + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; { - const CeedInt opt_elems = block_size / Q; - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); - CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q, elems_per_block, 1, weight_args)); + CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args)); } } break; case CEED_EVAL_NONE: /* handled separately below */