Skip to content

Commit

Permalink
Merge pull request #1727 from CEED/zach/hip-nontensor-fix
Browse files Browse the repository at this point in the history
Fix issue in block sizing for GPU shared basis
  • Loading branch information
jeremylt authored Jan 14, 2025
2 parents 2c98d61 + 97011ea commit 79881bb
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
9 changes: 6 additions & 3 deletions backends/cuda-shared/ceed-cuda-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -486,18 +486,21 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
}
} break;
case CEED_EVAL_WEIGHT: {
CeedInt Q;
CeedInt P, Q;

CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
CeedInt thread = CeedIntMax(Q, P);

void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};

{
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / Q, 1));
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);

CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, Q, elems_per_block, 1, weight_args));
CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args));
}
} break;
case CEED_EVAL_NONE: /* handled separately below */
Expand Down
15 changes: 8 additions & 7 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,19 +550,20 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
}
} break;
case CEED_EVAL_WEIGHT: {
CeedInt Q;
CeedInt block_size = data->block_sizes[2];
CeedInt P, Q;

CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
CeedInt thread = CeedIntMax(Q, P);
void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};

{
const CeedInt opt_elems = block_size / Q;
const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);

CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q, elems_per_block, 1, weight_args));
CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
}
} break;
case CEED_EVAL_NONE: /* handled separately below */
Expand Down

0 comments on commit 79881bb

Please sign in to comment.