Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Dec 12, 2024
1 parent f500522 commit 66c3c7c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 23 deletions.
41 changes: 22 additions & 19 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,

// Restriction was already done for previous input
code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
if (eval_mode == CEED_EVAL_NONE) {
// No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
Expand Down Expand Up @@ -429,32 +429,34 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
break;
case CEED_EVAL_INTERP:
if (is_at_points) {
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
} else {
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
}
break;
case CEED_EVAL_GRAD:
if (is_at_points) {
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
} else if (use_3d_slices) {
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
} else {
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
}
break;
case CEED_EVAL_WEIGHT: {
if (!is_at_points) {
if (is_at_points) {
code << " // Nothing to do AtPoints\n";
} else {
CeedBasis_Cuda_shared *basis_data;

code << " CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
Expand Down Expand Up @@ -533,15 +535,10 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
switch (eval_mode) {
case CEED_EVAL_NONE:
if (is_at_points) {
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
} else {
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
}
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
break;
case CEED_EVAL_INTERP:
if (is_at_points) {
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
// Accumulator for point data
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
Expand All @@ -553,9 +550,8 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
break;
case CEED_EVAL_GRAD:
if (is_at_points) {
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
// Accumulator for point data
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n";
code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n";
code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
code << " r_c" << var_suffix << "[i] = 0.0;\n";
code << " }\n";
Expand Down Expand Up @@ -604,7 +600,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
case CEED_EVAL_NONE:
code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
code << " ReadPoint<num_comp" << var_suffix << ", max_num_points>(data, elem, p, max_num_points, 1, max_num_points, 0"
<< ", r_c" << var_suffix << ", r_s" << var_suffix << ");\n";
<< ", r_e" << var_suffix << ", r_s" << var_suffix << ");\n";
break;
case CEED_EVAL_INTERP:
code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
Expand Down Expand Up @@ -796,7 +792,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C

if (is_at_points) {
// Map back to coefficients
code << " // -- Output fields\n";
code << "\n // -- Output fields\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
std::string var_suffix = "_out_" + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
Expand All @@ -808,13 +804,19 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
switch (eval_mode) {
case CEED_EVAL_NONE:
code << " WritePoint<num_comp" << var_suffix << ", max_num_points>(data, elem, p, max_num_points, 1, max_num_points, 0"
<< ", r_s" << var_suffix << ", r_c" << var_suffix << ");\n";
<< ", r_s" << var_suffix << ", r_q" << var_suffix << ");\n";
break;
case CEED_EVAL_INTERP:
code << " if (i > points.num_per_elem[elem]) {\n";
code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
code << " }\n";
code << " InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
<< var_suffix << ", r_x, r_c" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
code << " if (i > points.num_per_elem[elem]) {\n";
code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
code << " }\n";
code << " GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
<< var_suffix << ", r_x, r_c" << var_suffix << ");\n";
break;
Expand All @@ -829,7 +831,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
}
} else if (use_3d_slices) {
// Copy or apply transpose grad, if needed
code << " // -- Output fields\n";
code << "\n // -- Output fields\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
std::string var_suffix = "_out_" + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
Expand Down Expand Up @@ -986,7 +988,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << "// d_[in,out]_i: CeedVector device array\n";
code << "// r_[in,out]_e_i: Element vector register\n";
code << "// r_[in,out]_q_i: Quadrature space vector register\n";
code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n";
code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
code << "// \n";
code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
Expand Down
2 changes: 1 addition & 1 deletion backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
CeedEvalMode eval_mode = CEED_EVAL_NONE;
CeedElemRestriction elem_rstr;

// Setup output arays
// Setup output arrays
code << "\n // -- Output field setup\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
std::string var_suffix = "_out_" + std::to_string(i);
Expand Down
6 changes: 3 additions & 3 deletions include/ceed/jit-source/cuda/cuda-gen-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
// E-vector -> L-vector, AtPoints
//------------------------------------------------------------------------------
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void writeDofsAtPoints1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
inline __device__ void writeLVecAtPoints1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v,
CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d) {
Expand Down Expand Up @@ -184,7 +184,7 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
// E-vector -> L-vector, AtPoints
//------------------------------------------------------------------------------
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void writeDofsAtPoints2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
inline __device__ void WriteLVecAtPoints2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v,
CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
Expand Down Expand Up @@ -293,7 +293,7 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
// E-vector -> L-vector, AtPoints
//------------------------------------------------------------------------------
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void writeDofsAtPoints3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
inline __device__ void WriteLVecAtPoints3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v,
CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
Expand Down

0 comments on commit 66c3c7c

Please sign in to comment.