From a0b73d41345167f0d3a94e20a50dbb72c671624a Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Fri, 20 Oct 2023 11:19:49 -0700 Subject: [PATCH 1/2] Initial commit to optimize magma backend transpose basis application --- .../jit-source/magma/magma-basis-grad-1d.h | 10 +-- .../jit-source/magma/magma-basis-grad-2d.h | 28 +++---- .../jit-source/magma/magma-basis-grad-3d.h | 32 +++---- .../jit-source/magma/magma-basis-interp-1d.h | 10 +-- .../jit-source/magma/magma-basis-interp-2d.h | 24 +++--- .../jit-source/magma/magma-basis-interp-3d.h | 20 ++--- .../jit-source/magma/magma-basis-weight-1d.h | 4 +- .../jit-source/magma/magma-basis-weight-2d.h | 4 +- .../jit-source/magma/magma-basis-weight-3d.h | 4 +- .../jit-source/magma/magma-common-tensor.h | 83 ++++++++++--------- 10 files changed, 111 insertions(+), 108 deletions(-) diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h index 89a112115e..a338be2055 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h @@ -15,7 +15,7 @@ // macros to abstract access of shared memory and reg. file #define sT(i, j) sT[(j)*P + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // grad basis action (1D) template static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { @@ -40,7 +40,7 @@ static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_ } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -72,7 +72,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); + read_T_notrans_gm2sm(tx, dTgrad, sT); } // read U @@ -86,7 +86,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ write_1d(sV, dV, cstrdV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -118,7 +118,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); + read_T_trans_gm2sm(tx, dTgrad, sT); } // read U diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h index 042e41b046..64397f431b 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h @@ -16,7 +16,7 @@ #define sT(i, j) sT[(j)*P + (i)] #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // grad basis action (2D) // This function is called two times at a higher level for 2D // DIM_U -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q] @@ -76,7 +76,7 @@ static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const } // loop over NUM_COMP } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -105,8 +105,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + read_T_notrans_gm2sm(tx, dinterp1d, sTinterp); + read_T_notrans_gm2sm(tx, dgrad1d, sTgrad); } // No need to read V ( required only in transposed grad ) @@ -114,22 +114,22 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 0) */ magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + write_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 1) */ magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d(dV + (1 * dstrdV), cstrdV, rV, tx); + write_V_2d(dV + (1 * dstrdV), cstrdV, rV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -158,32 +158,32 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); } __syncthreads(); /* read V (since this is transposed mode -- idim = 0 for dV, i_DIM = 0 for rV) */ const CeedScalar beta = 1.0; - readV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + read_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ // write V - writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + write_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); } #endif // CEED_MAGMA_BASIS_GRAD_2D_H diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h index 063ee7bc0d..6a56d47c66 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h @@ -17,7 +17,7 @@ #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] #define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // grad basis action (3D) // This function is called three times at a higher level for 3D // DIM_U -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q] @@ -98,7 +98,7 @@ static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const } // loop over NUM_COMP } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -127,8 +127,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + read_T_notrans_gm2sm(tx, dinterp1d, sTinterp); + read_T_notrans_gm2sm(tx, dgrad1d, sTgrad); } __syncthreads(); @@ -137,28 +137,28 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 0) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + write_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 1) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (1 * dstrdV), cstrdV, rV, tx); + write_V_3d(dV + (1 * dstrdV), cstrdV, rV, tx); /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 2) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d(dV + (2 * dstrdV), cstrdV, rV, tx); + write_V_3d(dV + (2 * dstrdV), cstrdV, rV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { @@ -187,38 +187,38 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + read_T_trans_gm2sm(tx, dinterp1d, sTinterp); + read_T_trans_gm2sm(tx, dgrad1d, sTgrad); } __syncthreads(); // read V (since this is transposed mode) const CeedScalar beta = 1.0; - readV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + read_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ /* read U (idim = 2 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ - readU_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); + read_U_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */ magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ // write V - writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + write_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); } #endif // CEED_MAGMA_BASIS_GRAD_3D_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h index 074efd94b6..0c49fa3696 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h @@ -15,7 +15,7 @@ // macros to abstract access of shared memory and reg. file #define sT(i, j) sT[(j)*P + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // interp basis action (1D) template static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { @@ -40,7 +40,7 @@ static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_tran } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -72,7 +72,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_notrans_gm2sm(tx, dT, sT); } // read U @@ -86,7 +86,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ write_1d(sV, dV, cstrdV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -118,7 +118,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_trans_gm2sm(tx, dT, sT); } // read U diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h index bb3475df51..ece85a26fe 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h @@ -16,7 +16,7 @@ #define sT(i, j) sT[(j)*P + (i)] #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // interp basis action (2D) template static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], @@ -65,7 +65,7 @@ static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_tran } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -93,21 +93,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_notrans_gm2sm(tx, dT, sT); } // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); + read_U_2d(dU, cstrdU, rU, sTmp, tx); - // no sync needed here -- readU_2d already syncs at the end + // no sync needed here -- read_U_2d already syncs at the end magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V - writeV_2d(dV, cstrdV, rV, tx); + write_V_2d(dV, cstrdV, rV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -135,21 +135,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_trans_gm2sm(tx, dT, sT); } // read V - readV_2d(dV, cstrdV, rV, tx); + read_V_2d(dV, cstrdV, rV, tx); // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); + read_U_2d(dU, cstrdU, rU, sTmp, tx); - // no sync needed here -- readU_2d already syncs at the end + // no sync needed here -- read_U_2d already syncs at the end magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V - writeV_2d(dV, cstrdV, rV, tx); + write_V_2d(dV, cstrdV, rV, tx); } #endif // CEED_MAGMA_BASIS_INTERP_2D_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h index 8f2fd3985e..9c5020c3ff 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h @@ -16,7 +16,7 @@ #define sT(i, j) sT[(j)*P + (i)] #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // interp basis action (3D) template static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], @@ -93,7 +93,7 @@ static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_tran } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -121,21 +121,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_notrans_gm2sm(tx, dT, sT); } // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); + read_U_3d(dU, cstrdU, rU, sTmp, tx); // there is a sync at the end of this function magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V - writeV_3d(dV, cstrdV, rV, tx); + write_V_3d(dV, cstrdV, rV, tx); } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) { @@ -163,21 +163,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA // read T if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); + read_T_trans_gm2sm(tx, dT, sT); } // read V - readV_3d(dV, cstrdV, rV, tx); + read_V_3d(dV, cstrdV, rV, tx); // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); + read_U_3d(dU, cstrdU, rU, sTmp, tx); // there is a sync at the end of this function magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V - writeV_3d(dV, cstrdV, rV, tx); + write_V_3d(dV, cstrdV, rV, tx); } #endif // CEED_MAGMA_BASIS_INTERP_3D_H diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h index 38a058ea67..879ad73fcd 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h @@ -12,7 +12,7 @@ #include "magma-common-tensor.h" -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // weight basis action -- 1D template static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) { @@ -24,7 +24,7 @@ static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *s } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_weight_1d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h index 721b50f953..21b0d1c0c7 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h @@ -12,7 +12,7 @@ #include "magma-common-tensor.h" -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // weight basis action -- 2D template static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) { @@ -32,7 +32,7 @@ static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_2D)) __global__ void magma_weight_2d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h index 835bca44cd..1233bb93d5 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h @@ -12,7 +12,7 @@ #include "magma-common-tensor.h" -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // weight basis action -- 3D template static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) { @@ -33,7 +33,7 @@ static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q *BASIS_Q, MAGMA_MAXTHREADS_3D)) __global__ void magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h index 1ca3f52758..6ea70f967e 100644 --- a/include/ceed/jit-source/magma/magma-common-tensor.h +++ b/include/ceed/jit-source/magma/magma-common-tensor.h @@ -12,7 +12,7 @@ #include "magma-common-defs.h" -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // read U or V of a 1D element into shared memory sU[][] or sV[][] -- for all components // the devptr is assumed to point directly to the element // must sync after call @@ -25,7 +25,7 @@ static __device__ __inline__ void read_1d(const T *devptr, const int compstride, } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // write V of a 1D element into global memory from sV[][] -- for all components // the devptr is assumed to point directly to the element template @@ -37,16 +37,16 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // read U of a 2D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride // register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE] // i_DIM specifies which dimension is being read into in rU -// rU_SIZE can be different from P (e.g. MAXP_Q) +// rU_SIZE can be different from P (e.g. max(P, Q)) // sTmp is a shared memory workspace of size P^2 template -static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { - // read U as a batch P of (1 x P_) vectors +static __device__ __inline__ void read_U_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { + // read U as a batch P of (1 x P) vectors // vec 0 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory // vec 1 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory // ... @@ -74,14 +74,14 @@ static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // read V of a 2D element into registers rV[][][] -- for all components of a single dim // dV is assumed to be offset by elem-stride and dim-stride // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] // i_DIM specifies which dimension is being read into in rV -// rV_SIZE can be different from P (e.g. MAXP_Q) +// rV_SIZE can be different from P (e.g. max(P, Q)) template -static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { +static __device__ __inline__ void read_V_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { if (tx < Q) { for (int comp = 0; comp < NUM_COMP; comp++) { for (int j = 0; j < Q; j++) { @@ -91,15 +91,14 @@ static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // write V of a 2D element from registers rV[][][] to global memory -- for all components of a single dim // dV is assumed to be offset by elem-stride and dim-stride // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] -// i_DIM specifies which dimension is being read from in rV -// idim specifies which dimension is being written to in dV -// rV_SIZE can be different from P (e.g. MAXP_Q) +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) template -static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { +static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { if (tx < Q) { for (int comp = 0; comp < NUM_COMP; comp++) { for (int j = 0; j < Q; j++) { @@ -109,15 +108,15 @@ static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DI } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // read U of a 3D element into registers rU[][][] -- for all components of a single dim // dU is assumed to be offset by elem-stride and dim-stride // register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE] // i_DIM specifies which dimension is being read into in rU -// rU_SIZE can be different from P (e.g. MAXP_Q) +// rU_SIZE can be different from P (e.g. max(P, Q)) // sTmp is a shared memory workspace of size P^3 template -static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { +static __device__ __inline__ void read_U_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) { // read U as a batch P^2 of (1 x P_) vectors // vec 0 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory // vec 1 : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory @@ -146,14 +145,14 @@ static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // read V of a 3D element into registers rV[][][] -- for all components of a single dim // dV is assumed to be offset by elem-stride and dim-stride // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] // i_DIM specifies which dimension is being read into in rV -// rV_SIZE can be different from P (e.g. MAXP_Q) +// rV_SIZE can be different from P (e.g. max(P, Q)) template -static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { +static __device__ __inline__ void read_V_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { if (tx < Q * Q) { for (int comp = 0; comp < NUM_COMP; comp++) { for (int j = 0; j < Q; j++) { @@ -163,15 +162,14 @@ static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T } } -////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// // write V of a 3D element from registers rV[][][] to global memory -- for all components of a single dim // dV is assumed to point directly to the element (i.e. already offset by elem-stride) // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE] -// i_DIM specifies which dimension is being read from in rV -// idim specifies which dimension is being written to in dV -// rV_SIZE can be different from P (e.g. MAXP_Q) +// i_DIM specifies which dimension is being written to in dV +// rV_SIZE can be different from P (e.g. max(P, Q)) template -static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { +static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) { if (tx < (Q * Q)) { for (int comp = 0; comp < NUM_COMP; comp++) { for (int j = 0; j < Q; j++) { @@ -181,24 +179,29 @@ static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DI } } -////////////////////////////////////////////////////////////////////////////////////////// -// reads T into shared memory +//////////////////////////////////////////////////////////////////////////////// +// reads T (no-trans) into shared memory +// T is B x J // must sync after call template -static __device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) { - if (transT == MagmaNoTrans) { - // T is B x J - if (tx < B) { - for (int i = 0; i < J; i++) { - sT[i * B + tx] = dT[i * B + tx]; - } +static __device__ __inline__ void read_T_notrans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) { + if (tx < B) { + for (int i = 0; i < J; i++) { + sT[i * B + tx] = dT[i * B + tx]; } - } else { - // T is J x B - if (tx < J) { - for (int i = 0; i < B; i++) { - sT[tx * B + i] = dT[i * J + tx]; - } + } + // must sync after call +} + +//////////////////////////////////////////////////////////////////////////////// +// reads T (trans) into shared memory +// T is J x B +// must sync after call +template +static __device__ __inline__ void read_T_trans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) { + if (tx < J) { + for (int i = 0; i < B; i++) { + sT[tx * B + i] = dT[i * J + tx]; } } // must sync after call From 3e5ab5d1e8af7236d5effd483f3326b502c4c840 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Fri, 20 Oct 2023 11:20:08 -0700 Subject: [PATCH 2/2] Remove need to zero out V vector before applying basis transpose for magma backends --- backends/magma/ceed-magma-basis.c | 26 -------- .../jit-source/magma/magma-basis-grad-1d.h | 26 ++++---- .../jit-source/magma/magma-basis-grad-2d.h | 55 +++++++++-------- .../jit-source/magma/magma-basis-grad-3d.h | 59 +++++++++++-------- .../jit-source/magma/magma-basis-interp-1d.h | 26 ++++---- .../jit-source/magma/magma-basis-interp-2d.h | 25 ++++---- .../jit-source/magma/magma-basis-interp-3d.h | 25 ++++---- .../ceed/jit-source/magma/magma-common-defs.h | 2 - 8 files changed, 103 insertions(+), 141 deletions(-) diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 3f0cd2f640..2122342be8 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -54,19 +54,6 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - // Clear v for transpose operation - if (t_mode == CEED_TRANSPOSE) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); - } else { - magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); - } - ceed_magma_queue_sync(data->queue); - } - // Apply basis operation switch (e_mode) { case CEED_EVAL_INTERP: { @@ -289,19 +276,6 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - // Clear v for transpose operation - if (t_mode == CEED_TRANSPOSE) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue); - } else { - magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue); - } - ceed_magma_queue_sync(data->queue); - } - // Apply basis operation if (e_mode != CEED_EVAL_WEIGHT) { const CeedScalar *d_b = NULL; diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h index a338be2055..21572c5c3a 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h @@ -18,7 +18,7 @@ //////////////////////////////////////////////////////////////////////////////// // grad basis action (1D) template -static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { +static __device__ __inline__ void magma_grad_1d_device(const T *sT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { // Assumptions // 1. 1D threads of size max(P,Q) // 2. sU[i] is 1xP: in shared memory @@ -28,10 +28,9 @@ static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_ // 6. Must sync before and after call // 7. Note that the layout for U and V is different from 2D/3D problem - T rv; if (tx < Q) { for (int comp = 0; comp < NUM_COMP; comp++) { - rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0; + T rv = 0.0; for (int i = 0; i < P; i++) { rv += sU[comp][i] * sT(i, tx); } @@ -46,10 +45,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -79,7 +77,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ read_1d(dU, cstrdU, sU, tx); __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); + magma_grad_1d_device(sT, sU, sV, tx); __syncthreads(); // write V @@ -92,10 +90,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -124,11 +121,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read U read_1d(dU, cstrdU, sU, tx); - // read V - read_1d(dV, cstrdV, sV, tx); - __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); + magma_grad_1d_device(sT, sU, sV, tx); __syncthreads(); // write V diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h index 64397f431b..8ed41718c0 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h @@ -16,6 +16,21 @@ #define sT(i, j) sT[(j)*P + (i)] #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] +//////////////////////////////////////////////////////////////////////////////// +// Helper function to add or set into V +template +struct magma_grad_2d_device_accumulate; + +template +struct magma_grad_2d_device_accumulate { + static __device__ __inline__ void op(T &rV, const T &rTmp) { rV += rTmp; } +}; + +template +struct magma_grad_2d_device_accumulate { + static __device__ __inline__ void op(T &rV, const T &rTmp) { rV = rTmp; } +}; + //////////////////////////////////////////////////////////////////////////////// // grad basis action (2D) // This function is called two times at a higher level for 2D @@ -24,10 +39,9 @@ // i_DIM -- the index of the outermost loop over dimensions in grad // i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans) // i_DIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans) -// the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template +template static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE], - T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) { + T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) { // Assumptions // 0. This device routine applies grad for one dim only (i_DIM), so it should be called twice for 2D // 1. 1D threads of size max(P,Q) @@ -68,8 +82,7 @@ static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const for (int i = 0; i < P; i++) { rTmp += sTmp(tx, i, sld) * sT(i, j); } - rV[i_DIM_V][comp][j] *= beta; - rV[i_DIM_V][comp][j] += rTmp; + magma_grad_2d_device_accumulate::op(rV[i_DIM_V][comp][j], rTmp); } } __syncthreads(); @@ -82,10 +95,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -109,22 +121,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ read_T_notrans_gm2sm(tx, dgrad1d, sTgrad); } - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, + sTmp); /* there is a sync at the end of magma_grad_2d_device */ write_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 1) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, + sTmp); /* there is a sync at the end of magma_grad_2d_device */ write_V_2d(dV + (1 * dstrdV), cstrdV, rV, tx); } @@ -135,10 +146,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -163,23 +173,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ } __syncthreads(); - /* read V (since this is transposed mode -- - idim = 0 for dV, i_DIM = 0 for rV) */ - const CeedScalar beta = 1.0; - read_V_2d(dV + (0 * dstrdV), cstrdV, rV, tx); - /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ - magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_2d_device */ // write V diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h index 6a56d47c66..632fe1e5c9 100644 --- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h @@ -17,6 +17,21 @@ #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] #define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)] +//////////////////////////////////////////////////////////////////////////////// +// Helper function to add or set into V +template +struct magma_grad_3d_device_accumulate; + +template +struct magma_grad_3d_device_accumulate { + static __device__ __inline__ void op(T &rV, const T &rTmp) { rV += rTmp; } +}; + +template +struct magma_grad_3d_device_accumulate { + static __device__ __inline__ void op(T &rV, const T &rTmp) { rV = rTmp; } +}; + //////////////////////////////////////////////////////////////////////////////// // grad basis action (3D) // This function is called three times at a higher level for 3D @@ -25,10 +40,9 @@ // i_DIM -- the index of the outermost loop over dimensions in grad // i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans) // i_DIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans) -// the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template +template static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE], - T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) { + T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) { // Assumptions // 0. This device routine applies grad for one dim only (i_DIM), so it should be thrice for 3D // 1. 1D threads of size max(P,Q)^2 @@ -90,8 +104,7 @@ static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const for (int i = 0; i < P; i++) { rTmp += sTmp(tx, i, sld) * sT(i, j); } - rV[i_DIM_V][comp][j] *= beta; - rV[i_DIM_V][comp][j] += rTmp; + magma_grad_3d_device_accumulate::op(rV[i_DIM_V][comp][j], rTmp); } } __syncthreads(); @@ -104,10 +117,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -132,28 +144,28 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA } __syncthreads(); - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, + sTmp); /* there is a sync at the end of magma_grad_3d_device */ write_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 1) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, + sTmp); /* there is a sync at the end of magma_grad_3d_device */ write_V_3d(dV + (1 * dstrdV), cstrdV, rV, tx); /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) -- output from rV[0][][] into dV (idim = 2) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, + sTmp); /* there is a sync at the end of magma_grad_3d_device */ write_V_3d(dV + (2 * dstrdV), cstrdV, rV, tx); } @@ -164,10 +176,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -192,29 +203,25 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA } __syncthreads(); - // read V (since this is transposed mode) - const CeedScalar beta = 1.0; - read_V_3d(dV + (0 * dstrdV), cstrdV, rV, tx); - /* read U (idim = 0 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ /* read U (idim = 1 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ /* read U (idim = 2 for dU, i_DIM = 0 for rU) -- there is a sync at the end of this function */ read_U_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */ - magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp); /* there is a sync at the end of magma_grad_3d_device */ // write V diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h index 0c49fa3696..9b6787d24d 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h @@ -18,7 +18,7 @@ //////////////////////////////////////////////////////////////////////////////// // interp basis action (1D) template -static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { +static __device__ __inline__ void magma_interp_1d_device(const T *sT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) { // Assumptions // 1. 1D threads of size max(P,Q) // 2. sU[i] is 1xP: in shared memory @@ -28,10 +28,9 @@ static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_tran // 6. Must sync before and after call // 7. Note that the layout for U and V is different from 2D/3D problem - T rv; if (tx < Q) { for (int comp = 0; comp < NUM_COMP; comp++) { - rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0; + T rv = 0.0; for (int i = 0; i < P; i++) { rv += sU[comp][i] * sT(i, tx); // sT[tx * P + i]; } @@ -46,10 +45,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -79,7 +77,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ read_1d(dU, cstrdU, sU, tx); __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); + magma_interp_1d_device(sT, sU, sV, tx); __syncthreads(); // write V @@ -92,10 +90,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -124,11 +121,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ // read U read_1d(dU, cstrdU, sU, tx); - // read V - read_1d(dV, cstrdV, sV, tx); - __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); + magma_interp_1d_device(sT, sU, sV, tx); __syncthreads(); // write V diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h index ece85a26fe..206f5cc32f 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h @@ -19,8 +19,8 @@ //////////////////////////////////////////////////////////////////////////////// // interp basis action (2D) template -static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], - T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) { +static __device__ __inline__ void magma_interp_2d_device(const T *sT, T rU[DIM_U][NUM_COMP][rU_SIZE], T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, + T rTmp, T *swork) { // Assumptions // 1. 1D threads of size max(P,Q) // 2. input: rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread) @@ -71,10 +71,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -100,7 +99,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ read_U_2d(dU, cstrdU, rU, sTmp, tx); // no sync needed here -- read_U_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + magma_interp_2d_device(sT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V @@ -113,10 +112,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -138,14 +136,11 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_ read_T_trans_gm2sm(tx, dT, sT); } - // read V - read_V_2d(dV, cstrdV, rV, tx); - // read U -- there is a sync at the end of this function read_U_2d(dU, cstrdU, rU, sTmp, tx); // no sync needed here -- read_U_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + magma_interp_2d_device(sT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h index 9c5020c3ff..2d6fa106ca 100644 --- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h +++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h @@ -19,8 +19,8 @@ //////////////////////////////////////////////////////////////////////////////// // interp basis action (3D) template -static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE], - T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp[Q], T *swork) { +static __device__ __inline__ void magma_interp_3d_device(const T *sT, T rU[DIM_U][NUM_COMP][rU_SIZE], T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, + T rTmp[Q], T *swork) { // Assumptions // 1. 1D threads of size max(P,Q)^2 // 2. input: rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread) @@ -99,10 +99,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -128,7 +127,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA read_U_3d(dU, cstrdU, rU, sTmp, tx); // there is a sync at the end of this function - magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + magma_interp_3d_device(sT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V @@ -141,10 +140,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA const int cstrdV, const int nelem) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; if (elem_id >= nelem) return; @@ -166,14 +164,11 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA read_T_trans_gm2sm(tx, dT, sT); } - // read V - read_V_3d(dV, cstrdV, rV, tx); - // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0) read_U_3d(dU, cstrdU, rU, sTmp, tx); // there is a sync at the end of this function - magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + magma_interp_3d_device(sT, rU, rV, tx, rTmp, sTmp); __syncthreads(); // write V diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h index 30c68dfb1e..0cb7c400cf 100644 --- a/include/ceed/jit-source/magma/magma-common-defs.h +++ b/include/ceed/jit-source/magma/magma-common-defs.h @@ -22,6 +22,4 @@ // Define macro for computing the total threads in a block for use with __launch_bounds__() #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) -typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t; - #endif // CEED_MAGMA_COMMON_DEFS_H