From a0b73d41345167f0d3a94e20a50dbb72c671624a Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Fri, 20 Oct 2023 11:19:49 -0700
Subject: [PATCH 1/2] Initial commit to optimize magma backend transpose basis
 application

---
 .../jit-source/magma/magma-basis-grad-1d.h    | 10 +--
 .../jit-source/magma/magma-basis-grad-2d.h    | 28 +++----
 .../jit-source/magma/magma-basis-grad-3d.h    | 32 +++----
 .../jit-source/magma/magma-basis-interp-1d.h  | 10 +--
 .../jit-source/magma/magma-basis-interp-2d.h  | 24 +++---
 .../jit-source/magma/magma-basis-interp-3d.h  | 20 ++---
 .../jit-source/magma/magma-basis-weight-1d.h  |  4 +-
 .../jit-source/magma/magma-basis-weight-2d.h  |  4 +-
 .../jit-source/magma/magma-basis-weight-3d.h  |  4 +-
 .../jit-source/magma/magma-common-tensor.h    | 83 ++++++++++---------
 10 files changed, 111 insertions(+), 108 deletions(-)
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index 89a112115e..a338be2055 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -15,7 +15,7 @@
 // macros to abstract access of shared memory and reg. file
 #define sT(i, j) sT[(j)*P + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // grad basis action (1D)
 template <typename T, int DIM, int NUM_COMP, int P, int Q>
 static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
@@ -40,7 +40,7 @@ static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_gradn_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -72,7 +72,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dTgrad, sT);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dTgrad, sT);
   }
 
   // read U
@@ -86,7 +86,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   write_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_gradt_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -118,7 +118,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dTgrad, sT);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dTgrad, sT);
   }
 
   // read U
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 042e41b046..64397f431b 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -16,7 +16,7 @@
 #define sT(i, j) sT[(j)*P + (i)]
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // grad basis action (2D)
 // This function is called two times at a higher level for 2D
 // DIM_U   -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q]
@@ -76,7 +76,7 @@ static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const
   }  // loop over NUM_COMP
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -105,8 +105,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dgrad1d, sTgrad);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dinterp1d, sTinterp);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dgrad1d, sTgrad);
   }
 
   // No need to read V ( required only in transposed grad )
@@ -114,22 +114,22 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
 
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 0) */
   magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
-  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  write_V_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
   output from rV[0][][] into dV (idim = 1) */
   magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
-  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
+  write_V_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -158,32 +158,32 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dgrad1d, sTgrad);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
   }
   __syncthreads();
 
   /* read V (since this is transposed mode --
      idim = 0 for dV, i_DIM = 0 for rV) */
   const CeedScalar beta = 1.0;
-  readV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  read_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
   magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
 
   /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
   magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
 
   // write V
-  writeV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
 
 #endif  // CEED_MAGMA_BASIS_GRAD_2D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index 063ee7bc0d..6a56d47c66 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -17,7 +17,7 @@
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 #define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // grad basis action (3D)
 // This function is called three times at a higher level for 3D
 // DIM_U   -- for the size of rU[DIM_U * NUM_COMP * MAX_P_Q]
@@ -98,7 +98,7 @@ static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const
   }  // loop over NUM_COMP
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_gradn_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -127,8 +127,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dgrad1d, sTgrad);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dinterp1d, sTinterp);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dgrad1d, sTgrad);
   }
   __syncthreads();
 
@@ -137,28 +137,28 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
 
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
 
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 0) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 1) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
 
   /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 2) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
-  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (2 * dstrdV), cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (2 * dstrdV), cstrdV, rV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_gradt_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
@@ -187,38 +187,38 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dinterp1d, sTinterp);
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dgrad1d, sTgrad);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
   }
   __syncthreads();
 
   // read V (since this is transposed mode)
   const CeedScalar beta = 1.0;
-  readV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  read_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
-  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
   magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   // write V
-  writeV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
 
 #endif  // CEED_MAGMA_BASIS_GRAD_3D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index 074efd94b6..0c49fa3696 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -15,7 +15,7 @@
 // macros to abstract access of shared memory and reg. file
 #define sT(i, j) sT[(j)*P + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // interp basis action (1D)
 template <typename T, int DIM, int NUM_COMP, int P, int Q>
 static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
@@ -40,7 +40,7 @@ static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_tran
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_interpn_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -72,7 +72,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dT, sT);
   }
 
   // read U
@@ -86,7 +86,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   write_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_interpt_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -118,7 +118,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
   }
 
   // read U
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index bb3475df51..ece85a26fe 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -16,7 +16,7 @@
 #define sT(i, j) sT[(j)*P + (i)]
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // interp basis action (2D)
 template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
 static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
@@ -65,7 +65,7 @@ static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_tran
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_interpn_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -93,21 +93,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dT, sT);
   }
 
   // read U -- there is a sync at the end of this function
-  readU_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
+  read_U_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
 
-  // no sync needed here -- readU_2d already syncs at the end
+  // no sync needed here -- read_U_2d already syncs at the end
   magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
-  writeV_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
+  write_V_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_interpt_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -135,21 +135,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
   }
 
   // read V
-  readV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+  read_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 
   // read U -- there is a sync at the end of this function
-  readU_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
 
-  // no sync needed here -- readU_2d already syncs at the end
+  // no sync needed here -- read_U_2d already syncs at the end
   magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
-  writeV_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+  write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
 
 #endif  // CEED_MAGMA_BASIS_INTERP_2D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 8f2fd3985e..9c5020c3ff 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -16,7 +16,7 @@
 #define sT(i, j) sT[(j)*P + (i)]
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // interp basis action (3D)
 template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
 static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
@@ -93,7 +93,7 @@ static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_tran
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_interpn_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -121,21 +121,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_P, BASIS_Q>(tx, transT, dT, sT);
+    read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dT, sT);
   }
 
   // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
-  readU_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
   // there is a sync at the end of this function
 
   magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
-  writeV_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV, cstrdV, rV, tx);
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_interpt_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
                                  const int cstrdV, const int nelem) {
@@ -163,21 +163,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
 
   // read T
   if (ty == 0) {
-    dread_T_gm2sm<BASIS_Q, BASIS_P>(tx, transT, dT, sT);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
   }
 
   // read V
-  readV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+  read_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 
   // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
-  readU_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
   // there is a sync at the end of this function
 
   magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
-  writeV_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+  write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
 
 #endif  // CEED_MAGMA_BASIS_INTERP_3D_H
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index 38a058ea67..879ad73fcd 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -12,7 +12,7 @@
 
 #include "magma-common-tensor.h"
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 1D
 template <typename T, int Q>
 static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) {
@@ -24,7 +24,7 @@ static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *s
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_weight_1d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 721b50f953..21b0d1c0c7 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -12,7 +12,7 @@
 
 #include "magma-common-tensor.h"
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 2D
 template <typename T, int DIM, int NUM_COMP, int Q, int i_DIM, int i_COMP>
 static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) {
@@ -32,7 +32,7 @@ static __device__ __inline__ void magma_weight_2d_device(const T *sTweight, T rV
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_2D)) __global__
     void magma_weight_2d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
index 835bca44cd..1233bb93d5 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -12,7 +12,7 @@
 
 #include "magma-common-tensor.h"
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // weight basis action -- 3D
 template <typename T, int DIM, int NUM_COMP, int Q, int i_DIM, int i_COMP>
 static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV[DIM][NUM_COMP][Q], const int tx) {
@@ -33,7 +33,7 @@ static __device__ __inline__ void magma_weight_3d_device(const T *sTweight, T rV
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q *BASIS_Q, MAGMA_MAXTHREADS_3D)) __global__
     void magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 1ca3f52758..6ea70f967e 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -12,7 +12,7 @@
 
 #include "magma-common-defs.h"
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // read U or V of a 1D element into shared memory sU[][] or sV[][] --  for all components
 // the devptr is assumed to point directly to the element
 // must sync after call
@@ -25,7 +25,7 @@ static __device__ __inline__ void read_1d(const T *devptr, const int compstride,
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // write V of a 1D element into global memory from sV[][] --  for all components
 // the devptr is assumed to point directly to the element
 template <typename T, int LENGTH, int NUM_COMP>
@@ -37,16 +37,16 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // read U of a 2D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
 // register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
 // i_DIM specifies which dimension is being read into in rU
-// rU_SIZE can be different from P (e.g. MAXP_Q)
+// rU_SIZE can be different from P (e.g. max(P, Q))
 // sTmp is a shared memory workspace of size P^2
 template <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
-static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
-  // read U as a batch P of (1 x P_) vectors
+static __device__ __inline__ void read_U_2d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
+  // read U as a batch P of (1 x P) vectors
   // vec 0  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
   // vec 1  : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
   // ...
@@ -74,14 +74,14 @@ static __device__ __inline__ void readU_2d(const T *dU, const int compstride, T
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // read V of a 2D element into registers rV[][][] --  for all components of a single dim
 // dV is assumed to be offset by elem-stride and dim-stride
 // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
 // i_DIM specifies which dimension is being read into in rV
-// rV_SIZE can be different from P (e.g. MAXP_Q)
+// rV_SIZE can be different from P (e.g. max(P, Q))
 template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
-static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+static __device__ __inline__ void read_V_2d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
   if (tx < Q) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
       for (int j = 0; j < Q; j++) {
@@ -91,15 +91,14 @@ static __device__ __inline__ void readV_2d(const T *dV, const int compstride, T
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // write V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
 // dV is assumed to be offset by elem-stride and dim-stride
 // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
-// i_DIM specifies which dimension is being read from in rV
-// idim specifies which dimension is being written to in dV
-// rV_SIZE can be different from P (e.g. MAXP_Q)
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
 template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
-static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
   if (tx < Q) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
       for (int j = 0; j < Q; j++) {
@@ -109,15 +108,15 @@ static __device__ __inline__ void writeV_2d(T *dV, const int compstride, T rV[DI
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // read U of a 3D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
 // register is assumed to be rU[DIM_U][NUM_COMP][rU_SIZE]
 // i_DIM specifies which dimension is being read into in rU
-// rU_SIZE can be different from P (e.g. MAXP_Q)
+// rU_SIZE can be different from P (e.g. max(P, Q))
 // sTmp is a shared memory workspace of size P^3
 template <typename T, int P, int DIM_U, int NUM_COMP, int rU_SIZE, int i_DIM>
-static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
+static __device__ __inline__ void read_U_3d(const T *dU, const int compstride, T rU[DIM_U][NUM_COMP][rU_SIZE], T *sTmp, const int tx) {
   // read U as a batch P^2 of (1 x P_) vectors
   // vec 0    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
   // vec 1    : [u0, u1, u2, ... u_(P-1)] -- contiguous in memory
@@ -146,14 +145,14 @@ static __device__ __inline__ void readU_3d(const T *dU, const int compstride, T
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // read V of a 3D element into registers rV[][][] --  for all components of a single dim
 // dV is assumed to be offset by elem-stride and dim-stride
 // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
 // i_DIM specifies which dimension is being read into in rV
-// rV_SIZE can be different from P (e.g. MAXP_Q)
+// rV_SIZE can be different from P (e.g. max(P, Q))
 template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
-static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+static __device__ __inline__ void read_V_3d(const T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
   if (tx < Q * Q) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
       for (int j = 0; j < Q; j++) {
@@ -163,15 +162,14 @@ static __device__ __inline__ void readV_3d(const T *dV, const int compstride, T
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // write V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
 // dV is assumed to point directly to the element (i.e. already offset by elem-stride)
 // register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
-// i_DIM specifies which dimension is being read from in rV
-// idim specifies which dimension is being written to in dV
-// rV_SIZE can be different from P (e.g. MAXP_Q)
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
 template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
-static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
   if (tx < (Q * Q)) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
       for (int j = 0; j < Q; j++) {
@@ -181,24 +179,29 @@ static __device__ __inline__ void writeV_3d(T *dV, const int compstride, T rV[DI
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////
-// reads T into shared memory
+////////////////////////////////////////////////////////////////////////////////
+// reads T (no-trans) into shared memory
+// T is B x J
 // must sync after call
 template <int B, int J>
-static __device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar *dT, CeedScalar *sT) {
-  if (transT == MagmaNoTrans) {
-    // T is B x J
-    if (tx < B) {
-      for (int i = 0; i < J; i++) {
-        sT[i * B + tx] = dT[i * B + tx];
-      }
+static __device__ __inline__ void read_T_notrans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) {
+  if (tx < B) {
+    for (int i = 0; i < J; i++) {
+      sT[i * B + tx] = dT[i * B + tx];
     }
-  } else {
-    // T is J x B
-    if (tx < J) {
-      for (int i = 0; i < B; i++) {
-        sT[tx * B + i] = dT[i * J + tx];
-      }
+  }
+  // must sync after call
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reads T (trans) into shared memory
+// T is J x B
+// must sync after call
+template <int B, int J>
+static __device__ __inline__ void read_T_trans_gm2sm(const int tx, const CeedScalar *dT, CeedScalar *sT) {
+  if (tx < J) {
+    for (int i = 0; i < B; i++) {
+      sT[tx * B + i] = dT[i * J + tx];
     }
   }
   // must sync after call

From 3e5ab5d1e8af7236d5effd483f3326b502c4c840 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Fri, 20 Oct 2023 11:20:08 -0700
Subject: [PATCH 2/2] Remove need to zero out V vector before applying basis
 transpose for magma backends

---
 backends/magma/ceed-magma-basis.c             | 26 --------
 .../jit-source/magma/magma-basis-grad-1d.h    | 26 ++++----
 .../jit-source/magma/magma-basis-grad-2d.h    | 55 +++++++++--------
 .../jit-source/magma/magma-basis-grad-3d.h    | 59 +++++++++++--------
 .../jit-source/magma/magma-basis-interp-1d.h  | 26 ++++----
 .../jit-source/magma/magma-basis-interp-2d.h  | 25 ++++----
 .../jit-source/magma/magma-basis-interp-3d.h  | 25 ++++----
 .../ceed/jit-source/magma/magma-common-defs.h |  2 -
 8 files changed, 103 insertions(+), 141 deletions(-)

diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 3f0cd2f640..2122342be8 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -54,19 +54,6 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
-  // Clear v for transpose operation
-  if (t_mode == CEED_TRANSPOSE) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-      magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue);
-    } else {
-      magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue);
-    }
-    ceed_magma_queue_sync(data->queue);
-  }
-
   // Apply basis operation
   switch (e_mode) {
     case CEED_EVAL_INTERP: {
@@ -289,19 +276,6 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
-  // Clear v for transpose operation
-  if (t_mode == CEED_TRANSPOSE) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-      magmablas_slaset(MagmaFull, length, 1, 0.0, 0.0, (float *)d_v, length, data->queue);
-    } else {
-      magmablas_dlaset(MagmaFull, length, 1, 0.0, 0.0, (double *)d_v, length, data->queue);
-    }
-    ceed_magma_queue_sync(data->queue);
-  }
-
   // Apply basis operation
   if (e_mode != CEED_EVAL_WEIGHT) {
     const CeedScalar *d_b = NULL;
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index a338be2055..21572c5c3a 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -18,7 +18,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // grad basis action (1D)
 template <typename T, int DIM, int NUM_COMP, int P, int Q>
-static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
+static __device__ __inline__ void magma_grad_1d_device(const T *sT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
   // Assumptions
   // 1. 1D threads of size max(P,Q)
   // 2. sU[i] is 1xP: in shared memory
@@ -28,10 +28,9 @@ static __device__ __inline__ void magma_grad_1d_device(const T *sT, magma_trans_
   // 6. Must sync before and after call
   // 7. Note that the layout for U and V is different from 2D/3D problem
 
-  T rv;
   if (tx < Q) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
-      rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0;
+      T rv = 0.0;
       for (int i = 0; i < P; i++) {
         rv += sU[comp][i] * sT(i, tx);
       }
@@ -46,10 +45,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -79,7 +77,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
 
   __syncthreads();
-  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, transT, sU, sV, tx);
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, sU, sV, tx);
   __syncthreads();
 
   // write V
@@ -92,10 +90,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -124,11 +121,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // read U
   read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
 
-  // read V
-  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dV, cstrdV, sV, tx);
-
   __syncthreads();
-  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, transT, sU, sV, tx);
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
   __syncthreads();
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 64397f431b..8ed41718c0 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -16,6 +16,21 @@
 #define sT(i, j) sT[(j)*P + (i)]
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 
+////////////////////////////////////////////////////////////////////////////////
+// Helper function to add or set into V
+template <typename T, bool Add>
+struct magma_grad_2d_device_accumulate;
+
+template <typename T>
+struct magma_grad_2d_device_accumulate<T, true> {
+  static __device__ __inline__ void op(T &rV, const T &rTmp) { rV += rTmp; }
+};
+
+template <typename T>
+struct magma_grad_2d_device_accumulate<T, false> {
+  static __device__ __inline__ void op(T &rV, const T &rTmp) { rV = rTmp; }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // grad basis action (2D)
 // This function is called two times at a higher level for 2D
@@ -24,10 +39,9 @@
 // i_DIM   -- the index of the outermost loop over dimensions in grad
 // i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans)
 // i_DIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans)
-// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
-template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V>
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V, bool ADD>
 static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE],
-                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) {
+                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) {
   // Assumptions
   // 0. This device routine applies grad for one dim only (i_DIM), so it should be called twice for 2D
   // 1. 1D threads of size max(P,Q)
@@ -68,8 +82,7 @@ static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const
         for (int i = 0; i < P; i++) {
           rTmp += sTmp(tx, i, sld) * sT(i, j);
         }
-        rV[i_DIM_V][comp][j] *= beta;
-        rV[i_DIM_V][comp][j] += rTmp;
+        magma_grad_2d_device_accumulate<T, ADD>::op(rV[i_DIM_V][comp][j], rTmp);
       }
     }
     __syncthreads();
@@ -82,10 +95,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -109,22 +121,21 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
     read_T_notrans_gm2sm<BASIS_P, BASIS_Q>(tx, dgrad1d, sTgrad);
   }
 
-  // No need to read V ( required only in transposed grad )
-  const CeedScalar beta = 0.0;
-
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
 
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0, false>(sTinterp, sTgrad, rU, rV, tx, rTmp,
+                                                                                                             sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
   write_V_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
   output from rV[0][][] into dV (idim = 1) */
-  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0, false>(sTinterp, sTgrad, rU, rV, tx, rTmp,
+                                                                                                             sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
   write_V_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
 }
@@ -135,10 +146,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -163,23 +173,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   }
   __syncthreads();
 
-  /* read V (since this is transposed mode --
-     idim = 0 for dV, i_DIM = 0 for rV) */
-  const CeedScalar beta = 1.0;
-  read_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
 
   /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
-  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_2d_device */
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index 6a56d47c66..632fe1e5c9 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -17,6 +17,21 @@
 #define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)]
 #define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)]
 
+////////////////////////////////////////////////////////////////////////////////
+// Helper function to add or set into V
+template <typename T, bool Add>
+struct magma_grad_3d_device_accumulate;
+
+template <typename T>
+struct magma_grad_3d_device_accumulate<T, true> {
+  static __device__ __inline__ void op(T &rV, const T &rTmp) { rV += rTmp; }
+};
+
+template <typename T>
+struct magma_grad_3d_device_accumulate<T, false> {
+  static __device__ __inline__ void op(T &rV, const T &rTmp) { rV = rTmp; }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // grad basis action (3D)
 // This function is called three times at a higher level for 3D
@@ -25,10 +40,9 @@
 // i_DIM   -- the index of the outermost loop over dimensions in grad
 // i_DIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans)
 // i_DIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans)
-// the scalar beta is used to specify whether to accumulate to rV, or overwrite it
-template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V>
+template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE, int i_DIM, int i_DIM_U, int i_DIM_V, bool ADD>
 static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NUM_COMP][rU_SIZE],
-                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], T beta, const int tx, T rTmp, T *swork) {
+                                                       T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) {
   // Assumptions
   // 0. This device routine applies grad for one dim only (i_DIM), so it should be thrice for 3D
   // 1. 1D threads of size max(P,Q)^2
@@ -90,8 +104,7 @@ static __device__ __inline__ void magma_grad_3d_device(const T *sTinterp, const
         for (int i = 0; i < P; i++) {
           rTmp += sTmp(tx, i, sld) * sT(i, j);
         }
-        rV[i_DIM_V][comp][j] *= beta;
-        rV[i_DIM_V][comp][j] += rTmp;
+        magma_grad_3d_device_accumulate<T, ADD>::op(rV[i_DIM_V][comp][j], rTmp);
       }
     }
     __syncthreads();
@@ -104,10 +117,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -132,28 +144,28 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   }
   __syncthreads();
 
-  // No need to read V ( required only in transposed grad )
-  const CeedScalar beta = 0.0;
-
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
 
   /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 0, 0, 0, false>(sTinterp, sTgrad, rU, rV, tx, rTmp,
+                                                                                                             sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
   write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 
   /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 1) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 1, 0, 0, false>(sTinterp, sTgrad, rU, rV, tx, rTmp,
+                                                                                                             sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
   write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (1 * dstrdV), cstrdV, rV, tx);
 
   /* third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) --
      output from rV[0][][] into dV (idim = 2) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q, 2, 0, 0, false>(sTinterp, sTgrad, rU, rV, tx, rTmp,
+                                                                                                             sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
   write_V_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dV + (2 * dstrdV), cstrdV, rV, tx);
 }
@@ -164,10 +176,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -192,29 +203,25 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   }
   __syncthreads();
 
-  // read V (since this is transposed mode)
-  const CeedScalar beta = 1.0;
-  read_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
-
   /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
      there is a sync at the end of this function */
   read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
   /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
-  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0>(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp);
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
   /* there is a sync at the end of magma_grad_3d_device */
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index 0c49fa3696..9b6787d24d 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -18,7 +18,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // interp basis action (1D)
 template <typename T, int DIM, int NUM_COMP, int P, int Q>
-static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_trans_t transT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
+static __device__ __inline__ void magma_interp_1d_device(const T *sT, T *sU[NUM_COMP], T *sV[NUM_COMP], const int tx) {
   // Assumptions
   // 1. 1D threads of size max(P,Q)
   // 2. sU[i] is 1xP: in shared memory
@@ -28,10 +28,9 @@ static __device__ __inline__ void magma_interp_1d_device(const T *sT, magma_tran
   // 6. Must sync before and after call
   // 7. Note that the layout for U and V is different from 2D/3D problem
 
-  T rv;
   if (tx < Q) {
     for (int comp = 0; comp < NUM_COMP; comp++) {
-      rv = (transT == MagmaTrans) ? sV[comp][tx] : 0.0;
+      T rv = 0.0;
       for (int i = 0; i < P; i++) {
         rv += sU[comp][i] * sT(i, tx);  // sT[tx * P + i];
       }
@@ -46,10 +45,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -79,7 +77,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
 
   __syncthreads();
-  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, transT, sU, sV, tx);
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_P, BASIS_Q>(sT, sU, sV, tx);
   __syncthreads();
 
   // write V
@@ -92,10 +90,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -124,11 +121,8 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // read U
   read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
 
-  // read V
-  read_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(dV, cstrdV, sV, tx);
-
   __syncthreads();
-  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, transT, sU, sV, tx);
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
   __syncthreads();
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index ece85a26fe..206f5cc32f 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -19,8 +19,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 // interp basis action (2D)
 template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
-static __device__ __inline__ void magma_interp_2d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
-                                                         T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp, T *swork) {
+static __device__ __inline__ void magma_interp_2d_device(const T *sT, T rU[DIM_U][NUM_COMP][rU_SIZE], T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx,
+                                                         T rTmp, T *swork) {
   // Assumptions
   // 1. 1D threads of size max(P,Q)
   // 2. input:  rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread)
@@ -71,10 +71,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -100,7 +99,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   read_U_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
 
   // no sync needed here -- read_U_2d already syncs at the end
-  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
@@ -113,10 +112,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -138,14 +136,11 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
     read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
   }
 
-  // read V
-  read_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
-
   // read U -- there is a sync at the end of this function
   read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
 
   // no sync needed here -- read_U_2d already syncs at the end
-  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 9c5020c3ff..2d6fa106ca 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -19,8 +19,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 // interp basis action (3D)
 template <typename T, int DIM_U, int DIM_V, int NUM_COMP, int P, int Q, int rU_SIZE, int rV_SIZE>
-static __device__ __inline__ void magma_interp_3d_device(const T *sT, magma_trans_t transT, T rU[DIM_U][NUM_COMP][rU_SIZE],
-                                                         T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx, T rTmp[Q], T *swork) {
+static __device__ __inline__ void magma_interp_3d_device(const T *sT, T rU[DIM_U][NUM_COMP][rU_SIZE], T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx,
+                                                         T rTmp[Q], T *swork) {
   // Assumptions
   // 1. 1D threads of size max(P,Q)^2
   // 2. input:  rU[DIM_U x NUM_COMP x rU_SIZE] in registers (per thread)
@@ -99,10 +99,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaNoTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -128,7 +127,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   read_U_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dU, cstrdU, rU, sTmp, tx);
   // there is a sync at the end of this function
 
-  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_P, BASIS_Q>(sT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
@@ -141,10 +140,9 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
                                  const int cstrdV, const int nelem) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
 
-  const int     tx      = threadIdx.x;
-  const int     ty      = threadIdx.y;
-  const int     elem_id = (blockIdx.x * blockDim.y) + ty;
-  magma_trans_t transT  = MagmaTrans;
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
 
   if (elem_id >= nelem) return;
 
@@ -166,14 +164,11 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
     read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
   }
 
-  // read V
-  read_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
-
   // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
   read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
   // there is a sync at the end of this function
 
-  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, transT, rU, rV, tx, rTmp, sTmp);
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
   __syncthreads();
 
   // write V
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index 30c68dfb1e..0cb7c400cf 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -22,6 +22,4 @@
 // Define macro for computing the total threads in a block for use with __launch_bounds__()
 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
 
-typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t;
-
 #endif  // CEED_MAGMA_COMMON_DEFS_H