diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 7023544b1b..10806a0ff0 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -268,7 +268,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
                                          CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
-  CeedInt                   dim, num_comp, num_nodes, num_qpts, P, Q, N;
+  CeedInt                   num_comp, q_comp, num_nodes, num_qpts, P, Q, N;
   const CeedScalar         *d_u;
   CeedScalar               *d_v;
   CeedBasisNonTensor_Magma *impl;
@@ -276,8 +276,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedGetData(ceed, &data));
   CeedCallBackend(CeedBasisGetData(basis, &impl));
-  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp));
   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
   P = num_nodes;
@@ -304,7 +304,30 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
 
   // Apply basis operation
   if (e_mode != CEED_EVAL_WEIGHT) {
-    if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
+    const CeedScalar *d_b = NULL;
+    switch (e_mode) {
+      case CEED_EVAL_INTERP:
+        d_b = impl->d_interp;
+        break;
+      case CEED_EVAL_GRAD:
+        d_b = impl->d_grad;
+        break;
+      case CEED_EVAL_DIV:
+        d_b = impl->d_div;
+        break;
+      case CEED_EVAL_CURL:
+        d_b = impl->d_curl;
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT:
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT does not make sense in this context");
+      case CEED_EVAL_NONE:
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
+        // LCOV_EXCL_STOP
+    }
+
+    // Apply basis operation
+    if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
       CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
       CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff;
       CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
@@ -319,92 +342,85 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
 
       // Compile kernels for N as needed
       if (!impl->NB_interp[iN]) {
+        CeedFESpace fe_space;
+        CeedInt     q_comp_interp, q_comp_deriv;
         Ceed        ceed_delegate;
-        char       *interp_kernel_path, *grad_kernel_path, *basis_kernel_source;
+        char       *basis_kernel_path, *basis_kernel_source;
         magma_int_t arch = magma_getdevice_arch();
 
         // Tuning parameters for NB
-        impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', 1, P, Q, n_array[iN]);
-        impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', 1, P, Q, n_array[iN]);
-        impl->NB_grad[iN]     = nontensor_rtc_get_nb(arch, 'n', dim, P, Q, n_array[iN]);
-        impl->NB_grad_t[iN]   = nontensor_rtc_get_nb(arch, 't', dim, P, Q, n_array[iN]);
+        CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space));
+        CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+        switch (fe_space) {
+          case CEED_FE_SPACE_H1:
+            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv));
+            break;
+          case CEED_FE_SPACE_HDIV:
+            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv));
+            break;
+          case CEED_FE_SPACE_HCURL:
+            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv));
+            break;
+        }
+        impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]);
+        impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]);
+        impl->NB_deriv[iN]    = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]);
+        impl->NB_deriv_t[iN]  = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]);
 
         // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
         CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
 
         // Compile kernels
-        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-nontensor.h", &interp_kernel_path));
+        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path));
         CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-        CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source));
-        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-grad-nontensor.h", &grad_kernel_path));
-        CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source));
+        CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
         CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-        CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 7, "BASIS_DIM", dim, "BASIS_P", P, "BASIS_Q",
-                                         Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_GRAD_N",
-                                         impl->NB_grad[iN], "BASIS_NB_GRAD_T", impl->NB_grad_t[iN]));
+        CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp,
+                                         "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN],
+                                         "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T",
+                                         impl->NB_deriv_t[iN]));
         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_n", &impl->Grad[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_t", &impl->GradTranspose[iN]));
-        CeedCallBackend(CeedFree(&interp_kernel_path));
-        CeedCallBackend(CeedFree(&grad_kernel_path));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
+        CeedCallBackend(CeedFree(&basis_kernel_path));
         CeedCallBackend(CeedFree(&basis_kernel_source));
       }
-
-      // Apply basis operation
-      CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D);
+      CeedMagmaFunction Kernel;
+      CeedInt           NB;
       if (e_mode == CEED_EVAL_INTERP) {
-        CeedInt NB           = (t_mode == CEED_TRANSPOSE) ? impl->NB_interp_t[iN] : impl->NB_interp[iN];
-        CeedInt grid         = CeedDivUpInt(N, NB * num_t_col);
-        CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar);
-        CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar);
-        CeedInt shared_mem   = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
-        void   *args[]       = {&N, &impl->d_interp, &P, &d_u, &K, &d_v, &M};
-
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose[iN], grid, M, num_t_col, 1, shared_mem, args));
+          Kernel = impl->InterpTranspose[iN];
+          NB     = impl->NB_interp_t[iN];
         } else {
-          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp[iN], grid, M, num_t_col, 1, shared_mem, args));
+          Kernel = impl->Interp[iN];
+          NB     = impl->NB_interp[iN];
         }
-      } else if (e_mode == CEED_EVAL_GRAD) {
-        CeedInt NB         = (t_mode == CEED_TRANSPOSE) ? impl->NB_grad_t[iN] : impl->NB_grad[iN];
-        CeedInt grid       = CeedDivUpInt(N, NB * num_t_col);
-        CeedInt shared_mem = num_t_col * K * NB * sizeof(CeedScalar) + ((t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar));
-        void   *args[]     = {&N, &impl->d_grad, &P, &d_u, &K, &d_v, &M};
-
+      } else {
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose[iN], grid, M, num_t_col, 1, shared_mem, args));
+          Kernel = impl->DerivTranspose[iN];
+          NB     = impl->NB_deriv_t[iN];
         } else {
-          CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad[iN], grid, M, num_t_col, 1, shared_mem, args));
+          Kernel = impl->Deriv[iN];
+          NB     = impl->NB_deriv[iN];
         }
-      } else {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported");
-        // LCOV_EXCL_STOP
       }
+      CeedInt num_t_col    = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D);
+      CeedInt grid         = CeedDivUpInt(N, num_t_col * NB);
+      CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : P * Q * sizeof(CeedScalar);
+      CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar);
+      CeedInt shared_mem   = (t_mode == CEED_TRANSPOSE || q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
+      void   *args[]       = {&N, &d_b, &d_u, &d_v};
+
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args));
     } else {
-      if (e_mode == CEED_EVAL_INTERP) {
+      for (CeedInt d = 0; d < q_comp; d++) {
         if (t_mode == CEED_TRANSPOSE) {
-          magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_interp, P, d_u, Q, 0.0, d_v, P, data->queue);
+          const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
+          magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
         } else {
-          magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_interp, P, d_u, P, 0.0, d_v, Q, data->queue);
+          magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
         }
-      } else if (e_mode == CEED_EVAL_GRAD) {
-        if (t_mode == CEED_TRANSPOSE) {
-          for (int d = 0; d < dim; d++) {
-            const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
-            magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_grad + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P,
-                                 data->queue);
-          }
-        } else {
-          for (int d = 0; d < dim; d++) {
-            magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_grad + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
-          }
-        }
-      } else {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported");
-        // LCOV_EXCL_STOP
       }
     }
   } else {
@@ -412,7 +428,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
     CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
-    void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v, &Q};
+    void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v};
 
     CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args));
   }
@@ -474,6 +490,8 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
   }
   CeedCallBackend(magma_free(impl->d_interp));
   CeedCallBackend(magma_free(impl->d_grad));
+  CeedCallBackend(magma_free(impl->d_div));
+  CeedCallBackend(magma_free(impl->d_curl));
   CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
@@ -590,10 +608,126 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   // Copy basis data to GPU
   CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
   magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
-  CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * sizeof(interp[0])));
-  magma_setvector(num_qpts * num_nodes, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
-  CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * dim * sizeof(grad[0])));
-  magma_setvector(num_qpts * num_nodes * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
+  if (interp) {
+    CeedInt q_comp_interp;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+    CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
+  }
+  if (grad) {
+    CeedInt q_comp_grad;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
+    CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
+  }
+
+  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts));
+  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
+  CeedCallBackend(CeedFree(&weight_kernel_path));
+  CeedCallBackend(CeedFree(&basis_kernel_source));
+
+  CeedCallBackend(CeedBasisSetData(basis, impl));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create non-tensor H(div)
+//------------------------------------------------------------------------------
+int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                              const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                      ceed, ceed_delegate;
+  Ceed_Magma               *data;
+  char                     *weight_kernel_path, *basis_kernel_source;
+  CeedBasisNonTensor_Magma *impl;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedCalloc(1, &impl));
+
+  // Copy basis data to GPU
+  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (interp) {
+    CeedInt q_comp_interp;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+    CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
+  }
+  if (div) {
+    CeedInt q_comp_div;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
+    CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue);
+  }
+
+  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 2, "BASIS_Q", num_qpts));
+  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
+  CeedCallBackend(CeedFree(&weight_kernel_path));
+  CeedCallBackend(CeedFree(&basis_kernel_source));
+
+  CeedCallBackend(CeedBasisSetData(basis, impl));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create non-tensor H(curl)
+//------------------------------------------------------------------------------
+int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                               const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                      ceed, ceed_delegate;
+  Ceed_Magma               *data;
+  char                     *weight_kernel_path, *basis_kernel_source;
+  CeedBasisNonTensor_Magma *impl;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedCalloc(1, &impl));
+
+  // Copy basis data to GPU
+  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (interp) {
+    CeedInt q_comp_interp;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+    CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue);
+  }
+  if (curl) {
+    CeedInt q_comp_curl;
+
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
+    CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0])));
+    magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue);
+  }
 
   // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
   CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
index 92fc1b3d8d..8abaf83882 100644
--- a/backends/magma/ceed-magma.c
+++ b/backends/magma/ceed-magma.c
@@ -39,6 +39,8 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index b5495a61bb..ae5ea39093 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -60,21 +60,26 @@ typedef struct {
   CeedMagmaModule   module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Weight;
   CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt           NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedInt           NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedScalar       *d_interp;
   CeedScalar       *d_grad;
+  CeedScalar       *d_div;
+  CeedScalar       *d_curl;
   CeedScalar       *d_q_weight;
 } CeedBasisNonTensor_Magma;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                               const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
-
 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                                         const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
+CEED_INTERN int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                          const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
+CEED_INTERN int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                           const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
 
 CEED_INTERN magma_int_t magma_isdevptr(const void *);
 
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h
deleted file mode 100644
index 95bf548fc3..0000000000
--- a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Internal header for Magma non-tensor basis gradient
-#ifndef CEED_MAGMA_GRAD_NONTENSOR_H
-#define CEED_MAGMA_GRAD_NONTENSOR_H
-
-#include "magma-common-nontensor.h"
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_grad_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int id      = blockIdx.x * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_N);
-  const int myn     = min(BASIS_NB_GRAD_N, n - id * BASIS_NB_GRAD_N);
-
-  dB += id * BASIS_NB_GRAD_N * lddb;
-  dC += id * BASIS_NB_GRAD_N * lddc;
-
-  // A is BASIS_P x BASIS_Q
-  const int   slda = BASIS_P;
-  const int   sldb = BASIS_P;
-  CeedScalar *sA   = (CeedScalar *)shared_data;
-  CeedScalar *sB   = sA + BASIS_Q * BASIS_P;
-  sB += ty * sldb * BASIS_NB_GRAD_N;
-
-  // read B once for all C's
-  if (id < nblocks) {
-    read_B_g2s_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, myn, dB, lddb, sB, sldb);
-  }
-
-  // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll)
-  for (int d = 0; d < BASIS_DIM; d++) {
-    // read A (BASIS_P x BASIS_Q) using all threads
-    CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO};
-    __syncthreads();
-    read_A_trans_g2r_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, ty, dA, ldda, sA, slda, rA);
-
-    // init rC
-    CeedScalar rC[BASIS_NB_GRAD_N] = {MAGMA_D_ZERO};
-    if (id < nblocks) {
-      mul_rAsBrC_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, rA, sB, sldb, rC);
-    }
-
-    // write C
-    if (id < nblocks) {
-      write_C_r2g_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_GRAD_N>(tx, myn, rC, dC, lddc);
-    }
-
-    dA += BASIS_Q * BASIS_P;
-    dC += BASIS_Q * n;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_grad_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int id      = blockIdx.x * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_T);
-  const int myn     = min(BASIS_NB_GRAD_T, n - id * BASIS_NB_GRAD_T);
-
-  // terminate threads with no work
-  if (id >= nblocks) return;
-
-  dB += id * BASIS_NB_GRAD_T * lddb;
-  dC += id * BASIS_NB_GRAD_T * lddc;
-
-  // A is BASIS_P x BASIS_Q
-  const int   sldb = BASIS_Q;
-  CeedScalar *sB   = (CeedScalar *)shared_data;
-  sB += ty * sldb * BASIS_NB_GRAD_T;
-
-  // init rA, rC
-  CeedScalar rA[BASIS_Q]         = {MAGMA_D_ZERO};
-  CeedScalar rC[BASIS_NB_GRAD_T] = {MAGMA_D_ZERO};
-
-  // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll)
-  for (int d = 0; d < BASIS_DIM; d++) {
-    // read A
-    read_A_notrans_g2r_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, dA, ldda, NULL, 0, rA);
-
-    // read B
-    __syncthreads();
-    read_B_g2s_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, myn, dB, lddb, sB, sldb);
-    __syncthreads();
-
-    addmul_rAsBrC_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, rA, sB, sldb, rC);
-
-    dA += BASIS_P * BASIS_Q;
-    dB += BASIS_Q * n;
-  }
-
-  // write C
-  write_C_r2g_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_GRAD_T>(tx, myn, rC, dC, lddc);
-}
-
-#endif  // CEED_MAGMA_GRAD_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
new file mode 100644
index 0000000000..e89c6ade4f
--- /dev/null
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -0,0 +1,182 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA non-tensor basis interpolation
+#ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
+#define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
+
+#include "magma-common-nontensor.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                 CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * P * NB;
+  dC += id * Q * NB;
+
+  // A is P x Q
+  CeedScalar *sB = shared_data + ty * P * NB;
+  CeedScalar *sA = shared_data + blockDim.y * P * NB;
+
+  // read B once for all C's
+  if (id < nblocks) {
+    read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
+  }
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // init rA, rC
+    CeedScalar rA[P], rC[NB];
+
+    // read A using all threads
+    read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+
+    if (id < nblocks) {
+      mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
+
+      // write C
+      write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
+    }
+
+    dA += Q * P;
+    dC += Q * n;
+
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                  CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * P * NB;
+  dC += id * Q * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * P * NB;
+
+  // init rA, rC
+  CeedScalar rA[P], rC[NB];
+
+  // read A using all threads
+  read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // read B once for all C's
+  read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
+  __syncthreads();
+
+  mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                 CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  // init rC
+  CeedScalar rC[NB] = {0.0};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // init rA
+    CeedScalar rA[Q];
+
+    // read A
+    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA);
+
+    // read B
+    read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+    dA += P * Q;
+    dB += Q * n;
+
+    __syncthreads();
+  }
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  if (BASIS_Q_COMP_INTERP == 1) {
+    magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  } else {
+    magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  if (BASIS_Q_COMP_DERIV == 1) {
+    magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  } else {
+    magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+}
+
+#endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h
deleted file mode 100644
index 956d69392a..0000000000
--- a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Internal header for MAGMA non-tensor basis interpolation
-#ifndef CEED_MAGMA_INTERP_NONTENSOR_H
-#define CEED_MAGMA_INTERP_NONTENSOR_H
-
-#include "magma-common-nontensor.h"
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_interp_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int id      = blockIdx.x * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_N);
-  const int myn     = min(BASIS_NB_INTERP_N, n - id * BASIS_NB_INTERP_N);
-
-  dB += id * BASIS_NB_INTERP_N * lddb;
-  dC += id * BASIS_NB_INTERP_N * lddc;
-
-  // A is BASIS_P x BASIS_Q
-  const int   slda = BASIS_P;
-  const int   sldb = BASIS_P;
-  CeedScalar *sA   = (CeedScalar *)shared_data;
-  CeedScalar *sB   = sA;
-  sB += ty * sldb * BASIS_NB_INTERP_N;
-
-  // read A using all threads
-  CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO};
-  read_A_trans_g2r_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, ty, dA, ldda, sA, slda, rA);
-  __syncthreads();
-
-  // terminate threads with no work
-  if (id >= nblocks) return;
-
-  // init rC
-  CeedScalar rC[BASIS_NB_INTERP_N] = {MAGMA_D_ZERO};
-
-  // read B
-  read_B_g2s_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, myn, dB, lddb, sB, sldb);
-  __syncthreads();
-
-  mul_rAsBrC_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, rA, sB, sldb, rC);
-
-  // write C
-  write_C_r2g_1D_nosync<CeedScalar, BASIS_Q, BASIS_P, BASIS_NB_INTERP_N>(tx, myn, rC, dC, lddc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_interp_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) {
-  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
-
-  const int tx      = threadIdx.x;
-  const int ty      = threadIdx.y;
-  const int id      = blockIdx.x * blockDim.y + ty;
-  const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_T);
-  const int myn     = min(BASIS_NB_INTERP_T, n - id * BASIS_NB_INTERP_T);
-
-  // terminate threads with no work
-  if (id >= nblocks) return;
-
-  dB += id * BASIS_NB_INTERP_T * lddb;
-  dC += id * BASIS_NB_INTERP_T * lddc;
-
-  // A is BASIS_P x BASIS_Q
-  const int   sldb = BASIS_Q;
-  CeedScalar *sB   = (CeedScalar *)shared_data;
-  sB += ty * sldb * BASIS_NB_INTERP_T;
-
-  // init rC
-  CeedScalar rC[BASIS_NB_INTERP_T] = {MAGMA_D_ZERO};
-
-  // read A
-  CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO};
-  read_A_notrans_g2r_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, dA, ldda, NULL, 0, rA);
-
-  // read B
-  read_B_g2s_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, myn, dB, lddb, sB, sldb);
-  __syncthreads();
-
-  mul_rAsBrC_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, rA, sB, sldb, rC);
-
-  // write C
-  write_C_r2g_1D_nosync<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(tx, myn, rC, dC, lddc);
-}
-
-#endif  // CEED_MAGMA_INTERP_NONTENSOR_H
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index 9f77dd4aca..38a058ea67 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -18,7 +18,7 @@ template <typename T, int Q>
 static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) {
   // Assumptions
   // 1. 1D thread configuration of size Q
-  // 2. The output sV is in shared memory -- size 1xQ
+  // 2. The output sV is in shared memory -- size Q
   if (tx < Q) {
     sV[tx] = sTweight[tx];
   }
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
index f6b51380cf..b1f2b2881b 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -14,7 +14,7 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
-    void magma_weight_nontensor(int n, const CeedScalar *dqweight, CeedScalar *dV, int lddv) {
+    void magma_weight_nontensor(int n, const CeedScalar *__restrict__ dqweight, CeedScalar *__restrict__ dV) {
   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
 
   const int tx = threadIdx.x;
@@ -24,7 +24,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) _
   // terminate threads with no work
   if (id >= n) return;
 
-  dV += id * lddv;
+  dV += id * BASIS_Q;
 
   // shared memory pointers
   CeedScalar *sqweight = (CeedScalar *)shared_data;
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index 24684be85e..30c68dfb1e 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -10,20 +10,7 @@
 #ifndef CEED_MAGMA_COMMON_DEFS_H
 #define CEED_MAGMA_COMMON_DEFS_H
 
-#ifdef CEED_MAGMA_USE_HIP
-#define MAGMA_DEVICE_SHARED(type, name) HIP_DYNAMIC_SHARED(type, name)
-#else
 #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[];
-#endif
-
-typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t;
-
-#define MAGMA_D_ZERO 0.0
-#define MAGMA_D_ONE 1.0
-
-#define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B))
-#define MAGMA_ROUNDUP(A, B) MAGMA_CEILDIV((A), (B)) * (B)
-#define MAGMA_MAX(A, B) ((A) > (B) ? (A) : (B))
 
 #define MAGMA_MAXTHREADS_1D 128
 #define MAGMA_MAXTHREADS_2D 128
@@ -35,4 +22,6 @@ typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma
 // Define macro for computing the total threads in a block for use with __launch_bounds__()
 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
 
+typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t;
+
 #endif  // CEED_MAGMA_COMMON_DEFS_H
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 0e1bbb007b..38ca314c04 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -18,36 +18,35 @@
 // 1D thread config. with (P x 1) threads
 // no sync at the end of the function
 template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q]) {
+static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, T rA[Q]) {
 #pragma unroll
-  for (int j = 0; j < Q; j++) {
-    rA[j] = dA[j * ldda + tx];
+  for (int i = 0; i < Q; i++) {
+    rA[i] = dA[i * P + tx];
   }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // read A (trans) from global to reg.
 // A is (P x Q)
-// 1D thread config. with (P x 1) threads
+// 2D thread config. with (P x BY) threads
 // no sync at the end of the function
-template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q]) {
-  const int nTH = MAGMA_BASIS_BOUNDS(P, MAGMA_MAXTHREADS_1D);
+template <typename T, int P, int Q, int BY>
+static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, T *sA, T rA[Q]) {
   const int tid = ty * blockDim.x + tx;
   int       i;
 
 #pragma unroll
-  for (i = 0; i < (Q * P) - nTH; i += nTH) {
+  for (i = 0; i < P * Q - P * BY; i += P * BY) {
     sA[i + tid] = dA[i + tid];
   }
-  if (tid < ((Q * P) - i)) {
+  if (i + tid < P * Q) {
     sA[i + tid] = dA[i + tid];
   }
   __syncthreads();
 
 #pragma unroll
   for (int j = 0; j < Q; j++) {
-    rA[j] = sA[tx * slda + j];
+    rA[j] = sA[tx * Q + j];
   }
 }
 
@@ -57,22 +56,21 @@ static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const
 // 1D thread config. with (P x 1) threads
 // no sync at the end of the function
 template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, int lddb, T *sB, int sldb) {
+static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, T *sB) {
+  int i;
+
   if (n != NB) {
-    for (int i = 0; i < (Q * n) - P; i += P) {
+    for (i = 0; i < Q * n - P; i += P) {
       sB[i + tx] = dB[i + tx];
     }
   } else {
 #pragma unroll
-    for (int i = 0; i < (Q * NB) - P; i += P) {
+    for (i = 0; i < Q * NB - P; i += P) {
       sB[i + tx] = dB[i + tx];
     }
   }
-
-  // cleanup for B
-  const int stride = MAGMA_ROUNDUP(Q * n - P, P);
-  if (tx < (Q * n) - stride) {
-    sB[stride + tx] = dB[stride + tx];
+  if (i + tx < Q * n) {
+    sB[i + tx] = dB[i + tx];
   }
 }
 
@@ -82,18 +80,15 @@ static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n
 // 1D thread config. with (P x 1) threads
 // no sync at the end of the function
 template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC, int lddc) {
+static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
   if (n != NB) {
-#pragma unroll
-    for (int j = 0; j < NB; j++) {
-      if (j < n) {
-        dC[j * lddc + tx] = rC[j];
-      }
+    for (int i = 0; i < n; i++) {
+      dC[i * P + tx] = rC[i];
     }
   } else {
 #pragma unroll
-    for (int j = 0; j < NB; j++) {
-      dC[j * lddc + tx] = rC[j];
+    for (int i = 0; i < NB; i++) {
+      dC[i * P + tx] = rC[i];
     }
   }
 }
@@ -105,18 +100,19 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int
 // C in registers -- one row per thread
 // no sync at the end of the function
 template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) {
+static __device__ __inline__ void mul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) {
   T rB[Q];
+
 #pragma unroll
   for (int i = 0; i < NB; i++) {
 #pragma unroll
-    for (int k = 0; k < Q; k++) {
-      rB[k] = sB[i * sldb + k];
+    for (int j = 0; j < Q; j++) {
+      rB[j] = sB[i * Q + j];
     }
     rC[i] = 0.0;
 #pragma unroll
-    for (int k = 0; k < Q; k++) {
-      rC[i] += rA[k] * rB[k];
+    for (int j = 0; j < Q; j++) {
+      rC[i] += rA[j] * rB[j];
     }
   }
 }
@@ -128,17 +124,18 @@ static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T
 // C in registers -- one row per thread
 // no sync at the end of the function
 template <typename T, int P, int Q, int NB>
-static __device__ __inline__ void addmul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) {
+static __device__ __inline__ void addmul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) {
   T rB[Q];
+
 #pragma unroll
   for (int i = 0; i < NB; i++) {
 #pragma unroll
-    for (int k = 0; k < Q; k++) {
-      rB[k] = sB[i * sldb + k];
+    for (int j = 0; j < Q; j++) {
+      rB[j] = sB[i * Q + j];
     }
 #pragma unroll
-    for (int k = 0; k < Q; k++) {
-      rC[i] += rA[k] * rB[k];
+    for (int j = 0; j < Q; j++) {
+      rC[i] += rA[j] * rB[j];
     }
   }
 }