From 7ab670478399ae1b2fa9fefefa2aa4d9e1c19402 Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Tue, 31 May 2022 10:05:43 -0600
Subject: [PATCH 01/11] Initial update to SNAP routine

---
 src/force_types/force_snap_neigh.h      | 175 +++++++++++++++++++
 src/force_types/force_snap_neigh_impl.h | 214 +++++++++++++++++++++++-
 2 files changed, 386 insertions(+), 3 deletions(-)
diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h
index d6f4e1e..2ddb01b 100644
--- a/src/force_types/force_snap_neigh.h
+++ b/src/force_types/force_snap_neigh.h
@@ -77,6 +77,43 @@
 template<class NeighborClass>
 class ForceSNAP : public Force {
 public:
+  // lammps splice
+// Routines for both the CPU and GPU backend
+template<int NEIGHFLAG, int EVFLAG>
+struct TagComputeForce{};
+
+
+// GPU backend only
+struct TagComputeNeigh{};
+struct TagComputeCayleyKlein{};
+struct TagPreUi{};
+struct TagComputeUiSmall{}; // more parallelism, more divergence
+struct TagComputeUiLarge{}; // less parallelism, no divergence
+struct TagTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagComputeZi{};
+struct TagBeta{};
+struct TagComputeBi{};
+struct TagTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagComputeYi{};
+struct TagComputeYiWithZlist{};
+template<int dir>
+struct TagComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagComputeFusedDeidrjLarge{}; // less parallelism, no divergence
+
+// CPU backend only
+struct TagComputeNeighCPU{};
+struct TagPreUiCPU{};
+struct TagComputeUiCPU{};
+struct TagTransformUiCPU{};
+struct TagComputeZiCPU{};
+struct TagBetaCPU{};
+struct TagComputeBiCPU{};
+struct TagZeroYiCPU{};
+struct TagComputeYiCPU{};
+struct TagComputeDuidrjCPU{};
+struct TagComputeDeidrjCPU{};
+
   ForceSNAP(char** args, System* system, bool half_neigh_);
   ~ForceSNAP();
   
@@ -84,6 +121,89 @@ class ForceSNAP : public Force {
   void compute(System* system, Binning* binning, Neighbor* neighbor );
 
   const char* name() {return "ForceSNAP";}
+// lammps splice
+/*  template<int NEIGHFLAG, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagBetaCPU,const int& ii) const;
+*/
+
+  // GPU backend only
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy<TagComputeNeigh>::member_type& team) const;
+
+/*
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagBeta, const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  template<int dir>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagComputeFusedDeidrjSmall<dir> >::member_type& team) const;
+
+  template<int dir>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagComputeFusedDeidrjLarge<dir> >::member_type& team) const;
+
+  // CPU backend only
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeNeighCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPreUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagTransformUiCPU, const int j, const int iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeZiCPU,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeBiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeYiCPU,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeDuidrjCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeDeidrjCPU>::member_type& team) const;
+*/
 
 protected:
   System* system;
@@ -99,6 +219,9 @@ class ForceSNAP : public Force {
   SNA sna;
 
   int nmax;
+   
+  int chunk_size,chunk_offset;
+  int host_flag;
 
   // How much parallelism to use within an interaction
   int vector_length;
@@ -178,8 +301,60 @@ class ForceSNAP : public Force {
   KOKKOS_INLINE_FUNCTION
   void operator() (const Kokkos::TeamPolicy<>::member_type& team) const;
 
+ // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+// Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_transform_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(double) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_transform_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2;
+#endif
+
+
+// Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <int num_tiles, class TagPairSNAP>
+  //using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagPairSNAP>;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <int num_teams, class TagPairSNAP>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Kokkos::LaunchBounds<vector_length * num_teams>, TagPairSNAP>;
+
+
+
+
+
 };
 
+
+
+
 #define FORCE_MODULES_EXTERNAL_TEMPLATE
 #define FORCETYPE_DECLARE_TEMPLATE_MACRO(NeighType) ForceSNAP<NeighType>
 #include<modules_neighbor.h>
diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h
index eb89ed3..de6ba6e 100644
--- a/src/force_types/force_snap_neigh_impl.h
+++ b/src/force_types/force_snap_neigh_impl.h
@@ -53,6 +53,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include "force_snap_neigh.h"
+// Headers added by AJP to fix compile issues
+//#include "pair_snap_kokkos.h"
+//#include "neighbor_kokkos.h"
+
 
 #define MAXLINE 1024
 #define MAXWORD 3
@@ -70,6 +74,14 @@
 //static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */
 
+// AJP, Stan, For SNAP update:
+
+//int num_teams;
+//int vector_length_;
+//static constexpr int vector_length = vector_length_;
+//template <class Device, int num_teams, class Tag>
+//using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, Tag>;
+
 template<class NeighborClass>
 ForceSNAP<NeighborClass>::ForceSNAP(char** args, System* system_, bool half_neigh_):Force(args,system_,half_neigh_)
 {
@@ -156,6 +168,8 @@ struct FindMaxNumNeighs {
    This version is a straightforward implementation
    ---------------------------------------------------------------------- */
 
+// Functor that will be replaced by LAMMPS version
+
 template<class NeighborClass>
 void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbor* neighbor_)
 {
@@ -179,9 +193,9 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
     if(max_neighs<num_neighs) max_neighs = num_neighs;
   }*/
   Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",nlocal, FindMaxNumNeighs<t_neigh_list>(neigh_list), Kokkos::Max<int>(max_neighs));
-
+  // snaKK (line 220 in lammps)
   sna.nmax = max_neighs;
-
+  // Put LAMMPS code starting here
   T_INT team_scratch_size = sna.size_team_scratch_arrays();
   T_INT thread_scratch_size = sna.size_thread_scratch_arrays();
 
@@ -189,18 +203,33 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
   int vector_length = 8;
   int team_size_max = Kokkos::TeamPolicy<>(nlocal,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
 #ifdef EMD_ENABLE_GPU
+  // Same as 207 in lammps
   int team_size = 20;//max_neighs;
   if(team_size*vector_length > team_size_max)
     team_size = team_size_max/vector_length;
 #else
   int team_size = 1;
 #endif
-  Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length);
+      //ComputeNeigh -- lammps
+      {
+        // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
+        int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
+
+        SnapAoSoATeamPolicy<team_size_compute_neigh, TagComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+        policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+      }    
+
 
+
+/*
+  Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length);
+  // Replaced in LAMMPS by a lot of functors
   Kokkos::parallel_for("ForceSNAP::compute",policy
       .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
       .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
     ,*this);
+*/
 //static int step =0;
 //step++;
 //if(step%10==0)
@@ -586,6 +615,174 @@ void ForceSNAP<NeighborClass>::read_files(char *coefffilename, char *paramfilena
   delete[] found;
 }
 
+/*
+template<class NeighborClass>
+KOKKOS_INLINE_FUNCTION
+void ForceSNAP<NeighborClass>::operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy<TagComputeNeighCPU>::member_type& team) const {
+
+
+  int ii = team.league_rank();
+  const int i = d_ilist[ii + chunk_offset];
+  // TODO: snaKK is used in lammps
+  SNAKokkos my_sna = snaKK;
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  const int num_neighs = d_numneigh[i];
+
+  // rij[][3] = displacements between atom I and those neighbors
+  // inside = indices of neighbors of I within cutoff
+  // wj = weights for neighbors of I within cutoff
+  // rcutij = cutoffs for neighbors of I within cutoff
+  // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+  int ninside = 0;
+  Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,num_neighs),
+      [&] (const int jj, int& count) {
+    Kokkos::single(Kokkos::PerThread(team), [&] () {
+      T_INT j = d_neighbors(i,jj);
+      const F_FLOAT dx = x(j,0) - xtmp;
+      const F_FLOAT dy = x(j,1) - ytmp;
+      const F_FLOAT dz = x(j,2) - ztmp;
+
+      const int jtype = type(j);
+      const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+      if (rsq < rnd_cutsq(itype,jtype))
+       count++;
+    });
+  },ninside);
+
+  d_ninside(ii) = ninside;
+
+  if (team.team_rank() == 0)
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs),
+      [&] (const int jj, int& offset, bool final) {
+  //for (int jj = 0; jj < num_neighs; jj++) {
+    T_INT j = d_neighbors(i,jj);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+
+    const int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    const int elem_j = d_map[jtype];
+
+    if (rsq < rnd_cutsq(itype,jtype)) {
+      if (final) {
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[elem_j]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[elem_j])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (chemflag)
+          my_sna.element(ii,offset) = elem_j;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+}
+*/
+// lammps splice
+
+template<class NeighborClass>
+KOKKOS_INLINE_FUNCTION
+void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy<TagComputeNeigh>::member_type& team) const {
+
+  SNAKokkos my_sna = snaKK;
+
+  // extract atom number
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+  if (ii >= chunk_size) return;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // Load various info about myself up front
+  const int i = d_ilist[ii + chunk_offset];
+  const F_FLOAT xtmp = x(i,0);
+  const F_FLOAT ytmp = x(i,1);
+  const F_FLOAT ztmp = x(i,2);
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  const int num_neighs = d_numneigh[i];
+
+  // rij[][3] = displacements between atom I and those neighbors
+  // inside = indices of neighbors of I within cutoff
+  // wj = weights for neighbors of I within cutoff
+  // rcutij = cutoffs for neighbors of I within cutoff
+  // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs),
+    [&] (const int jj, int& count) {
+    T_INT j = d_neighbors(i,jj);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq >= rnd_cutsq(itype,jtype)) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    type_cache[jj] = jtype;
+
+    if (jtype >= 0)
+     count++;
+  }, ninside);
+
+  d_ninside(ii) = ninside;
+
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs),
+    [&] (const int jj, int& offset, bool final) {
+
+    const int jtype = type_cache[jj];
+
+    if (jtype >= 0) {
+      if (final) {
+        T_INT j = d_neighbors(i,jj);
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        const int elem_j = d_map[jtype];
+        my_sna.rij(ii,offset,0) = dx;
+        my_sna.rij(ii,offset,1) = dy;
+        my_sna.rij(ii,offset,2) = dz;
+        my_sna.wj(ii,offset) = d_wjelem[elem_j];
+        my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
+        my_sna.inside(ii,offset) = j;
+        if (chemflag)
+          my_sna.element(ii,offset) = elem_j;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+}
+
+
+/*
+// This is multiple smaller functors in lammps
 template<class NeighborClass>
 KOKKOS_INLINE_FUNCTION
 void ForceSNAP<NeighborClass>::operator() (const Kokkos::TeamPolicy<>::member_type& team) const {
@@ -723,3 +920,14 @@ void ForceSNAP<NeighborClass>::operator() (const Kokkos::TeamPolicy<>::member_ty
   });
   //t5 += timer.seconds(); timer.reset();
 }
+*/
+
+
+template<class NeighborClass>
+template<typename scratch_type>
+int ForceSNAP<NeighborClass>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+

From 12c2fe2fd4357cefcfff3a12342927350c9afb86 Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Tue, 31 May 2022 10:42:58 -0600
Subject: [PATCH 02/11] Resolve compile errors in
 src/force_types/force_snap_neigh.h

---
 src/force_types/force_snap_neigh.h | 6 ++----
 src/types.h                        | 6 ++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h
index 2ddb01b..c342f10 100644
--- a/src/force_types/force_snap_neigh.h
+++ b/src/force_types/force_snap_neigh.h
@@ -223,8 +223,6 @@ struct TagComputeDeidrjCPU{};
   int chunk_size,chunk_offset;
   int host_flag;
 
-  // How much parallelism to use within an interaction
-  int vector_length;
   // How many interactions can be run concurrently
   int concurrent_interactions;
 
@@ -331,13 +329,13 @@ struct TagComputeDeidrjCPU{};
   static constexpr int tile_size_compute_yi = 8;
   static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2;
 #endif
-
+static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN;
 
 // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
   // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
   // and reduces the verbosity of the LaunchBound by hiding the explicit
   // multiplication by vector_length
-  template <int num_tiles, class TagPairSNAP>
+  //template <int num_tiles, class TagPairSNAP>
   //using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagPairSNAP>;
 
   // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
diff --git a/src/types.h b/src/types.h
index 4d70c17..c72e631 100644
--- a/src/types.h
+++ b/src/types.h
@@ -183,5 +183,11 @@ t_scalar3<Scalar> operator *
 #define EMD_ENABLE_GPU
 #endif
 
+// Needed to handle vector_length template parameter
+#ifdef EMD_ENABLE_GPU
+#define SNAP_KOKKOS_DEVICE_VECLEN 32
+#else
+#define SNAP_KOKKOS_DEVICE_VECLEN 1
 #endif
 
+#endif // Who is my partner?

From 9d2f47d5232107f420518b5d9dff5e1a965aae62 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 13 Jul 2022 14:22:13 -0600
Subject: [PATCH 03/11] Work on sna class

---
 src/force_types/sna.h        |  317 +++-
 src/force_types/sna_impl.hpp | 2681 ++++++++++++++++++++++++----------
 2 files changed, 2169 insertions(+), 829 deletions(-)

diff --git a/src/force_types/sna.h b/src/force_types/sna.h
index fc4c58f..f174ba6 100644
--- a/src/force_types/sna.h
+++ b/src/force_types/sna.h
@@ -61,60 +61,179 @@
 #include <Kokkos_Core.hpp>
 #include <types.h>
 
-struct SNA_LOOPINDICES {
-  int j1, j2, j;
+#ifdef __SYCL_DEVICE_ONLY__
+#include <CL/sycl.hpp>
+#endif
+
+struct WignerWrapper {
+  using double = double;
+  using complex = SNAComplex<double>;
+  static constexpr int vector_length = vector_length_;
+
+  const int offset; // my offset into the vector (0, ..., vector_length - 1)
+  double* buffer; // buffer of real numbers
+
+  KOKKOS_INLINE_FUNCTION
+  WignerWrapper(complex* buffer_, const int offset_)
+   : offset(offset_), buffer(reinterpret_cast<double*>(buffer_))
+  { ; }
+
+  KOKKOS_INLINE_FUNCTION
+  complex get(const int& ma) const {
+    return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void set(const int& ma, const complex& store) const {
+    buffer[offset + 2 * vector_length * ma] = store.re;
+    buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
+  }
 };
 
+struct alignas(8) FullHalfMapper {
+  int idxu_half;
+  int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj
+};
+
+template<class DeviceType, typename double_, int vector_length_>
 class SNA {
 
-public:
-  typedef Kokkos::View<int*> t_sna_1i;
-  typedef Kokkos::View<double*> t_sna_1d;
-  typedef Kokkos::View<double**, Kokkos::LayoutRight> t_sna_2d;
-  typedef Kokkos::View<double***, Kokkos::LayoutRight> t_sna_3d;
-  typedef Kokkos::View<double***, Kokkos::LayoutRight,Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_3d_atomic;
-  typedef Kokkos::View<double***[3], Kokkos::LayoutRight> t_sna_4d;
-  typedef Kokkos::View<double**[3], Kokkos::LayoutRight> t_sna_3d3;
-  typedef Kokkos::View<double*****, Kokkos::LayoutRight> t_sna_5d;
+ public:
+  using double = double_;
+  using complex = SNAComplex<double>;
+  static constexpr int vector_length = vector_length_;
+
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
+
+  typedef Kokkos::View<int*, DeviceType> t_sna_1i;
+  typedef Kokkos::View<double*, DeviceType> t_sna_1d;
+  typedef Kokkos::View<double*, KKDeviceType, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1d_atomic;
+  typedef Kokkos::View<int**, DeviceType> t_sna_2i;
+  typedef Kokkos::View<double**, DeviceType> t_sna_2d;
+  typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
+  typedef Kokkos::View<double***, DeviceType> t_sna_3d;
+  typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
+  typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
+  typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
+  typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
+  typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
+
+  typedef Kokkos::View<complex*, DeviceType> t_sna_1c;
+  typedef Kokkos::View<complex*, KKDeviceType, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1c_atomic;
+  typedef Kokkos::View<complex**, DeviceType> t_sna_2c;
+  typedef Kokkos::View<complex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
+  typedef Kokkos::View<complex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
+  typedef Kokkos::View<complex***, DeviceType> t_sna_3c;
+  typedef Kokkos::View<complex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
+  typedef Kokkos::View<complex***[3], DeviceType> t_sna_4c;
+  typedef Kokkos::View<complex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
+  typedef Kokkos::View<complex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
+  typedef Kokkos::View<complex**[3], DeviceType> t_sna_3c3;
+  typedef Kokkos::View<complex*****, DeviceType> t_sna_5c;
+
   inline
   SNA() {};
+
   KOKKOS_INLINE_FUNCTION
-  SNA(const SNA& sna, const Kokkos::TeamPolicy<>::member_type& team);
+  SNA(const SNA<DeviceType,double,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
+
   inline
-  SNA(double, int, int, int, double, int, int);
+  SNA(double, int, double, int, int, int, int, int, int, int);
 
   KOKKOS_INLINE_FUNCTION
   ~SNA();
+
   inline
   void build_indexlist(); // SNA()
+
   inline
   void init();            //
-  inline
-  T_INT size_team_scratch_arrays();
-  inline
-  T_INT size_thread_scratch_arrays();
+
+  double memory_usage();
 
   int ncoeff;
+  int host_flag;
 
-  // functions for bispectrum coefficients
+  // functions for bispectrum coefficients, GPU only
+  KOKKOS_INLINE_FUNCTION
+  void compute_cayley_klein(const int&, const int&, const int&);
+  KOKKOS_INLINE_FUNCTION
+  void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
 
+  // version of the code with parallelism over j_bend
   KOKKOS_INLINE_FUNCTION
-  void compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int); // ForceSNAP
+  void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
+  // version of the code without parallelism over j_bend
   KOKKOS_INLINE_FUNCTION
-  void compute_zi(const Kokkos::TeamPolicy<>::member_type& team);    // ForceSNAP
+  void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
 
-  // functions for derivatives
+  KOKKOS_INLINE_FUNCTION
+  void compute_zi(const int&, const int&, const int&);    // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_yi(int,int,int,
+   const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_yi_with_zlist(int,int,int,
+   const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_bi(const int&, const int&, const int&);    // ForceSNAP
 
+  // functions for derivatives, GPU only
+  // version of the code with parallelism over j_bend
+  template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void compute_duidrj(const Kokkos::TeamPolicy<>::member_type& team, double*, double, double); //ForceSNAP
+  void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
+  // version of the code without parallelism over j_bend
+  template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void compute_dbidrj(const Kokkos::TeamPolicy<>::member_type& team); //ForceSNAP
+  void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
+
+  // core "evaluation" functions that get plugged into "compute" functions
+  // plugged into compute_ui_small, compute_ui_large
+  KOKKOS_FORCEINLINE_FUNCTION
+  void evaluate_ui_jbend(const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&, const int&,
+                        const int&, const int&, const int&);
+  // plugged into compute_zi, compute_yi
+  KOKKOS_FORCEINLINE_FUNCTION
+  complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
+                        const int&, const int&, const int&, const int&, const double*);
+  // plugged into compute_yi, compute_yi_with_zlist
+  KOKKOS_FORCEINLINE_FUNCTION
+  double evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
+                        const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &);
+  // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
+  KOKKOS_FORCEINLINE_FUNCTION
+  double evaluate_duidrj_jbend(const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&,
+                        const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&,
+                        const int&, const int&, const int&, const int&);
+
+  // functions for bispectrum coefficients, CPU only
   KOKKOS_INLINE_FUNCTION
-  void copy_dbi2dbvec(const Kokkos::TeamPolicy<>::member_type& team); //ForceSNAP
+  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  double compute_sfac(double, double); // add_uarraytot, compute_duarray
+  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_zi_cpu(const int&);    // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_yi_cpu(int,
+   const Kokkos::View<double**, DeviceType> &beta); // ForceSNAP
+    KOKKOS_INLINE_FUNCTION
+  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
+
+  // functions for derivatives, CPU only
+  KOKKOS_INLINE_FUNCTION
+  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+
+  KOKKOS_INLINE_FUNCTION
+  double compute_sfac(double, double, double, double); // add_uarraytot, compute_duarray
+
   KOKKOS_INLINE_FUNCTION
-  double compute_dsfac(double, double); // compute_duarray
+  double compute_dsfac(double, double, double, double); // compute_duarray
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_s_dsfac(const double, const double, const double, const double, double&, double&); // compute_cayley_klein
 
 #ifdef TIMING_INFO
   double* timers;
@@ -125,99 +244,137 @@ class SNA {
 
   //per sna class instance for OMP use
 
-
   // Per InFlight Particle
-  t_sna_2d rij;
-  t_sna_1i inside;
-  t_sna_1d wj;
-  t_sna_1d rcutij;
-  int nmax;
-
-  void grow_rij(int);
+  t_sna_3d rij;
+  t_sna_2i inside;
+  t_sna_2d wj;
+  t_sna_2d rcutij;
+  t_sna_2d sinnerij;
+  t_sna_2d dinnerij;
+  t_sna_2i element;
+  t_sna_3d dedr;
+  int natom, nmax;
+
+  void grow_rij(int, int);
 
   int twojmax, diagonalstyle;
-  // Per InFlight Particle
-  t_sna_3d uarraytot_r, uarraytot_i;
-  t_sna_3d_atomic uarraytot_r_a, uarraytot_i_a;
-  t_sna_5d zarray_r, zarray_i;
 
-  // Per InFlight Interaction
-  t_sna_3d uarray_r, uarray_i;
+  t_sna_3d blist;
+  t_sna_3c_ll ulisttot;
+  t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
+  t_sna_3c_ll zlist;
 
-  // derivatives of data
-  Kokkos::View<double*[3], Kokkos::LayoutRight> dbvec;
-  t_sna_4d duarray_r, duarray_i;
-  t_sna_4d dbarray;
+  t_sna_3c_ll ulist;
+  t_sna_3c_ll ylist;
 
-private:
+  // derivatives of data
+  t_sna_4c3_ll dulist;
+
+  // Modified structures for GPU backend
+  t_sna_3c_ll a_pack; // Cayley-Klein `a`
+  t_sna_3c_ll b_pack; // `b`
+  t_sna_4c_ll da_pack; // `da`
+  t_sna_4c_ll db_pack; // `db`
+  t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z}
+
+  t_sna_4d_ll ulisttot_re_pack; // split real,
+  t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened
+  t_sna_4c_ll ulisttot_pack; // AoSoA layout
+  t_sna_4c_ll zlist_pack; // AoSoA layout
+  t_sna_4d_ll blist_pack;
+  t_sna_4d_ll ylist_pack_re; // split real,
+  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
+
+  int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;
+
+  // Chem snap counts
+  int nelements;
+  int ndoubles;
+  int ntriples;
+
+ private:
   double rmin0, rfac0;
 
   //use indexlist instead of loops, constructor generates these
-  // Same accross all SNA
-  Kokkos::View<SNA_LOOPINDICES*> idxj,idxj_full;
-  int idxj_max,idxj_full_max;
+  // Same across all SNA
+  Kokkos::View<int*[10], DeviceType> idxz;
+  Kokkos::View<int*[3], DeviceType> idxb;
+  Kokkos::View<int***, DeviceType> idxcg_block;
+
+ public:
+  Kokkos::View<int*, DeviceType> idxu_block;
+  Kokkos::View<int*, DeviceType> idxu_half_block;
+  Kokkos::View<int*, DeviceType> idxu_cache_block;
+  Kokkos::View<FullHalfMapper*, DeviceType> idxu_full_half;
+
+ private:
+  Kokkos::View<int***, DeviceType> idxz_block;
+  Kokkos::View<int***, DeviceType> idxb_block;
+
   // data for bispectrum coefficients
 
-  // Same accross all SNA
-  t_sna_5d cgarray;
+  // Same across all SNA
+  t_sna_1d cglist;
   t_sna_2d rootpqarray;
 
-
   static const int nmaxfactorial = 167;
-  KOKKOS_INLINE_FUNCTION
+  static const double nfac_table[];
+  inline
   double factorial(int);
 
   KOKKOS_INLINE_FUNCTION
-  void create_team_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team); // SNA()
+  void create_team_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNA()
   KOKKOS_INLINE_FUNCTION
-  void create_thread_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team); // SNA()
+  void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNA()
+
   inline
   void init_clebsch_gordan(); // init()
+
   inline
   void init_rootpqarray();    // init()
-  KOKKOS_INLINE_FUNCTION
-  void zero_uarraytot(const Kokkos::TeamPolicy<>::member_type& team);      // compute_ui
-  KOKKOS_INLINE_FUNCTION
-  void addself_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double); // compute_ui
-  KOKKOS_INLINE_FUNCTION
-  void add_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double, double, double); // compute_ui
 
   KOKKOS_INLINE_FUNCTION
-  void compute_uarray(const Kokkos::TeamPolicy<>::member_type& team,
-                      double, double, double,
-                      double, double); // compute_ui
+  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui
+
   KOKKOS_INLINE_FUNCTION
+  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
+                      const double&, const double&, const double&,
+                      const double&, const double&); // compute_ui_cpu
+
+
+  inline
   double deltacg(int, int, int);  // init_clebsch_gordan
+
   inline
   int compute_ncoeff();           // SNA()
   KOKKOS_INLINE_FUNCTION
-  void compute_duarray(const Kokkos::TeamPolicy<>::member_type& team,
-                       double, double, double, // compute_duidrj
-                       double, double, double, double, double);
-
-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
+  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
+                       const double&, const double&, const double&, // compute_duidrj_cpu
+                           const double&, const double&, const double&, const double&, const double&,
+                           const double&, const double&);
 
   // Sets the style for the switching function
   // 0 = none
   // 1 = cosine
   int switch_flag;
 
+  // Sets the style for the inner switching function
+  // 0 = none
+  // 1 = cosine
+  int switch_inner_flag;
+
+  // Chem snap flags
+  int chem_flag;
+  int bnorm_flag;
+
   // Self-weight
   double wself;
+  int wselfall_flag;
+
+  int bzero_flag; // 1 if bzero subtracted from barray
+  Kokkos::View<double*, DeviceType> bzero; // array of B values for isolated atoms
 };
 
 #include<sna_impl.hpp>
 #endif
 
-/* ERROR/WARNING messages:
-
-E: Invalid argument to factorial %d
-
-N must be >= 0 and <= 167, otherwise the factorial result is too
-large.
-
-*/
diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp
index 40f5bad..3bbcc4b 100644
--- a/src/force_types/sna_impl.hpp
+++ b/src/force_types/sna_impl.hpp
@@ -1,6 +1,7 @@
-/* ----------------------------------------------------------------------
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -12,68 +13,69 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Aidan Thompson, Christian Trott, SNL
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
 #include "sna.h"
-#include <math.h>
-#include <string.h>
-#include <stdlib.h>
+#include "memory_kokkos.h"
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#include <type_traits>
 
 static const double MY_PI  = 3.14159265358979323846; // pi
+static const double MY_PI2  = 1.57079632679489661923; // pi/2
 
 inline
 SNA::SNA(double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
-         double rmin0_in, int switch_flag_in, int bzero_flag_in) 
+         int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in,
+         int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in)
 {
-  wself = 1.0;
-  
-  use_shared_arrays = use_shared_arrays_in;
+  LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  host_flag = (execution_space == LAMMPS_NS::Host);
+
+  wself = static_cast<double>(1.0);
+
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
   switch_flag = switch_flag_in;
+  switch_inner_flag = switch_inner_flag_in;
+  bzero_flag = bzero_flag_in;
+
+  chem_flag = chem_flag_in;
+  if (chem_flag)
+    nelements = nelements_in;
+  else
+    nelements = 1;
+  bnorm_flag = bnorm_flag_in;
+  wselfall_flag = wselfall_flag_in;
 
   twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;
 
   ncoeff = compute_ncoeff();
 
-  //create_twojmax_arrays();
-
   nmax = 0;
-  
+  natom = 0;
+
   build_indexlist();
 
-  int jdim = twojmax + 1;
+  int jdimpq = twojmax + 2;
+  MemKK::realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq);
 
-  cgarray = t_sna_5d("SNA::cgarray",jdim,jdim,jdim,jdim,jdim);
-  rootpqarray = t_sna_2d("SNA::rootpqarray",jdim+1,jdim+1);
+  MemKK::realloc_kokkos(cglist,"SNA::cglist",idxcg_max);
 
-}
+  if (bzero_flag) {
+    MemKK::realloc_kokkos(bzero,"sna:bzero",twojmax+1);
+    auto h_bzero = Kokkos::create_mirror_view(bzero);
 
-KOKKOS_INLINE_FUNCTION
-SNA::SNA(const SNA& sna, const Kokkos::TeamPolicy<>::member_type& team) {
-  wself = sna.wself;
-
-  use_shared_arrays = sna.use_shared_arrays;
-  rfac0 = sna.rfac0;
-  rmin0 = sna.rmin0;
-  switch_flag = sna.switch_flag;
-
-  twojmax = sna.twojmax;
-  diagonalstyle = sna.diagonalstyle;
-
-  ncoeff = sna.ncoeff;
-  nmax = sna.nmax;
-  idxj = sna.idxj;
-  idxj_max = sna.idxj_max;
-  idxj_full = sna.idxj_full;
-  idxj_full_max = sna.idxj_full_max;
-  cgarray = sna.cgarray;
-  rootpqarray = sna.rootpqarray;
-  create_team_scratch_arrays(team);
-  create_thread_scratch_arrays(team);
+    double www = wself*wself*wself;
+    for (int j = 0; j <= twojmax; j++)
+      if (bnorm_flag)
+        h_bzero[j] = www;
+      else
+        h_bzero[j] = www*(j+1);
+    Kokkos::deep_copy(bzero,h_bzero);
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -86,50 +88,190 @@ SNA::~SNA()
 inline
 void SNA::build_indexlist()
 {
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-    int idxj_full_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) idxj_count++;
-          idxj_full_count++;
-        }
+  // index list for cglist
 
-    // indexList can be changed here
+  int jdim = twojmax + 1;
+  MemKK::realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim);
+  auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block);
 
-    idxj = Kokkos::View<SNA_LOOPINDICES*>("SNA::idxj",idxj_count);
-    idxj_full = Kokkos::View<SNA_LOOPINDICES*>("SNA::idxj_full",idxj_full_count);
-    auto h_idxj = Kokkos::create_mirror_view(idxj);
-    auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
+  int idxcg_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        h_idxcg_block(j1,j2,j) = idxcg_count;
+        for (int m1 = 0; m1 <= j1; m1++)
+          for (int m2 = 0; m2 <= j2; m2++)
+            idxcg_count++;
+      }
+  idxcg_max = idxcg_count;
+  Kokkos::deep_copy(idxcg_block,h_idxcg_block);
 
-    idxj_max = idxj_count;
-    idxj_full_max = idxj_full_count;
+  // index list for uarray
+  // need to include both halves
 
-    idxj_count = 0;
-    idxj_full_count = 0;
+  MemKK::realloc_kokkos(idxu_block,"SNA::idxu_block",jdim);
+  auto h_idxu_block = Kokkos::create_mirror_view(idxu_block);
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) {
-            h_idxj[idxj_count].j1 = j1;
-            h_idxj[idxj_count].j2 = j2;
-            h_idxj[idxj_count].j = j;
-            idxj_count++;
-          }
-          h_idxj_full[idxj_full_count].j1 = j1;
-          h_idxj_full[idxj_full_count].j2 = j2;
-          h_idxj_full[idxj_full_count].j = j;
-          idxj_full_count++;
+  int idxu_count = 0;
+
+  for (int j = 0; j <= twojmax; j++) {
+    h_idxu_block[j] = idxu_count;
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++)
+        idxu_count++;
+  }
+  idxu_max = idxu_count;
+  Kokkos::deep_copy(idxu_block,h_idxu_block);
+
+  // index list for half uarray
+  MemKK::realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim);
+  auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
+
+  int idxu_half_count = 0;
+  for (int j = 0; j <= twojmax; j++) {
+    h_idxu_half_block[j] = idxu_half_count;
+    for (int mb = 0; 2*mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++)
+        idxu_half_count++;
+  }
+  idxu_half_max = idxu_half_count;
+  Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
+
+  // mapping between full and half indexing, encoding flipping
+  MemKK::realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max);
+  auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half);
+
+  idxu_count = 0;
+  for (int j = 0; j <= twojmax; j++) {
+    int jju_half = h_idxu_half_block[j];
+    for (int mb = 0; mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+        FullHalfMapper mapper;
+        if (2*mb <= j) {
+          mapper.idxu_half = jju_half + mb * (j + 1) + ma;
+          mapper.flip_sign = 0;
+        } else {
+          mapper.idxu_half = jju_half + (j + 1 - mb) * (j + 1) - (ma + 1);
+          mapper.flip_sign = (((ma+mb)%2==0)?1:-1);
         }
-    Kokkos::deep_copy(idxj,h_idxj);
-    Kokkos::deep_copy(idxj_full,h_idxj_full);
+        h_idxu_full_half[idxu_count] = mapper;
+        idxu_count++;
+      }
+    }
+  }
 
+  Kokkos::deep_copy(idxu_full_half, h_idxu_full_half);
+
+  // index list for "cache" uarray
+  // this is the GPU scratch memory requirements
+  // applied the CPU structures
+  MemKK::realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim);
+  auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
+
+  int idxu_cache_count = 0;
+  for (int j = 0; j <= twojmax; j++) {
+    h_idxu_cache_block[j] = idxu_cache_count;
+    for (int mb = 0; mb < ((j+3)/2); mb++)
+      for (int ma = 0; ma <= j; ma++)
+        idxu_cache_count++;
   }
+  idxu_cache_max = idxu_cache_count;
+  Kokkos::deep_copy(idxu_cache_block, h_idxu_cache_block);
+
+  // index list for beta and B
+
+  int idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) idxb_count++;
+
+  idxb_max = idxb_count;
+  MemKK::realloc_kokkos(idxb,"SNA::idxb",idxb_max);
+  auto h_idxb = Kokkos::create_mirror_view(idxb);
+
+  idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) {
+          h_idxb(idxb_count,0) = j1;
+          h_idxb(idxb_count,1) = j2;
+          h_idxb(idxb_count,2) = j;
+          idxb_count++;
+        }
+  Kokkos::deep_copy(idxb,h_idxb);
+
+  // reverse index list for beta and b
+
+  MemKK::realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim);
+  auto h_idxb_block = Kokkos::create_mirror_view(idxb_block);
+
+  idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        if (j >= j1) {
+          h_idxb_block(j1,j2,j) = idxb_count;
+          idxb_count++;
+        }
+      }
+  Kokkos::deep_copy(idxb_block,h_idxb_block);
+
+  // index list for zlist
+
+  int idxz_count = 0;
+
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++)
+            idxz_count++;
+
+  idxz_max = idxz_count;
+  MemKK::realloc_kokkos(idxz,"SNA::idxz",idxz_max);
+  auto h_idxz = Kokkos::create_mirror_view(idxz);
+
+  MemKK::realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim);
+  auto h_idxz_block = Kokkos::create_mirror_view(idxz_block);
+
+  idxz_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        h_idxz_block(j1,j2,j) = idxz_count;
+
+        // find right beta(ii,jjb) entry
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3
+
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++) {
+            h_idxz(idxz_count,0) = j1;
+            h_idxz(idxz_count,1) = j2;
+            h_idxz(idxz_count,2) = j;
+            h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2);
+            h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2;
+            h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2);
+            h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2;
+            h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1;
+            h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;
+
+            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+            // ylist is "compressed" via symmetry in its
+            // contraction with dulist
+            const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma;
+            h_idxz(idxz_count,9) = jju_half;
+
+            idxz_count++;
+          }
+      }
+  Kokkos::deep_copy(idxz,h_idxz);
+  Kokkos::deep_copy(idxz_block,h_idxz_block);
 
 }
+
 /* ---------------------------------------------------------------------- */
 
 inline
@@ -140,19 +282,268 @@ void SNA::init()
 }
 
 inline
-void SNA::grow_rij(int newnmax)
+void SNA::grow_rij(int newnatom, int newnmax)
 {
-  if(newnmax <= nmax) return;
+  if (newnatom <= natom && newnmax <= nmax) return;
+  natom = newnatom;
   nmax = newnmax;
+
+  MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3);
+  MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax);
+  MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax);
+  MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax);
+  MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax);
+  MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax);
+  MemKK::realloc_kokkos(element,"sna:element",natom,nmax);
+  MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3);
+
+#ifdef LMP_KOKKOS_GPU
+  if (!host_flag) {
+    const int natom_div = (natom + vector_length - 1) / vector_length;
+
+    MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div);
+    MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div);
+    MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3);
+    MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3);
+    MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4);
+    MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation
+    MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1);
+    MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div);
+    MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div);
+    MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div);
+    MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1);
+    MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1);
+    MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div);
+    MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
+    MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div);
+    MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1);
+    MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div);
+    MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div);
+    MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1);
+  } else {
+#endif
+    MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1);
+    MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1);
+    MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1);
+    MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1);
+    MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1);
+    MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom);
+    MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom);
+    MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1);
+    MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1);
+    MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1);
+    MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax);
+    MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom);
+    MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1);
+    MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
+    MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1);
+    MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom);
+    MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1);
+    MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1);
+    MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax);
+
+#ifdef LMP_KOKKOS_GPU
+  }
+#endif
 }
+
+/* ----------------------------------------------------------------------
+ * GPU routines
+ * ----------------------------------------------------------------------*/
+
+
 /* ----------------------------------------------------------------------
-   compute Ui by summing over neighbors j
+   Precompute the Cayley-Klein parameters and the derivatives thereof.
+   This routine better exploits parallelism than the GPU ComputeUi and
+   ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int jnum)
+void SNA::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div)
+{
+  const int iatom = iatom_mod + vector_length * iatom_div;
+  const double x = rij(iatom,jnbor,0);
+  const double y = rij(iatom,jnbor,1);
+  const double z = rij(iatom,jnbor,2);
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+  const double rcut = rcutij(iatom, jnbor);
+  const double sinner = sinnerij(iatom, jnbor);
+  const double dinner = dinnerij(iatom, jnbor);
+  const double rscale0 = rfac0 * static_cast<double>(MY_PI) / (rcut - rmin0);
+  const double theta0 = (r - rmin0) * rscale0;
+  const double sn = sin(theta0);
+  const double cs = cos(theta0);
+  const double z0 = r * cs / sn;
+  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+
+  const double wj_local = wj(iatom, jnbor);
+  double sfac, dsfac;
+  compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac);
+  sfac *= wj_local;
+  dsfac *= wj_local;
+
+  const double rinv = static_cast<double>(1.0) / r;
+  const double ux = x * rinv;
+  const double uy = y * rinv;
+  const double uz = z * rinv;
+
+  const double r0inv = static_cast<double>(1.0) / sqrt(r * r + z0 * z0);
+
+  const complex a = { z0 * r0inv, -z * r0inv };
+  const complex b = { r0inv * y, -r0inv * x };
+
+  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+
+  const double dr0invx = dr0invdr * ux;
+  const double dr0invy = dr0invdr * uy;
+  const double dr0invz = dr0invdr * uz;
+
+  const double dz0x = dz0dr * ux;
+  const double dz0y = dz0dr * uy;
+  const double dz0z = dz0dr * uz;
+
+  const complex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
+  const complex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
+  const complex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
+
+  const complex dbx = { y * dr0invx, -x * dr0invx - r0inv };
+  const complex dby = { y * dr0invy + r0inv, -x * dr0invy };
+  const complex dbz = { y * dr0invz, -x * dr0invz };
+
+  const double dsfacux = dsfac * ux;
+  const double dsfacuy = dsfac * uy;
+  const double dsfacuz = dsfac * uz;
+
+  a_pack(iatom_mod,jnbor,iatom_div) = a;
+  b_pack(iatom_mod,jnbor,iatom_div) = b;
+
+  da_pack(iatom_mod,jnbor,iatom_div,0) = dax;
+  db_pack(iatom_mod,jnbor,iatom_div,0) = dbx;
+
+  da_pack(iatom_mod,jnbor,iatom_div,1) = day;
+  db_pack(iatom_mod,jnbor,iatom_div,1) = dby;
+
+  da_pack(iatom_mod,jnbor,iatom_div,2) = daz;
+  db_pack(iatom_mod,jnbor,iatom_div,2) = dbz;
+
+  sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac;
+  sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux;
+  sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy;
+  sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz;
+
+  // we need to explicitly zero `dedr` somewhere before hitting
+  // ComputeFusedDeidrj --- this is just a convenient place to do it.
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast<double>(0.);
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast<double>(0.);
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast<double>(0.);
+
+}
+
+/* ----------------------------------------------------------------------
+   Initialize ulisttot with self-energy terms.
+   Ulisttot uses a "half" data layout which takes
+   advantage of the symmetry of the Wigner U matrices.
+------------------------------------------------------------------------- */
+
+KOKKOS_INLINE_FUNCTION
+void SNA::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div)
+{
+
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    int jju_half = idxu_half_block(j);
+
+    // Only diagonal elements get initialized
+    // Top half only: gets symmetrized by TransformUi
+
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+
+        double re_part = static_cast<double>(0.);
+        if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
+
+        ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part;
+        ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast<double>(0.);
+
+        jju_half++;
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   compute Ui by computing Wigner U-functions for one neighbor and
+   accumulating to the total. GPU only.
+------------------------------------------------------------------------- */
+
+// Version of the code that exposes additional parallelism by threading over `j_bend` values
+
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
+{
+
+  // get shared memory offset
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size;
+
+  // extract and wrap
+  const WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+
+  // load parameters
+  const complex a = a_pack(iatom_mod, jnbor, iatom_div);
+  const complex b = b_pack(iatom_mod, jnbor, iatom_div);
+  const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
+
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
+
+  // we need to "choose" when to bend
+  // this for loop is here for context --- we expose additional
+  // parallelism over this loop instead
+  //for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
+  evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div);
+}
+
+// Version of the code that loops over all `j_bend` values which reduces integer arithmetic
+// and some amount of load imbalance, at the expense of reducing parallelism
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
+{
+  // get shared memory offset
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size;
+
+  // extract and wrap
+  const WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+
+  // load parameters
+  const complex a = a_pack(iatom_mod, jnbor, iatom_div);
+  const complex b = b_pack(iatom_mod, jnbor, iatom_div);
+  const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
+
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
+
+  // we need to "choose" when to bend
+  #ifdef LMP_KK_DEVICE_COMPILE
+  #pragma unroll
+  #endif
+  for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
+    evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div);
+  }
+}
+
+// Core "evaluation" kernel that gets reused in `compute_ui_small` and `compute_ui_large`
+KOKKOS_FORCEINLINE_FUNCTION
+void SNA::evaluate_ui_jbend(const WignerWrapper<double, vector_length>& ulist_wrapper,
+          const complex& a, const complex& b, const double& sfac, const int& jelem,
+          const int& iatom_mod, const int& j_bend, const int& iatom_div)
 {
-  double rsq, r, x, y, z, z0, theta0;
 
   // utot(j,ma,mb) = 0 for all j,ma,ma
   // utot(j,ma,ma) = 1 for all j,ma
@@ -160,488 +551,1087 @@ void SNA::compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int jnum)
   //   compute r0 = (x,y,z,z0)
   //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
 
-  if(team.team_rank() == 0) {
-    zero_uarraytot(team);
-    //Kokkos::single(Kokkos::PerThread(team), [&] (){
-    addself_uarraytot(team,wself);
-    //});
+  // level 0 is just 1.
+  ulist_wrapper.set(0, complex::one());
+
+  // j from before the bend, don't store, mb == 0
+  for (int j = 1; j <= j_bend; j++) {
+
+    constexpr int mb = 0; // intentional for readability, compiler should optimize this out
+
+    complex ulist_accum = complex::zero();
+
+    int ma;
+    for (ma = 0; ma < j; ma++) {
+
+      // grab the cached value
+      const complex ulist_prev = ulist_wrapper.get(ma);
+
+      // ulist_accum += rootpq * a.conj() * ulist_prev;
+      double rootpq = rootpqarray(j - ma, j - mb);
+      ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im);
+      ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re);
+
+      // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here
+      ulist_wrapper.set(ma, ulist_accum);
+
+      // next value
+      // ulist_accum = -rootpq * b.conj() * ulist_prev;
+      rootpq = rootpqarray(ma + 1, j - mb);
+      ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im);
+      ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
+
+    }
+
+    ulist_wrapper.set(ma, ulist_accum);
   }
-  team.team_barrier();
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum),
-      [&] (const int& j) {
-  //for(int j = 0; j < jnum; j++) {
-    x = rij(j,0);
-    y = rij(j,1);
-    z = rij(j,2);
-    rsq = x * x + y * y + z * z;
-    r = sqrt(rsq);
-
-    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
-    //    theta0 = (r - rmin0) * rscale0;
-    z0 = r / tan(theta0);
-
-    compute_uarray(team,x, y, z, z0, r);
-    //Kokkos::single(Kokkos::PerThread(team), [&] (){
-    add_uarraytot(team,r, wj[j], rcutij[j]);
-    //});
-  });
+  // now we're after the bend, start storing but only up to the "half way point"
+  const int j_half_way = MIN(2 * j_bend, twojmax);
+
+  int mb = 1;
+  int j; //= j_bend + 1; // need this value below
+  for (j = j_bend + 1; j <= j_half_way; j++) {
+
+    const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+    complex ulist_accum = complex::zero();
 
+    int ma;
+    for (ma = 0; ma < j; ma++) {
+
+      // grab the cached value
+      const complex ulist_prev = ulist_wrapper.get(ma);
+
+      // atomic add the previous level here
+      Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
+      Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
+
+      // ulist_accum += rootpq * b * ulist_prev;
+      double rootpq = rootpqarray(j - ma, mb);
+      ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im);
+      ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re);
+
+      // store ulist_accum
+      ulist_wrapper.set(ma, ulist_accum);
+
+      // next value
+      // ulist_accum = rootpq * a * ulist_prev;
+      rootpq = rootpqarray(ma + 1, mb);
+      ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im);
+      ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re);
+    }
+
+    ulist_wrapper.set(ma, ulist_accum);
+
+    mb++;
+  }
+
+  // atomic add the last level
+  const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+  for (int ma = 0; ma < j; ma++) {
+    const complex ulist_prev = ulist_wrapper.get(ma);
+
+    // atomic add the previous level here
+    Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
+    Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   compute Zi by summing over products of Ui,
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. GPU version
+------------------------------------------------------------------------- */
+
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div)
+{
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+
+  const double* cgblock = cglist.data() + idxcg_block(j1, j2, j);
+
+  int idouble = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      zlist_pack(iatom_mod,jjz,idouble,iatom_div) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock);
+
+      idouble++;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
-   compute Zi by summing over products of Ui
+   compute Bi by summing conj(Ui)*Zi
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence.
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_zi(const Kokkos::TeamPolicy<>::member_type& team)
+void SNA::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
   //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,jmid
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
+  //        b(j1,j2,j) = 0
+  //        for mb = 0,...,jmid
+  //          for ma = 0,...,j
+  //            b(j1,j2,j) +=
+  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
+
+  const int j1 = idxb(jjb,0);
+  const int j2 = idxb(jjb,1);
+  const int j = idxb(jjb,2);
+
+  const int jjz = idxz_block(j1,j2,j);
+  const int jju = idxu_block[j];
+
+  int itriple = 0;
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        double sumzu = 0.0;
+        double sumzu_temp = 0.0;
+
+        for (int mb = 0; 2*mb < j; mb++) {
+          for (int ma = 0; ma <= j; ma++) {
+            const int jju_index = jju+mb*(j+1)+ma;
+            const int jjz_index = jjz+mb*(j+1)+ma;
+            if (2*mb == j) return; // I think we can remove this?
+            const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+          }
+        }
+        sumzu += sumzu_temp;
 
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
-
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxj_full_max),
-      [&] (const int& idx) {
-    const int j1 = idxj_full(idx).j1;
-    const int j2 = idxj_full(idx).j2;
-    const int j =  idxj_full(idx).j;
-
-    const int bound = (j+2)/2;
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+1)*bound),
-        [&] (const int mbma ) {
-	//for(int mb = 0; 2*mb <= j; mb++)
-	  //for(int ma = 0; ma <= j; ma++) {
-      const int ma = mbma%(j+1);
-      const int mb = mbma/(j+1);
-	    //zarray_r(j1,j2,j,ma,mb) = 0.0;
-	    //zarray_i(j1,j2,j,ma,mb) = 0.0;
-      double z_r = 0.0;
-      double z_i = 0.0;
-
-	    for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-		ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-	      double sumb1_r = 0.0;
-	      double sumb1_i = 0.0;
-
-	      const int ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-
-	      for(int mb1  = MAX( 0, (2 * mb - j - j2 + j1) / 2);
-                mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
-
-		const int mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-    const double cga = cgarray(j1,j2,j,mb1,mb2);
-    const double uat1_r = uarraytot_r(j1,ma1,mb1);
-    const double uat1_i = uarraytot_i(j1,ma1,mb1);
-    const double uat2_r = uarraytot_r(j2,ma2,mb2);
-    const double uat2_i = uarraytot_i(j2,ma2,mb2);
-    sumb1_r += cga * (uat1_r * uat2_r - uat1_i * uat2_i);
-    sumb1_i += cga * (uat1_r * uat2_i + uat1_i * uat2_r);
-		/*sumb1_r += cgarray(j1,j2,j,mb1,mb2) *
-		  (uarraytot_r(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2) -
-		   uarraytot_i(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2));
-		sumb1_i += cgarray(j1,j2,j,mb1,mb2) *
-		  (uarraytot_r(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2) +
-		   uarraytot_i(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2));*/
-	      } // end loop over mb1
-
-        const double cga = cgarray(j1,j2,j,ma1,ma2);
-	      z_r += sumb1_r * cga;//rray(j1,j2,j,ma1,ma2);
-	      z_i += sumb1_i * cga;//rray(j1,j2,j,ma1,ma2);
-	    } // end loop over ma1
-      zarray_r(j1,j2,j,mb,ma) = z_r;
-	    zarray_i(j1,j2,j,mb,ma) = z_i;
-	  }); // end loop over ma, mb
-    //  }
-    //}
-  });
-      //} // end loop over j
-    //} // end loop over j1, j2
+        // For j even, special treatment for middle column
+        if (j%2 == 0) {
+          sumzu_temp = 0.;
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+          const int mb = j/2;
+          for (int ma = 0; ma < mb; ma++) {
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+            const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+
+          }
+          sumzu += sumzu_temp;
+
+          const int ma = mb;
+          const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+          const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+          const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+          const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
+          sumzu += static_cast<double>(0.5) * (utot.re * zloc.re + utot.im * zloc.im);
+        } // end if jeven
+
+        sumzu *= static_cast<double>(2.0);
+        if (bzero_flag) {
+          if (!wselfall_flag) {
+            if (elem1 == elem2 && elem1 == elem3) {
+              sumzu -= bzero[j];
+            }
+          } else {
+            sumzu -= bzero[j];
+          }
+        }
+        blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu;
+            //} // end loop over j
+          //} // end loop over j1, j2
+        itriple++;
+      } // end loop over elem3
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
 
 /* ----------------------------------------------------------------------
-   calculate derivative of Ui w.r.t. atom j
+   compute Yi from Ui without storing Zi, looping over zlist indices.
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. GPU version.
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_duidrj(const Kokkos::TeamPolicy<>::member_type& team,
-                         double* rij, double wj, double rcut)
+void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div,
+ const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
 {
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
 
-  x = rij[0];
-  y = rij[1];
-  z = rij[2];
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
-  theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
-  z0 = r * cs / sn;
-  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+  const int jju_half = idxz(jjz, 9);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+  //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock);
+
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
+
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        const double betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack);
+
+        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re);
+        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
+}
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices.
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. GPU version.
+------------------------------------------------------------------------- */
 
-  compute_duarray(team, x, y, z, z0, r, dz0dr, wj, rcut);
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
+ const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
+{
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int jju_half = idxz(jjz, 9);
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      const complex ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div);
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        const double betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack);
+
+        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re);
+        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im);
+      } // end loop over elem3
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
+}
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+// Core "evaluation" kernel that computes a single zlist value
+// which gets used in both `compute_zi` and `compute_yi`
+KOKKOS_FORCEINLINE_FUNCTION
+typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int& j,
+        const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb,
+        const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const double* cgblock) {
+
+  complex ztmp = complex::zero();
+
+  int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+  int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+  int icgb = mb1min*(j2+1) + mb2max;
+
+  #ifdef LMP_KK_DEVICE_COMPILE
+  #pragma unroll
+  #endif
+  for (int ib = 0; ib < nb; ib++) {
+
+    int ma1 = ma1min;
+    int ma2 = ma2max;
+    int icga = ma1min*(j2+1) + ma2max;
+
+    #ifdef LMP_KK_DEVICE_COMPILE
+    #pragma unroll
+    #endif
+    for (int ia = 0; ia < na; ia++) {
+      const complex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
+      const complex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
+      const double cgcoeff_a = cgblock[icga];
+      const double cgcoeff_b = cgblock[icgb];
+      ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+      ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+      ma1++;
+      ma2--;
+      icga += j2;
+    } // end loop over ia
+
+    jju1 += j1 + 1;
+    jju2 -= j2 + 1;
+    icgb += j2;
+  } // end loop over ib
+
+  if (bnorm_flag) {
+    const double scale = static_cast<double>(1) / static_cast<double>(j + 1);
+    ztmp.re *= scale;
+    ztmp.im *= scale;
+  }
+
+  return ztmp;
+}
+
+// Core "evaluation" kernel that extracts and rescales the appropriate `beta` value,
+// which gets used in both `compute_yi` and `compute_yi_from_zlist
+KOKKOS_FORCEINLINE_FUNCTION
+typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j,
+          const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div,
+          const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack) {
+
+  double betaj = 0;
+
+  if (j >= j1) {
+    const int jjb = idxb_block(j1, j2, j);
+    const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+    if (j1 == j) {
+      if (j2 == j) betaj = static_cast<double>(3) * beta_pack(iatom_mod, itriple, iatom_div);
+      else betaj = static_cast<double>(2) * beta_pack(iatom_mod, itriple, iatom_div);
+    } else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+  } else if (j >= j2) {
+    const int jjb = idxb_block(j, j2, j1);
+    const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+    if (j2 == j) betaj = static_cast<double>(2) * beta_pack(iatom_mod, itriple, iatom_div);
+    else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+  } else {
+    const int jjb = idxb_block(j2, j, j1);
+    const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+    betaj = beta_pack(iatom_mod, itriple, iatom_div);
+  }
+
+  if (!bnorm_flag && j1 > j) {
+    const double scale = static_cast<double>(j1 + 1) / static_cast<double>(j + 1);
+    betaj *= scale;
+  }
+
+  return betaj;
 
 }
 
 /* ----------------------------------------------------------------------
-   calculate derivative of Bi w.r.t. atom j
-   variant using indexlist for j1,j2,j
-   variant using symmetry relation
+   Fused calculation of the derivative of Ui w.r.t. atom j
+   and accumulation into dEidRj. GPU only.
 ------------------------------------------------------------------------- */
 
+// Version of the code that exposes additional parallelism by threading over `j_bend` values
+template<int dir>
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_dbidrj(const Kokkos::TeamPolicy<>::member_type& team)
+void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
 {
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        zdb = 0
-  //        for mb = 0,...,jmid
-  //          for ma = 0,...,j
-  //            zdb +=
-  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
-  //        dbdr(j1,j2,j) += 2*zdb
-  //        zdb = 0
-  //        for mb1 = 0,...,j1mid
-  //          for ma1 = 0,...,j1
-  //            zdb +=
-  //              Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)
-  //        dbdr(j1,j2,j) += 2*zdb*(j+1)/(j1+1)
-  //        zdb = 0
-  //        for mb2 = 0,...,j2mid
-  //          for ma2 = 0,...,j2
-  //            zdb +=
-  //              Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
-  //        dbdr(j1,j2,j) += 2*zdb*(j+1)/(j2+1)
-
-  double* dudr_r, *dudr_i;
-  double jjjmambzarray_r;
-  double jjjmambzarray_i;
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max),
-          [&] (const int& JJ) {
-  //for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-//    dbdr = &dbarray(j1,j2,j,0);
-//    dbdr[0] = 0.0;
-//    dbdr[1] = 0.0;
-//    dbdr[2] = 0.0;
-
-    t_scalar3<double> dbdr,sumzdu_r;
-    // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
-
-    // use zarray j1/j2 symmetry (optional)
-
-    int j_,j1_,j2_;
-    if (j1 >= j2) {
-      //jjjzarray_r = &zarray_r(j1,j2,j);
-      //jjjzarray_i = &zarray_i(j1,j2,j);
-      j1_ = j1;
-      j2_ = j2;
-      j_ = j;
-    } else {
-      j1_ = j2;
-      j2_ = j1;
-      j_ = j;
-      //jjjzarray_r = &zarray_r(j2,j1,j);
-      //jjjzarray_i = &zarray_i(j2,j1,j);
+  // get shared memory offset
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size;
+
+  // extract, wrap shared memory buffer
+  WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper<double, vector_length> dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+
+  // load parameters
+  const complex a = a_pack(iatom_mod, jnbor, iatom_div);
+  const complex b = b_pack(iatom_mod, jnbor, iatom_div);
+  const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir);
+  const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir);
+  const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
+  const double dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u
+
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
+
+  // compute the contribution to dedr_full_sum for one "bend" location
+  const double dedr_full_sum = evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu,
+                                                       jelem, iatom_mod, j_bend, iatom_div);
+
+  // dedr gets zeroed out at the start of each iteration in compute_cayley_klein
+  Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast<double>(2.0) * dedr_full_sum);
+
+}
+
+// Version of the code that loops over all `j_bend` values which reduces integer arithmetic
+// and some amount of load imbalance, at the expense of reducing parallelism
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
+{
+  // get shared memory offset
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size;
+
+  // extract, wrap shared memory buffer
+  WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper<double, vector_length> dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+
+  // load parameters
+  const complex a = a_pack(iatom_mod, jnbor, iatom_div);
+  const complex b = b_pack(iatom_mod, jnbor, iatom_div);
+  const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir);
+  const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir);
+  const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
+  const double dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u
+
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
+
+  // compute the contributions to dedr_full_sum for all "bend" locations
+  double dedr_full_sum = static_cast<double>(0);
+  #ifdef LMP_KK_DEVICE_COMPILE
+  #pragma unroll
+  #endif
+  for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
+    dedr_full_sum += evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu,
+                                          jelem, iatom_mod, j_bend, iatom_div);
+  }
+
+  // there's one thread per atom, neighbor pair, so no need to make this atomic
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, dir) = static_cast<double>(2.0) * dedr_full_sum;
+
+}
+
+// Core "evaluation" kernel that gets reused in `compute_fused_deidrj_small` and
+// `compute_fused_deidrj_large`
+KOKKOS_FORCEINLINE_FUNCTION
+typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper<double, vector_length>& ulist_wrapper, const complex& a, const complex& b, const double& sfac,
+                      const WignerWrapper<double, vector_length>& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu,
+                      const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) {
+
+  double dedr_full_sum = static_cast<double>(0);
+
+  // level 0 is just 1, 0
+  ulist_wrapper.set(0, complex::one());
+  dulist_wrapper.set(0, complex::zero());
+
+  // j from before the bend, don't store, mb == 0
+  // this is "creeping up the side"
+  for (int j = 1; j <= j_bend; j++) {
+
+    constexpr int mb = 0; // intentional for readability, compiler should optimize this out
+
+    complex ulist_accum = complex::zero();
+    complex dulist_accum = complex::zero();
+
+    int ma;
+    for (ma = 0; ma < j; ma++) {
+
+      // grab the cached value
+      const complex ulist_prev = ulist_wrapper.get(ma);
+      const complex dulist_prev = dulist_wrapper.get(ma);
+
+      // ulist_accum += rootpq * a.conj() * ulist_prev;
+      double rootpq = rootpqarray(j - ma, j - mb);
+      ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im);
+      ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re);
+
+      // product rule of above
+      dulist_accum.re += rootpq * (da.re * ulist_prev.re + da.im * ulist_prev.im + a.re * dulist_prev.re + a.im * dulist_prev.im);
+      dulist_accum.im += rootpq * (da.re * ulist_prev.im - da.im * ulist_prev.re + a.re * dulist_prev.im - a.im * dulist_prev.re);
+
+      // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here
+      ulist_wrapper.set(ma, ulist_accum);
+      dulist_wrapper.set(ma, dulist_accum);
+
+      // next value
+      // ulist_accum = -rootpq * b.conj() * ulist_prev;
+      rootpq = rootpqarray(ma + 1, j - mb);
+      ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im);
+      ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
+
+      // product rule of above
+      dulist_accum.re = -rootpq * (db.re * ulist_prev.re + db.im * ulist_prev.im + b.re * dulist_prev.re + b.im * dulist_prev.im);
+      dulist_accum.im = -rootpq * (db.re * ulist_prev.im - db.im * ulist_prev.re + b.re * dulist_prev.im - b.im * dulist_prev.re);
+
     }
 
-    for(int mb = 0; 2*mb < j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
+    ulist_wrapper.set(ma, ulist_accum);
+    dulist_wrapper.set(ma, dulist_accum);
+  }
 
-        dudr_r = &duarray_r(j,mb,ma,0);
-        dudr_i = &duarray_i(j,mb,ma,0);
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb,ma);
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb,ma);
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
+  // now we're after the bend, start storing but only up to the "half way point"
+  const int j_half_way = MIN(2 * j_bend, twojmax);
 
-      } //end loop over ma mb
+  int mb = 1;
+  int j; //= j_bend + 1; // need this value below
+  for (j = j_bend + 1; j <= j_half_way; j++) {
 
-    // For j even, handle middle column
+    const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
 
-    if (j%2 == 0) {
-      int mb = j/2;
-      for(int ma = 0; ma <= mb; ma++) {
-        dudr_r = &duarray_r(j,mb,ma,0);
-        dudr_i = &duarray_i(j,mb,ma,0);
-        const double factor = ma==mb?0.5:1.0;
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb,ma) * factor;
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb,ma) * factor;
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
-      }
-    } // end if jeven
+    complex ulist_accum = complex::zero();
+    complex dulist_accum = complex::zero();
+
+    int ma;
+    for (ma = 0; ma < j; ma++) {
 
-      dbdr += 2.0*sumzdu_r;
+      // grab y_local early
+      // this will never be the last element of a row, no need to rescale.
+      complex y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div));
 
-    // Sum over Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)
+      // grab the cached value
+      const complex ulist_prev = ulist_wrapper.get(ma);
+      const complex dulist_prev = dulist_wrapper.get(ma);
 
-    double j1fac = (j+1)/(j1+1.0);
+      // ulist_accum += rootpq * b * ulist_prev;
+      double rootpq = rootpqarray(j - ma, mb);
+      ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im);
+      ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re);
 
-    sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0;
+      // product rule of above
+      dulist_accum.re += rootpq * (db.re * ulist_prev.re - db.im * ulist_prev.im + b.re * dulist_prev.re - b.im * dulist_prev.im);
+      dulist_accum.im += rootpq * (db.re * ulist_prev.im + db.im * ulist_prev.re + b.re * dulist_prev.im + b.im * dulist_prev.re);
 
-    // use zarray j1/j2 symmetry (optional)
+      // store ulist_accum
+      ulist_wrapper.set(ma, ulist_accum);
+      dulist_wrapper.set(ma, dulist_accum);
 
-    if (j >= j2) {
-      j1_ = j;
-      j2_ = j2;
-      j_ = j1;
+      // Directly accumulate deidrj into sum_tmp
+      const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev);
+      dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im;
+
+      // next value
+      // ulist_accum = rootpq * a * ulist_prev;
+      rootpq = rootpqarray(ma + 1, mb);
+      ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im);
+      ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re);
+
+      // product rule of above
+      dulist_accum.re = rootpq * (da.re * ulist_prev.re - da.im * ulist_prev.im + a.re * dulist_prev.re - a.im * dulist_prev.im);
+      dulist_accum.im = rootpq * (da.re * ulist_prev.im + da.im * ulist_prev.re + a.re * dulist_prev.im + a.im * dulist_prev.re);
 
-      //jjjzarray_r = zarray_r(j,j2,j1);
-      //jjjzarray_i = zarray_i(j,j2,j1);
-    } else {
-      j1_ = j2;
-      j2_ = j;
-      j_ = j1;
-      //jjjzarray_r = zarray_r(j2,j,j1);
-      //jjjzarray_i = zarray_i(j2,j,j1);
     }
 
-    for(int mb1 = 0; 2*mb1 < j1; mb1++)
-      for(int ma1 = 0; ma1 <= j1; ma1++) {
-
-        dudr_r = &duarray_r(j1,mb1,ma1,0);
-        dudr_i = &duarray_i(j1,mb1,ma1,0);
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1);
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1);
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
-      } //end loop over ma1 mb1
-
-    // For j1 even, handle middle column
-
-    if (j1%2 == 0) {
-      const int mb1 = j1/2;
-      for(int ma1 = 0; ma1 <= mb1; ma1++) {
-        dudr_r = &duarray_r(j1,mb1,ma1,0);
-        dudr_i = &duarray_i(j1,mb1,ma1,0);
-        const double factor = ma1==mb1?0.5:1.0;
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1) * factor;
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1) * factor;
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
-      }
-    } // end if j1even
+    ulist_wrapper.set(ma, ulist_accum);
+    dulist_wrapper.set(ma, dulist_accum);
 
-      dbdr += 2.0*sumzdu_r*j1fac;
+    mb++;
+  }
+
+  // accumulate the last level
+  const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+  for (int ma = 0; ma < j; ma++) {
+    // grab y_local early
+    complex y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div));
+    if (j % 2 == 1 && 2*(mb-1) == j-1) { // double check me...
+      if (ma == (mb-1)) { y_local = static_cast<double>(0.5)*y_local; }
+      else if (ma > (mb-1)) { y_local.re = static_cast<double>(0.); y_local.im = static_cast<double>(0.); } // can probably avoid this outright
+      // else the ma < mb gets "double counted", cancelling the 0.5.
+    }
 
-    // Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
+    const complex ulist_prev = ulist_wrapper.get(ma);
+    const complex dulist_prev = dulist_wrapper.get(ma);
 
-    double j2fac = (j+1)/(j2+1.0);
+    // Directly accumulate deidrj into sum_tmp
+    const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev);
+    dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im;
 
-    sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0;
+  }
 
-    // use zarray j1/j2 symmetry (optional)
+  return dedr_full_sum;
+}
 
-    if (j1 >= j) {
-      j1_ = j1;
-      j2_ = j;
-      j_ = j2;
-      //jjjzarray_r = zarray_r(j1,j,j2);
-      //jjjzarray_i = zarray_i(j1,j,j2);
-    } else {
-      j1_ = j;
-      j2_ = j1;
-      j_ = j2;
-      //jjjzarray_r = zarray_r(j,j1,j2);
-      //jjjzarray_i = zarray_i(j,j1,j2);
+/* ----------------------------------------------------------------------
+ * CPU routines
+ * ----------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------
+   Ulisttot uses a "half" data layout which takes
+   advantage of the symmetry of the Wigner U matrices.
+ * ------------------------------------------------------------------------- */
+
+KOKKOS_INLINE_FUNCTION
+void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
+{
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    for (int j = 0; j <= twojmax; j++) {
+      int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug
+
+      // Only diagonal elements get initialized
+      // for (int m = 0; m < (j+1)*(j/2+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
+        [&] (const int m) {
+
+        const int jjup = jju + m;
+
+        // if m is on the "diagonal", initialize it with the self energy.
+        // Otherwise zero it out
+        complex init(static_cast<double>(0.),static_cast<double>(0.));
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element
+
+        ulisttot(jjup, jelem, iatom) = init;
+      });
     }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------
+   compute Ui by summing over bispectrum components. CPU only.
+   See comments above compute_uarray_cpu and add_uarraytot for
+   data layout comments.
+------------------------------------------------------------------------- */
+
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+{
+  double rsq, r, x, y, z, z0, theta0;
+
+  // utot(j,ma,mb) = 0 for all j,ma,ma
+  // utot(j,ma,ma) = 1 for all j,ma
+  // for j in neighbors of i:
+  //   compute r0 = (x,y,z,z0)
+  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
+
+  x = rij(iatom,jnbor,0);
+  y = rij(iatom,jnbor,1);
+  z = rij(iatom,jnbor,2);
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+
+  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  //    theta0 = (r - rmin0) * rscale0;
+  z0 = r / tan(theta0);
+
+  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
+  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
+
+}
+/* ----------------------------------------------------------------------
+   compute Zi by summing over products of Ui, CPU version
+------------------------------------------------------------------------- */
 
-    for(int mb2 = 0; 2*mb2 < j2; mb2++)
-      for(int ma2 = 0; ma2 <= j2; ma2++) {
-
-        dudr_r = &duarray_r(j2,mb2,ma2,0);
-        dudr_i = &duarray_i(j2,mb2,ma2,0);
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2);
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2);
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
-      } //end loop over ma2 mb2
-
-    // For j2 even, handle middle column
-
-    if (j2%2 == 0) {
-      const int mb2 = j2/2;
-      for(int ma2 = 0; ma2 <= mb2; ma2++) {
-        dudr_r = &duarray_r(j2,mb2,ma2,0);
-        dudr_i = &duarray_i(j2,mb2,ma2,0);
-        const double factor = ma2==mb2?0.5:1.0;
-        jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2) * factor;
-        jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2) * factor;
-        sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i);
-        sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i);
-        sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i);
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_zi_cpu(const int& iter)
+{
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+
+  int idouble = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      zlist(jjz, idouble, iatom).re = static_cast<double>(0.0);
+      zlist(jjz, idouble, iatom).im = static_cast<double>(0.0);
+
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
+      for (int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = static_cast<double>(0.0);
+        double suma1_i = static_cast<double>(0.0);
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min * (j2 + 1) + ma2max;
+        for (int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r;
+        zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i;
+
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        const double scale = static_cast<double>(1) / static_cast<double>(j + 1);
+        zlist(jjz, idouble, iatom).re *= scale;
+        zlist(jjz, idouble, iatom).im *= scale;
       }
-    } // end if j2even
-
-    dbdr += 2.0*sumzdu_r*j2fac;
-    dbarray(j1,j2,j,0) = dbdr.x;
-    dbarray(j1,j2,j,1) = dbdr.y;
-    dbarray(j1,j2,j,2) = dbdr.z;
-  }); //end loop over j1 j2 j
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
+}
+
+
+/* ----------------------------------------------------------------------
+   compute Bi by summing conj(Ui)*Zi, CPU version
+------------------------------------------------------------------------- */
+
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
+{
+  // for j1 = 0,...,twojmax
+  //   for j2 = 0,twojmax
+  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
+  //        b(j1,j2,j) = 0
+  //        for mb = 0,...,jmid
+  //          for ma = 0,...,j
+  //            b(j1,j2,j) +=
+  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
+
+  int itriple = 0;
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      int jalloy = idouble; // must be non-const to work around gcc compiler bug
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
+          [&] (const int& jjb) {
+          const int j1 = idxb(jjb, 0);
+          const int j2 = idxb(jjb, 1);
+          int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug
+
+          int jjz = idxz_block(j1, j2, j);
+          int jju = idxu_block[j];
+          double sumzu = static_cast<double>(0.0);
+          double sumzu_temp = static_cast<double>(0.0);
+          const int bound = (j+2)/2;
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
+              [&] (const int mbma, double& sum) {
+              const int ma = mbma % (j + 1);
+              const int mb = mbma / (j + 1);
+              const int jju_index = jju + mb * (j + 1) + ma;
+              const int jjz_index = jjz + mb * (j + 1) + ma;
+              if (2*mb == j) return;
+              sum +=
+                ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma, mb
+            sumzu += sumzu_temp;
+
+          // For j even, special treatment for middle column
+
+          if (j%2 == 0) {
+            int mb = j/2; // removed "const" to work around GCC 7 bug
+            Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
+                [&] (const int ma, double& sum) {
+              const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+              const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+              sum +=
+                ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma
+            sumzu += sumzu_temp;
+
+            const int ma = mb;
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+            sumzu += static_cast<double>(0.5)*
+              (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+               ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
+          } // end if jeven
+
+          Kokkos::single(Kokkos::PerThread(team), [&] () {
+            sumzu *= static_cast<double>(2.0);
+
+            // apply bzero shift
+
+            if (bzero_flag) {
+              if (!wselfall_flag) {
+                if (elem1 == elem2 && elem1 == elem3) {
+                  sumzu -= bzero[j];
+                }
+              } else {
+                sumzu -= bzero[j];
+              }
+            }
+
+            blist(iatom, itriple, jjb) = sumzu;
+          });
+        });
+          //} // end loop over j
+        //} // end loop over j1, j2
+        itriple++;
+      }
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
 
 }
 
 /* ----------------------------------------------------------------------
-   copy Bi derivatives into a vector
+   compute Yi from Ui without storing Zi, looping over zlist indices,
+   CPU version
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::copy_dbi2dbvec(const Kokkos::TeamPolicy<>::member_type& team)
+void SNA::compute_yi_cpu(int iter,
+ const Kokkos::View<double**, DeviceType> &beta)
 {
- /* int ncount, j1, j2, j;
+  double betaj;
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+  const int jju_half = idxz(jjz, 9);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+  //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
+      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
+      int icgb = mb1min * (j2 +1) + mb2max;
+
+      for (int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
+
+        for (int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        ztmp_r += cgblock[icgb] * suma1_r;
+        ztmp_i += cgblock[icgb] * suma1_i;
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        const double scale = static_cast<double>(1) / static_cast<double>(j + 1);
+        ztmp_i *= scale;
+        ztmp_r *= scale;
+      }
 
-  ncount = 0;
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
+
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        if (j >= j1) {
+          const int jjb = idxb_block(j1, j2, j);
+          const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+          if (j1 == j) {
+            if (j2 == j) betaj = 3 * beta(itriple, iatom);
+            else betaj = 2 * beta(itriple, iatom);
+          } else betaj = beta(itriple, iatom);
+        } else if (j >= j2) {
+          const int jjb = idxb_block(j, j2, j1);
+          const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+          if (j2 == j) betaj = 2 * beta(itriple, iatom);
+          else betaj = beta(itriple, iatom);
+        } else {
+          const int jjb = idxb_block(j2, j, j1);
+          const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+          betaj = beta(itriple, iatom);
+        }
 
-  for(j1 = 0; j1 <= twojmax; j1++) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-	  if (j >= j1) {*/
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max),
-          [&] (const int& JJ) {
-  //for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-	    dbvec(JJ,0) = dbarray(j1,j2,j,0);
-	    dbvec(JJ,1) = dbarray(j1,j2,j,1);
-	    dbvec(JJ,2) = dbarray(j1,j2,j,2);
-	    //ncount++;
-	  //});
-  });
+        if (!bnorm_flag && j1 > j)
+          betaj *= static_cast<double>(j1 + 1) / static_cast<double>(j + 1);
+
+        Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
-/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   calculate derivative of Ui w.r.t. atom j
+   see comments above compute_duarray_cpu for comments on the
+   data layout
+------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::zero_uarraytot(const Kokkos::TeamPolicy<>::member_type& team)
+void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
-  {
-    double* const ptr = uarraytot_r.data();
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()),
-        [&] (const int& i) {
-      ptr[i] = 0.0;
-    });
-  }
-  {
-    double* const ptr = uarraytot_i.data();
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()),
-        [&] (const int& i) {
-      ptr[i] = 0.0;
-    });
-  }
+  double rsq, r, x, y, z, z0, theta0, cs, sn;
+  double dz0dr;
+
+  x = rij(iatom,jnbor,0);
+  y = rij(iatom,jnbor,1);
+  z = rij(iatom,jnbor,2);
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+  double rscale0 = rfac0 * static_cast<double>(MY_PI) / (rcutij(iatom,jnbor) - rmin0);
+  theta0 = (r - rmin0) * rscale0;
+  sn = sin(theta0);
+  cs = cos(theta0);
+  z0 = r * cs / sn;
+  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+
+  compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
 }
 
-/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   compute dEidRj, CPU path only.
+   dulist takes advantage of a `cached` data layout, similar to the
+   shared memory layout for the GPU routines, which is efficient for
+   compressing the calculation in compute_duarray_cpu. That said,
+   dulist only uses the "half" data layout part of that structure.
+------------------------------------------------------------------------- */
+
 
 KOKKOS_INLINE_FUNCTION
-void SNA::addself_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double wself_in)
+void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
-  //for (int j = 0; j <= twojmax; j++)
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
-    [&] (const int& j) {
-    for (int ma = 0; ma <= j; ma++) {
-      uarraytot_r(j,ma,ma) = wself_in;
-      uarraytot_i(j,ma,ma) = 0.0;
-    }
+  t_scalar3<double> final_sum;
+  const int jelem = element(iatom, jnbor);
+
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
+      [&] (const int& j, t_scalar3<double>& sum_tmp) {
+    int jju_half = idxu_half_block[j];
+    int jju_cache = idxu_cache_block[j];
+
+    for (int mb = 0; 2*mb < j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
+        jju_half++; jju_cache++;
+      } //end loop over ma mb
+
+    // For j even, handle middle column
+
+    if (j%2 == 0) {
+
+      int mb = j/2;
+      for (int ma = 0; ma < mb; ma++) {
+        sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
+        jju_half++; jju_cache++;
+      }
+
+      //int ma = mb;
+      sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                    dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5;
+      sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
+                    dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5;
+      sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
+                    dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5;
+    } // end if jeven
+
+  },final_sum); // end loop over j
+
+  Kokkos::single(Kokkos::PerThread(team), [&] () {
+    dedr(iatom,jnbor,0) = final_sum.x*2.0;
+    dedr(iatom,jnbor,1) = final_sum.y*2.0;
+    dedr(iatom,jnbor,2) = final_sum.z*2.0;
   });
+
 }
 
+
 /* ----------------------------------------------------------------------
    add Wigner U-functions for one neighbor to the total
+   ulist is in a "cached" data layout, which is a compressed layout
+   which still keeps the recursive calculation simple. On the other hand
+   `ulisttot` uses a "half" data layout, which fully takes advantage
+   of the symmetry of the Wigner U matrices.
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::add_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double r, double wj, double rcut)
+void SNA::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+    const double& r, const double& wj, const double& rcut,
+    const double& sinner, const double& dinner, int jelem)
 {
-  const double sfac = compute_sfac(r, rcut) * wj;
-
-/*
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r_a(j,ma,mb) +=
-          sfac * uarray_r(j,ma,mb);
-        uarraytot_i_a(j,ma,mb) +=
-          sfac * uarray_i(j,ma,mb);
-      }*/
-  const double* const ptr_r = uarray_r.data();
-  const double* const ptr_i = uarray_i.data();
-  double* const ptrtot_r = uarraytot_r.data();
-  double* const ptrtot_i = uarraytot_i.data();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()),
-      [&] (const int& i) {
-    Kokkos::atomic_add(ptrtot_r+i, sfac * ptr_r[i]);
-    Kokkos::atomic_add(ptrtot_i+i, sfac * ptr_i[i]);
+  const double sfac = compute_sfac(r, rcut, sinner, dinner) * wj;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
+      [&] (const int& j) {
+
+    int jju_half = idxu_half_block[j]; // index into ulisttot
+    int jju_cache = idxu_cache_block[j]; // index into ulist
+
+    int count = 0;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+        Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re);
+        Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im);
+        count++;
+      }
+    }
   });
 }
 
 /* ----------------------------------------------------------------------
-   compute Wigner U-functions for one neighbor
+   compute Wigner U-functions for one neighbor.
+   `ulisttot` uses a "cached" data layout, matching the amount of
+   information stored between layers via scratch memory on the GPU path
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team,
-                         double x, double y, double z,
-                         double z0, double r)
+void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+                         const double& x, const double& y, const double& z, const double& z0, const double& r)
 {
   double r0inv;
   double a_r, b_r, a_i, b_i;
@@ -649,7 +1639,7 @@ void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team,
 
   // compute Cayley-Klein parameters for unit quaternion
 
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  r0inv = static_cast<double>(1.0) / sqrt(r * r + z0 * z0);
   a_r = r0inv * z0;
   a_i = -r0inv * z;
   b_r = r0inv * y;
@@ -657,79 +1647,84 @@ void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team,
 
   // VMK Section 4.8.2
 
-  uarray_r(0,0,0) = 1.0;
-  uarray_i(0,0,0) = 0.0;
+  ulist(0,iatom,jnbor).re = 1.0;
+  ulist(0,iatom,jnbor).im = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug
+    int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug
 
     // fill in left side of matrix layer from previous layer
 
     Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
         [&] (const int& mb) {
-      //const int mb = 2*mb_2;
     //for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r(j,0,mb) = 0.0;
-      uarray_i(j,0,mb) = 0.0;
+      const int jju_index = jju+mb+mb*j;
+      ulist(jju_index,iatom,jnbor).re = 0.0;
+      ulist(jju_index,iatom,jnbor).im = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
-	rootpq = rootpqarray(j - ma,j - mb);
-        uarray_r(j,ma,mb) +=
+        const int jju_index = jju+mb+mb*j+ma;
+        const int jjup_index = jjup+mb*j+ma;
+        rootpq = rootpqarray(j - ma,j - mb);
+        ulist(jju_index,iatom,jnbor).re +=
           rootpq *
-          (a_r * uarray_r(j - 1,ma,mb) +
-	   a_i * uarray_i(j - 1,ma,mb));
-        uarray_i(j,ma,mb) +=
+          (a_r * ulist(jjup_index,iatom,jnbor).re +
+           a_i * ulist(jjup_index,iatom,jnbor).im);
+        ulist(jju_index,iatom,jnbor).im +=
           rootpq *
-          (a_r * uarray_i(j - 1,ma,mb) -
-	   a_i * uarray_r(j - 1,ma,mb));
+          (a_r * ulist(jjup_index,iatom,jnbor).im -
+           a_i * ulist(jjup_index,iatom,jnbor).re);
 
-	rootpq = rootpqarray(ma + 1,j - mb);
-        uarray_r(j,ma + 1,mb) =
+        rootpq = rootpqarray(ma + 1,j - mb);
+        ulist(jju_index+1,iatom,jnbor).re =
           -rootpq *
-          (b_r * uarray_r(j - 1,ma,mb) +
-	   b_i * uarray_i(j - 1,ma,mb));
-        uarray_i(j,ma + 1,mb) =
+          (b_r * ulist(jjup_index,iatom,jnbor).re +
+           b_i * ulist(jjup_index,iatom,jnbor).im);
+        ulist(jju_index+1,iatom,jnbor).im =
           -rootpq *
-          (b_r * uarray_i(j - 1,ma,mb) -
-	   b_i * uarray_r(j - 1,ma,mb));
+          (b_r * ulist(jjup_index,iatom,jnbor).im -
+           b_i * ulist(jjup_index,iatom,jnbor).re);
       }
-    });
 
-    // copy left side to right side with inversion symmetry VMK 4.4(2)
-    // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
-
-    //int mbpar = -1;
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-//    for (int mb = 0; 2*mb <= j; mb++) {
-      int mbpar = (mb)%2==0?1:-1;
-      int mapar = -mbpar;
-      for (int ma = 0; ma <= j; ma++) {
-    	mapar = -mapar;
-    	if (mapar == 1) {
-    	  uarray_r(j,j-ma,j-mb) = uarray_r(j,ma,mb);
-    	  uarray_i(j,j-ma,j-mb) = -uarray_i(j,ma,mb);
-    	} else {
-    	  uarray_r(j,j-ma,j-mb) = -uarray_r(j,ma,mb);
-    	  uarray_i(j,j-ma,j-mb) = uarray_i(j,ma,mb);
-    	}
-    	//OK
-    	//printf("%lf %lf %lf %lf %lf %lf %lf SNAP-COMPARE: UARRAY\n",x,y,z,z0,r,uarray_r(j,ma,mb),uarray_i(j,ma,mb));
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+
+      // Only need to add one symmetrized row for convenience
+      // Symmetry gets "unfolded" in accumulating ulisttot
+      if (j%2==1 && mb==(j/2)) {
+        const int mbpar = (mb)%2==0?1:-1;
+        int mapar = mbpar;
+        for (int ma = 0; ma <= j; ma++) {
+          const int jju_index = jju + mb*(j+1) + ma;
+          const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
+          if (mapar == 1) {
+            ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
+            ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
+          } else {
+            ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
+            ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
+          }
+          mapar = -mapar;
+        }
       }
     });
+
   }
 }
 
-
 /* ----------------------------------------------------------------------
    compute derivatives of Wigner U-functions for one neighbor
-   see comments in compute_uarray()
+   see comments in compute_uarray_cpu()
+   Uses same cached data layout of ulist
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team,
-                          double x, double y, double z,
-                          double z0, double r, double dz0dr,
-			  double wj, double rcut)
+void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+                          const double& x, const double& y, const double& z,
+                          const double& z0, const double& r, const double& dz0dr,
+                          const double& wj, const double& rcut,
+                          const double& sinner, const double& dinner)
 {
   double r0inv;
   double a_r, a_i, b_r, b_i;
@@ -748,7 +1743,7 @@ void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team,
   b_r = y * r0inv;
   b_i = -x * r0inv;
 
-  dr0invdr = -pow(r0inv, 3.0) * (r + z0 * dz0dr);
+  dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
 
   dr0inv[0] = dr0invdr * ux;
   dr0inv[1] = dr0invdr * uy;
@@ -773,208 +1768,315 @@ void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team,
   db_i[0] += -r0inv;
   db_r[1] += r0inv;
 
-  uarray_r(0,0,0) = 1.0;
-  duarray_r(0,0,0,0) = 0.0;
-  duarray_r(0,0,0,1) = 0.0;
-  duarray_r(0,0,0,2) = 0.0;
-  uarray_i(0,0,0) = 0.0;
-  duarray_i(0,0,0,0) = 0.0;
-  duarray_i(0,0,0,1) = 0.0;
-  duarray_i(0,0,0,2) = 0.0;
+  dulist(0,iatom,jnbor,0).re = 0.0;
+  dulist(0,iatom,jnbor,1).re = 0.0;
+  dulist(0,iatom,jnbor,2).re = 0.0;
+  dulist(0,iatom,jnbor,0).im = 0.0;
+  dulist(0,iatom,jnbor,1).im = 0.0;
+  dulist(0,iatom,jnbor,2).im = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_cache_block[j];
+    int jjup = idxu_cache_block[j-1];
     Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
         [&] (const int& mb) {
-
     //for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r(j,0,mb) = 0.0;
-      duarray_r(j,mb,0,0) = 0.0;
-      duarray_r(j,mb,0,1) = 0.0;
-      duarray_r(j,mb,0,2) = 0.0;
-      uarray_i(j,0,mb) = 0.0;
-      duarray_i(j,mb,0,0) = 0.0;
-      duarray_i(j,mb,0,1) = 0.0;
-      duarray_i(j,mb,0,2) = 0.0;
+      const int jju_index = jju+mb+mb*j;
+      dulist(jju_index,iatom,jnbor,0).re = 0.0;
+      dulist(jju_index,iatom,jnbor,1).re = 0.0;
+      dulist(jju_index,iatom,jnbor,2).re = 0.0;
+      dulist(jju_index,iatom,jnbor,0).im = 0.0;
+      dulist(jju_index,iatom,jnbor,1).im = 0.0;
+      dulist(jju_index,iatom,jnbor,2).im = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
+        const int jju_index = jju+mb+mb*j+ma;
+        const int jjup_index = jjup+mb*j+ma;
         rootpq = rootpqarray(j - ma,j - mb);
-        uarray_r(j,ma,mb) += rootpq *
-                               (a_r *  uarray_r(j - 1,ma,mb) +
-                                a_i *  uarray_i(j - 1,ma,mb));
-        uarray_i(j,ma,mb) += rootpq *
-                               (a_r *  uarray_i(j - 1,ma,mb) -
-                                a_i *  uarray_r(j - 1,ma,mb));
-
         for (int k = 0; k < 3; k++) {
-          duarray_r(j,mb,ma,k) +=
-            rootpq * (da_r[k] * uarray_r(j - 1,ma,mb) +
-                      da_i[k] * uarray_i(j - 1,ma,mb) +
-                      a_r * duarray_r(j - 1,mb,ma,k) +
-                      a_i * duarray_i(j - 1,mb,ma,k));
-          duarray_i(j,mb,ma,k) +=
-            rootpq * (da_r[k] * uarray_i(j - 1,ma,mb) -
-                      da_i[k] * uarray_r(j - 1,ma,mb) +
-                      a_r * duarray_i(j - 1,mb,ma,k) -
-                      a_i * duarray_r(j - 1,mb,ma,k));
+          dulist(jju_index,iatom,jnbor,k).re +=
+            rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re +
+                      da_i[k] * ulist(jjup_index,iatom,jnbor).im +
+                      a_r * dulist(jjup_index,iatom,jnbor,k).re +
+                      a_i * dulist(jjup_index,iatom,jnbor,k).im);
+          dulist(jju_index,iatom,jnbor,k).im +=
+            rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im -
+                      da_i[k] * ulist(jjup_index,iatom,jnbor).re +
+                      a_r * dulist(jjup_index,iatom,jnbor,k).im -
+                      a_i * dulist(jjup_index,iatom,jnbor,k).re);
         }
 
-	rootpq = rootpqarray(ma + 1,j - mb);
-        uarray_r(j,ma + 1,mb) =
-          -rootpq * (b_r *  uarray_r(j - 1,ma,mb) +
-                     b_i *  uarray_i(j - 1,ma,mb));
-        uarray_i(j,ma + 1,mb) =
-          -rootpq * (b_r *  uarray_i(j - 1,ma,mb) -
-                     b_i *  uarray_r(j - 1,ma,mb));
-
+        rootpq = rootpqarray(ma + 1,j - mb);
         for (int k = 0; k < 3; k++) {
-          duarray_r(j,mb,ma + 1,k) =
-            -rootpq * (db_r[k] * uarray_r(j - 1,ma,mb) +
-                       db_i[k] * uarray_i(j - 1,ma,mb) +
-                       b_r * duarray_r(j - 1,mb,ma,k) +
-                       b_i * duarray_i(j - 1,mb,ma,k));
-          duarray_i(j,mb,ma + 1,k) =
-            -rootpq * (db_r[k] * uarray_i(j - 1,ma,mb) -
-                       db_i[k] * uarray_r(j - 1,ma,mb) +
-                       b_r * duarray_i(j - 1,mb,ma,k) -
-                       b_i * duarray_r(j - 1,mb,ma,k));
+          dulist(jju_index+1,iatom,jnbor,k).re =
+            -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re +
+                       db_i[k] * ulist(jjup_index,iatom,jnbor).im +
+                       b_r * dulist(jjup_index,iatom,jnbor,k).re +
+                       b_i * dulist(jjup_index,iatom,jnbor,k).im);
+          dulist(jju_index+1,iatom,jnbor,k).im =
+            -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im -
+                       db_i[k] * ulist(jjup_index,iatom,jnbor).re +
+                       b_r * dulist(jjup_index,iatom,jnbor,k).im -
+                       b_i * dulist(jjup_index,iatom,jnbor,k).re);
         }
       }
-    });
 
-    //int mbpar = -1;
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-//    for (int mb = 0; 2*mb <= j; mb++) {
-      int mbpar = (mb)%2==0?1:-1;
-      int mapar = -mbpar;
-      for (int ma = 0; ma <= j; ma++) {
-    	mapar = -mapar;
-    	if (mapar == 1) {
-    	  uarray_r(j,j-ma,j-mb) = uarray_r(j,ma,mb);
-    	  uarray_i(j,j-ma,j-mb) = -uarray_i(j,ma,mb);
-    	  for (int k = 0; k < 3; k++) {
-    	    duarray_r(j,j-mb,j-ma,k) = duarray_r(j,mb,ma,k);
-    	    duarray_i(j,j-mb,j-ma,k) = -duarray_i(j,mb,ma,k);
-    	  }
-    	} else {
-    	  uarray_r(j,j-ma,j-mb) = -uarray_r(j,ma,mb);
-    	  uarray_i(j,j-ma,j-mb) = uarray_i(j,ma,mb);
-    	  for (int k = 0; k < 3; k++) {
-    	    duarray_r(j,j-mb,j-ma,k) = -duarray_r(j,mb,ma,k);
-    	    duarray_i(j,j-mb,j-ma,k) = duarray_i(j,mb,ma,k);
-    	  }
-    	}
+      // Only need to add one symmetrized row for convenience
+      // Symmetry gets "unfolded" during the dedr accumulation
+
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+      if (j%2==1 && mb==(j/2)) {
+        const int mbpar = (mb)%2==0?1:-1;
+        int mapar = mbpar;
+        for (int ma = 0; ma <= j; ma++) {
+          const int jju_index = jju+mb*(j+1)+ma;
+          const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
+          if (mapar == 1) {
+            for (int k = 0; k < 3; k++) {
+              dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
+              dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
+            }
+          } else {
+            for (int k = 0; k < 3; k++) {
+              dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
+              dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
+            }
+          }
+          mapar = -mapar;
+        }
       }
     });
   }
 
-  double sfac = compute_sfac(r, rcut);
-  double dsfac = compute_dsfac(r, rcut);
+  double sfac = compute_sfac(r, rcut, sinner, dinner);
+  double dsfac = compute_dsfac(r, rcut, sinner, dinner);
 
   sfac *= wj;
   dsfac *= wj;
 
-  for (int j = 0; j <= twojmax; j++)
-    for (int mb = 0; mb <= j; mb++) 
+  // Even though we fill out a full "cached" data layout above,
+  // we only need the "half" data for the accumulation into dedr.
+  // Thus we skip updating any unnecessary data.
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_cache_block[j];
+    for (int mb = 0; 2*mb <= j; mb++)
       for (int ma = 0; ma <= j; ma++) {
-        duarray_r(j,mb,ma,0) = dsfac * uarray_r(j,ma,mb) * ux +
-                                  sfac * duarray_r(j,mb,ma,0);
-        duarray_i(j,mb,ma,0) = dsfac * uarray_i(j,ma,mb) * ux +
-                                  sfac * duarray_i(j,mb,ma,0);
-        duarray_r(j,mb,ma,1) = dsfac * uarray_r(j,ma,mb) * uy +
-                                  sfac * duarray_r(j,mb,ma,1);
-        duarray_i(j,mb,ma,1) = dsfac * uarray_i(j,ma,mb) * uy +
-                                  sfac * duarray_i(j,mb,ma,1);
-        duarray_r(j,mb,ma,2) = dsfac * uarray_r(j,ma,mb) * uz +
-                                  sfac * duarray_r(j,mb,ma,2);
-        duarray_i(j,mb,ma,2) = dsfac * uarray_i(j,ma,mb) * uz +
-                                  sfac * duarray_i(j,mb,ma,2);
+        dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
+                                  sfac * dulist(jju,iatom,jnbor,0).re;
+        dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux +
+                                  sfac * dulist(jju,iatom,jnbor,0).im;
+        dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy +
+                                  sfac * dulist(jju,iatom,jnbor,1).re;
+        dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy +
+                                  sfac * dulist(jju,iatom,jnbor,1).im;
+        dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz +
+                                  sfac * dulist(jju,iatom,jnbor,2).re;
+        dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz +
+                                  sfac * dulist(jju,iatom,jnbor,2).im;
+
+        jju++;
       }
+  }
 }
 
-/* ---------------------------------------------------------------------- */
-
-KOKKOS_INLINE_FUNCTION
-void SNA::create_team_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team)
-{
-  int jdim = twojmax + 1;
-  uarraytot_r_a = uarraytot_r = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim);
-  uarraytot_i_a = uarraytot_i = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim);
-  zarray_r = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim);
-  zarray_i = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim);
-
-  rij = t_sna_2d(team.team_scratch(1),nmax,3);
-  rcutij = t_sna_1d(team.team_scratch(1),nmax);
-  wj = t_sna_1d(team.team_scratch(1),nmax);
-  inside = t_sna_1i(team.team_scratch(1),nmax);
-}
-
+/* ----------------------------------------------------------------------
+   factorial n, wrapper for precomputed table
+------------------------------------------------------------------------- */
 
 inline
-T_INT SNA::size_team_scratch_arrays() {
-  T_INT size = 0;
-  int jdim = twojmax + 1;
-
-  size += t_sna_3d::shmem_size(jdim,jdim,jdim);
-  size += t_sna_3d::shmem_size(jdim,jdim,jdim);
-  size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim);
-  size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim);
-
-  size += t_sna_2d::shmem_size(nmax,3);
-  size += t_sna_1d::shmem_size(nmax);
-  size += t_sna_1d::shmem_size(nmax);
-  size += t_sna_1i::shmem_size(nmax);
-
-  return size;
-}
-
-/* ---------------------------------------------------------------------- */
-
-KOKKOS_INLINE_FUNCTION
-void SNA::create_thread_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team)
+double SNA::factorial(int n)
 {
-  int jdim = twojmax + 1;
-  dbvec = Kokkos::View<double*[3], Kokkos::LayoutRight>(team.thread_scratch(1),ncoeff);
-  dbarray = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim);
+  //if (n < 0 || n > nmaxfactorial) {
+  //  char str[128];
+  //  sprintf(str, "Invalid argument to factorial %d", n);
+  //  error->all(FLERR, str);
+  //}
 
-  uarray_r = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim);
-  uarray_i = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim);
-  duarray_r = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim);
-  duarray_i = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim);
-}
-
-inline
-T_INT SNA::size_thread_scratch_arrays() {
-  T_INT size = 0;
-  int jdim = twojmax + 1;
-  size += Kokkos::View<double*[3], Kokkos::LayoutRight>::shmem_size(ncoeff);
-  size += t_sna_4d::shmem_size(jdim,jdim,jdim);
-
-  size += t_sna_3d::shmem_size(jdim,jdim,jdim);
-  size += t_sna_3d::shmem_size(jdim,jdim,jdim);
-  size += t_sna_4d::shmem_size(jdim,jdim,jdim);
-  size += t_sna_4d::shmem_size(jdim,jdim,jdim);
-  return size;
+  return nfac_table[n];
 }
 
 /* ----------------------------------------------------------------------
-   factorial n
+   factorial n table, size SNA::nmaxfactorial+1
 ------------------------------------------------------------------------- */
 
-KOKKOS_INLINE_FUNCTION
-double SNA::factorial(int n)
-{
-  double result = 1.0;
-  for(int i=1; i<=n; i++)
-    result *= 1.0*i;
-  return result;
-}
+const double SNA::nfac_table[] = {
+  1,
+  1,
+  2,
+  6,
+  24,
+  120,
+  720,
+  5040,
+  40320,
+  362880,
+  3628800,
+  39916800,
+  479001600,
+  6227020800,
+  87178291200,
+  1307674368000,
+  20922789888000,
+  355687428096000,
+  6.402373705728e+15,
+  1.21645100408832e+17,
+  2.43290200817664e+18,
+  5.10909421717094e+19,
+  1.12400072777761e+21,
+  2.5852016738885e+22,
+  6.20448401733239e+23,
+  1.5511210043331e+25,
+  4.03291461126606e+26,
+  1.08888694504184e+28,
+  3.04888344611714e+29,
+  8.8417619937397e+30,
+  2.65252859812191e+32,
+  8.22283865417792e+33,
+  2.63130836933694e+35,
+  8.68331761881189e+36,
+  2.95232799039604e+38,
+  1.03331479663861e+40,
+  3.71993326789901e+41,
+  1.37637530912263e+43,
+  5.23022617466601e+44,
+  2.03978820811974e+46,
+  8.15915283247898e+47,
+  3.34525266131638e+49,
+  1.40500611775288e+51,
+  6.04152630633738e+52,
+  2.65827157478845e+54,
+  1.1962222086548e+56,
+  5.50262215981209e+57,
+  2.58623241511168e+59,
+  1.24139155925361e+61,
+  6.08281864034268e+62,
+  3.04140932017134e+64,
+  1.55111875328738e+66,
+  8.06581751709439e+67,
+  4.27488328406003e+69,
+  2.30843697339241e+71,
+  1.26964033536583e+73,
+  7.10998587804863e+74,
+  4.05269195048772e+76,
+  2.35056133128288e+78,
+  1.3868311854569e+80,
+  8.32098711274139e+81,
+  5.07580213877225e+83,
+  3.14699732603879e+85,
+  1.98260831540444e+87,
+  1.26886932185884e+89,
+  8.24765059208247e+90,
+  5.44344939077443e+92,
+  3.64711109181887e+94,
+  2.48003554243683e+96,
+  1.71122452428141e+98,
+  1.19785716699699e+100,
+  8.50478588567862e+101,
+  6.12344583768861e+103,
+  4.47011546151268e+105,
+  3.30788544151939e+107,
+  2.48091408113954e+109,
+  1.88549470166605e+111,
+  1.45183092028286e+113,
+  1.13242811782063e+115,
+  8.94618213078297e+116,
+  7.15694570462638e+118,
+  5.79712602074737e+120,
+  4.75364333701284e+122,
+  3.94552396972066e+124,
+  3.31424013456535e+126,
+  2.81710411438055e+128,
+  2.42270953836727e+130,
+  2.10775729837953e+132,
+  1.85482642257398e+134,
+  1.65079551609085e+136,
+  1.48571596448176e+138,
+  1.3520015276784e+140,
+  1.24384140546413e+142,
+  1.15677250708164e+144,
+  1.08736615665674e+146,
+  1.03299784882391e+148,
+  9.91677934870949e+149,
+  9.61927596824821e+151,
+  9.42689044888324e+153,
+  9.33262154439441e+155,
+  9.33262154439441e+157,
+  9.42594775983835e+159,
+  9.61446671503512e+161,
+  9.90290071648618e+163,
+  1.02990167451456e+166,
+  1.08139675824029e+168,
+  1.14628056373471e+170,
+  1.22652020319614e+172,
+  1.32464181945183e+174,
+  1.44385958320249e+176,
+  1.58824554152274e+178,
+  1.76295255109024e+180,
+  1.97450685722107e+182,
+  2.23119274865981e+184,
+  2.54355973347219e+186,
+  2.92509369349301e+188,
+  3.3931086844519e+190,
+  3.96993716080872e+192,
+  4.68452584975429e+194,
+  5.5745857612076e+196,
+  6.68950291344912e+198,
+  8.09429852527344e+200,
+  9.8750442008336e+202,
+  1.21463043670253e+205,
+  1.50614174151114e+207,
+  1.88267717688893e+209,
+  2.37217324288005e+211,
+  3.01266001845766e+213,
+  3.8562048236258e+215,
+  4.97450422247729e+217,
+  6.46685548922047e+219,
+  8.47158069087882e+221,
+  1.118248651196e+224,
+  1.48727070609069e+226,
+  1.99294274616152e+228,
+  2.69047270731805e+230,
+  3.65904288195255e+232,
+  5.01288874827499e+234,
+  6.91778647261949e+236,
+  9.61572319694109e+238,
+  1.34620124757175e+241,
+  1.89814375907617e+243,
+  2.69536413788816e+245,
+  3.85437071718007e+247,
+  5.5502938327393e+249,
+  8.04792605747199e+251,
+  1.17499720439091e+254,
+  1.72724589045464e+256,
+  2.55632391787286e+258,
+  3.80892263763057e+260,
+  5.71338395644585e+262,
+  8.62720977423323e+264,
+  1.31133588568345e+267,
+  2.00634390509568e+269,
+  3.08976961384735e+271,
+  4.78914290146339e+273,
+  7.47106292628289e+275,
+  1.17295687942641e+278,
+  1.85327186949373e+280,
+  2.94670227249504e+282,
+  4.71472363599206e+284,
+  7.59070505394721e+286,
+  1.22969421873945e+289,
+  2.0044015765453e+291,
+  3.28721858553429e+293,
+  5.42391066613159e+295,
+  9.00369170577843e+297,
+  1.503616514865e+300, // nmaxfactorial = 167
+};
 
 /* ----------------------------------------------------------------------
    the function delta given by VMK Eq. 8.2(1)
 ------------------------------------------------------------------------- */
 
-KOKKOS_INLINE_FUNCTION
+inline
 double SNA::deltacg(int j1, int j2, int j)
 {
   double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
@@ -991,58 +2093,65 @@ double SNA::deltacg(int j1, int j2, int j)
 inline
 void SNA::init_clebsch_gordan()
 {
+  auto h_cglist = Kokkos::create_mirror_view(cglist);
+
   double sum,dcg,sfaccg;
   int m, aa2, bb2, cc2;
   int ifac;
-  auto h_cgarray = Kokkos::create_mirror_view(cgarray);
 
+  int idxcg_count = 0;
   for (int j1 = 0; j1 <= twojmax; j1++)
-    for (int j2 = 0; j2 <= twojmax; j2++)
-      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-        for (int m1 = 0; m1 <= j1; m1 += 1) {
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        for (int m1 = 0; m1 <= j1; m1++) {
           aa2 = 2 * m1 - j1;
 
-          for (int m2 = 0; m2 <= j2; m2 += 1) {
+          for (int m2 = 0; m2 <= j2; m2++) {
 
             // -c <= cc <= c
 
             bb2 = 2 * m2 - j2;
             m = (aa2 + bb2 + j) / 2;
 
-            if(m < 0 || m > j) continue;
-
-	    sum = 0.0;
-
-	    for (int z = MAX(0, MAX(-(j - j2 + aa2)
-				   / 2, -(j - j1 - bb2) / 2));
-		z <= MIN((j1 + j2 - j) / 2,
-			 MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
-		z++) {
-	      ifac = z % 2 ? -1 : 1;
-	      sum += ifac /
-		(factorial(z) *
-		 factorial((j1 + j2 - j) / 2 - z) *
-		 factorial((j1 - aa2) / 2 - z) *
-		 factorial((j2 + bb2) / 2 - z) *
-		 factorial((j - j2 + aa2) / 2 + z) *
-		 factorial((j - j1 - bb2) / 2 + z));
-	    }
-
-	    cc2 = 2 * m - j;
-	    dcg = deltacg(j1, j2, j);
-	    sfaccg = sqrt(factorial((j1 + aa2) / 2) *
-			factorial((j1 - aa2) / 2) *
-			factorial((j2 + bb2) / 2) *
-			factorial((j2 - bb2) / 2) *
-			factorial((j  + cc2) / 2) *
-			factorial((j  - cc2) / 2) *
-			(j + 1));
-
-	    h_cgarray(j1,j2,j,m1,m2) = sum * dcg * sfaccg;
-	    //printf("SNAP-COMPARE: CG: %i %i %i %i %i %e\n",j1,j2,j,m1,m2,cgarray(j1,j2,j,m1,m2));
-	  }
-	}
-  Kokkos::deep_copy(cgarray,h_cgarray);
+            if (m < 0 || m > j) {
+              h_cglist[idxcg_count] = 0.0;
+              idxcg_count++;
+              continue;
+            }
+
+            sum = 0.0;
+
+            for (int z = MAX(0, MAX(-(j - j2 + aa2)
+                                    / 2, -(j - j1 - bb2) / 2));
+                 z <= MIN((j1 + j2 - j) / 2,
+                          MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
+                 z++) {
+              ifac = z % 2 ? -1 : 1;
+              sum += ifac /
+                (factorial(z) *
+                 factorial((j1 + j2 - j) / 2 - z) *
+                 factorial((j1 - aa2) / 2 - z) *
+                 factorial((j2 + bb2) / 2 - z) *
+                 factorial((j - j2 + aa2) / 2 + z) *
+                 factorial((j - j1 - bb2) / 2 + z));
+            }
+
+            cc2 = 2 * m - j;
+            dcg = deltacg(j1, j2, j);
+            sfaccg = sqrt(factorial((j1 + aa2) / 2) *
+                          factorial((j1 - aa2) / 2) *
+                          factorial((j2 + bb2) / 2) *
+                          factorial((j2 - bb2) / 2) *
+                          factorial((j  + cc2) / 2) *
+                          factorial((j  - cc2) / 2) *
+                          (j + 1));
+
+            h_cglist[idxcg_count] = sum * dcg * sfaccg;
+            idxcg_count++;
+          }
+        }
+      }
+  Kokkos::deep_copy(cglist,h_cglist);
 }
 
 /* ----------------------------------------------------------------------
@@ -1056,12 +2165,13 @@ void SNA::init_rootpqarray()
   auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray);
   for (int p = 1; p <= twojmax; p++)
     for (int q = 1; q <= twojmax; q++)
-      h_rootpqarray(p,q) = sqrt(static_cast<double>(p)/q);
+      h_rootpqarray(p,q) = static_cast<double>(sqrt(static_cast<double>(p)/q));
   Kokkos::deep_copy(rootpqarray,h_rootpqarray);
 }
 
 
 /* ---------------------------------------------------------------------- */
+
 inline
 int SNA::compute_ncoeff()
 {
@@ -1070,25 +2180,14 @@ int SNA::compute_ncoeff()
   ncount = 0;
 
   for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
-      for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2;
+           j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) ncount++;
+
+  ndoubles = nelements*nelements;
+  ntriples = nelements*nelements*nelements;
+  if (chem_flag) ncount *= ntriples;
 
   return ncount;
 }
@@ -1096,34 +2195,118 @@ int SNA::compute_ncoeff()
 /* ---------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-double SNA::compute_sfac(double r, double rcut)
+double SNA::compute_sfac(double r, double rcut, double sinner, double  dinner)
 {
-  if (switch_flag == 0) return 1.0;
+  double sfac_outer;
+  constexpr double one = static_cast<double>(1.0);
+  constexpr double zero = static_cast<double>(0.0);
+  constexpr double onehalf = static_cast<double>(0.5);
+  if (switch_flag == 0) sfac_outer = one;
   if (switch_flag == 1) {
-    if(r <= rmin0) return 1.0;
-    else if(r > rcut) return 0.0;
+    if (r <= rmin0) sfac_outer = one;
+    else if (r > rcut) return zero;
     else {
-      double rcutfac = MY_PI / (rcut - rmin0);
-      return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
+      double rcutfac = static_cast<double>(MY_PI) / (rcut - rmin0);
+      sfac_outer = onehalf * (cos((r - rmin0) * rcutfac) + one);
     }
   }
-  return 0.0;
+
+  if (switch_inner_flag == 0) return sfac_outer;
+  if (switch_inner_flag == 1) {
+    if (r >= sinner + dinner)
+        return sfac_outer;
+    else if (r > sinner - dinner) {
+      double rcutfac = static_cast<double>(MY_PI2) / dinner;
+      return sfac_outer *
+        onehalf * (one - cos(static_cast<double>(MY_PI2) + (r - sinner) * rcutfac));
+    } else return zero;
+  }
+  return zero; // dummy return
 }
 
 /* ---------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-double SNA::compute_dsfac(double r, double rcut)
+double SNA::compute_dsfac(double r, double rcut, double sinner, double dinner)
 {
-  if (switch_flag == 0) return 0.0;
+  double sfac_outer, dsfac_outer, sfac_inner, dsfac_inner;
+  constexpr double one = static_cast<double>(1.0);
+  constexpr double zero = static_cast<double>(0.0);
+  constexpr double onehalf = static_cast<double>(0.5);
+  if (switch_flag == 0) dsfac_outer = zero;
   if (switch_flag == 1) {
-    if(r <= rmin0) return 0.0;
-    else if(r > rcut) return 0.0;
+    if (r <= rmin0) dsfac_outer = zero;
+    else if (r > rcut) return zero;
     else {
-      double rcutfac = MY_PI / (rcut - rmin0);
-      return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac;
+      double rcutfac = static_cast<double>(MY_PI) / (rcut - rmin0);
+      dsfac_outer = -onehalf * sin((r - rmin0) * rcutfac) * rcutfac;
     }
   }
-  return 0.0;
+
+  if (switch_inner_flag == 0) return dsfac_outer;
+  if (switch_inner_flag == 1) {
+    if (r >= sinner + dinner)
+      return dsfac_outer;
+    else if (r > sinner - dinner) {
+
+      // calculate sfac_outer
+
+      if (switch_flag == 0) sfac_outer = one;
+      if (switch_flag == 1) {
+        if (r <= rmin0) sfac_outer = one;
+        else if (r > rcut) sfac_outer = zero;
+        else {
+          double rcutfac = static_cast<double>(MY_PI) / (rcut - rmin0);
+          sfac_outer = onehalf * (cos((r - rmin0) * rcutfac) + one);
+        }
+      }
+
+      // calculate sfac_inner
+
+      double rcutfac = static_cast<double>(MY_PI2) / dinner;
+      sfac_inner = onehalf * (one - cos(static_cast<double>(MY_PI2) + (r - sinner) * rcutfac));
+      dsfac_inner = onehalf * rcutfac * sin(static_cast<double>(MY_PI2) + (r - sinner) * rcutfac);
+      return dsfac_outer * sfac_inner + sfac_outer * dsfac_inner;
+
+    } else return zero;
+  }
+  return zero; // dummy return
+}
+
+KOKKOS_INLINE_FUNCTION
+void SNA::compute_s_dsfac(const double r, const double rcut, const double sinner, const double dinner, double& sfac, double& dsfac) {
+
+  double sfac_outer, dsfac_outer, sfac_inner, dsfac_inner;
+  constexpr double one = static_cast<double>(1.0);
+  constexpr double zero = static_cast<double>(0.0);
+  constexpr double onehalf = static_cast<double>(0.5);
+
+  if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
+  else if (switch_flag == 1) {
+    if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
+    else if (r > rcut) { sfac = zero; dsfac = zero; return; }
+    else {
+      const double rcutfac = static_cast<double>(MY_PI) / (rcut - rmin0);
+      const double theta0 = (r - rmin0) * rcutfac;
+      const double sn = sin(theta0);
+      const double cs = cos(theta0);
+      sfac_outer = onehalf * (cs + one);
+      dsfac_outer = -onehalf * sn * rcutfac;
+    }
+  } else { sfac = zero; dsfac = zero; return; } // dummy return
+
+  if (switch_inner_flag == 0) { sfac = sfac_outer; dsfac = dsfac_outer; return; }
+  else if (switch_inner_flag == 1) {
+    if (r >= sinner + dinner) { sfac = sfac_outer; dsfac = dsfac_outer; return; }
+    else if (r > sinner - dinner) {
+      double rcutfac = static_cast<double>(MY_PI2) / dinner;
+      sfac_inner = onehalf * (one - cos(static_cast<double>(MY_PI2) + (r - sinner) * rcutfac));
+      dsfac_inner = onehalf * rcutfac * sin(static_cast<double>(MY_PI2) + (r - sinner) * rcutfac);
+      sfac = sfac_outer * sfac_inner;
+      dsfac = dsfac_outer * sfac_inner + sfac_outer * dsfac_inner;
+      return;
+    } else { sfac = zero; dsfac = zero; return; }
+  } else { sfac = zero; dsfac = zero; return; } // dummy return
+
 }
 

From b7cd1a43d1a08e36d9aa9d9aab6e02d967cf800e Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 13 Jul 2022 14:31:36 -0600
Subject: [PATCH 04/11] WIP

---
 src/force_types/sna.h        |   3 +-
 src/force_types/sna_impl.hpp | 117 +++++++++++++++++------------------
 src/types.h                  |  18 ++++++
 3 files changed, 77 insertions(+), 61 deletions(-)

diff --git a/src/force_types/sna.h b/src/force_types/sna.h
index f174ba6..f1e6c0b 100644
--- a/src/force_types/sna.h
+++ b/src/force_types/sna.h
@@ -95,11 +95,10 @@ struct alignas(8) FullHalfMapper {
   int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj
 };
 
-template<class DeviceType, typename double_, int vector_length_>
+template<class DeviceType, typename double, int vector_length_>
 class SNA {
 
  public:
-  using double = double_;
   using complex = SNAComplex<double>;
   static constexpr int vector_length = vector_length_;
 
diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp
index 3bbcc4b..01a6140 100644
--- a/src/force_types/sna_impl.hpp
+++ b/src/force_types/sna_impl.hpp
@@ -17,7 +17,6 @@
 ------------------------------------------------------------------------- */
 
 #include "sna.h"
-#include "memory_kokkos.h"
 #include <cmath>
 #include <cstring>
 #include <cstdlib>
@@ -60,12 +59,12 @@ SNA::SNA(double rfac0_in,
   build_indexlist();
 
   int jdimpq = twojmax + 2;
-  MemKK::realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq);
+  realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq);
 
-  MemKK::realloc_kokkos(cglist,"SNA::cglist",idxcg_max);
+  realloc_kokkos(cglist,"SNA::cglist",idxcg_max);
 
   if (bzero_flag) {
-    MemKK::realloc_kokkos(bzero,"sna:bzero",twojmax+1);
+    realloc_kokkos(bzero,"sna:bzero",twojmax+1);
     auto h_bzero = Kokkos::create_mirror_view(bzero);
 
     double www = wself*wself*wself;
@@ -91,7 +90,7 @@ void SNA::build_indexlist()
   // index list for cglist
 
   int jdim = twojmax + 1;
-  MemKK::realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim);
+  realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim);
   auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block);
 
   int idxcg_count = 0;
@@ -109,7 +108,7 @@ void SNA::build_indexlist()
   // index list for uarray
   // need to include both halves
 
-  MemKK::realloc_kokkos(idxu_block,"SNA::idxu_block",jdim);
+  realloc_kokkos(idxu_block,"SNA::idxu_block",jdim);
   auto h_idxu_block = Kokkos::create_mirror_view(idxu_block);
 
   int idxu_count = 0;
@@ -124,7 +123,7 @@ void SNA::build_indexlist()
   Kokkos::deep_copy(idxu_block,h_idxu_block);
 
   // index list for half uarray
-  MemKK::realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim);
+  realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim);
   auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
 
   int idxu_half_count = 0;
@@ -138,7 +137,7 @@ void SNA::build_indexlist()
   Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
 
   // mapping between full and half indexing, encoding flipping
-  MemKK::realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max);
+  realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max);
   auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half);
 
   idxu_count = 0;
@@ -165,7 +164,7 @@ void SNA::build_indexlist()
   // index list for "cache" uarray
   // this is the GPU scratch memory requirements
   // applied the CPU structures
-  MemKK::realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim);
+  realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim);
   auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
 
   int idxu_cache_count = 0;
@@ -187,7 +186,7 @@ void SNA::build_indexlist()
         if (j >= j1) idxb_count++;
 
   idxb_max = idxb_count;
-  MemKK::realloc_kokkos(idxb,"SNA::idxb",idxb_max);
+  realloc_kokkos(idxb,"SNA::idxb",idxb_max);
   auto h_idxb = Kokkos::create_mirror_view(idxb);
 
   idxb_count = 0;
@@ -204,7 +203,7 @@ void SNA::build_indexlist()
 
   // reverse index list for beta and b
 
-  MemKK::realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim);
+  realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim);
   auto h_idxb_block = Kokkos::create_mirror_view(idxb_block);
 
   idxb_count = 0;
@@ -230,10 +229,10 @@ void SNA::build_indexlist()
             idxz_count++;
 
   idxz_max = idxz_count;
-  MemKK::realloc_kokkos(idxz,"SNA::idxz",idxz_max);
+  realloc_kokkos(idxz,"SNA::idxz",idxz_max);
   auto h_idxz = Kokkos::create_mirror_view(idxz);
 
-  MemKK::realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim);
+  realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim);
   auto h_idxz_block = Kokkos::create_mirror_view(idxz_block);
 
   idxz_count = 0;
@@ -288,59 +287,59 @@ void SNA::grow_rij(int newnatom, int newnmax)
   natom = newnatom;
   nmax = newnmax;
 
-  MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3);
-  MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax);
-  MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax);
-  MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax);
-  MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax);
-  MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax);
-  MemKK::realloc_kokkos(element,"sna:element",natom,nmax);
-  MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3);
+  realloc_kokkos(rij,"sna:rij",natom,nmax,3);
+  realloc_kokkos(wj,"sna:wj",natom,nmax);
+  realloc_kokkos(rcutij,"sna:rcutij",natom,nmax);
+  realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax);
+  realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax);
+  realloc_kokkos(inside,"sna:inside",natom,nmax);
+  realloc_kokkos(element,"sna:element",natom,nmax);
+  realloc_kokkos(dedr,"sna:dedr",natom,nmax,3);
 
 #ifdef LMP_KOKKOS_GPU
   if (!host_flag) {
     const int natom_div = (natom + vector_length - 1) / vector_length;
 
-    MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div);
-    MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div);
-    MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3);
-    MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3);
-    MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4);
-    MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation
-    MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1);
-    MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div);
-    MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div);
-    MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div);
-    MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1);
-    MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1);
-    MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div);
-    MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
-    MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div);
-    MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1);
-    MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div);
-    MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div);
-    MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1);
+    realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div);
+    realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div);
+    realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3);
+    realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3);
+    realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4);
+    realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation
+    realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1);
+    realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div);
+    realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div);
+    realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div);
+    realloc_kokkos(ulist,"sna:ulist",1,1,1);
+    realloc_kokkos(zlist,"sna:zlist",1,1,1);
+    realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div);
+    realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
+    realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div);
+    realloc_kokkos(ylist,"sna:ylist",1,1,1);
+    realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div);
+    realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div);
+    realloc_kokkos(dulist,"sna:dulist",1,1,1);
   } else {
 #endif
-    MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1);
-    MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1);
-    MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1);
-    MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1);
-    MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1);
-    MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom);
-    MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom);
-    MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1);
-    MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1);
-    MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1);
-    MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax);
-    MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom);
-    MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1);
-    MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
-    MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1);
-    MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom);
-    MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1);
-    MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1);
-    MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax);
+    realloc_kokkos(a_pack,"sna:a_pack",1,1,1);
+    realloc_kokkos(b_pack,"sna:b_pack",1,1,1);
+    realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1);
+    realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1);
+    realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1);
+    realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom);
+    realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom);
+    realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1);
+    realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1);
+    realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1);
+    realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax);
+    realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom);
+    realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1);
+    realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max);
+    realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1);
+    realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom);
+    realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1);
+    realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1);
+    realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax);
 
 #ifdef LMP_KOKKOS_GPU
   }
diff --git a/src/types.h b/src/types.h
index c72e631..771c3d5 100644
--- a/src/types.h
+++ b/src/types.h
@@ -114,6 +114,24 @@ typedef Kokkos::View<const T_V_FLOAT*>    t_mass_const; // Mass
 
 typedef Kokkos::DefaultExecutionSpace::memory_space t_neigh_mem_space;
 
+namespace Kokkos {
+  static auto NoInit = [](std::string const& label) {
+    return Kokkos::view_alloc(Kokkos::WithoutInitializing, label);
+  };
+}
+
+/* ----------------------------------------------------------------------
+   reallocate Kokkos views without initialization
+   deallocate first to reduce memory use
+------------------------------------------------------------------------- */
+
+template <typename TYPE, typename... Indices>
+static void realloc_kokkos(TYPE &data, const char *name, Indices... ns)
+{
+  data = TYPE();
+  data = TYPE(Kokkos::NoInit(std::string(name)), ns...);
+}
+
 template<class Scalar>
 struct t_scalar3 {
   Scalar x,y,z;

From fbd684ae39dfb6fd3c0e5c00da5a722da61b09de Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 13 Jul 2022 15:19:02 -0600
Subject: [PATCH 05/11] WIP

---
 src/force_types/force_snap_neigh_impl.h |  30 +++---
 src/force_types/sna.h                   | 124 ++++++++++++------------
 src/force_types/sna_impl.hpp            |  53 +++++-----
 src/types.h                             |  78 +++++++++++++++
 4 files changed, 179 insertions(+), 106 deletions(-)

diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h
index de6ba6e..bcc8c7b 100644
--- a/src/force_types/force_snap_neigh_impl.h
+++ b/src/force_types/force_snap_neigh_impl.h
@@ -267,9 +267,9 @@ void ForceSNAP<NeighborClass>::init_coeff(int narg, char **arg)
     for (int i = 0; i < nelements; i++)
       delete[] elements[i];
     delete[] elements;
-    radelem = Kokkos::View<T_F_FLOAT*>();
-    wjelem = Kokkos::View<T_F_FLOAT*>();
-    coeffelem = Kokkos::View<T_F_FLOAT**, Kokkos::LayoutRight>();
+    radelem = Kokkos::View<double*>();
+    wjelem = Kokkos::View<double*>();
+    coeffelem = Kokkos::View<double**, Kokkos::LayoutRight>();
   }
 
   nelements = narg - 5 - system->ntypes;
@@ -422,9 +422,9 @@ void ForceSNAP<NeighborClass>::read_files(char *coefffilename, char *paramfilena
 
   // Set up element lists
 
-  radelem = Kokkos::View<T_F_FLOAT*>("pair:radelem",nelements);
-  wjelem = Kokkos::View<T_F_FLOAT*>("pair:wjelem",nelements);
-  coeffelem = Kokkos::View<T_F_FLOAT**, Kokkos::LayoutRight>("pair:coeffelem",nelements,ncoeffall);
+  radelem = Kokkos::View<double*>("pair:radelem",nelements);
+  wjelem = Kokkos::View<double*>("pair:wjelem",nelements);
+  coeffelem = Kokkos::View<double**, Kokkos::LayoutRight>("pair:coeffelem",nelements,ncoeffall);
 
   int *found = new int[nelements];
   for (int ielem = 0; ielem < nelements; ielem++)
@@ -811,12 +811,12 @@ void ForceSNAP<NeighborClass>::operator() (const Kokkos::TeamPolicy<>::member_ty
       [&] (const int jj, int& count) {
     Kokkos::single(Kokkos::PerThread(team), [&] (){
       T_INT j = neighs_i(jj);
-      const T_F_FLOAT dx = x(j,0) - x_i;
-      const T_F_FLOAT dy = x(j,1) - y_i;
-      const T_F_FLOAT dz = x(j,2) - z_i;
+      const double dx = x(j,0) - x_i;
+      const double dy = x(j,1) - y_i;
+      const double dz = x(j,2) - z_i;
 
       const int type_j = type(j);
-      const T_F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+      const double rsq = dx*dx + dy*dy + dz*dz;
       const int elem_j = map[type_j];
 
       if( rsq < rnd_cutsq(type_i,type_j) )
@@ -831,12 +831,12 @@ void ForceSNAP<NeighborClass>::operator() (const Kokkos::TeamPolicy<>::member_ty
       [&] (const int jj, int& offset, bool final){
   //for (int jj = 0; jj < num_neighs; jj++) {
     T_INT j = neighs_i(jj);
-    const T_F_FLOAT dx = x(j,0) - x_i;
-    const T_F_FLOAT dy = x(j,1) - y_i;
-    const T_F_FLOAT dz = x(j,2) - z_i;
+    const double dx = x(j,0) - x_i;
+    const double dy = x(j,1) - y_i;
+    const double dz = x(j,2) - z_i;
 
     const int type_j = type(j);
-    const T_F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    const double rsq = dx*dx + dy*dy + dz*dz;
     const int elem_j = map[type_j];
 
     if( rsq < rnd_cutsq(type_i,type_j) ) {
@@ -884,7 +884,7 @@ void ForceSNAP<NeighborClass>::operator() (const Kokkos::TeamPolicy<>::member_ty
 
 
     Kokkos::single(Kokkos::PerThread(team), [&] (){
-    T_F_FLOAT fij[3];
+    double fij[3];
 
     fij[0] = 0.0;
     fij[1] = 0.0;
diff --git a/src/force_types/sna.h b/src/force_types/sna.h
index f1e6c0b..f2f99e6 100644
--- a/src/force_types/sna.h
+++ b/src/force_types/sna.h
@@ -66,9 +66,8 @@
 #endif
 
 struct WignerWrapper {
-  using double = double;
   using complex = SNAComplex<double>;
-  static constexpr int vector_length = vector_length_;
+  static constexpr int vector_length = 32;
 
   const int offset; // my offset into the vector (0, ..., vector_length - 1)
   double* buffer; // buffer of real numbers
@@ -95,46 +94,43 @@ struct alignas(8) FullHalfMapper {
   int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj
 };
 
-template<class DeviceType, typename double, int vector_length_>
 class SNA {
 
  public:
   using complex = SNAComplex<double>;
-  static constexpr int vector_length = vector_length_;
-
-  using KKDeviceType = typename KKDevice<DeviceType>::value;
-
-  typedef Kokkos::View<int*, DeviceType> t_sna_1i;
-  typedef Kokkos::View<double*, DeviceType> t_sna_1d;
-  typedef Kokkos::View<double*, KKDeviceType, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1d_atomic;
-  typedef Kokkos::View<int**, DeviceType> t_sna_2i;
-  typedef Kokkos::View<double**, DeviceType> t_sna_2d;
-  typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
-  typedef Kokkos::View<double***, DeviceType> t_sna_3d;
-  typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
-  typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
-  typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
-  typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
-  typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
-
-  typedef Kokkos::View<complex*, DeviceType> t_sna_1c;
-  typedef Kokkos::View<complex*, KKDeviceType, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1c_atomic;
-  typedef Kokkos::View<complex**, DeviceType> t_sna_2c;
-  typedef Kokkos::View<complex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
-  typedef Kokkos::View<complex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
-  typedef Kokkos::View<complex***, DeviceType> t_sna_3c;
-  typedef Kokkos::View<complex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
-  typedef Kokkos::View<complex***[3], DeviceType> t_sna_4c;
-  typedef Kokkos::View<complex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
-  typedef Kokkos::View<complex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
-  typedef Kokkos::View<complex**[3], DeviceType> t_sna_3c3;
-  typedef Kokkos::View<complex*****, DeviceType> t_sna_5c;
+  static constexpr int vector_length = 32;
+
+  typedef Kokkos::View<int*> t_sna_1i;
+  typedef Kokkos::View<double*> t_sna_1d;
+  typedef Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1d_atomic;
+  typedef Kokkos::View<int**> t_sna_2i;
+  typedef Kokkos::View<double**> t_sna_2d;
+  typedef Kokkos::View<double**, Kokkos::LayoutLeft> t_sna_2d_ll;
+  typedef Kokkos::View<double***> t_sna_3d;
+  typedef Kokkos::View<double***, Kokkos::LayoutLeft> t_sna_3d_ll;
+  typedef Kokkos::View<double***[3]> t_sna_4d;
+  typedef Kokkos::View<double****, Kokkos::LayoutLeft> t_sna_4d_ll;
+  typedef Kokkos::View<double**[3]> t_sna_3d3;
+  typedef Kokkos::View<double*****> t_sna_5d;
+
+  typedef Kokkos::View<complex*> t_sna_1c;
+  typedef Kokkos::View<complex*, Kokkos::MemoryTraits<Kokkos::Atomic>> t_sna_1c_atomic;
+  typedef Kokkos::View<complex**> t_sna_2c;
+  typedef Kokkos::View<complex**, Kokkos::LayoutLeft> t_sna_2c_ll;
+  typedef Kokkos::View<complex**, Kokkos::LayoutRight> t_sna_2c_lr;
+  typedef Kokkos::View<complex***> t_sna_3c;
+  typedef Kokkos::View<complex***, Kokkos::LayoutLeft> t_sna_3c_ll;
+  typedef Kokkos::View<complex***[3]> t_sna_4c;
+  typedef Kokkos::View<complex***[3], Kokkos::LayoutLeft> t_sna_4c3_ll;
+  typedef Kokkos::View<complex****, Kokkos::LayoutLeft> t_sna_4c_ll;
+  typedef Kokkos::View<complex**[3]> t_sna_3c3;
+  typedef Kokkos::View<complex*****> t_sna_5c;
 
   inline
   SNA() {};
 
   KOKKOS_INLINE_FUNCTION
-  SNA(const SNA<DeviceType,double,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
+  SNA(const SNA& sna, const typename Kokkos::TeamPolicy<>::member_type& team);
 
   inline
   SNA(double, int, double, int, int, int, int, int, int, int);
@@ -161,19 +157,19 @@ class SNA {
 
   // version of the code with parallelism over j_bend
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
+  void compute_ui_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int, const int); // ForceSNAP
   // version of the code without parallelism over j_bend
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
+  void compute_ui_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int); // ForceSNAP
 
   KOKKOS_INLINE_FUNCTION
   void compute_zi(const int&, const int&, const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_yi(int,int,int,
-   const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+   const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_yi_with_zlist(int,int,int,
-   const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+   const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_bi(const int&, const int&, const int&);    // ForceSNAP
 
@@ -181,16 +177,16 @@ class SNA {
   // version of the code with parallelism over j_bend
   template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
+  void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int, const int); //ForceSNAP
   // version of the code without parallelism over j_bend
   template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
+  void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int); //ForceSNAP
 
   // core "evaluation" functions that get plugged into "compute" functions
   // plugged into compute_ui_small, compute_ui_large
   KOKKOS_FORCEINLINE_FUNCTION
-  void evaluate_ui_jbend(const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&, const int&,
+  void evaluate_ui_jbend(const WignerWrapper, const complex&, const complex&, const double&, const int&,
                         const int&, const int&, const int&);
   // plugged into compute_zi, compute_yi
   KOKKOS_FORCEINLINE_FUNCTION
@@ -199,31 +195,31 @@ class SNA {
   // plugged into compute_yi, compute_yi_with_zlist
   KOKKOS_FORCEINLINE_FUNCTION
   double evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
-                        const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &);
+                        const Kokkos::View<double***, Kokkos::LayoutLeft> &);
   // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
   KOKKOS_FORCEINLINE_FUNCTION
-  double evaluate_duidrj_jbend(const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&,
-                        const WignerWrapper<double, vector_length>&, const complex&, const complex&, const double&,
+  double evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const double&,
+                        const WignerWrapper&, const complex&, const complex&, const double&,
                         const int&, const int&, const int&, const int&);
 
   // functions for bispectrum coefficients, CPU only
   KOKKOS_INLINE_FUNCTION
-  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
+  void pre_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  void compute_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_zi_cpu(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_yi_cpu(int,
-   const Kokkos::View<double**, DeviceType> &beta); // ForceSNAP
+   const Kokkos::View<double**> &beta); // ForceSNAP
     KOKKOS_INLINE_FUNCTION
-  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
+  void compute_bi_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int);    // ForceSNAP
 
   // functions for derivatives, CPU only
   KOKKOS_INLINE_FUNCTION
-  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
+  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); // ForceSNAP
 
   KOKKOS_INLINE_FUNCTION
   double compute_sfac(double, double, double, double); // add_uarraytot, compute_duarray
@@ -296,19 +292,19 @@ class SNA {
 
   //use indexlist instead of loops, constructor generates these
   // Same across all SNA
-  Kokkos::View<int*[10], DeviceType> idxz;
-  Kokkos::View<int*[3], DeviceType> idxb;
-  Kokkos::View<int***, DeviceType> idxcg_block;
+  Kokkos::View<int*[10]> idxz;
+  Kokkos::View<int*[3]> idxb;
+  Kokkos::View<int***> idxcg_block;
 
  public:
-  Kokkos::View<int*, DeviceType> idxu_block;
-  Kokkos::View<int*, DeviceType> idxu_half_block;
-  Kokkos::View<int*, DeviceType> idxu_cache_block;
-  Kokkos::View<FullHalfMapper*, DeviceType> idxu_full_half;
+  Kokkos::View<int*> idxu_block;
+  Kokkos::View<int*> idxu_half_block;
+  Kokkos::View<int*> idxu_cache_block;
+  Kokkos::View<FullHalfMapper*> idxu_full_half;
 
  private:
-  Kokkos::View<int***, DeviceType> idxz_block;
-  Kokkos::View<int***, DeviceType> idxb_block;
+  Kokkos::View<int***> idxz_block;
+  Kokkos::View<int***> idxb_block;
 
   // data for bispectrum coefficients
 
@@ -322,9 +318,9 @@ class SNA {
   double factorial(int);
 
   KOKKOS_INLINE_FUNCTION
-  void create_team_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNA()
+  void create_team_scratch_arrays(const typename Kokkos::TeamPolicy<>::member_type& team); // SNA()
   KOKKOS_INLINE_FUNCTION
-  void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // SNA()
+  void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<>::member_type& team); // SNA()
 
   inline
   void init_clebsch_gordan(); // init()
@@ -333,10 +329,10 @@ class SNA {
   void init_rootpqarray();    // init()
 
   KOKKOS_INLINE_FUNCTION
-  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui
+  void add_uarraytot(const typename Kokkos::TeamPolicy<>::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui
 
   KOKKOS_INLINE_FUNCTION
-  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
+  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int,
                       const double&, const double&, const double&,
                       const double&, const double&); // compute_ui_cpu
 
@@ -347,7 +343,7 @@ class SNA {
   inline
   int compute_ncoeff();           // SNA()
   KOKKOS_INLINE_FUNCTION
-  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
+  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int,
                        const double&, const double&, const double&, // compute_duidrj_cpu
                            const double&, const double&, const double&, const double&, const double&,
                            const double&, const double&);
@@ -371,7 +367,7 @@ class SNA {
   int wselfall_flag;
 
   int bzero_flag; // 1 if bzero subtracted from barray
-  Kokkos::View<double*, DeviceType> bzero; // array of B values for isolated atoms
+  Kokkos::View<double*> bzero; // array of B values for isolated atoms
 };
 
 #include<sna_impl.hpp>
diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp
index 01a6140..fda5a36 100644
--- a/src/force_types/sna_impl.hpp
+++ b/src/force_types/sna_impl.hpp
@@ -30,8 +30,7 @@ SNA::SNA(double rfac0_in,
          int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in,
          int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in)
 {
-  LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  host_flag = (execution_space == LAMMPS_NS::Host);
+  host_flag = 0;
 
   wself = static_cast<double>(1.0);
 
@@ -479,7 +478,7 @@ void SNA::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int
 // Version of the code that exposes additional parallelism by threading over `j_bend` values
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
+void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
 {
 
   // get shared memory offset
@@ -490,7 +489,7 @@ void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member
   const int scratch_shift = team_rank * tile_size;
 
   // extract and wrap
-  const WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
   // load parameters
   const complex a = a_pack(iatom_mod, jnbor, iatom_div);
@@ -509,7 +508,7 @@ void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member
 // Version of the code that loops over all `j_bend` values which reduces integer arithmetic
 // and some amount of load imbalance, at the expense of reducing parallelism
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
+void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
 {
   // get shared memory offset
   // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
@@ -519,7 +518,7 @@ void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member
   const int scratch_shift = team_rank * tile_size;
 
   // extract and wrap
-  const WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
   // load parameters
   const complex a = a_pack(iatom_mod, jnbor, iatom_div);
@@ -539,7 +538,7 @@ void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member
 
 // Core "evaluation" kernel that gets reused in `compute_ui_small` and `compute_ui_large`
 KOKKOS_FORCEINLINE_FUNCTION
-void SNA::evaluate_ui_jbend(const WignerWrapper<double, vector_length>& ulist_wrapper,
+void SNA::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper,
           const complex& a, const complex& b, const double& sfac, const int& jelem,
           const int& iatom_mod, const int& j_bend, const int& iatom_div)
 {
@@ -773,7 +772,7 @@ void SNA::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
 
 KOKKOS_INLINE_FUNCTION
 void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div,
- const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
+ const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack)
 {
 
   const int j1 = idxz(jjz, 0);
@@ -821,7 +820,7 @@ void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div,
 
 KOKKOS_INLINE_FUNCTION
 void SNA::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
- const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
+ const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack)
 {
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -904,7 +903,7 @@ typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int&
 KOKKOS_FORCEINLINE_FUNCTION
 typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j,
           const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div,
-          const Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> &beta_pack) {
+          const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack) {
 
   double betaj = 0;
 
@@ -943,7 +942,7 @@ typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, con
 // Version of the code that exposes additional parallelism by threading over `j_bend` values
 template<int dir>
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
+void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
 {
   // get shared memory offset
   // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
@@ -953,8 +952,8 @@ void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceTyp
   const int scratch_shift = team_rank * tile_size;
 
   // extract, wrap shared memory buffer
-  WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
-  WignerWrapper<double, vector_length> dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
   // load parameters
   const complex a = a_pack(iatom_mod, jnbor, iatom_div);
@@ -979,7 +978,7 @@ void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceTyp
 // and some amount of load imbalance, at the expense of reducing parallelism
 template<int dir>
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
+void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div)
 {
   // get shared memory offset
   // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
@@ -989,8 +988,8 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceTyp
   const int scratch_shift = team_rank * tile_size;
 
   // extract, wrap shared memory buffer
-  WignerWrapper<double, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
-  WignerWrapper<double, vector_length> dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
   // load parameters
   const complex a = a_pack(iatom_mod, jnbor, iatom_div);
@@ -1020,8 +1019,8 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceTyp
 // Core "evaluation" kernel that gets reused in `compute_fused_deidrj_small` and
 // `compute_fused_deidrj_large`
 KOKKOS_FORCEINLINE_FUNCTION
-typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper<double, vector_length>& ulist_wrapper, const complex& a, const complex& b, const double& sfac,
-                      const WignerWrapper<double, vector_length>& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu,
+typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac,
+                      const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu,
                       const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) {
 
   double dedr_full_sum = static_cast<double>(0);
@@ -1167,7 +1166,7 @@ typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper<double, vect
  * ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
+void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, const int& iatom, const int& ielem)
 {
   for (int jelem = 0; jelem < nelements; jelem++) {
     for (int j = 0; j <= twojmax; j++) {
@@ -1200,7 +1199,7 @@ void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type&
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor)
 {
   double rsq, r, x, y, z, z0, theta0;
 
@@ -1298,7 +1297,7 @@ void SNA::compute_zi_cpu(const int& iter)
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
+void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
@@ -1396,7 +1395,7 @@ void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_t
 
 KOKKOS_INLINE_FUNCTION
 void SNA::compute_yi_cpu(int iter,
- const Kokkos::View<double**, DeviceType> &beta)
+ const Kokkos::View<double**> &beta)
 {
   double betaj;
   const int iatom = iter / idxz_max;
@@ -1503,7 +1502,7 @@ void SNA::compute_yi_cpu(int iter,
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor)
 {
   double rsq, r, x, y, z, z0, theta0, cs, sn;
   double dz0dr;
@@ -1534,7 +1533,7 @@ void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::memb
 
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor)
 {
   t_scalar3<double> final_sum;
   const int jelem = element(iatom, jnbor);
@@ -1599,7 +1598,7 @@ void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::memb
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNA::add_uarraytot(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor,
     const double& r, const double& wj, const double& rcut,
     const double& sinner, const double& dinner, int jelem)
 {
@@ -1629,7 +1628,7 @@ void SNA::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_ty
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor,
                          const double& x, const double& y, const double& z, const double& z0, const double& r)
 {
   double r0inv;
@@ -1719,7 +1718,7 @@ void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::memb
 ------------------------------------------------------------------------- */
 
 KOKKOS_INLINE_FUNCTION
-void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor,
                           const double& x, const double& y, const double& z,
                           const double& z0, const double& r, const double& dz0dr,
                           const double& wj, const double& rcut,
diff --git a/src/types.h b/src/types.h
index 771c3d5..48d30ff 100644
--- a/src/types.h
+++ b/src/types.h
@@ -112,6 +112,84 @@ typedef Kokkos::View<const T_FLOAT*>      t_q_const;    // Charge
 typedef Kokkos::View<T_V_FLOAT*>          t_mass;       // Mass
 typedef Kokkos::View<const T_V_FLOAT*>    t_mass_const; // Mass
 
+// intentional: SNAreal/complex gets reused beyond SNAP
+typedef double SNAreal;
+
+//typedef struct { SNAreal re, im; } SNAcomplex;
+template <typename real_type_>
+struct alignas(2*sizeof(real_type_)) SNAComplex
+{
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+  real_type re,im;
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex()
+   : re(static_cast<real_type>(0.)), im(static_cast<real_type>(0.)) { ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re)
+   : re(re), im(static_cast<real_type>(0.)) { ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re, real_type im)
+   : re(re), im(im) { ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other)
+   : re(other.re), im(other.im) { ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(const SNAComplex& other) {
+    re = other.re; im = other.im;
+    return *this;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(SNAComplex&& other)
+   : re(other.re), im(other.im) { ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(SNAComplex&& other) {
+    re = other.re; im = other.im;
+    return *this;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator+(SNAComplex const& other) {
+    return SNAComplex(re + other.re, im + other.im);
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator+=(SNAComplex const& other) {
+    re += other.re; im += other.im;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr complex zero() { return complex(static_cast<real_type>(0.), static_cast<real_type>(0.)); }
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr complex one() { return complex(static_cast<real_type>(1.), static_cast<real_type>(0.)); }
+
+  KOKKOS_INLINE_FUNCTION
+  const complex conj() const { return complex(re, -im); }
+
+  KOKKOS_INLINE_FUNCTION
+  const real_type real_part_product(const complex &cm2) { return re * cm2.re - im * cm2.im; }
+
+  KOKKOS_INLINE_FUNCTION
+  const real_type real_part_product(const real_type &r) const { return re * r; }
+};
+
+template <typename real_type>
+KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real_type> operator*(const real_type& r, const SNAComplex<real_type>& self) {
+  return SNAComplex<real_type>(r*self.re, r*self.im);
+}
+
+template <typename real_type>
+KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real_type> operator*(const SNAComplex<real_type>& self, const real_type& r) {
+  return SNAComplex<real_type>(r*self.re, r*self.im);
+}
+
+template <typename real_type>
+KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real_type> operator*(const SNAComplex<real_type>& self, const SNAComplex<real_type>& cm2) {
+  return SNAComplex<real_type>(self.re*cm2.re - self.im*cm2.im, self.re*cm2.im + self.im*cm2.re);
+}
+
+typedef SNAComplex<SNAreal> SNAcomplex;
+
 typedef Kokkos::DefaultExecutionSpace::memory_space t_neigh_mem_space;
 
 namespace Kokkos {

From 9f39618157bed93bda5d8e4c640371f71363a929 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 13 Jul 2022 15:28:15 -0600
Subject: [PATCH 06/11] WIP

---
 src/force_types/force_snap_neigh_impl.h | 40 ++++++++++++-------------
 src/force_types/sna.h                   |  2 +-
 src/force_types/sna_impl.hpp            |  4 +--
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h
index bcc8c7b..e068bd6 100644
--- a/src/force_types/force_snap_neigh_impl.h
+++ b/src/force_types/force_snap_neigh_impl.h
@@ -624,7 +624,7 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeighCPU,const typename Kok
   int ii = team.league_rank();
   const int i = d_ilist[ii + chunk_offset];
   // TODO: snaKK is used in lammps
-  SNAKokkos my_sna = snaKK;
+  SNA my_sna = snaKK;
   const double xtmp = x(i,0);
   const double ytmp = x(i,1);
   const double ztmp = x(i,2);
@@ -645,12 +645,12 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeighCPU,const typename Kok
       [&] (const int jj, int& count) {
     Kokkos::single(Kokkos::PerThread(team), [&] () {
       T_INT j = d_neighbors(i,jj);
-      const F_FLOAT dx = x(j,0) - xtmp;
-      const F_FLOAT dy = x(j,1) - ytmp;
-      const F_FLOAT dz = x(j,2) - ztmp;
+      const double dx = x(j,0) - xtmp;
+      const double dy = x(j,1) - ytmp;
+      const double dz = x(j,2) - ztmp;
 
       const int jtype = type(j);
-      const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+      const double rsq = dx*dx + dy*dy + dz*dz;
 
       if (rsq < rnd_cutsq(itype,jtype))
        count++;
@@ -664,12 +664,12 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeighCPU,const typename Kok
       [&] (const int jj, int& offset, bool final) {
   //for (int jj = 0; jj < num_neighs; jj++) {
     T_INT j = d_neighbors(i,jj);
-    const F_FLOAT dx = x(j,0) - xtmp;
-    const F_FLOAT dy = x(j,1) - ytmp;
-    const F_FLOAT dz = x(j,2) - ztmp;
+    const double dx = x(j,0) - xtmp;
+    const double dy = x(j,1) - ytmp;
+    const double dz = x(j,2) - ztmp;
 
     const int jtype = type(j);
-    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    const double rsq = dx*dx + dy*dy + dz*dz;
     const int elem_j = d_map[jtype];
 
     if (rsq < rnd_cutsq(itype,jtype)) {
@@ -696,7 +696,7 @@ template<class NeighborClass>
 KOKKOS_INLINE_FUNCTION
 void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy<TagComputeNeigh>::member_type& team) const {
 
-  SNAKokkos my_sna = snaKK;
+  SNA my_sna = snaKK;
 
   // extract atom number
   int ii = team.team_rank() + team.league_rank() * team.team_size();
@@ -713,9 +713,9 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
 
   // Load various info about myself up front
   const int i = d_ilist[ii + chunk_offset];
-  const F_FLOAT xtmp = x(i,0);
-  const F_FLOAT ytmp = x(i,1);
-  const F_FLOAT ztmp = x(i,2);
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
   const int itype = type[i];
   const int ielem = d_map[itype];
   const double radi = d_radelem[ielem];
@@ -733,12 +733,12 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs),
     [&] (const int jj, int& count) {
     T_INT j = d_neighbors(i,jj);
-    const F_FLOAT dx = x(j,0) - xtmp;
-    const F_FLOAT dy = x(j,1) - ytmp;
-    const F_FLOAT dz = x(j,2) - ztmp;
+    const double dx = x(j,0) - xtmp;
+    const double dy = x(j,1) - ytmp;
+    const double dz = x(j,2) - ztmp;
 
     int jtype = type(j);
-    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    const double rsq = dx*dx + dy*dy + dz*dz;
 
     if (rsq >= rnd_cutsq(itype,jtype)) {
       jtype = -1; // use -1 to signal it's outside the radius
@@ -760,9 +760,9 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
     if (jtype >= 0) {
       if (final) {
         T_INT j = d_neighbors(i,jj);
-        const F_FLOAT dx = x(j,0) - xtmp;
-        const F_FLOAT dy = x(j,1) - ytmp;
-        const F_FLOAT dz = x(j,2) - ztmp;
+        const double dx = x(j,0) - xtmp;
+        const double dy = x(j,1) - ytmp;
+        const double dz = x(j,2) - ztmp;
         const int elem_j = d_map[jtype];
         my_sna.rij(ii,offset,0) = dx;
         my_sna.rij(ii,offset,1) = dy;
diff --git a/src/force_types/sna.h b/src/force_types/sna.h
index f2f99e6..bdd183f 100644
--- a/src/force_types/sna.h
+++ b/src/force_types/sna.h
@@ -186,7 +186,7 @@ class SNA {
   // core "evaluation" functions that get plugged into "compute" functions
   // plugged into compute_ui_small, compute_ui_large
   KOKKOS_FORCEINLINE_FUNCTION
-  void evaluate_ui_jbend(const WignerWrapper, const complex&, const complex&, const double&, const int&,
+  void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const double&, const int&,
                         const int&, const int&, const int&);
   // plugged into compute_zi, compute_yi
   KOKKOS_FORCEINLINE_FUNCTION
diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp
index fda5a36..4804cc3 100644
--- a/src/force_types/sna_impl.hpp
+++ b/src/force_types/sna_impl.hpp
@@ -901,7 +901,7 @@ typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int&
 // Core "evaluation" kernel that extracts and rescales the appropriate `beta` value,
 // which gets used in both `compute_yi` and `compute_yi_from_zlist
 KOKKOS_FORCEINLINE_FUNCTION
-typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j,
+double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j,
           const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div,
           const Kokkos::View<double***, Kokkos::LayoutLeft> &beta_pack) {
 
@@ -1019,7 +1019,7 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member
 // Core "evaluation" kernel that gets reused in `compute_fused_deidrj_small` and
 // `compute_fused_deidrj_large`
 KOKKOS_FORCEINLINE_FUNCTION
-typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac,
+double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac,
                       const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu,
                       const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) {
 

From 4fc5d711368c99beeb5435774aeeaa8ba021972e Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 16 Aug 2022 10:57:05 -0600
Subject: [PATCH 07/11] WIP

---
 src/force_types/force_snap_neigh_impl.h | 331 +++++++++++++++++++++---
 1 file changed, 289 insertions(+), 42 deletions(-)

diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h
index e068bd6..b44b8ff 100644
--- a/src/force_types/force_snap_neigh_impl.h
+++ b/src/force_types/force_snap_neigh_impl.h
@@ -134,6 +134,8 @@ ForceSNAP<NeighborClass>::ForceSNAP(char** args, System* system_, bool half_neig
   // Need to set this because restart not handled by ForceHybrid
 
   cutsq = t_fparams("ForceSNAP::cutsq",system->ntypes,system->ntypes);
+
+  host_flag = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -185,54 +187,296 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
 
   NeighborClass* neighbor = (NeighborClass*) neighbor_;
   neigh_list = neighbor->get_neigh_list();
-  int max_neighs = 0;
   /*
   for (int i = 0; i < nlocal; i++) {
     typename t_neigh_list::t_neighs neighs_i = neigh_list.get_neighs(i);
     const int num_neighs = neighs_i.get_num_neighs();
     if(max_neighs<num_neighs) max_neighs = num_neighs;
   }*/
-  Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",nlocal, FindMaxNumNeighs<t_neigh_list>(neigh_list), Kokkos::Max<int>(max_neighs));
-  // snaKK (line 220 in lammps)
-  sna.nmax = max_neighs;
-  // Put LAMMPS code starting here
-  T_INT team_scratch_size = sna.size_team_scratch_arrays();
-  T_INT thread_scratch_size = sna.size_thread_scratch_arrays();
-
-  //printf("Sizes: %i %i\n",team_scratch_size/1024,thread_scratch_size/1024);
-  int vector_length = 8;
-  int team_size_max = Kokkos::TeamPolicy<>(nlocal,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
-#ifdef EMD_ENABLE_GPU
-  // Same as 207 in lammps
-  int team_size = 20;//max_neighs;
-  if(team_size*vector_length > team_size_max)
-    team_size = team_size_max/vector_length;
-#else
-  int team_size = 1;
-#endif
-      //ComputeNeigh -- lammps
+  max_neighs = 0;
+  Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(max_neighs));
+
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 32;//max_neighs;
+
+  if (beta_max < inum) {
+    beta_max = inum;
+    MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,inum);
+    if (!host_flag)
+      MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length);
+    MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",inum);
+  }
+
+  chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
+  chunk_offset = 0;
+
+  sna.grow_rij(chunk_size,max_neighs);
+
+  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > inum - chunk_offset)
+      chunk_size = inum - chunk_offset;
+
+    if (host_flag)
+    {
+      // Host codepath
+
+      //ComputeNeigh
+      /*{
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPComputeNeighCPU>(chunk_size,team_size);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU> policy_neigh(chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this);
+      }
+
+      //PreUi
+      {
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
+      }
+
+      // ComputeUi
+      {
+        int team_size = team_size_default;
+        // Fused calculation of ulist and accumulation into ulisttot using atomics
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
+      }
+
+      {
+        // Expand ulisttot -> ulisttot_full
+        // Zero out ylist
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
+        Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
+      }
+
+      //Compute bispectrum
+      if (quadraticflag || eflag) {
+        //ComputeZi
+        int idxz_max = sna.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
+
+        //ComputeBi
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
+      }
+
+      //ComputeYi
+      {
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
+
+        //ComputeYi
+        int idxz_max = sna.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
+      } // host flag
+
+      //ComputeDuidrj and Deidrj
+      {
+        int team_size = team_size_default;
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+
+        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
+      }*/
+
+    } else { // GPU
+
+#ifdef EMD_KOKKOS_GPU
+
+      // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+      const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+      //ComputeNeigh
       {
         // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
         int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
 
-        SnapAoSoATeamPolicy<team_size_compute_neigh, TagComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
         policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
-      }    
+      }
 
+      //ComputeCayleyKlein
+      /*{
+        // tile_size_compute_ck is defined in `pair_snap_kokkos.h`
+        Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagPairSNAPComputeCayleyKlein>
+            policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1});
+        Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
+      }
 
+      //PreUi
+      {
+        // tile_size_pre_ui is defined in `pair_snap_kokkos.h`
+        Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagPairSNAPPreUi>
+            policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+        Kokkos::parallel_for("PreUi",policy_preui,*this);
+      }
+
+      // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
+      {
+        // team_size_compute_ui is defined in `pair_snap_kokkos.h`
+        // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+        const int tile_size = vector_length * (twojmax + 1);
+        const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+        if (chunk_size < parallel_thresh)
+        {
+          // Version with parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
+          const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+          const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
+        } else {
+          // Version w/out parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs)
+          const int n_teams = chunk_size_div * max_neighs;
+          const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
+        }
+      }
+
+      //TransformUi: un-"fold" ulisttot, zero ylist
+      {
+        // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+        Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagPairSNAPTransformUi>
+            policy_transform_ui({0,0,0},{vector_length,sna.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+      }
+
+      //Compute bispectrum in AoSoA data layout, transform Bi
+      if (quadraticflag || eflag) {
+        // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+        //ComputeZi
+        const int idxz_max = sna.idxz_max;
+        Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagPairSNAPComputeZi>
+            policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+        Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+        //ComputeBi
+        const int idxb_max = sna.idxb_max;
+        Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagPairSNAPComputeBi>
+            policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+        Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+        //Transform data layout of blist out of AoSoA
+        //We need this because `blist` gets used in ComputeForce which doesn't
+        //take advantage of AoSoA, which at best would only be beneficial on the margins
+        Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
+            policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+        Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+      }
+
+      //Note zeroing `ylist` is fused into `TransformUi`.
+      {
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
+        const int idxz_max = sna.idxz_max;
+        if (quadraticflag || eflag) {
+          Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
+              policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
+          Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
+        } else {
+          Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
+              policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
+          Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
+        }
+      }
+
+      // Fused ComputeDuidrj, ComputeDeidrj
+      {
+        // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h`
+
+        // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer
+        const int tile_size = vector_length * (twojmax + 1);
+        const int scratch_size = scratch_size_helper<complex>(2 * team_size_compute_fused_deidrj * tile_size);
+
+        if (chunk_size < parallel_thresh)
+        {
+          // Version with parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
+          const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+          const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
+
+          // x direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this);
+
+          // y direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this);
+
+          // z direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this);
+        } else {
+          // Version w/out parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs)
+          const int n_teams = chunk_size_div * max_neighs;
+          const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
+
+          // x direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this);
+
+          // y direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this);
+
+          // z direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this);
+
+        }
+      }*/
+
+#endif // LMP_KOKKOS_GPU
+
+    }
+
+    //ComputeForce
+    /*{
+      if (neighflag == HALF) {
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(0,chunk_size);
+        Kokkos::parallel_for(policy_force, *this);
+      } else if (neighflag == HALFTHREAD) {
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
+        Kokkos::parallel_for(policy_force, *this);
+      }
+    }*/
+    chunk_offset += chunk_size;
+
+  }
 
-/*
-  Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length);
-  // Replaced in LAMMPS by a lot of functors
-  Kokkos::parallel_for("ForceSNAP::compute",policy
-      .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
-      .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
-    ,*this);
-*/
-//static int step =0;
-//step++;
-//if(step%10==0)
 //        printf(" %e %e %e %e %e (%e %e): %e\n",t1,t2,t3,t4,t5,t6,t7,t1+t2+t3+t4+t5);
 }
 
@@ -344,7 +588,7 @@ void ForceSNAP<NeighborClass>::init_coeff(int narg, char **arg)
             diagonalstyle,use_shared_arrays,
 		        rmin0,switchflag,bzeroflag);
     //if (!use_shared_arrays)
-  sna.grow_rij(nmax);
+  sna.grow_rij(nmax,max_neighs);
   sna.init();
 
   //printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
@@ -623,8 +867,8 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeighCPU,const typename Kok
 
   int ii = team.league_rank();
   const int i = d_ilist[ii + chunk_offset];
-  // TODO: snaKK is used in lammps
-  SNA my_sna = snaKK;
+  // TODO: sna is used in lammps
+  SNA my_sna = sna;
   const double xtmp = x(i,0);
   const double ytmp = x(i,1);
   const double ztmp = x(i,2);
@@ -696,7 +940,7 @@ template<class NeighborClass>
 KOKKOS_INLINE_FUNCTION
 void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy<TagComputeNeigh>::member_type& team) const {
 
-  SNA my_sna = snaKK;
+  SNA my_sna = sna;
 
   // extract atom number
   int ii = team.team_rank() + team.league_rank() * team.team_size();
@@ -712,7 +956,12 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
   int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
 
   // Load various info about myself up front
-  const int i = d_ilist[ii + chunk_offset];
+  const int i = ii + chunk_offset;
+
+
+  typename t_neigh_list::t_neighs neighs_i = neigh_list.get_neighs(i);
+  const int num_neighs = neighs_i.get_num_neighs();
+
   const double xtmp = x(i,0);
   const double ytmp = x(i,1);
   const double ztmp = x(i,2);
@@ -720,8 +969,6 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
   const int ielem = d_map[itype];
   const double radi = d_radelem[ielem];
 
-  const int num_neighs = d_numneigh[i];
-
   // rij[][3] = displacements between atom I and those neighbors
   // inside = indices of neighbors of I within cutoff
   // wj = weights for neighbors of I within cutoff
@@ -732,7 +979,7 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
   int ninside = 0;
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs),
     [&] (const int jj, int& count) {
-    T_INT j = d_neighbors(i,jj);
+    T_INT j = neighs_i(jj);
     const double dx = x(j,0) - xtmp;
     const double dy = x(j,1) - ytmp;
     const double dz = x(j,2) - ztmp;
@@ -759,7 +1006,7 @@ void ForceSNAP<NeighborClass>::operator() (TagComputeNeigh,const typename Kokkos
 
     if (jtype >= 0) {
       if (final) {
-        T_INT j = d_neighbors(i,jj);
+        T_INT j = neighs_i(jj);
         const double dx = x(j,0) - xtmp;
         const double dy = x(j,1) - ytmp;
         const double dz = x(j,2) - ztmp;

From 76daaf40fdfb15b3355506d9706e710dc8199e43 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 16 Aug 2022 11:58:25 -0600
Subject: [PATCH 08/11] WIP

---
 src/force_types/force_snap_neigh.h      | 40 ++++++-----
 src/force_types/force_snap_neigh_impl.h | 93 +++++++++++++------------
 2 files changed, 69 insertions(+), 64 deletions(-)

diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h
index c342f10..d05b59b 100644
--- a/src/force_types/force_snap_neigh.h
+++ b/src/force_types/force_snap_neigh.h
@@ -142,10 +142,10 @@ struct TagComputeDeidrjCPU{};
   void operator() (TagPreUi,const int iatom_mod, const int j, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiSmall>::member_type& team) const;
+  void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicyTagComputeUiSmall>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiLarge>::member_type& team) const;
+  void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicyTagComputeUiLarge>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
@@ -170,21 +170,21 @@ struct TagComputeDeidrjCPU{};
 
   template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagComputeFusedDeidrjSmall<dir> >::member_type& team) const;
+  void operator() (TagComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicyTagComputeFusedDeidrjSmall<dir> >::member_type& team) const;
 
   template<int dir>
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagComputeFusedDeidrjLarge<dir> >::member_type& team) const;
+  void operator() (TagComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicyTagComputeFusedDeidrjLarge<dir> >::member_type& team) const;
 
   // CPU backend only
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeNeighCPU>::member_type& team) const;
+  void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicyTagComputeNeighCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPreUiCPU>::member_type& team) const;
+  void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicyTagPreUiCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeUiCPU>::member_type& team) const;
+  void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicyTagComputeUiCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagTransformUiCPU, const int j, const int iatom) const;
@@ -193,16 +193,16 @@ struct TagComputeDeidrjCPU{};
   void operator() (TagComputeZiCPU,const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeBiCPU>::member_type& team) const;
+  void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicyTagComputeBiCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagComputeYiCPU,const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeDuidrjCPU>::member_type& team) const;
+  void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicyTagComputeDuidrjCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagComputeDeidrjCPU>::member_type& team) const;
+  void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicyTagComputeDeidrjCPU>::member_type& team) const;
 */
 
 protected:
@@ -218,9 +218,9 @@ struct TagComputeDeidrjCPU{};
   t_dbvec dbvec;
   SNA sna;
 
-  int nmax;
+  int nmax,max_neighs;
    
-  int chunk_size,chunk_offset;
+  int chunk_size,chunk_offset,chunksize;
   int host_flag;
 
   // How many interactions can be run concurrently
@@ -280,7 +280,8 @@ struct TagComputeDeidrjCPU{};
   Kokkos::View<T_F_FLOAT*> wjelem;               // elements weights
   Kokkos::View<T_F_FLOAT**, Kokkos::LayoutRight> coeffelem;           // element bispectrum coefficients
   Kokkos::View<T_INT*> map;                     // mapping from atom types to elements
-  int twojmax, diagonalstyle, switchflag, bzeroflag, quadraticflag;
+  int twojmax, switchflag, bzeroflag, bnormflag, quadraticflag;
+  int chemflag, wselfallflag, switchinnerflag, diagonalstyle;
   double rcutfac, rfac0, rmin0, wj1, wj2;
   int rcutfacflag, twojmaxflag; // flags for required parameters
   typedef Kokkos::View<T_F_FLOAT**> t_fparams;
@@ -329,7 +330,7 @@ struct TagComputeDeidrjCPU{};
   static constexpr int tile_size_compute_yi = 8;
   static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2;
 #endif
-static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN;
+static constexpr int vector_length = 32;
 
 // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
   // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
@@ -344,10 +345,13 @@ static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN;
   template <int num_teams, class TagPairSNAP>
   using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Kokkos::LaunchBounds<vector_length * num_teams>, TagPairSNAP>;
 
-
-
-
-
+  Kokkos::View<T_F_FLOAT*> d_radelem;              // element radii
+  Kokkos::View<T_F_FLOAT*> d_wjelem;               // elements weights
+  Kokkos::View<T_F_FLOAT**, Kokkos::LayoutRight> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<T_F_FLOAT*> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<T_F_FLOAT*> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*> d_map;                    // mapping from atom types to elements
+  Kokkos::View<T_INT*> d_ninside;                // ninside for all atoms in list
 };
 
 
diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h
index b44b8ff..1feaa70 100644
--- a/src/force_types/force_snap_neigh_impl.h
+++ b/src/force_types/force_snap_neigh_impl.h
@@ -91,7 +91,6 @@ ForceSNAP<NeighborClass>::ForceSNAP(char** args, System* system_, bool half_neig
 
   nmax = 0;
 
-  vector_length = 8;
   concurrent_interactions =
 #if defined(KOKKOS_ENABLE_CUDA)
       std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value ?
@@ -194,29 +193,30 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
     if(max_neighs<num_neighs) max_neighs = num_neighs;
   }*/
   max_neighs = 0;
-  Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(max_neighs));
+  Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",nlocal, FindMaxNumNeighs<t_neigh_list>(neigh_list), Kokkos::Max<int>(max_neighs));
 
   int team_size_default = 1;
   if (!host_flag)
     team_size_default = 32;//max_neighs;
 
-  if (beta_max < inum) {
-    beta_max = inum;
-    MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,inum);
+/*
+  if (beta_max < nlocal) {
+    beta_max = nlocal;
+    MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,nlocal);
     if (!host_flag)
-      MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length);
-    MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",inum);
-  }
+      MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(nlocal + vector_length - 1) / vector_length);
+    MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",nlocal);
+  }*/
 
-  chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
+  chunk_size = MIN(chunksize,nlocal); // "chunksize" variable is set by user
   chunk_offset = 0;
 
   sna.grow_rij(chunk_size,max_neighs);
 
-  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
+  while (chunk_offset < nlocal) { // chunk up loop to prevent running out of memory
 
-    if (chunk_size > inum - chunk_offset)
-      chunk_size = inum - chunk_offset;
+    if (chunk_size > nlocal - chunk_offset)
+      chunk_size = nlocal - chunk_offset;
 
     if (host_flag)
     {
@@ -226,7 +226,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       /*{
         int team_size = team_size_default;
         check_team_size_for<TagPairSNAPComputeNeighCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU> policy_neigh(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPComputeNeighCPU> policy_neigh(chunk_size,team_size,vector_length);
         Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this);
       }
 
@@ -234,7 +234,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       {
         int team_size = team_size_default;
         check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
         Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
       }
 
@@ -242,14 +242,14 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       {
         int team_size = team_size_default;
         // Fused calculation of ulist and accumulation into ulisttot using atomics
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
         Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
       }
 
       {
         // Expand ulisttot -> ulisttot_full
         // Zero out ylist
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
+        typename Kokkos::MDRangePolicyKokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
         Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
       }
 
@@ -257,25 +257,25 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       if (quadraticflag || eflag) {
         //ComputeZi
         int idxz_max = sna.idxz_max;
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
+        typename Kokkos::RangePolicyTagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
         Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
 
         //ComputeBi
         int team_size = team_size_default;
         check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
         Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
       }
 
       //ComputeYi
       {
         //Compute beta = dE_i/dB_i for all i in list
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
+        typename Kokkos::RangePolicyTagPairSNAPBetaCPU> policy_beta(0,chunk_size);
         Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
 
         //ComputeYi
         int idxz_max = sna.idxz_max;
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
+        typename Kokkos::RangePolicyTagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
         Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
       } // host flag
 
@@ -283,10 +283,10 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       {
         int team_size = team_size_default;
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
         Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::TeamPolicyTagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
 
         Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
       }*/
@@ -303,7 +303,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
         // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
         int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
 
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+        SnapAoSoATeamPolicyteam_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
         policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
       }
@@ -311,7 +311,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       //ComputeCayleyKlein
       /*{
         // tile_size_compute_ck is defined in `pair_snap_kokkos.h`
-        Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagPairSNAPComputeCayleyKlein>
+        Snap3DRangePolicytile_size_compute_ck, TagPairSNAPComputeCayleyKlein>
             policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1});
         Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
       }
@@ -319,7 +319,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       //PreUi
       {
         // tile_size_pre_ui is defined in `pair_snap_kokkos.h`
-        Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagPairSNAPPreUi>
+        Snap3DRangePolicytile_size_pre_ui, TagPairSNAPPreUi>
             policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
         Kokkos::parallel_for("PreUi",policy_preui,*this);
       }
@@ -339,7 +339,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
           const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
           const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
           policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
         } else {
@@ -349,7 +349,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
           const int n_teams = chunk_size_div * max_neighs;
           const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
           policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
         }
@@ -358,7 +358,7 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       //TransformUi: un-"fold" ulisttot, zero ylist
       {
         // team_size_transform_ui is defined in `pair_snap_kokkos.h`
-        Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagPairSNAPTransformUi>
+        Snap3DRangePolicytile_size_transform_ui, TagPairSNAPTransformUi>
             policy_transform_ui({0,0,0},{vector_length,sna.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
         Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
       }
@@ -369,20 +369,20 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
 
         //ComputeZi
         const int idxz_max = sna.idxz_max;
-        Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagPairSNAPComputeZi>
+        Snap3DRangePolicytile_size_compute_zi, TagPairSNAPComputeZi>
             policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
         Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
 
         //ComputeBi
         const int idxb_max = sna.idxb_max;
-        Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagPairSNAPComputeBi>
+        Snap3DRangePolicytile_size_compute_bi, TagPairSNAPComputeBi>
             policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
         Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
 
         //Transform data layout of blist out of AoSoA
         //We need this because `blist` gets used in ComputeForce which doesn't
         //take advantage of AoSoA, which at best would only be beneficial on the margins
-        Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
+        Snap3DRangePolicytile_size_transform_bi, TagPairSNAPTransformBi>
             policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
         Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
       }
@@ -390,15 +390,15 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
       //Note zeroing `ylist` is fused into `TransformUi`.
       {
         //Compute beta = dE_i/dB_i for all i in list
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
+        typename Kokkos::RangePolicyTagPairSNAPBeta> policy_beta(0,chunk_size);
         Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
         const int idxz_max = sna.idxz_max;
         if (quadraticflag || eflag) {
-          Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
+          Snap3DRangePolicytile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
               policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
           Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
         } else {
-          Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
+          Snap3DRangePolicytile_size_compute_yi, TagPairSNAPComputeYi>
               policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
           Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
         }
@@ -421,17 +421,17 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
           const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
 
           // x direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this);
 
           // y direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this);
 
           // z direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this);
         } else {
@@ -442,17 +442,17 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
           const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
 
           // x direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this);
 
           // y direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this);
 
           // z direction
-          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
           policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
           Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this);
 
@@ -466,10 +466,10 @@ void ForceSNAP<NeighborClass>::compute(System* system, Binning* binning, Neighbo
     //ComputeForce
     /*{
       if (neighflag == HALF) {
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(0,chunk_size);
+        typename Kokkos::RangePolicyTagPairSNAPComputeForce<HALF,0> > policy_force(0,chunk_size);
         Kokkos::parallel_for(policy_force, *this);
       } else if (neighflag == HALFTHREAD) {
-        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
+        typename Kokkos::RangePolicyTagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
         Kokkos::parallel_for(policy_force, *this);
       }
     }*/
@@ -584,10 +584,11 @@ void ForceSNAP<NeighborClass>::init_coeff(int narg, char **arg)
   // is wrapped into the sna class
 
 
-  sna = SNA(rfac0,twojmax,
-            diagonalstyle,use_shared_arrays,
-		        rmin0,switchflag,bzeroflag);
-    //if (!use_shared_arrays)
+  sna = SNA(rfac0, twojmax,
+            rmin0, switchflag, bzeroflag,
+            chemflag, bnormflag, wselfallflag,
+            nelements, switchinnerflag);
+
   sna.grow_rij(nmax,max_neighs);
   sna.init();
 

From 55451bbd36b6be98466e1b1882700a69df1b012b Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Wed, 26 Oct 2022 16:41:13 -0600
Subject: [PATCH 09/11] Changes from last co-development session (july 13,
 2022)

---
 src/KokkosCore_Config_DeclareBackend.hpp | 51 ++++++++++++++++++++++++
 src/KokkosCore_Config_FwdBackend.hpp     | 51 ++++++++++++++++++++++++
 src/KokkosCore_Config_PostInclude.hpp    | 49 +++++++++++++++++++++++
 src/KokkosCore_Config_SetupBackend.hpp   | 50 +++++++++++++++++++++++
 src/Makefile                             |  6 ++-
 5 files changed, 205 insertions(+), 2 deletions(-)
 create mode 100644 src/KokkosCore_Config_DeclareBackend.hpp
 create mode 100644 src/KokkosCore_Config_FwdBackend.hpp
 create mode 100644 src/KokkosCore_Config_PostInclude.hpp
 create mode 100644 src/KokkosCore_Config_SetupBackend.hpp

diff --git a/src/KokkosCore_Config_DeclareBackend.hpp b/src/KokkosCore_Config_DeclareBackend.hpp
new file mode 100644
index 0000000..26a6b2a
--- /dev/null
+++ b/src/KokkosCore_Config_DeclareBackend.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef  KOKKOS_DECLARE_HPP_
+#define  KOKKOS_DECLARE_HPP_
+
+
+
+#endif
+#include <decl/Kokkos_Declare_CUDA.hpp>
+#include <decl/Kokkos_Declare_SERIAL.hpp>
diff --git a/src/KokkosCore_Config_FwdBackend.hpp b/src/KokkosCore_Config_FwdBackend.hpp
new file mode 100644
index 0000000..4c64356
--- /dev/null
+++ b/src/KokkosCore_Config_FwdBackend.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef  KOKKOS_FWD_HPP_
+#define  KOKKOS_FWD_HPP_
+
+
+
+#endif
+#include <fwd/Kokkos_Fwd_CUDA.hpp>
+#include <fwd/Kokkos_Fwd_SERIAL.hpp>
diff --git a/src/KokkosCore_Config_PostInclude.hpp b/src/KokkosCore_Config_PostInclude.hpp
new file mode 100644
index 0000000..7fe4704
--- /dev/null
+++ b/src/KokkosCore_Config_PostInclude.hpp
@@ -0,0 +1,49 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef  KOKKOS_POST_INCLUDE_HPP_
+#define  KOKKOS_POST_INCLUDE_HPP_
+
+
+
+#endif
diff --git a/src/KokkosCore_Config_SetupBackend.hpp b/src/KokkosCore_Config_SetupBackend.hpp
new file mode 100644
index 0000000..50795dc
--- /dev/null
+++ b/src/KokkosCore_Config_SetupBackend.hpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef  KOKKOS_SETUP_HPP_
+#define  KOKKOS_SETUP_HPP_
+
+
+
+#endif
+#include <setup/Kokkos_Setup_Cuda.hpp>
diff --git a/src/Makefile b/src/Makefile
index e763ae9..5741d9b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,10 +14,12 @@ EXAMINIMD_PATH = $(subst Makefile,,$(MAKEFILE_PATH))/..
 default: build
 	echo "Start Build"
 
+MY_MPI_EXE = $(shell which mpicxx)
+MY_MPI_PATH = $(dir ${MY_MPI_EXE})
 
 EXE = ExaMiniMD
-CXXFLAGS = -O3 -g
-LINKFLAGS = -O3 -g
+CXXFLAGS = -O3 -g -I${MY_MPI_PATH}../include
+LINKFLAGS = -O3 -g -L${MY_MPI_PATH}../lib -lmpi_ibm -lopen-pal -lopen-rte -lhwloc_ompi -levent_core -levent_pthreads -levent
 
 ifeq ($(MPI), 1)
   CXX = mpicxx

From 267b6e2b90d9ef3014bfae8cad8ea0f1b7ff161f Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Thu, 27 Oct 2022 10:44:25 -0600
Subject: [PATCH 10/11] add RIG to examinimd.h, comment nfac_table

---
 src/examinimd.h              |  5 ++++-
 src/force_types/sna.h        |  2 +-
 src/force_types/sna_impl.hpp | 16 ++++++++++------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/examinimd.h b/src/examinimd.h
index 4738703..3e3c0c2 100644
--- a/src/examinimd.h
+++ b/src/examinimd.h
@@ -36,6 +36,9 @@
 //  Questions? Contact Christian R. Trott (crtrott@sandia.gov)
 //************************************************************************
 
+#ifndef EXAMINI_H
+#define EXAMINIMD_H
+
 #include <types.h>
 
 #include<system.h>
@@ -69,4 +72,4 @@ class ExaMiniMD {
 
     void shutdown();
 };
-
+#endif //EXAMINIMD_H
diff --git a/src/force_types/sna.h b/src/force_types/sna.h
index bdd183f..61fbb5f 100644
--- a/src/force_types/sna.h
+++ b/src/force_types/sna.h
@@ -313,7 +313,7 @@ class SNA {
   t_sna_2d rootpqarray;
 
   static const int nmaxfactorial = 167;
-  static const double nfac_table[];
+  //static const double nfac_table[];
   inline
   double factorial(int);
 
diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp
index 4804cc3..51a7465 100644
--- a/src/force_types/sna_impl.hpp
+++ b/src/force_types/sna_impl.hpp
@@ -1886,19 +1886,22 @@ void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type&
 inline
 double SNA::factorial(int n)
 {
-  //if (n < 0 || n > nmaxfactorial) {
-  //  char str[128];
-  //  sprintf(str, "Invalid argument to factorial %d", n);
-  //  error->all(FLERR, str);
-  //}
+  double result = 1.0;
+  for(int i=1; i<=n; i++)
+    result *= 1.0*i;
+  return result;  
 
-  return nfac_table[n];
 }
 
 /* ----------------------------------------------------------------------
    factorial n table, size SNA::nmaxfactorial+1
 ------------------------------------------------------------------------- */
 
+
+// Using nfac_table gives linking errors
+
+
+/*
 const double SNA::nfac_table[] = {
   1,
   1,
@@ -2069,6 +2072,7 @@ const double SNA::nfac_table[] = {
   9.00369170577843e+297,
   1.503616514865e+300, // nmaxfactorial = 167
 };
+*/
 
 /* ----------------------------------------------------------------------
    the function delta given by VMK Eq. 8.2(1)

From 8e85e61c5361ad093d993487afdd0ba54346297d Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Mon, 31 Oct 2022 11:21:32 -0600
Subject: [PATCH 11/11] CMakeLists.txt: STATUS messages to help identify Kokkos
 project

---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c30c775..ba2e02d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,14 @@ cmake_minimum_required(VERSION 3.10)
 project(ExaMiniMD LANGUAGES CXX)
 
 find_package(Kokkos 3.0 REQUIRED)
+
+if ((Kokkos_DIR) AND (NOT Kokkos_ROOT))
+  message(STATUS "STATUS using lib64/cmake/Kokkos/KokkosConfig.cmake file (in Kokkos installation point) and Kokkos_DIR set to: ${Kokkos_DIR}")
+else()
+  message(STATUS "STATUS using Kokkos installation point with Kokkos_ROOT: ${Kokkos_ROOT}")
+  message(STATUS "STATUS Kokkos_DIR: ${Kokkos_DIR} automatically set when using Kokkos_ROOT")
+endif()
+
 option(USE_MPI "Build with MPI" ON)
 
 if (USE_MPI)