From 7ab670478399ae1b2fa9fefefa2aa4d9e1c19402 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 31 May 2022 10:05:43 -0600 Subject: [PATCH 01/11] Initial update to SNAP routine --- src/force_types/force_snap_neigh.h | 175 +++++++++++++++++++ src/force_types/force_snap_neigh_impl.h | 214 +++++++++++++++++++++++- 2 files changed, 386 insertions(+), 3 deletions(-) diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h index d6f4e1e..2ddb01b 100644 --- a/src/force_types/force_snap_neigh.h +++ b/src/force_types/force_snap_neigh.h @@ -77,6 +77,43 @@ template class ForceSNAP : public Force { public: + // lammps splice +// Routines for both the CPU and GPU backend +template +struct TagComputeForce{}; + + +// GPU backend only +struct TagComputeNeigh{}; +struct TagComputeCayleyKlein{}; +struct TagPreUi{}; +struct TagComputeUiSmall{}; // more parallelism, more divergence +struct TagComputeUiLarge{}; // less parallelism, no divergence +struct TagTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagComputeZi{}; +struct TagBeta{}; +struct TagComputeBi{}; +struct TagTransformBi{}; // re-order blist from AoSoA to AoS +struct TagComputeYi{}; +struct TagComputeYiWithZlist{}; +template +struct TagComputeFusedDeidrjSmall{}; // more parallelism, more divergence +template +struct TagComputeFusedDeidrjLarge{}; // less parallelism, no divergence + +// CPU backend only +struct TagComputeNeighCPU{}; +struct TagPreUiCPU{}; +struct TagComputeUiCPU{}; +struct TagTransformUiCPU{}; +struct TagComputeZiCPU{}; +struct TagBetaCPU{}; +struct TagComputeBiCPU{}; +struct TagZeroYiCPU{}; +struct TagComputeYiCPU{}; +struct TagComputeDuidrjCPU{}; +struct TagComputeDeidrjCPU{}; + ForceSNAP(char** args, System* system, bool half_neigh_); ~ForceSNAP(); @@ -84,6 +121,89 @@ class ForceSNAP : public Force { void compute(System* system, Binning* binning, Neighbor* neighbor ); const char* name() {return "ForceSNAP";} +// lammps splice +/* template + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeForce,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagBetaCPU,const int& ii) const; +*/ + + // GPU backend only + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + +/* + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPreUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagBeta, const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const; + + // CPU backend only + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagTransformUiCPU, const int j, const int iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeZiCPU,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeYiCPU,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; +*/ protected: System* system; @@ -99,6 +219,9 @@ class ForceSNAP : public Force { SNA sna; int nmax; + + int chunk_size,chunk_offset; + int host_flag; // How much parallelism to use within an interaction int vector_length; @@ -178,8 +301,60 @@ class ForceSNAP : public Force { KOKKOS_INLINE_FUNCTION void operator() (const Kokkos::TeamPolicy<>::member_type& team) const; + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + +// Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(double) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2; +#endif + + +// Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + //using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagPairSNAP>; + + + + + }; + + + #define FORCE_MODULES_EXTERNAL_TEMPLATE #define FORCETYPE_DECLARE_TEMPLATE_MACRO(NeighType) ForceSNAP #include diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h index eb89ed3..de6ba6e 100644 --- a/src/force_types/force_snap_neigh_impl.h +++ b/src/force_types/force_snap_neigh_impl.h @@ -53,6 +53,10 @@ #include #include #include "force_snap_neigh.h" +// Headers added by AJP to fix compile issues +//#include "pair_snap_kokkos.h" +//#include "neighbor_kokkos.h" + #define MAXLINE 1024 #define MAXWORD 3 @@ -70,6 +74,14 @@ //static double t7 = 0.0; /* ---------------------------------------------------------------------- */ +// AJP, Stan, For SNAP update: + +//int num_teams; +//int vector_length_; +//static constexpr int vector_length = vector_length_; +//template +//using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, Tag>; + template ForceSNAP::ForceSNAP(char** args, System* system_, bool half_neigh_):Force(args,system_,half_neigh_) { @@ -156,6 +168,8 @@ struct FindMaxNumNeighs { This version is a straightforward implementation ---------------------------------------------------------------------- */ +// Functor that will be replaced by LAMMPS version + template void ForceSNAP::compute(System* system, Binning* binning, Neighbor* neighbor_) { @@ -179,9 +193,9 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo if(max_neighs(neigh_list), Kokkos::Max(max_neighs)); - + // snaKK (line 220 in lammps) sna.nmax = max_neighs; - + // Put LAMMPS code starting here T_INT team_scratch_size = sna.size_team_scratch_arrays(); T_INT thread_scratch_size = sna.size_thread_scratch_arrays(); @@ -189,18 +203,33 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo int vector_length = 8; int team_size_max = Kokkos::TeamPolicy<>(nlocal,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); #ifdef EMD_ENABLE_GPU + // Same as 207 in lammps int team_size = 20;//max_neighs; if(team_size*vector_length > team_size_max) team_size = team_size_max/vector_length; #else int team_size = 1; #endif - Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length); + //ComputeNeigh -- lammps + { + // team_size_compute_neigh is defined in `pair_snap_kokkos.h` + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); + + SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + +/* + Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length); + // Replaced in LAMMPS by a lot of functors Kokkos::parallel_for("ForceSNAP::compute",policy .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) ,*this); +*/ //static int step =0; //step++; //if(step%10==0) @@ -586,6 +615,174 @@ void ForceSNAP::read_files(char *coefffilename, char *paramfilena delete[] found; } +/* +template +KOKKOS_INLINE_FUNCTION +void ForceSNAP::operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + + + int ii = team.league_rank(); + const int i = d_ilist[ii + chunk_offset]; + // TODO: snaKK is used in lammps + SNAKokkos my_sna = snaKK; + const double xtmp = x(i,0); + const double ytmp = x(i,1); + const double ztmp = x(i,2); + const int itype = type[i]; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + const int num_neighs = d_numneigh[i]; + + // rij[][3] = displacements between atom I and those neighbors + // inside = indices of neighbors of I within cutoff + // wj = weights for neighbors of I within cutoff + // rcutij = cutoffs for neighbors of I within cutoff + // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi + + int ninside = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,num_neighs), + [&] (const int jj, int& count) { + Kokkos::single(Kokkos::PerThread(team), [&] () { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + const int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + if (rsq < rnd_cutsq(itype,jtype)) + count++; + }); + },ninside); + + d_ninside(ii) = ninside; + + if (team.team_rank() == 0) + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& offset, bool final) { + //for (int jj = 0; jj < num_neighs; jj++) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + const int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const int elem_j = d_map[jtype]; + + if (rsq < rnd_cutsq(itype,jtype)) { + if (final) { + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + my_sna.wj(ii,offset) = static_cast(d_wjelem[elem_j]); + my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[elem_j])*rcutfac); + my_sna.inside(ii,offset) = j; + if (chemflag) + my_sna.element(ii,offset) = elem_j; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); +} +*/ +// lammps splice + +template +KOKKOS_INLINE_FUNCTION +void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + SNAKokkos my_sna = snaKK; + + // extract atom number + int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = max_neighs; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // Load various info about myself up front + const int i = d_ilist[ii + chunk_offset]; + const F_FLOAT xtmp = x(i,0); + const F_FLOAT ytmp = x(i,1); + const F_FLOAT ztmp = x(i,2); + const int itype = type[i]; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + const int num_neighs = d_numneigh[i]; + + // rij[][3] = displacements between atom I and those neighbors + // inside = indices of neighbors of I within cutoff + // wj = weights for neighbors of I within cutoff + // rcutij = cutoffs for neighbors of I within cutoff + // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi + + // Compute the number of neighbors, store rsq + int ninside = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& count) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + if (rsq >= rnd_cutsq(itype,jtype)) { + jtype = -1; // use -1 to signal it's outside the radius + } + + type_cache[jj] = jtype; + + if (jtype >= 0) + count++; + }, ninside); + + d_ninside(ii) = ninside; + + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& offset, bool final) { + + const int jtype = type_cache[jj]; + + if (jtype >= 0) { + if (final) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const int elem_j = d_map[jtype]; + my_sna.rij(ii,offset,0) = dx; + my_sna.rij(ii,offset,1) = dy; + my_sna.rij(ii,offset,2) = dz; + my_sna.wj(ii,offset) = d_wjelem[elem_j]; + my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac; + my_sna.inside(ii,offset) = j; + if (chemflag) + my_sna.element(ii,offset) = elem_j; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); +} + + +/* +// This is multiple smaller functors in lammps template KOKKOS_INLINE_FUNCTION void ForceSNAP::operator() (const Kokkos::TeamPolicy<>::member_type& team) const { @@ -723,3 +920,14 @@ void ForceSNAP::operator() (const Kokkos::TeamPolicy<>::member_ty }); //t5 += timer.seconds(); timer.reset(); } +*/ + + +template +template +int ForceSNAP::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + From 12c2fe2fd4357cefcfff3a12342927350c9afb86 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 31 May 2022 10:42:58 -0600 Subject: [PATCH 02/11] Resolve compile errors in src/force_types/force_snap_neigh.h --- src/force_types/force_snap_neigh.h | 6 ++---- src/types.h | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h index 2ddb01b..c342f10 100644 --- a/src/force_types/force_snap_neigh.h +++ b/src/force_types/force_snap_neigh.h @@ -223,8 +223,6 @@ struct TagComputeDeidrjCPU{}; int chunk_size,chunk_offset; int host_flag; - // How much parallelism to use within an interaction - int vector_length; // How many interactions can be run concurrently int concurrent_interactions; @@ -331,13 +329,13 @@ struct TagComputeDeidrjCPU{}; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2; #endif - +static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN; // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> // and reduces the verbosity of the LaunchBound by hiding the explicit // multiplication by vector_length - template + //template //using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches diff --git a/src/types.h b/src/types.h index 4d70c17..c72e631 100644 --- a/src/types.h +++ b/src/types.h @@ -183,5 +183,11 @@ t_scalar3 operator * #define EMD_ENABLE_GPU #endif +// Needed to handle vector_length template parameter +#ifdef EMD_ENABLE_GPU +#define SNAP_KOKKOS_DEVICE_VECLEN 32 +#else +#define SNAP_KOKKOS_DEVICE_VECLEN 1 #endif +#endif // Who is my partner? From 9d2f47d5232107f420518b5d9dff5e1a965aae62 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 13 Jul 2022 14:22:13 -0600 Subject: [PATCH 03/11] Work on sna class --- src/force_types/sna.h | 317 +++- src/force_types/sna_impl.hpp | 2681 ++++++++++++++++++++++++---------- 2 files changed, 2169 insertions(+), 829 deletions(-) diff --git a/src/force_types/sna.h b/src/force_types/sna.h index fc4c58f..f174ba6 100644 --- a/src/force_types/sna.h +++ b/src/force_types/sna.h @@ -61,60 +61,179 @@ #include #include -struct SNA_LOOPINDICES { - int j1, j2, j; +#ifdef __SYCL_DEVICE_ONLY__ +#include +#endif + +struct WignerWrapper { + using double = double; + using complex = SNAComplex; + static constexpr int vector_length = vector_length_; + + const int offset; // my offset into the vector (0, ..., vector_length - 1) + double* buffer; // buffer of real numbers + + KOKKOS_INLINE_FUNCTION + WignerWrapper(complex* buffer_, const int offset_) + : offset(offset_), buffer(reinterpret_cast(buffer_)) + { ; } + + KOKKOS_INLINE_FUNCTION + complex get(const int& ma) const { + return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]); + } + + KOKKOS_INLINE_FUNCTION + void set(const int& ma, const complex& store) const { + buffer[offset + 2 * vector_length * ma] = store.re; + buffer[offset + vector_length + 2 * vector_length * ma] = store.im; + } }; +struct alignas(8) FullHalfMapper { + int idxu_half; + int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj +}; + +template class SNA { -public: - typedef Kokkos::View t_sna_1i; - typedef Kokkos::View t_sna_1d; - typedef Kokkos::View t_sna_2d; - typedef Kokkos::View t_sna_3d; - typedef Kokkos::View > t_sna_3d_atomic; - typedef Kokkos::View t_sna_4d; - typedef Kokkos::View t_sna_3d3; - typedef Kokkos::View t_sna_5d; + public: + using double = double_; + using complex = SNAComplex; + static constexpr int vector_length = vector_length_; + + using KKDeviceType = typename KKDevice::value; + + typedef Kokkos::View t_sna_1i; + typedef Kokkos::View t_sna_1d; + typedef Kokkos::View> t_sna_1d_atomic; + typedef Kokkos::View t_sna_2i; + typedef Kokkos::View t_sna_2d; + typedef Kokkos::View t_sna_2d_ll; + typedef Kokkos::View t_sna_3d; + typedef Kokkos::View t_sna_3d_ll; + typedef Kokkos::View t_sna_4d; + typedef Kokkos::View t_sna_4d_ll; + typedef Kokkos::View t_sna_3d3; + typedef Kokkos::View t_sna_5d; + + typedef Kokkos::View t_sna_1c; + typedef Kokkos::View> t_sna_1c_atomic; + typedef Kokkos::View t_sna_2c; + typedef Kokkos::View t_sna_2c_ll; + typedef Kokkos::View t_sna_2c_lr; + typedef Kokkos::View t_sna_3c; + typedef Kokkos::View t_sna_3c_ll; + typedef Kokkos::View t_sna_4c; + typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c_ll; + typedef Kokkos::View t_sna_3c3; + typedef Kokkos::View t_sna_5c; + inline SNA() {}; + KOKKOS_INLINE_FUNCTION - SNA(const SNA& sna, const Kokkos::TeamPolicy<>::member_type& team); + SNA(const SNA& sna, const typename Kokkos::TeamPolicy::member_type& team); + inline - SNA(double, int, int, int, double, int, int); + SNA(double, int, double, int, int, int, int, int, int, int); KOKKOS_INLINE_FUNCTION ~SNA(); + inline void build_indexlist(); // SNA() + inline void init(); // - inline - T_INT size_team_scratch_arrays(); - inline - T_INT size_thread_scratch_arrays(); + + double memory_usage(); int ncoeff; + int host_flag; - // functions for bispectrum coefficients + // functions for bispectrum coefficients, GPU only + KOKKOS_INLINE_FUNCTION + void compute_cayley_klein(const int&, const int&, const int&); + KOKKOS_INLINE_FUNCTION + void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP + // version of the code with parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int); // ForceSNAP + void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); // ForceSNAP + // version of the code without parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_zi(const Kokkos::TeamPolicy<>::member_type& team); // ForceSNAP + void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); // ForceSNAP - // functions for derivatives + KOKKOS_INLINE_FUNCTION + void compute_zi(const int&, const int&, const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_yi(int,int,int, + const Kokkos::View &beta_pack); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_yi_with_zlist(int,int,int, + const Kokkos::View &beta_pack); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_bi(const int&, const int&, const int&); // ForceSNAP + // functions for derivatives, GPU only + // version of the code with parallelism over j_bend + template KOKKOS_INLINE_FUNCTION - void compute_duidrj(const Kokkos::TeamPolicy<>::member_type& team, double*, double, double); //ForceSNAP + void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); //ForceSNAP + // version of the code without parallelism over j_bend + template KOKKOS_INLINE_FUNCTION - void compute_dbidrj(const Kokkos::TeamPolicy<>::member_type& team); //ForceSNAP + void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); //ForceSNAP + + // core "evaluation" functions that get plugged into "compute" functions + // plugged into compute_ui_small, compute_ui_large + KOKKOS_FORCEINLINE_FUNCTION + void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const double&, const int&, + const int&, const int&, const int&); + // plugged into compute_zi, compute_yi + KOKKOS_FORCEINLINE_FUNCTION + complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, + const int&, const int&, const int&, const int&, const double*); + // plugged into compute_yi, compute_yi_with_zlist + KOKKOS_FORCEINLINE_FUNCTION + double evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, + const Kokkos::View &); + // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large + KOKKOS_FORCEINLINE_FUNCTION + double evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const double&, + const WignerWrapper&, const complex&, const complex&, const double&, + const int&, const int&, const int&, const int&); + + // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void copy_dbi2dbvec(const Kokkos::TeamPolicy<>::member_type& team); //ForceSNAP + void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - double compute_sfac(double, double); // add_uarraytot, compute_duarray + void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_zi_cpu(const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_yi_cpu(int, + const Kokkos::View &beta); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + + // functions for derivatives, CPU only + KOKKOS_INLINE_FUNCTION + void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + + KOKKOS_INLINE_FUNCTION + double compute_sfac(double, double, double, double); // add_uarraytot, compute_duarray + KOKKOS_INLINE_FUNCTION - double compute_dsfac(double, double); // compute_duarray + double compute_dsfac(double, double, double, double); // compute_duarray + + KOKKOS_INLINE_FUNCTION + void compute_s_dsfac(const double, const double, const double, const double, double&, double&); // compute_cayley_klein #ifdef TIMING_INFO double* timers; @@ -125,99 +244,137 @@ class SNA { //per sna class instance for OMP use - // Per InFlight Particle - t_sna_2d rij; - t_sna_1i inside; - t_sna_1d wj; - t_sna_1d rcutij; - int nmax; - - void grow_rij(int); + t_sna_3d rij; + t_sna_2i inside; + t_sna_2d wj; + t_sna_2d rcutij; + t_sna_2d sinnerij; + t_sna_2d dinnerij; + t_sna_2i element; + t_sna_3d dedr; + int natom, nmax; + + void grow_rij(int, int); int twojmax, diagonalstyle; - // Per InFlight Particle - t_sna_3d uarraytot_r, uarraytot_i; - t_sna_3d_atomic uarraytot_r_a, uarraytot_i_a; - t_sna_5d zarray_r, zarray_i; - // Per InFlight Interaction - t_sna_3d uarray_r, uarray_i; + t_sna_3d blist; + t_sna_3c_ll ulisttot; + t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only + t_sna_3c_ll zlist; - // derivatives of data - Kokkos::View dbvec; - t_sna_4d duarray_r, duarray_i; - t_sna_4d dbarray; + t_sna_3c_ll ulist; + t_sna_3c_ll ylist; -private: + // derivatives of data + t_sna_4c3_ll dulist; + + // Modified structures for GPU backend + t_sna_3c_ll a_pack; // Cayley-Klein `a` + t_sna_3c_ll b_pack; // `b` + t_sna_4c_ll da_pack; // `da` + t_sna_4c_ll db_pack; // `db` + t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z} + + t_sna_4d_ll ulisttot_re_pack; // split real, + t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened + t_sna_4c_ll ulisttot_pack; // AoSoA layout + t_sna_4c_ll zlist_pack; // AoSoA layout + t_sna_4d_ll blist_pack; + t_sna_4d_ll ylist_pack_re; // split real, + t_sna_4d_ll ylist_pack_im; // imag AoSoA layout + + int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; + + // Chem snap counts + int nelements; + int ndoubles; + int ntriples; + + private: double rmin0, rfac0; //use indexlist instead of loops, constructor generates these - // Same accross all SNA - Kokkos::View idxj,idxj_full; - int idxj_max,idxj_full_max; + // Same across all SNA + Kokkos::View idxz; + Kokkos::View idxb; + Kokkos::View idxcg_block; + + public: + Kokkos::View idxu_block; + Kokkos::View idxu_half_block; + Kokkos::View idxu_cache_block; + Kokkos::View idxu_full_half; + + private: + Kokkos::View idxz_block; + Kokkos::View idxb_block; + // data for bispectrum coefficients - // Same accross all SNA - t_sna_5d cgarray; + // Same across all SNA + t_sna_1d cglist; t_sna_2d rootpqarray; - static const int nmaxfactorial = 167; - KOKKOS_INLINE_FUNCTION + static const double nfac_table[]; + inline double factorial(int); KOKKOS_INLINE_FUNCTION - void create_team_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team); // SNA() + void create_team_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team); // SNA() KOKKOS_INLINE_FUNCTION - void create_thread_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team); // SNA() + void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team); // SNA() + inline void init_clebsch_gordan(); // init() + inline void init_rootpqarray(); // init() - KOKKOS_INLINE_FUNCTION - void zero_uarraytot(const Kokkos::TeamPolicy<>::member_type& team); // compute_ui - KOKKOS_INLINE_FUNCTION - void addself_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double); // compute_ui - KOKKOS_INLINE_FUNCTION - void add_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double, double, double); // compute_ui KOKKOS_INLINE_FUNCTION - void compute_uarray(const Kokkos::TeamPolicy<>::member_type& team, - double, double, double, - double, double); // compute_ui + void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui + KOKKOS_INLINE_FUNCTION + void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + const double&, const double&, const double&, + const double&, const double&); // compute_ui_cpu + + + inline double deltacg(int, int, int); // init_clebsch_gordan + inline int compute_ncoeff(); // SNA() KOKKOS_INLINE_FUNCTION - void compute_duarray(const Kokkos::TeamPolicy<>::member_type& team, - double, double, double, // compute_duidrj - double, double, double, double, double); - - // if number of atoms are small use per atom arrays - // for twojmax arrays, rij, inside, bvec - // this will increase the memory footprint considerably, - // but allows parallel filling and reuse of these arrays - int use_shared_arrays; + void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + const double&, const double&, const double&, // compute_duidrj_cpu + const double&, const double&, const double&, const double&, const double&, + const double&, const double&); // Sets the style for the switching function // 0 = none // 1 = cosine int switch_flag; + // Sets the style for the inner switching function + // 0 = none + // 1 = cosine + int switch_inner_flag; + + // Chem snap flags + int chem_flag; + int bnorm_flag; + // Self-weight double wself; + int wselfall_flag; + + int bzero_flag; // 1 if bzero subtracted from barray + Kokkos::View bzero; // array of B values for isolated atoms }; #include #endif -/* ERROR/WARNING messages: - -E: Invalid argument to factorial %d - -N must be >= 0 and <= 167, otherwise the factorial result is too -large. - -*/ diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp index 40f5bad..3bbcc4b 100644 --- a/src/force_types/sna_impl.hpp +++ b/src/force_types/sna_impl.hpp @@ -1,6 +1,7 @@ -/* ---------------------------------------------------------------------- +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories + https://www.lammps.org/, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract @@ -12,68 +13,69 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Aidan Thompson, Christian Trott, SNL + Contributing authors: Christian Trott (SNL), Stan Moore (SNL) ------------------------------------------------------------------------- */ #include "sna.h" -#include -#include -#include +#include "memory_kokkos.h" +#include +#include +#include +#include static const double MY_PI = 3.14159265358979323846; // pi +static const double MY_PI2 = 1.57079632679489661923; // pi/2 inline SNA::SNA(double rfac0_in, - int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in, - double rmin0_in, int switch_flag_in, int bzero_flag_in) + int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in, + int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in) { - wself = 1.0; - - use_shared_arrays = use_shared_arrays_in; + LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + host_flag = (execution_space == LAMMPS_NS::Host); + + wself = static_cast(1.0); + rfac0 = rfac0_in; rmin0 = rmin0_in; switch_flag = switch_flag_in; + switch_inner_flag = switch_inner_flag_in; + bzero_flag = bzero_flag_in; + + chem_flag = chem_flag_in; + if (chem_flag) + nelements = nelements_in; + else + nelements = 1; + bnorm_flag = bnorm_flag_in; + wselfall_flag = wselfall_flag_in; twojmax = twojmax_in; - diagonalstyle = diagonalstyle_in; ncoeff = compute_ncoeff(); - //create_twojmax_arrays(); - nmax = 0; - + natom = 0; + build_indexlist(); - int jdim = twojmax + 1; + int jdimpq = twojmax + 2; + MemKK::realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq); - cgarray = t_sna_5d("SNA::cgarray",jdim,jdim,jdim,jdim,jdim); - rootpqarray = t_sna_2d("SNA::rootpqarray",jdim+1,jdim+1); + MemKK::realloc_kokkos(cglist,"SNA::cglist",idxcg_max); -} + if (bzero_flag) { + MemKK::realloc_kokkos(bzero,"sna:bzero",twojmax+1); + auto h_bzero = Kokkos::create_mirror_view(bzero); -KOKKOS_INLINE_FUNCTION -SNA::SNA(const SNA& sna, const Kokkos::TeamPolicy<>::member_type& team) { - wself = sna.wself; - - use_shared_arrays = sna.use_shared_arrays; - rfac0 = sna.rfac0; - rmin0 = sna.rmin0; - switch_flag = sna.switch_flag; - - twojmax = sna.twojmax; - diagonalstyle = sna.diagonalstyle; - - ncoeff = sna.ncoeff; - nmax = sna.nmax; - idxj = sna.idxj; - idxj_max = sna.idxj_max; - idxj_full = sna.idxj_full; - idxj_full_max = sna.idxj_full_max; - cgarray = sna.cgarray; - rootpqarray = sna.rootpqarray; - create_team_scratch_arrays(team); - create_thread_scratch_arrays(team); + double www = wself*wself*wself; + for (int j = 0; j <= twojmax; j++) + if (bnorm_flag) + h_bzero[j] = www; + else + h_bzero[j] = www*(j+1); + Kokkos::deep_copy(bzero,h_bzero); + } } /* ---------------------------------------------------------------------- */ @@ -86,50 +88,190 @@ SNA::~SNA() inline void SNA::build_indexlist() { - if(diagonalstyle == 3) { - int idxj_count = 0; - int idxj_full_count = 0; - - for(int j1 = 0; j1 <= twojmax; j1++) - for(int j2 = 0; j2 <= j1; j2++) - for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) { - if (j >= j1) idxj_count++; - idxj_full_count++; - } + // index list for cglist - // indexList can be changed here + int jdim = twojmax + 1; + MemKK::realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim); + auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block); - idxj = Kokkos::View("SNA::idxj",idxj_count); - idxj_full = Kokkos::View("SNA::idxj_full",idxj_full_count); - auto h_idxj = Kokkos::create_mirror_view(idxj); - auto h_idxj_full = Kokkos::create_mirror_view(idxj_full); + int idxcg_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + h_idxcg_block(j1,j2,j) = idxcg_count; + for (int m1 = 0; m1 <= j1; m1++) + for (int m2 = 0; m2 <= j2; m2++) + idxcg_count++; + } + idxcg_max = idxcg_count; + Kokkos::deep_copy(idxcg_block,h_idxcg_block); - idxj_max = idxj_count; - idxj_full_max = idxj_full_count; + // index list for uarray + // need to include both halves - idxj_count = 0; - idxj_full_count = 0; + MemKK::realloc_kokkos(idxu_block,"SNA::idxu_block",jdim); + auto h_idxu_block = Kokkos::create_mirror_view(idxu_block); - for(int j1 = 0; j1 <= twojmax; j1++) - for(int j2 = 0; j2 <= j1; j2++) - for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) { - if (j >= j1) { - h_idxj[idxj_count].j1 = j1; - h_idxj[idxj_count].j2 = j2; - h_idxj[idxj_count].j = j; - idxj_count++; - } - h_idxj_full[idxj_full_count].j1 = j1; - h_idxj_full[idxj_full_count].j2 = j2; - h_idxj_full[idxj_full_count].j = j; - idxj_full_count++; + int idxu_count = 0; + + for (int j = 0; j <= twojmax; j++) { + h_idxu_block[j] = idxu_count; + for (int mb = 0; mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxu_count++; + } + idxu_max = idxu_count; + Kokkos::deep_copy(idxu_block,h_idxu_block); + + // index list for half uarray + MemKK::realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim); + auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block); + + int idxu_half_count = 0; + for (int j = 0; j <= twojmax; j++) { + h_idxu_half_block[j] = idxu_half_count; + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxu_half_count++; + } + idxu_half_max = idxu_half_count; + Kokkos::deep_copy(idxu_half_block, h_idxu_half_block); + + // mapping between full and half indexing, encoding flipping + MemKK::realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max); + auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half); + + idxu_count = 0; + for (int j = 0; j <= twojmax; j++) { + int jju_half = h_idxu_half_block[j]; + for (int mb = 0; mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + FullHalfMapper mapper; + if (2*mb <= j) { + mapper.idxu_half = jju_half + mb * (j + 1) + ma; + mapper.flip_sign = 0; + } else { + mapper.idxu_half = jju_half + (j + 1 - mb) * (j + 1) - (ma + 1); + mapper.flip_sign = (((ma+mb)%2==0)?1:-1); } - Kokkos::deep_copy(idxj,h_idxj); - Kokkos::deep_copy(idxj_full,h_idxj_full); + h_idxu_full_half[idxu_count] = mapper; + idxu_count++; + } + } + } + Kokkos::deep_copy(idxu_full_half, h_idxu_full_half); + + // index list for "cache" uarray + // this is the GPU scratch memory requirements + // applied the CPU structures + MemKK::realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim); + auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block); + + int idxu_cache_count = 0; + for (int j = 0; j <= twojmax; j++) { + h_idxu_cache_block[j] = idxu_cache_count; + for (int mb = 0; mb < ((j+3)/2); mb++) + for (int ma = 0; ma <= j; ma++) + idxu_cache_count++; } + idxu_cache_max = idxu_cache_count; + Kokkos::deep_copy(idxu_cache_block, h_idxu_cache_block); + + // index list for beta and B + + int idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) idxb_count++; + + idxb_max = idxb_count; + MemKK::realloc_kokkos(idxb,"SNA::idxb",idxb_max); + auto h_idxb = Kokkos::create_mirror_view(idxb); + + idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) { + h_idxb(idxb_count,0) = j1; + h_idxb(idxb_count,1) = j2; + h_idxb(idxb_count,2) = j; + idxb_count++; + } + Kokkos::deep_copy(idxb,h_idxb); + + // reverse index list for beta and b + + MemKK::realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim); + auto h_idxb_block = Kokkos::create_mirror_view(idxb_block); + + idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + if (j >= j1) { + h_idxb_block(j1,j2,j) = idxb_count; + idxb_count++; + } + } + Kokkos::deep_copy(idxb_block,h_idxb_block); + + // index list for zlist + + int idxz_count = 0; + + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxz_count++; + + idxz_max = idxz_count; + MemKK::realloc_kokkos(idxz,"SNA::idxz",idxz_max); + auto h_idxz = Kokkos::create_mirror_view(idxz); + + MemKK::realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim); + auto h_idxz_block = Kokkos::create_mirror_view(idxz_block); + + idxz_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + h_idxz_block(j1,j2,j) = idxz_count; + + // find right beta(ii,jjb) entry + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + h_idxz(idxz_count,0) = j1; + h_idxz(idxz_count,1) = j2; + h_idxz(idxz_count,2) = j; + h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2); + h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2; + h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2); + h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2; + h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1; + h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1; + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // ylist is "compressed" via symmetry in its + // contraction with dulist + const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma; + h_idxz(idxz_count,9) = jju_half; + + idxz_count++; + } + } + Kokkos::deep_copy(idxz,h_idxz); + Kokkos::deep_copy(idxz_block,h_idxz_block); } + /* ---------------------------------------------------------------------- */ inline @@ -140,19 +282,268 @@ void SNA::init() } inline -void SNA::grow_rij(int newnmax) +void SNA::grow_rij(int newnatom, int newnmax) { - if(newnmax <= nmax) return; + if (newnatom <= natom && newnmax <= nmax) return; + natom = newnatom; nmax = newnmax; + + MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3); + MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax); + MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax); + MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax); + MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax); + MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax); + MemKK::realloc_kokkos(element,"sna:element",natom,nmax); + MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); + +#ifdef LMP_KOKKOS_GPU + if (!host_flag) { + const int natom_div = (natom + vector_length - 1) / vector_length; + + MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); + MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); + MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3); + MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3); + MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4); + MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation + MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); + MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); + MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div); + MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div); + MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); + MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); + MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); + MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); + MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); + MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); + MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); + MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); + } else { +#endif + MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1); + MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1); + MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); + MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); + MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); + MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom); + MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom); + MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); + MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); + MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); + MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax); + MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom); + MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); + MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); + MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom); + MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); + MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); + MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); + +#ifdef LMP_KOKKOS_GPU + } +#endif } + +/* ---------------------------------------------------------------------- + * GPU routines + * ----------------------------------------------------------------------*/ + + /* ---------------------------------------------------------------------- - compute Ui by summing over neighbors j + Precompute the Cayley-Klein parameters and the derivatives thereof. + This routine better exploits parallelism than the GPU ComputeUi and + ComputeFusedDeidrj, which are one warp per atom-neighbor pair. ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int jnum) +void SNA::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) +{ + const int iatom = iatom_mod + vector_length * iatom_div; + const double x = rij(iatom,jnbor,0); + const double y = rij(iatom,jnbor,1); + const double z = rij(iatom,jnbor,2); + const double rsq = x * x + y * y + z * z; + const double r = sqrt(rsq); + const double rcut = rcutij(iatom, jnbor); + const double sinner = sinnerij(iatom, jnbor); + const double dinner = dinnerij(iatom, jnbor); + const double rscale0 = rfac0 * static_cast(MY_PI) / (rcut - rmin0); + const double theta0 = (r - rmin0) * rscale0; + const double sn = sin(theta0); + const double cs = cos(theta0); + const double z0 = r * cs / sn; + const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + + const double wj_local = wj(iatom, jnbor); + double sfac, dsfac; + compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac); + sfac *= wj_local; + dsfac *= wj_local; + + const double rinv = static_cast(1.0) / r; + const double ux = x * rinv; + const double uy = y * rinv; + const double uz = z * rinv; + + const double r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); + + const complex a = { z0 * r0inv, -z * r0inv }; + const complex b = { r0inv * y, -r0inv * x }; + + const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + + const double dr0invx = dr0invdr * ux; + const double dr0invy = dr0invdr * uy; + const double dr0invz = dr0invdr * uz; + + const double dz0x = dz0dr * ux; + const double dz0y = dz0dr * uy; + const double dz0z = dz0dr * uz; + + const complex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx }; + const complex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy }; + const complex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv }; + + const complex dbx = { y * dr0invx, -x * dr0invx - r0inv }; + const complex dby = { y * dr0invy + r0inv, -x * dr0invy }; + const complex dbz = { y * dr0invz, -x * dr0invz }; + + const double dsfacux = dsfac * ux; + const double dsfacuy = dsfac * uy; + const double dsfacuz = dsfac * uz; + + a_pack(iatom_mod,jnbor,iatom_div) = a; + b_pack(iatom_mod,jnbor,iatom_div) = b; + + da_pack(iatom_mod,jnbor,iatom_div,0) = dax; + db_pack(iatom_mod,jnbor,iatom_div,0) = dbx; + + da_pack(iatom_mod,jnbor,iatom_div,1) = day; + db_pack(iatom_mod,jnbor,iatom_div,1) = dby; + + da_pack(iatom_mod,jnbor,iatom_div,2) = daz; + db_pack(iatom_mod,jnbor,iatom_div,2) = dbz; + + sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac; + sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux; + sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy; + sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz; + + // we need to explicitly zero `dedr` somewhere before hitting + // ComputeFusedDeidrj --- this is just a convenient place to do it. + dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast(0.); + dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast(0.); + dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast(0.); + +} + +/* ---------------------------------------------------------------------- + Initialize ulisttot with self-energy terms. + Ulisttot uses a "half" data layout which takes + advantage of the symmetry of the Wigner U matrices. +------------------------------------------------------------------------- */ + +KOKKOS_INLINE_FUNCTION +void SNA::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) +{ + + for (int jelem = 0; jelem < nelements; jelem++) { + int jju_half = idxu_half_block(j); + + // Only diagonal elements get initialized + // Top half only: gets symmetrized by TransformUi + + for (int mb = 0; 2*mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + + double re_part = static_cast(0.); + if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } + + ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part; + ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast(0.); + + jju_half++; + } + } + } + +} + +/* ---------------------------------------------------------------------- + compute Ui by computing Wigner U-functions for one neighbor and + accumulating to the total. GPU only. +------------------------------------------------------------------------- */ + +// Version of the code that exposes additional parallelism by threading over `j_bend` values + +KOKKOS_INLINE_FUNCTION +void SNA::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +{ + + // get shared memory offset + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; + + // extract and wrap + const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + + // load parameters + const complex a = a_pack(iatom_mod, jnbor, iatom_div); + const complex b = b_pack(iatom_mod, jnbor, iatom_div); + const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + + // we need to "choose" when to bend + // this for loop is here for context --- we expose additional + // parallelism over this loop instead + //for (int j_bend = 0; j_bend <= twojmax; j_bend++) { + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); +} + +// Version of the code that loops over all `j_bend` values which reduces integer arithmetic +// and some amount of load imbalance, at the expense of reducing parallelism +KOKKOS_INLINE_FUNCTION +void SNA::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +{ + // get shared memory offset + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; + + // extract and wrap + const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + + // load parameters + const complex a = a_pack(iatom_mod, jnbor, iatom_div); + const complex b = b_pack(iatom_mod, jnbor, iatom_div); + const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + + // we need to "choose" when to bend + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int j_bend = 0; j_bend <= twojmax; j_bend++) { + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); + } +} + +// Core "evaluation" kernel that gets reused in `compute_ui_small` and `compute_ui_large` +KOKKOS_FORCEINLINE_FUNCTION +void SNA::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, + const complex& a, const complex& b, const double& sfac, const int& jelem, + const int& iatom_mod, const int& j_bend, const int& iatom_div) { - double rsq, r, x, y, z, z0, theta0; // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma @@ -160,488 +551,1087 @@ void SNA::compute_ui(const Kokkos::TeamPolicy<>::member_type& team, int jnum) // compute r0 = (x,y,z,z0) // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - if(team.team_rank() == 0) { - zero_uarraytot(team); - //Kokkos::single(Kokkos::PerThread(team), [&] (){ - addself_uarraytot(team,wself); - //}); + // level 0 is just 1. + ulist_wrapper.set(0, complex::one()); + + // j from before the bend, don't store, mb == 0 + for (int j = 1; j <= j_bend; j++) { + + constexpr int mb = 0; // intentional for readability, compiler should optimize this out + + complex ulist_accum = complex::zero(); + + int ma; + for (ma = 0; ma < j; ma++) { + + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + + // ulist_accum += rootpq * a.conj() * ulist_prev; + double rootpq = rootpqarray(j - ma, j - mb); + ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im); + ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re); + + // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here + ulist_wrapper.set(ma, ulist_accum); + + // next value + // ulist_accum = -rootpq * b.conj() * ulist_prev; + rootpq = rootpqarray(ma + 1, j - mb); + ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im); + ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); + + } + + ulist_wrapper.set(ma, ulist_accum); } - team.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum), - [&] (const int& j) { - //for(int j = 0; j < jnum; j++) { - x = rij(j,0); - y = rij(j,1); - z = rij(j,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray(team,x, y, z, z0, r); - //Kokkos::single(Kokkos::PerThread(team), [&] (){ - add_uarraytot(team,r, wj[j], rcutij[j]); - //}); - }); + // now we're after the bend, start storing but only up to the "half way point" + const int j_half_way = MIN(2 * j_bend, twojmax); + + int mb = 1; + int j; //= j_bend + 1; // need this value below + for (j = j_bend + 1; j <= j_half_way; j++) { + + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + complex ulist_accum = complex::zero(); + int ma; + for (ma = 0; ma < j; ma++) { + + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + + // atomic add the previous level here + Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + + // ulist_accum += rootpq * b * ulist_prev; + double rootpq = rootpqarray(j - ma, mb); + ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im); + ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re); + + // store ulist_accum + ulist_wrapper.set(ma, ulist_accum); + + // next value + // ulist_accum = rootpq * a * ulist_prev; + rootpq = rootpqarray(ma + 1, mb); + ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im); + ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re); + } + + ulist_wrapper.set(ma, ulist_accum); + + mb++; + } + + // atomic add the last level + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + for (int ma = 0; ma < j; ma++) { + const complex ulist_prev = ulist_wrapper.get(ma); + + // atomic add the previous level here + Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + } + +} + +/* ---------------------------------------------------------------------- + compute Zi by summing over products of Ui, + AoSoA data layout to take advantage of coalescing, avoiding warp + divergence. GPU version +------------------------------------------------------------------------- */ + +KOKKOS_INLINE_FUNCTION +void SNA::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) +{ + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + + const double* cgblock = cglist.data() + idxcg_block(j1, j2, j); + + int idouble = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + zlist_pack(iatom_mod,jjz,idouble,iatom_div) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); + + idouble++; + } + } } /* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui + compute Bi by summing conj(Ui)*Zi + AoSoA data layout to take advantage of coalescing, avoiding warp + divergence. ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_zi(const Kokkos::TeamPolicy<>::member_type& team) +void SNA::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // for ma = 0,...,j - // for mb = 0,...,jmid - // z(j1,j2,j,ma,mb) = 0 - // for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2) - // sumb1 = 0 - // ma2 = ma-ma1+(j1+j2-j)/2; - // for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2) - // mb2 = mb-mb1+(j1+j2-j)/2; - // sumb1 += cg(j1,mb1,j2,mb2,j) * - // u(j1,ma1,mb1) * u(j2,ma2,mb2) - // z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j) - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &starttime); -#endif + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + + const int j1 = idxb(jjb,0); + const int j2 = idxb(jjb,1); + const int j = idxb(jjb,2); + + const int jjz = idxz_block(j1,j2,j); + const int jju = idxu_block[j]; + + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int elem3 = 0; elem3 < nelements; elem3++) { + + double sumzu = 0.0; + double sumzu_temp = 0.0; + + for (int mb = 0; 2*mb < j; mb++) { + for (int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + if (2*mb == j) return; // I think we can remove this? + const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + } + } + sumzu += sumzu_temp; - // compute_dbidrj() requires full j1/j2/j chunk of z elements - // use zarray j1/j2 symmetry - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxj_full_max), - [&] (const int& idx) { - const int j1 = idxj_full(idx).j1; - const int j2 = idxj_full(idx).j2; - const int j = idxj_full(idx).j; - - const int bound = (j+2)/2; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma ) { - //for(int mb = 0; 2*mb <= j; mb++) - //for(int ma = 0; ma <= j; ma++) { - const int ma = mbma%(j+1); - const int mb = mbma/(j+1); - //zarray_r(j1,j2,j,ma,mb) = 0.0; - //zarray_i(j1,j2,j,ma,mb) = 0.0; - double z_r = 0.0; - double z_i = 0.0; - - for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2); - ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) { - double sumb1_r = 0.0; - double sumb1_i = 0.0; - - const int ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2; - - for(int mb1 = MAX( 0, (2 * mb - j - j2 + j1) / 2); - mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) { - - const int mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2; - const double cga = cgarray(j1,j2,j,mb1,mb2); - const double uat1_r = uarraytot_r(j1,ma1,mb1); - const double uat1_i = uarraytot_i(j1,ma1,mb1); - const double uat2_r = uarraytot_r(j2,ma2,mb2); - const double uat2_i = uarraytot_i(j2,ma2,mb2); - sumb1_r += cga * (uat1_r * uat2_r - uat1_i * uat2_i); - sumb1_i += cga * (uat1_r * uat2_i + uat1_i * uat2_r); - /*sumb1_r += cgarray(j1,j2,j,mb1,mb2) * - (uarraytot_r(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2) - - uarraytot_i(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2)); - sumb1_i += cgarray(j1,j2,j,mb1,mb2) * - (uarraytot_r(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2) + - uarraytot_i(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2));*/ - } // end loop over mb1 - - const double cga = cgarray(j1,j2,j,ma1,ma2); - z_r += sumb1_r * cga;//rray(j1,j2,j,ma1,ma2); - z_i += sumb1_i * cga;//rray(j1,j2,j,ma1,ma2); - } // end loop over ma1 - zarray_r(j1,j2,j,mb,ma) = z_r; - zarray_i(j1,j2,j,mb,ma) = z_i; - }); // end loop over ma, mb - // } - //} - }); - //} // end loop over j - //} // end loop over j1, j2 + // For j even, special treatment for middle column + if (j%2 == 0) { + sumzu_temp = 0.; -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif + const int mb = j/2; + for (int ma = 0; ma < mb; ma++) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + + } + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); + } // end if jeven + + sumzu *= static_cast(2.0); + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu; + //} // end loop over j + //} // end loop over j1, j2 + itriple++; + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 } /* ---------------------------------------------------------------------- - calculate derivative of Ui w.r.t. atom j + compute Yi from Ui without storing Zi, looping over zlist indices. + AoSoA data layout to take advantage of coalescing, avoiding warp + divergence. GPU version. ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_duidrj(const Kokkos::TeamPolicy<>::member_type& team, - double* rij, double wj, double rcut) +void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) { - double rsq, r, x, y, z, z0, theta0, cs, sn; - double dz0dr; - x = rij[0]; - y = rij[1]; - z = rij[2]; - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - double rscale0 = rfac0 * MY_PI / (rcut - rmin0); - theta0 = (r - rmin0) * rscale0; - cs = cos(theta0); - sn = sin(theta0); - z0 = r * cs / sn; - dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + const int jju_half = idxz(jjz, 9); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + const double betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); + + Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 +} -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &starttime); -#endif +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices. + AoSoA data layout to take advantage of coalescing, avoiding warp + divergence. GPU version. +------------------------------------------------------------------------- */ - compute_duarray(team, x, y, z, z0, r, dz0dr, wj, rcut); +KOKKOS_INLINE_FUNCTION +void SNA::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) +{ + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int jju_half = idxz(jjz, 9); + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + const complex ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + const double betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); + + Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif +// Core "evaluation" kernel that computes a single zlist value +// which gets used in both `compute_zi` and `compute_yi` +KOKKOS_FORCEINLINE_FUNCTION +typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int& j, + const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, + const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const double* cgblock) { + + complex ztmp = complex::zero(); + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int ib = 0; ib < nb; ib++) { + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int ia = 0; ia < na; ia++) { + const complex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div); + const complex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div); + const double cgcoeff_a = cgblock[icga]; + const double cgcoeff_b = cgblock[icgb]; + ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + const double scale = static_cast(1) / static_cast(j + 1); + ztmp.re *= scale; + ztmp.im *= scale; + } + + return ztmp; +} + +// Core "evaluation" kernel that extracts and rescales the appropriate `beta` value, +// which gets used in both `compute_yi` and `compute_yi_from_zlist +KOKKOS_FORCEINLINE_FUNCTION +typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, + const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, + const Kokkos::View &beta_pack) { + + double betaj = 0; + + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = static_cast(3) * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); + } else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else { + const int jjb = idxb_block(j2, j, j1); + const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta_pack(iatom_mod, itriple, iatom_div); + } + + if (!bnorm_flag && j1 > j) { + const double scale = static_cast(j1 + 1) / static_cast(j + 1); + betaj *= scale; + } + + return betaj; } /* ---------------------------------------------------------------------- - calculate derivative of Bi w.r.t. atom j - variant using indexlist for j1,j2,j - variant using symmetry relation + Fused calculation of the derivative of Ui w.r.t. atom j + and accumulation into dEidRj. GPU only. ------------------------------------------------------------------------- */ +// Version of the code that exposes additional parallelism by threading over `j_bend` values +template KOKKOS_INLINE_FUNCTION -void SNA::compute_dbidrj(const Kokkos::TeamPolicy<>::member_type& team) +void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) { - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // zdb = 0 - // for mb = 0,...,jmid - // for ma = 0,...,j - // zdb += - // Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) - // dbdr(j1,j2,j) += 2*zdb - // zdb = 0 - // for mb1 = 0,...,j1mid - // for ma1 = 0,...,j1 - // zdb += - // Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1) - // dbdr(j1,j2,j) += 2*zdb*(j+1)/(j1+1) - // zdb = 0 - // for mb2 = 0,...,j2mid - // for ma2 = 0,...,j2 - // zdb += - // Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2) - // dbdr(j1,j2,j) += 2*zdb*(j+1)/(j2+1) - - double* dudr_r, *dudr_i; - double jjjmambzarray_r; - double jjjmambzarray_i; - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &starttime); -#endif - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max), - [&] (const int& JJ) { - //for(int JJ = 0; JJ < idxj_max; JJ++) { - const int j1 = idxj[JJ].j1; - const int j2 = idxj[JJ].j2; - const int j = idxj[JJ].j; - -// dbdr = &dbarray(j1,j2,j,0); -// dbdr[0] = 0.0; -// dbdr[1] = 0.0; -// dbdr[2] = 0.0; - - t_scalar3 dbdr,sumzdu_r; - // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) - - // use zarray j1/j2 symmetry (optional) - - int j_,j1_,j2_; - if (j1 >= j2) { - //jjjzarray_r = &zarray_r(j1,j2,j); - //jjjzarray_i = &zarray_i(j1,j2,j); - j1_ = j1; - j2_ = j2; - j_ = j; - } else { - j1_ = j2; - j2_ = j1; - j_ = j; - //jjjzarray_r = &zarray_r(j2,j1,j); - //jjjzarray_i = &zarray_i(j2,j1,j); + // get shared memory offset + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; + + // extract, wrap shared memory buffer + WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + + // load parameters + const complex a = a_pack(iatom_mod, jnbor, iatom_div); + const complex b = b_pack(iatom_mod, jnbor, iatom_div); + const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); + const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); + const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const double dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + + // compute the contribution to dedr_full_sum for one "bend" location + const double dedr_full_sum = evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, + jelem, iatom_mod, j_bend, iatom_div); + + // dedr gets zeroed out at the start of each iteration in compute_cayley_klein + Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast(2.0) * dedr_full_sum); + +} + +// Version of the code that loops over all `j_bend` values which reduces integer arithmetic +// and some amount of load imbalance, at the expense of reducing parallelism +template +KOKKOS_INLINE_FUNCTION +void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +{ + // get shared memory offset + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; + + // extract, wrap shared memory buffer + WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + + // load parameters + const complex a = a_pack(iatom_mod, jnbor, iatom_div); + const complex b = b_pack(iatom_mod, jnbor, iatom_div); + const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); + const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); + const double sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const double dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + + // compute the contributions to dedr_full_sum for all "bend" locations + double dedr_full_sum = static_cast(0); + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int j_bend = 0; j_bend <= twojmax; j_bend++) { + dedr_full_sum += evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, + jelem, iatom_mod, j_bend, iatom_div); + } + + // there's one thread per atom, neighbor pair, so no need to make this atomic + dedr(iatom_mod + vector_length * iatom_div, jnbor, dir) = static_cast(2.0) * dedr_full_sum; + +} + +// Core "evaluation" kernel that gets reused in `compute_fused_deidrj_small` and +// `compute_fused_deidrj_large` +KOKKOS_FORCEINLINE_FUNCTION +typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac, + const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu, + const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { + + double dedr_full_sum = static_cast(0); + + // level 0 is just 1, 0 + ulist_wrapper.set(0, complex::one()); + dulist_wrapper.set(0, complex::zero()); + + // j from before the bend, don't store, mb == 0 + // this is "creeping up the side" + for (int j = 1; j <= j_bend; j++) { + + constexpr int mb = 0; // intentional for readability, compiler should optimize this out + + complex ulist_accum = complex::zero(); + complex dulist_accum = complex::zero(); + + int ma; + for (ma = 0; ma < j; ma++) { + + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); + + // ulist_accum += rootpq * a.conj() * ulist_prev; + double rootpq = rootpqarray(j - ma, j - mb); + ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im); + ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re); + + // product rule of above + dulist_accum.re += rootpq * (da.re * ulist_prev.re + da.im * ulist_prev.im + a.re * dulist_prev.re + a.im * dulist_prev.im); + dulist_accum.im += rootpq * (da.re * ulist_prev.im - da.im * ulist_prev.re + a.re * dulist_prev.im - a.im * dulist_prev.re); + + // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); + + // next value + // ulist_accum = -rootpq * b.conj() * ulist_prev; + rootpq = rootpqarray(ma + 1, j - mb); + ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im); + ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); + + // product rule of above + dulist_accum.re = -rootpq * (db.re * ulist_prev.re + db.im * ulist_prev.im + b.re * dulist_prev.re + b.im * dulist_prev.im); + dulist_accum.im = -rootpq * (db.re * ulist_prev.im - db.im * ulist_prev.re + b.re * dulist_prev.im - b.im * dulist_prev.re); + } - for(int mb = 0; 2*mb < j; mb++) - for(int ma = 0; ma <= j; ma++) { + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); + } - dudr_r = &duarray_r(j,mb,ma,0); - dudr_i = &duarray_i(j,mb,ma,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb,ma); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb,ma); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); + // now we're after the bend, start storing but only up to the "half way point" + const int j_half_way = MIN(2 * j_bend, twojmax); - } //end loop over ma mb + int mb = 1; + int j; //= j_bend + 1; // need this value below + for (j = j_bend + 1; j <= j_half_way; j++) { - // For j even, handle middle column + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; - if (j%2 == 0) { - int mb = j/2; - for(int ma = 0; ma <= mb; ma++) { - dudr_r = &duarray_r(j,mb,ma,0); - dudr_i = &duarray_i(j,mb,ma,0); - const double factor = ma==mb?0.5:1.0; - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb,ma) * factor; - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb,ma) * factor; - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); - } - } // end if jeven + complex ulist_accum = complex::zero(); + complex dulist_accum = complex::zero(); + + int ma; + for (ma = 0; ma < j; ma++) { - dbdr += 2.0*sumzdu_r; + // grab y_local early + // this will never be the last element of a row, no need to rescale. + complex y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div)); - // Sum over Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1) + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); - double j1fac = (j+1)/(j1+1.0); + // ulist_accum += rootpq * b * ulist_prev; + double rootpq = rootpqarray(j - ma, mb); + ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im); + ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re); - sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0; + // product rule of above + dulist_accum.re += rootpq * (db.re * ulist_prev.re - db.im * ulist_prev.im + b.re * dulist_prev.re - b.im * dulist_prev.im); + dulist_accum.im += rootpq * (db.re * ulist_prev.im + db.im * ulist_prev.re + b.re * dulist_prev.im + b.im * dulist_prev.re); - // use zarray j1/j2 symmetry (optional) + // store ulist_accum + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); - if (j >= j2) { - j1_ = j; - j2_ = j2; - j_ = j1; + // Directly accumulate deidrj into sum_tmp + const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev); + dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im; + + // next value + // ulist_accum = rootpq * a * ulist_prev; + rootpq = rootpqarray(ma + 1, mb); + ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im); + ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re); + + // product rule of above + dulist_accum.re = rootpq * (da.re * ulist_prev.re - da.im * ulist_prev.im + a.re * dulist_prev.re - a.im * dulist_prev.im); + dulist_accum.im = rootpq * (da.re * ulist_prev.im + da.im * ulist_prev.re + a.re * dulist_prev.im + a.im * dulist_prev.re); - //jjjzarray_r = zarray_r(j,j2,j1); - //jjjzarray_i = zarray_i(j,j2,j1); - } else { - j1_ = j2; - j2_ = j; - j_ = j1; - //jjjzarray_r = zarray_r(j2,j,j1); - //jjjzarray_i = zarray_i(j2,j,j1); } - for(int mb1 = 0; 2*mb1 < j1; mb1++) - for(int ma1 = 0; ma1 <= j1; ma1++) { - - dudr_r = &duarray_r(j1,mb1,ma1,0); - dudr_i = &duarray_i(j1,mb1,ma1,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); - } //end loop over ma1 mb1 - - // For j1 even, handle middle column - - if (j1%2 == 0) { - const int mb1 = j1/2; - for(int ma1 = 0; ma1 <= mb1; ma1++) { - dudr_r = &duarray_r(j1,mb1,ma1,0); - dudr_i = &duarray_i(j1,mb1,ma1,0); - const double factor = ma1==mb1?0.5:1.0; - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1) * factor; - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1) * factor; - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); - } - } // end if j1even + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); - dbdr += 2.0*sumzdu_r*j1fac; + mb++; + } + + // accumulate the last level + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + for (int ma = 0; ma < j; ma++) { + // grab y_local early + complex y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div)); + if (j % 2 == 1 && 2*(mb-1) == j-1) { // double check me... + if (ma == (mb-1)) { y_local = static_cast(0.5)*y_local; } + else if (ma > (mb-1)) { y_local.re = static_cast(0.); y_local.im = static_cast(0.); } // can probably avoid this outright + // else the ma < mb gets "double counted", cancelling the 0.5. + } - // Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2) + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); - double j2fac = (j+1)/(j2+1.0); + // Directly accumulate deidrj into sum_tmp + const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev); + dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im; - sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0; + } - // use zarray j1/j2 symmetry (optional) + return dedr_full_sum; +} - if (j1 >= j) { - j1_ = j1; - j2_ = j; - j_ = j2; - //jjjzarray_r = zarray_r(j1,j,j2); - //jjjzarray_i = zarray_i(j1,j,j2); - } else { - j1_ = j; - j2_ = j1; - j_ = j2; - //jjjzarray_r = zarray_r(j,j1,j2); - //jjjzarray_i = zarray_i(j,j1,j2); +/* ---------------------------------------------------------------------- + * CPU routines + * ----------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------- + Ulisttot uses a "half" data layout which takes + advantage of the symmetry of the Wigner U matrices. + * ------------------------------------------------------------------------- */ + +KOKKOS_INLINE_FUNCTION +void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) +{ + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug + + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j/2+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)), + [&] (const int m) { + + const int jjup = jju + m; + + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + complex init(static_cast(0.),static_cast(0.)); + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element + + ulisttot(jjup, jelem, iatom) = init; + }); } + } + +} + + +/* ---------------------------------------------------------------------- + compute Ui by summing over bispectrum components. CPU only. + See comments above compute_uarray_cpu and add_uarraytot for + data layout comments. +------------------------------------------------------------------------- */ + +KOKKOS_INLINE_FUNCTION +void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + double rsq, r, x, y, z, z0, theta0; + + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + + theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + z0 = r / tan(theta0); + + compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); + add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); + +} +/* ---------------------------------------------------------------------- + compute Zi by summing over products of Ui, CPU version +------------------------------------------------------------------------- */ - for(int mb2 = 0; 2*mb2 < j2; mb2++) - for(int ma2 = 0; ma2 <= j2; ma2++) { - - dudr_r = &duarray_r(j2,mb2,ma2,0); - dudr_i = &duarray_i(j2,mb2,ma2,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); - } //end loop over ma2 mb2 - - // For j2 even, handle middle column - - if (j2%2 == 0) { - const int mb2 = j2/2; - for(int ma2 = 0; ma2 <= mb2; ma2++) { - dudr_r = &duarray_r(j2,mb2,ma2,0); - dudr_i = &duarray_i(j2,mb2,ma2,0); - const double factor = ma2==mb2?0.5:1.0; - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2) * factor; - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2) * factor; - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); +KOKKOS_INLINE_FUNCTION +void SNA::compute_zi_cpu(const int& iter) +{ + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + + int idouble = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(jjz, idouble, iatom).re = static_cast(0.0); + zlist(jjz, idouble, iatom).im = static_cast(0.0); + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for (int ib = 0; ib < nb; ib++) { + + double suma1_r = static_cast(0.0); + double suma1_i = static_cast(0.0); + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min * (j2 + 1) + ma2max; + for (int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - + ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + + ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; + zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; + + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + const double scale = static_cast(1) / static_cast(j + 1); + zlist(jjz, idouble, iatom).re *= scale; + zlist(jjz, idouble, iatom).im *= scale; } - } // end if j2even - - dbdr += 2.0*sumzdu_r*j2fac; - dbarray(j1,j2,j,0) = dbdr.x; - dbarray(j1,j2,j,1) = dbdr.y; - dbarray(j1,j2,j,2) = dbdr.z; - }); //end loop over j1 j2 j - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} + + +/* ---------------------------------------------------------------------- + compute Bi by summing conj(Ui)*Zi, CPU version +------------------------------------------------------------------------- */ + +KOKKOS_INLINE_FUNCTION +void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +{ + // for j1 = 0,...,twojmax + // for j2 = 0,twojmax + // for j = |j1-j2|,Min(twojmax,j1+j2),2 + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + int jalloy = idouble; // must be non-const to work around gcc compiler bug + for (int elem3 = 0; elem3 < nelements; elem3++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max), + [&] (const int& jjb) { + const int j1 = idxb(jjb, 0); + const int j2 = idxb(jjb, 1); + int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug + + int jjz = idxz_block(j1, j2, j); + int jju = idxu_block[j]; + double sumzu = static_cast(0.0); + double sumzu_temp = static_cast(0.0); + const int bound = (j+2)/2; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), + [&] (const int mbma, double& sum) { + const int ma = mbma % (j + 1); + const int mb = mbma / (j + 1); + const int jju_index = jju + mb * (j + 1) + ma; + const int jjz_index = jjz + mb * (j + 1) + ma; + if (2*mb == j) return; + sum += + ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; + },sumzu_temp); // end loop over ma, mb + sumzu += sumzu_temp; + + // For j even, special treatment for middle column + + if (j%2 == 0) { + int mb = j/2; // removed "const" to work around GCC 7 bug + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb), + [&] (const int ma, double& sum) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + sum += + ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; + },sumzu_temp); // end loop over ma + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + sumzu += static_cast(0.5)* + (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im); + } // end if jeven + + Kokkos::single(Kokkos::PerThread(team), [&] () { + sumzu *= static_cast(2.0); + + // apply bzero shift + + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + + blist(iatom, itriple, jjb) = sumzu; + }); + }); + //} // end loop over j + //} // end loop over j1, j2 + itriple++; + } + idouble++; + } // end loop over elem2 + } // end loop over elem1 } /* ---------------------------------------------------------------------- - copy Bi derivatives into a vector + compute Yi from Ui without storing Zi, looping over zlist indices, + CPU version ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::copy_dbi2dbvec(const Kokkos::TeamPolicy<>::member_type& team) +void SNA::compute_yi_cpu(int iter, + const Kokkos::View &beta) { - /* int ncount, j1, j2, j; + double betaj; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + const int jju_half = idxz(jjz, 9); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + double ztmp_r = 0.0; + double ztmp_i = 0.0; + + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 +1) + mb2max; + + for (int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + for (int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - + ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + + ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + const double scale = static_cast(1) / static_cast(j + 1); + ztmp_i *= scale; + ztmp_r *= scale; + } - ncount = 0; + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta(itriple, iatom); + else betaj = 2 * beta(itriple, iatom); + } else betaj = beta(itriple, iatom); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta(itriple, iatom); + else betaj = beta(itriple, iatom); + } else { + const int jjb = idxb_block(j2, j, j1); + const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta(itriple, iatom); + } - for(j1 = 0; j1 <= twojmax; j1++) { - for(j2 = 0; j2 <= j1; j2++) - for(j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - if (j >= j1) {*/ - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max), - [&] (const int& JJ) { - //for(int JJ = 0; JJ < idxj_max; JJ++) { - const int j1 = idxj[JJ].j1; - const int j2 = idxj[JJ].j2; - const int j = idxj[JJ].j; - - dbvec(JJ,0) = dbarray(j1,j2,j,0); - dbvec(JJ,1) = dbarray(j1,j2,j,1); - dbvec(JJ,2) = dbarray(j1,j2,j,2); - //ncount++; - //}); - }); + if (!bnorm_flag && j1 > j) + betaj *= static_cast(j1 + 1) / static_cast(j + 1); + + Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 } -/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + calculate derivative of Ui w.r.t. atom j + see comments above compute_duarray_cpu for comments on the + data layout +------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::zero_uarraytot(const Kokkos::TeamPolicy<>::member_type& team) +void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - { - double* const ptr = uarraytot_r.data(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()), - [&] (const int& i) { - ptr[i] = 0.0; - }); - } - { - double* const ptr = uarraytot_i.data(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()), - [&] (const int& i) { - ptr[i] = 0.0; - }); - } + double rsq, r, x, y, z, z0, theta0, cs, sn; + double dz0dr; + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + double rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); + theta0 = (r - rmin0) * rscale0; + sn = sin(theta0); + cs = cos(theta0); + z0 = r * cs / sn; + dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + + compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); } -/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + compute dEidRj, CPU path only. + dulist takes advantage of a `cached` data layout, similar to the + shared memory layout for the GPU routines, which is efficient for + compressing the calculation in compute_duarray_cpu. That said, + dulist only uses the "half" data layout part of that structure. +------------------------------------------------------------------------- */ + KOKKOS_INLINE_FUNCTION -void SNA::addself_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double wself_in) +void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - //for (int j = 0; j <= twojmax; j++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j) { - for (int ma = 0; ma <= j; ma++) { - uarraytot_r(j,ma,ma) = wself_in; - uarraytot_i(j,ma,ma) = 0.0; - } + t_scalar3 final_sum; + const int jelem = element(iatom, jnbor); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), + [&] (const int& j, t_scalar3& sum_tmp) { + int jju_half = idxu_half_block[j]; + int jju_cache = idxu_cache_block[j]; + + for (int mb = 0; 2*mb < j; mb++) + for (int ma = 0; ma <= j; ma++) { + sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; + sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; + sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + jju_half++; jju_cache++; + } //end loop over ma mb + + // For j even, handle middle column + + if (j%2 == 0) { + + int mb = j/2; + for (int ma = 0; ma < mb; ma++) { + sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; + sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; + sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + jju_half++; jju_cache++; + } + + //int ma = mb; + sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5; + sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5; + sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + + dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5; + } // end if jeven + + },final_sum); // end loop over j + + Kokkos::single(Kokkos::PerThread(team), [&] () { + dedr(iatom,jnbor,0) = final_sum.x*2.0; + dedr(iatom,jnbor,1) = final_sum.y*2.0; + dedr(iatom,jnbor,2) = final_sum.z*2.0; }); + } + /* ---------------------------------------------------------------------- add Wigner U-functions for one neighbor to the total + ulist is in a "cached" data layout, which is a compressed layout + which still keeps the recursive calculation simple. On the other hand + `ulisttot` uses a "half" data layout, which fully takes advantage + of the symmetry of the Wigner U matrices. ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::add_uarraytot(const Kokkos::TeamPolicy<>::member_type& team, double r, double wj, double rcut) +void SNA::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const double& r, const double& wj, const double& rcut, + const double& sinner, const double& dinner, int jelem) { - const double sfac = compute_sfac(r, rcut) * wj; - -/* - for (int j = 0; j <= twojmax; j++) - for (int ma = 0; ma <= j; ma++) - for (int mb = 0; mb <= j; mb++) { - uarraytot_r_a(j,ma,mb) += - sfac * uarray_r(j,ma,mb); - uarraytot_i_a(j,ma,mb) += - sfac * uarray_i(j,ma,mb); - }*/ - const double* const ptr_r = uarray_r.data(); - const double* const ptr_i = uarray_i.data(); - double* const ptrtot_r = uarraytot_r.data(); - double* const ptrtot_i = uarraytot_i.data(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()), - [&] (const int& i) { - Kokkos::atomic_add(ptrtot_r+i, sfac * ptr_r[i]); - Kokkos::atomic_add(ptrtot_i+i, sfac * ptr_i[i]); + const double sfac = compute_sfac(r, rcut, sinner, dinner) * wj; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), + [&] (const int& j) { + + int jju_half = idxu_half_block[j]; // index into ulisttot + int jju_cache = idxu_cache_block[j]; // index into ulist + + int count = 0; + for (int mb = 0; 2*mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re); + Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im); + count++; + } + } }); } /* ---------------------------------------------------------------------- - compute Wigner U-functions for one neighbor + compute Wigner U-functions for one neighbor. + `ulisttot` uses a "cached" data layout, matching the amount of + information stored between layers via scratch memory on the GPU path ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team, - double x, double y, double z, - double z0, double r) +void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const double& x, const double& y, const double& z, const double& z0, const double& r) { double r0inv; double a_r, b_r, a_i, b_i; @@ -649,7 +1639,7 @@ void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team, // compute Cayley-Klein parameters for unit quaternion - r0inv = 1.0 / sqrt(r * r + z0 * z0); + r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); a_r = r0inv * z0; a_i = -r0inv * z; b_r = r0inv * y; @@ -657,79 +1647,84 @@ void SNA::compute_uarray(const Kokkos::TeamPolicy<>::member_type& team, // VMK Section 4.8.2 - uarray_r(0,0,0) = 1.0; - uarray_i(0,0,0) = 0.0; + ulist(0,iatom,jnbor).re = 1.0; + ulist(0,iatom,jnbor).im = 0.0; for (int j = 1; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug + int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug // fill in left side of matrix layer from previous layer Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { - //const int mb = 2*mb_2; //for (int mb = 0; 2*mb <= j; mb++) { - uarray_r(j,0,mb) = 0.0; - uarray_i(j,0,mb) = 0.0; + const int jju_index = jju+mb+mb*j; + ulist(jju_index,iatom,jnbor).re = 0.0; + ulist(jju_index,iatom,jnbor).im = 0.0; for (int ma = 0; ma < j; ma++) { - rootpq = rootpqarray(j - ma,j - mb); - uarray_r(j,ma,mb) += + const int jju_index = jju+mb+mb*j+ma; + const int jjup_index = jjup+mb*j+ma; + rootpq = rootpqarray(j - ma,j - mb); + ulist(jju_index,iatom,jnbor).re += rootpq * - (a_r * uarray_r(j - 1,ma,mb) + - a_i * uarray_i(j - 1,ma,mb)); - uarray_i(j,ma,mb) += + (a_r * ulist(jjup_index,iatom,jnbor).re + + a_i * ulist(jjup_index,iatom,jnbor).im); + ulist(jju_index,iatom,jnbor).im += rootpq * - (a_r * uarray_i(j - 1,ma,mb) - - a_i * uarray_r(j - 1,ma,mb)); + (a_r * ulist(jjup_index,iatom,jnbor).im - + a_i * ulist(jjup_index,iatom,jnbor).re); - rootpq = rootpqarray(ma + 1,j - mb); - uarray_r(j,ma + 1,mb) = + rootpq = rootpqarray(ma + 1,j - mb); + ulist(jju_index+1,iatom,jnbor).re = -rootpq * - (b_r * uarray_r(j - 1,ma,mb) + - b_i * uarray_i(j - 1,ma,mb)); - uarray_i(j,ma + 1,mb) = + (b_r * ulist(jjup_index,iatom,jnbor).re + + b_i * ulist(jjup_index,iatom,jnbor).im); + ulist(jju_index+1,iatom,jnbor).im = -rootpq * - (b_r * uarray_i(j - 1,ma,mb) - - b_i * uarray_r(j - 1,ma,mb)); + (b_r * ulist(jjup_index,iatom,jnbor).im - + b_i * ulist(jjup_index,iatom,jnbor).re); } - }); - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) - - //int mbpar = -1; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { -// for (int mb = 0; 2*mb <= j; mb++) { - int mbpar = (mb)%2==0?1:-1; - int mapar = -mbpar; - for (int ma = 0; ma <= j; ma++) { - mapar = -mapar; - if (mapar == 1) { - uarray_r(j,j-ma,j-mb) = uarray_r(j,ma,mb); - uarray_i(j,j-ma,j-mb) = -uarray_i(j,ma,mb); - } else { - uarray_r(j,j-ma,j-mb) = -uarray_r(j,ma,mb); - uarray_i(j,j-ma,j-mb) = uarray_i(j,ma,mb); - } - //OK - //printf("%lf %lf %lf %lf %lf %lf %lf SNAP-COMPARE: UARRAY\n",x,y,z,z0,r,uarray_r(j,ma,mb),uarray_i(j,ma,mb)); + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) + + // Only need to add one symmetrized row for convenience + // Symmetry gets "unfolded" in accumulating ulisttot + if (j%2==1 && mb==(j/2)) { + const int mbpar = (mb)%2==0?1:-1; + int mapar = mbpar; + for (int ma = 0; ma <= j; ma++) { + const int jju_index = jju + mb*(j+1) + ma; + const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1); + if (mapar == 1) { + ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re; + ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im; + } else { + ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re; + ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im; + } + mapar = -mapar; + } } }); + } } - /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor - see comments in compute_uarray() + see comments in compute_uarray_cpu() + Uses same cached data layout of ulist ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team, - double x, double y, double z, - double z0, double r, double dz0dr, - double wj, double rcut) +void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const double& x, const double& y, const double& z, + const double& z0, const double& r, const double& dz0dr, + const double& wj, const double& rcut, + const double& sinner, const double& dinner) { double r0inv; double a_r, a_i, b_r, b_i; @@ -748,7 +1743,7 @@ void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team, b_r = y * r0inv; b_i = -x * r0inv; - dr0invdr = -pow(r0inv, 3.0) * (r + z0 * dz0dr); + dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); dr0inv[0] = dr0invdr * ux; dr0inv[1] = dr0invdr * uy; @@ -773,208 +1768,315 @@ void SNA::compute_duarray(const Kokkos::TeamPolicy<>::member_type& team, db_i[0] += -r0inv; db_r[1] += r0inv; - uarray_r(0,0,0) = 1.0; - duarray_r(0,0,0,0) = 0.0; - duarray_r(0,0,0,1) = 0.0; - duarray_r(0,0,0,2) = 0.0; - uarray_i(0,0,0) = 0.0; - duarray_i(0,0,0,0) = 0.0; - duarray_i(0,0,0,1) = 0.0; - duarray_i(0,0,0,2) = 0.0; + dulist(0,iatom,jnbor,0).re = 0.0; + dulist(0,iatom,jnbor,1).re = 0.0; + dulist(0,iatom,jnbor,2).re = 0.0; + dulist(0,iatom,jnbor,0).im = 0.0; + dulist(0,iatom,jnbor,1).im = 0.0; + dulist(0,iatom,jnbor,2).im = 0.0; for (int j = 1; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; + int jjup = idxu_cache_block[j-1]; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - uarray_r(j,0,mb) = 0.0; - duarray_r(j,mb,0,0) = 0.0; - duarray_r(j,mb,0,1) = 0.0; - duarray_r(j,mb,0,2) = 0.0; - uarray_i(j,0,mb) = 0.0; - duarray_i(j,mb,0,0) = 0.0; - duarray_i(j,mb,0,1) = 0.0; - duarray_i(j,mb,0,2) = 0.0; + const int jju_index = jju+mb+mb*j; + dulist(jju_index,iatom,jnbor,0).re = 0.0; + dulist(jju_index,iatom,jnbor,1).re = 0.0; + dulist(jju_index,iatom,jnbor,2).re = 0.0; + dulist(jju_index,iatom,jnbor,0).im = 0.0; + dulist(jju_index,iatom,jnbor,1).im = 0.0; + dulist(jju_index,iatom,jnbor,2).im = 0.0; for (int ma = 0; ma < j; ma++) { + const int jju_index = jju+mb+mb*j+ma; + const int jjup_index = jjup+mb*j+ma; rootpq = rootpqarray(j - ma,j - mb); - uarray_r(j,ma,mb) += rootpq * - (a_r * uarray_r(j - 1,ma,mb) + - a_i * uarray_i(j - 1,ma,mb)); - uarray_i(j,ma,mb) += rootpq * - (a_r * uarray_i(j - 1,ma,mb) - - a_i * uarray_r(j - 1,ma,mb)); - for (int k = 0; k < 3; k++) { - duarray_r(j,mb,ma,k) += - rootpq * (da_r[k] * uarray_r(j - 1,ma,mb) + - da_i[k] * uarray_i(j - 1,ma,mb) + - a_r * duarray_r(j - 1,mb,ma,k) + - a_i * duarray_i(j - 1,mb,ma,k)); - duarray_i(j,mb,ma,k) += - rootpq * (da_r[k] * uarray_i(j - 1,ma,mb) - - da_i[k] * uarray_r(j - 1,ma,mb) + - a_r * duarray_i(j - 1,mb,ma,k) - - a_i * duarray_r(j - 1,mb,ma,k)); + dulist(jju_index,iatom,jnbor,k).re += + rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re + + da_i[k] * ulist(jjup_index,iatom,jnbor).im + + a_r * dulist(jjup_index,iatom,jnbor,k).re + + a_i * dulist(jjup_index,iatom,jnbor,k).im); + dulist(jju_index,iatom,jnbor,k).im += + rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im - + da_i[k] * ulist(jjup_index,iatom,jnbor).re + + a_r * dulist(jjup_index,iatom,jnbor,k).im - + a_i * dulist(jjup_index,iatom,jnbor,k).re); } - rootpq = rootpqarray(ma + 1,j - mb); - uarray_r(j,ma + 1,mb) = - -rootpq * (b_r * uarray_r(j - 1,ma,mb) + - b_i * uarray_i(j - 1,ma,mb)); - uarray_i(j,ma + 1,mb) = - -rootpq * (b_r * uarray_i(j - 1,ma,mb) - - b_i * uarray_r(j - 1,ma,mb)); - + rootpq = rootpqarray(ma + 1,j - mb); for (int k = 0; k < 3; k++) { - duarray_r(j,mb,ma + 1,k) = - -rootpq * (db_r[k] * uarray_r(j - 1,ma,mb) + - db_i[k] * uarray_i(j - 1,ma,mb) + - b_r * duarray_r(j - 1,mb,ma,k) + - b_i * duarray_i(j - 1,mb,ma,k)); - duarray_i(j,mb,ma + 1,k) = - -rootpq * (db_r[k] * uarray_i(j - 1,ma,mb) - - db_i[k] * uarray_r(j - 1,ma,mb) + - b_r * duarray_i(j - 1,mb,ma,k) - - b_i * duarray_r(j - 1,mb,ma,k)); + dulist(jju_index+1,iatom,jnbor,k).re = + -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re + + db_i[k] * ulist(jjup_index,iatom,jnbor).im + + b_r * dulist(jjup_index,iatom,jnbor,k).re + + b_i * dulist(jjup_index,iatom,jnbor,k).im); + dulist(jju_index+1,iatom,jnbor,k).im = + -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im - + db_i[k] * ulist(jjup_index,iatom,jnbor).re + + b_r * dulist(jjup_index,iatom,jnbor,k).im - + b_i * dulist(jjup_index,iatom,jnbor,k).re); } } - }); - //int mbpar = -1; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { -// for (int mb = 0; 2*mb <= j; mb++) { - int mbpar = (mb)%2==0?1:-1; - int mapar = -mbpar; - for (int ma = 0; ma <= j; ma++) { - mapar = -mapar; - if (mapar == 1) { - uarray_r(j,j-ma,j-mb) = uarray_r(j,ma,mb); - uarray_i(j,j-ma,j-mb) = -uarray_i(j,ma,mb); - for (int k = 0; k < 3; k++) { - duarray_r(j,j-mb,j-ma,k) = duarray_r(j,mb,ma,k); - duarray_i(j,j-mb,j-ma,k) = -duarray_i(j,mb,ma,k); - } - } else { - uarray_r(j,j-ma,j-mb) = -uarray_r(j,ma,mb); - uarray_i(j,j-ma,j-mb) = uarray_i(j,ma,mb); - for (int k = 0; k < 3; k++) { - duarray_r(j,j-mb,j-ma,k) = -duarray_r(j,mb,ma,k); - duarray_i(j,j-mb,j-ma,k) = duarray_i(j,mb,ma,k); - } - } + // Only need to add one symmetrized row for convenience + // Symmetry gets "unfolded" during the dedr accumulation + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + if (j%2==1 && mb==(j/2)) { + const int mbpar = (mb)%2==0?1:-1; + int mapar = mbpar; + for (int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjup_index = jju+(mb+2)*(j+1)-(ma+1); + if (mapar == 1) { + for (int k = 0; k < 3; k++) { + dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re; + dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im; + } + } else { + for (int k = 0; k < 3; k++) { + dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re; + dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im; + } + } + mapar = -mapar; + } } }); } - double sfac = compute_sfac(r, rcut); - double dsfac = compute_dsfac(r, rcut); + double sfac = compute_sfac(r, rcut, sinner, dinner); + double dsfac = compute_dsfac(r, rcut, sinner, dinner); sfac *= wj; dsfac *= wj; - for (int j = 0; j <= twojmax; j++) - for (int mb = 0; mb <= j; mb++) + // Even though we fill out a full "cached" data layout above, + // we only need the "half" data for the accumulation into dedr. + // Thus we skip updating any unnecessary data. + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; + for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { - duarray_r(j,mb,ma,0) = dsfac * uarray_r(j,ma,mb) * ux + - sfac * duarray_r(j,mb,ma,0); - duarray_i(j,mb,ma,0) = dsfac * uarray_i(j,ma,mb) * ux + - sfac * duarray_i(j,mb,ma,0); - duarray_r(j,mb,ma,1) = dsfac * uarray_r(j,ma,mb) * uy + - sfac * duarray_r(j,mb,ma,1); - duarray_i(j,mb,ma,1) = dsfac * uarray_i(j,ma,mb) * uy + - sfac * duarray_i(j,mb,ma,1); - duarray_r(j,mb,ma,2) = dsfac * uarray_r(j,ma,mb) * uz + - sfac * duarray_r(j,mb,ma,2); - duarray_i(j,mb,ma,2) = dsfac * uarray_i(j,ma,mb) * uz + - sfac * duarray_i(j,mb,ma,2); + dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux + + sfac * dulist(jju,iatom,jnbor,0).re; + dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux + + sfac * dulist(jju,iatom,jnbor,0).im; + dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy + + sfac * dulist(jju,iatom,jnbor,1).re; + dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy + + sfac * dulist(jju,iatom,jnbor,1).im; + dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz + + sfac * dulist(jju,iatom,jnbor,2).re; + dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz + + sfac * dulist(jju,iatom,jnbor,2).im; + + jju++; } + } } -/* ---------------------------------------------------------------------- */ - -KOKKOS_INLINE_FUNCTION -void SNA::create_team_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team) -{ - int jdim = twojmax + 1; - uarraytot_r_a = uarraytot_r = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim); - uarraytot_i_a = uarraytot_i = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim); - zarray_r = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim); - zarray_i = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim); - - rij = t_sna_2d(team.team_scratch(1),nmax,3); - rcutij = t_sna_1d(team.team_scratch(1),nmax); - wj = t_sna_1d(team.team_scratch(1),nmax); - inside = t_sna_1i(team.team_scratch(1),nmax); -} - +/* ---------------------------------------------------------------------- + factorial n, wrapper for precomputed table +------------------------------------------------------------------------- */ inline -T_INT SNA::size_team_scratch_arrays() { - T_INT size = 0; - int jdim = twojmax + 1; - - size += t_sna_3d::shmem_size(jdim,jdim,jdim); - size += t_sna_3d::shmem_size(jdim,jdim,jdim); - size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); - size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); - - size += t_sna_2d::shmem_size(nmax,3); - size += t_sna_1d::shmem_size(nmax); - size += t_sna_1d::shmem_size(nmax); - size += t_sna_1i::shmem_size(nmax); - - return size; -} - -/* ---------------------------------------------------------------------- */ - -KOKKOS_INLINE_FUNCTION -void SNA::create_thread_scratch_arrays(const Kokkos::TeamPolicy<>::member_type& team) +double SNA::factorial(int n) { - int jdim = twojmax + 1; - dbvec = Kokkos::View(team.thread_scratch(1),ncoeff); - dbarray = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); + //if (n < 0 || n > nmaxfactorial) { + // char str[128]; + // sprintf(str, "Invalid argument to factorial %d", n); + // error->all(FLERR, str); + //} - uarray_r = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim); - uarray_i = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim); - duarray_r = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); - duarray_i = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); -} - -inline -T_INT SNA::size_thread_scratch_arrays() { - T_INT size = 0; - int jdim = twojmax + 1; - size += Kokkos::View::shmem_size(ncoeff); - size += t_sna_4d::shmem_size(jdim,jdim,jdim); - - size += t_sna_3d::shmem_size(jdim,jdim,jdim); - size += t_sna_3d::shmem_size(jdim,jdim,jdim); - size += t_sna_4d::shmem_size(jdim,jdim,jdim); - size += t_sna_4d::shmem_size(jdim,jdim,jdim); - return size; + return nfac_table[n]; } /* ---------------------------------------------------------------------- - factorial n + factorial n table, size SNA::nmaxfactorial+1 ------------------------------------------------------------------------- */ -KOKKOS_INLINE_FUNCTION -double SNA::factorial(int n) -{ - double result = 1.0; - for(int i=1; i<=n; i++) - result *= 1.0*i; - return result; -} +const double SNA::nfac_table[] = { + 1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880, + 3628800, + 39916800, + 479001600, + 6227020800, + 87178291200, + 1307674368000, + 20922789888000, + 355687428096000, + 6.402373705728e+15, + 1.21645100408832e+17, + 2.43290200817664e+18, + 5.10909421717094e+19, + 1.12400072777761e+21, + 2.5852016738885e+22, + 6.20448401733239e+23, + 1.5511210043331e+25, + 4.03291461126606e+26, + 1.08888694504184e+28, + 3.04888344611714e+29, + 8.8417619937397e+30, + 2.65252859812191e+32, + 8.22283865417792e+33, + 2.63130836933694e+35, + 8.68331761881189e+36, + 2.95232799039604e+38, + 1.03331479663861e+40, + 3.71993326789901e+41, + 1.37637530912263e+43, + 5.23022617466601e+44, + 2.03978820811974e+46, + 8.15915283247898e+47, + 3.34525266131638e+49, + 1.40500611775288e+51, + 6.04152630633738e+52, + 2.65827157478845e+54, + 1.1962222086548e+56, + 5.50262215981209e+57, + 2.58623241511168e+59, + 1.24139155925361e+61, + 6.08281864034268e+62, + 3.04140932017134e+64, + 1.55111875328738e+66, + 8.06581751709439e+67, + 4.27488328406003e+69, + 2.30843697339241e+71, + 1.26964033536583e+73, + 7.10998587804863e+74, + 4.05269195048772e+76, + 2.35056133128288e+78, + 1.3868311854569e+80, + 8.32098711274139e+81, + 5.07580213877225e+83, + 3.14699732603879e+85, + 1.98260831540444e+87, + 1.26886932185884e+89, + 8.24765059208247e+90, + 5.44344939077443e+92, + 3.64711109181887e+94, + 2.48003554243683e+96, + 1.71122452428141e+98, + 1.19785716699699e+100, + 8.50478588567862e+101, + 6.12344583768861e+103, + 4.47011546151268e+105, + 3.30788544151939e+107, + 2.48091408113954e+109, + 1.88549470166605e+111, + 1.45183092028286e+113, + 1.13242811782063e+115, + 8.94618213078297e+116, + 7.15694570462638e+118, + 5.79712602074737e+120, + 4.75364333701284e+122, + 3.94552396972066e+124, + 3.31424013456535e+126, + 2.81710411438055e+128, + 2.42270953836727e+130, + 2.10775729837953e+132, + 1.85482642257398e+134, + 1.65079551609085e+136, + 1.48571596448176e+138, + 1.3520015276784e+140, + 1.24384140546413e+142, + 1.15677250708164e+144, + 1.08736615665674e+146, + 1.03299784882391e+148, + 9.91677934870949e+149, + 9.61927596824821e+151, + 9.42689044888324e+153, + 9.33262154439441e+155, + 9.33262154439441e+157, + 9.42594775983835e+159, + 9.61446671503512e+161, + 9.90290071648618e+163, + 1.02990167451456e+166, + 1.08139675824029e+168, + 1.14628056373471e+170, + 1.22652020319614e+172, + 1.32464181945183e+174, + 1.44385958320249e+176, + 1.58824554152274e+178, + 1.76295255109024e+180, + 1.97450685722107e+182, + 2.23119274865981e+184, + 2.54355973347219e+186, + 2.92509369349301e+188, + 3.3931086844519e+190, + 3.96993716080872e+192, + 4.68452584975429e+194, + 5.5745857612076e+196, + 6.68950291344912e+198, + 8.09429852527344e+200, + 9.8750442008336e+202, + 1.21463043670253e+205, + 1.50614174151114e+207, + 1.88267717688893e+209, + 2.37217324288005e+211, + 3.01266001845766e+213, + 3.8562048236258e+215, + 4.97450422247729e+217, + 6.46685548922047e+219, + 8.47158069087882e+221, + 1.118248651196e+224, + 1.48727070609069e+226, + 1.99294274616152e+228, + 2.69047270731805e+230, + 3.65904288195255e+232, + 5.01288874827499e+234, + 6.91778647261949e+236, + 9.61572319694109e+238, + 1.34620124757175e+241, + 1.89814375907617e+243, + 2.69536413788816e+245, + 3.85437071718007e+247, + 5.5502938327393e+249, + 8.04792605747199e+251, + 1.17499720439091e+254, + 1.72724589045464e+256, + 2.55632391787286e+258, + 3.80892263763057e+260, + 5.71338395644585e+262, + 8.62720977423323e+264, + 1.31133588568345e+267, + 2.00634390509568e+269, + 3.08976961384735e+271, + 4.78914290146339e+273, + 7.47106292628289e+275, + 1.17295687942641e+278, + 1.85327186949373e+280, + 2.94670227249504e+282, + 4.71472363599206e+284, + 7.59070505394721e+286, + 1.22969421873945e+289, + 2.0044015765453e+291, + 3.28721858553429e+293, + 5.42391066613159e+295, + 9.00369170577843e+297, + 1.503616514865e+300, // nmaxfactorial = 167 +}; /* ---------------------------------------------------------------------- the function delta given by VMK Eq. 8.2(1) ------------------------------------------------------------------------- */ -KOKKOS_INLINE_FUNCTION +inline double SNA::deltacg(int j1, int j2, int j) { double sfaccg = factorial((j1 + j2 + j) / 2 + 1); @@ -991,58 +2093,65 @@ double SNA::deltacg(int j1, int j2, int j) inline void SNA::init_clebsch_gordan() { + auto h_cglist = Kokkos::create_mirror_view(cglist); + double sum,dcg,sfaccg; int m, aa2, bb2, cc2; int ifac; - auto h_cgarray = Kokkos::create_mirror_view(cgarray); + int idxcg_count = 0; for (int j1 = 0; j1 <= twojmax; j1++) - for (int j2 = 0; j2 <= twojmax; j2++) - for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) - for (int m1 = 0; m1 <= j1; m1 += 1) { + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + for (int m1 = 0; m1 <= j1; m1++) { aa2 = 2 * m1 - j1; - for (int m2 = 0; m2 <= j2; m2 += 1) { + for (int m2 = 0; m2 <= j2; m2++) { // -c <= cc <= c bb2 = 2 * m2 - j2; m = (aa2 + bb2 + j) / 2; - if(m < 0 || m > j) continue; - - sum = 0.0; - - for (int z = MAX(0, MAX(-(j - j2 + aa2) - / 2, -(j - j1 - bb2) / 2)); - z <= MIN((j1 + j2 - j) / 2, - MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); - z++) { - ifac = z % 2 ? -1 : 1; - sum += ifac / - (factorial(z) * - factorial((j1 + j2 - j) / 2 - z) * - factorial((j1 - aa2) / 2 - z) * - factorial((j2 + bb2) / 2 - z) * - factorial((j - j2 + aa2) / 2 + z) * - factorial((j - j1 - bb2) / 2 + z)); - } - - cc2 = 2 * m - j; - dcg = deltacg(j1, j2, j); - sfaccg = sqrt(factorial((j1 + aa2) / 2) * - factorial((j1 - aa2) / 2) * - factorial((j2 + bb2) / 2) * - factorial((j2 - bb2) / 2) * - factorial((j + cc2) / 2) * - factorial((j - cc2) / 2) * - (j + 1)); - - h_cgarray(j1,j2,j,m1,m2) = sum * dcg * sfaccg; - //printf("SNAP-COMPARE: CG: %i %i %i %i %i %e\n",j1,j2,j,m1,m2,cgarray(j1,j2,j,m1,m2)); - } - } - Kokkos::deep_copy(cgarray,h_cgarray); + if (m < 0 || m > j) { + h_cglist[idxcg_count] = 0.0; + idxcg_count++; + continue; + } + + sum = 0.0; + + for (int z = MAX(0, MAX(-(j - j2 + aa2) + / 2, -(j - j1 - bb2) / 2)); + z <= MIN((j1 + j2 - j) / 2, + MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); + z++) { + ifac = z % 2 ? -1 : 1; + sum += ifac / + (factorial(z) * + factorial((j1 + j2 - j) / 2 - z) * + factorial((j1 - aa2) / 2 - z) * + factorial((j2 + bb2) / 2 - z) * + factorial((j - j2 + aa2) / 2 + z) * + factorial((j - j1 - bb2) / 2 + z)); + } + + cc2 = 2 * m - j; + dcg = deltacg(j1, j2, j); + sfaccg = sqrt(factorial((j1 + aa2) / 2) * + factorial((j1 - aa2) / 2) * + factorial((j2 + bb2) / 2) * + factorial((j2 - bb2) / 2) * + factorial((j + cc2) / 2) * + factorial((j - cc2) / 2) * + (j + 1)); + + h_cglist[idxcg_count] = sum * dcg * sfaccg; + idxcg_count++; + } + } + } + Kokkos::deep_copy(cglist,h_cglist); } /* ---------------------------------------------------------------------- @@ -1056,12 +2165,13 @@ void SNA::init_rootpqarray() auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray); for (int p = 1; p <= twojmax; p++) for (int q = 1; q <= twojmax; q++) - h_rootpqarray(p,q) = sqrt(static_cast(p)/q); + h_rootpqarray(p,q) = static_cast(sqrt(static_cast(p)/q)); Kokkos::deep_copy(rootpqarray,h_rootpqarray); } /* ---------------------------------------------------------------------- */ + inline int SNA::compute_ncoeff() { @@ -1070,25 +2180,14 @@ int SNA::compute_ncoeff() ncount = 0; for (int j1 = 0; j1 <= twojmax; j1++) - if(diagonalstyle == 0) { - for (int j2 = 0; j2 <= j1; j2++) - for (int j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - ncount++; - } else if(diagonalstyle == 1) { - int j2 = j1; - - for (int j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - ncount++; - } else if(diagonalstyle == 2) { - ncount++; - } else if(diagonalstyle == 3) { - for (int j2 = 0; j2 <= j1; j2++) - for (int j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - if (j >= j1) ncount++; - } + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; + j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) ncount++; + + ndoubles = nelements*nelements; + ntriples = nelements*nelements*nelements; + if (chem_flag) ncount *= ntriples; return ncount; } @@ -1096,34 +2195,118 @@ int SNA::compute_ncoeff() /* ---------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -double SNA::compute_sfac(double r, double rcut) +double SNA::compute_sfac(double r, double rcut, double sinner, double dinner) { - if (switch_flag == 0) return 1.0; + double sfac_outer; + constexpr double one = static_cast(1.0); + constexpr double zero = static_cast(0.0); + constexpr double onehalf = static_cast(0.5); + if (switch_flag == 0) sfac_outer = one; if (switch_flag == 1) { - if(r <= rmin0) return 1.0; - else if(r > rcut) return 0.0; + if (r <= rmin0) sfac_outer = one; + else if (r > rcut) return zero; else { - double rcutfac = MY_PI / (rcut - rmin0); - return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0); + double rcutfac = static_cast(MY_PI) / (rcut - rmin0); + sfac_outer = onehalf * (cos((r - rmin0) * rcutfac) + one); } } - return 0.0; + + if (switch_inner_flag == 0) return sfac_outer; + if (switch_inner_flag == 1) { + if (r >= sinner + dinner) + return sfac_outer; + else if (r > sinner - dinner) { + double rcutfac = static_cast(MY_PI2) / dinner; + return sfac_outer * + onehalf * (one - cos(static_cast(MY_PI2) + (r - sinner) * rcutfac)); + } else return zero; + } + return zero; // dummy return } /* ---------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -double SNA::compute_dsfac(double r, double rcut) +double SNA::compute_dsfac(double r, double rcut, double sinner, double dinner) { - if (switch_flag == 0) return 0.0; + double sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; + constexpr double one = static_cast(1.0); + constexpr double zero = static_cast(0.0); + constexpr double onehalf = static_cast(0.5); + if (switch_flag == 0) dsfac_outer = zero; if (switch_flag == 1) { - if(r <= rmin0) return 0.0; - else if(r > rcut) return 0.0; + if (r <= rmin0) dsfac_outer = zero; + else if (r > rcut) return zero; else { - double rcutfac = MY_PI / (rcut - rmin0); - return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac; + double rcutfac = static_cast(MY_PI) / (rcut - rmin0); + dsfac_outer = -onehalf * sin((r - rmin0) * rcutfac) * rcutfac; } } - return 0.0; + + if (switch_inner_flag == 0) return dsfac_outer; + if (switch_inner_flag == 1) { + if (r >= sinner + dinner) + return dsfac_outer; + else if (r > sinner - dinner) { + + // calculate sfac_outer + + if (switch_flag == 0) sfac_outer = one; + if (switch_flag == 1) { + if (r <= rmin0) sfac_outer = one; + else if (r > rcut) sfac_outer = zero; + else { + double rcutfac = static_cast(MY_PI) / (rcut - rmin0); + sfac_outer = onehalf * (cos((r - rmin0) * rcutfac) + one); + } + } + + // calculate sfac_inner + + double rcutfac = static_cast(MY_PI2) / dinner; + sfac_inner = onehalf * (one - cos(static_cast(MY_PI2) + (r - sinner) * rcutfac)); + dsfac_inner = onehalf * rcutfac * sin(static_cast(MY_PI2) + (r - sinner) * rcutfac); + return dsfac_outer * sfac_inner + sfac_outer * dsfac_inner; + + } else return zero; + } + return zero; // dummy return +} + +KOKKOS_INLINE_FUNCTION +void SNA::compute_s_dsfac(const double r, const double rcut, const double sinner, const double dinner, double& sfac, double& dsfac) { + + double sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; + constexpr double one = static_cast(1.0); + constexpr double zero = static_cast(0.0); + constexpr double onehalf = static_cast(0.5); + + if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; } + else if (switch_flag == 1) { + if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; } + else if (r > rcut) { sfac = zero; dsfac = zero; return; } + else { + const double rcutfac = static_cast(MY_PI) / (rcut - rmin0); + const double theta0 = (r - rmin0) * rcutfac; + const double sn = sin(theta0); + const double cs = cos(theta0); + sfac_outer = onehalf * (cs + one); + dsfac_outer = -onehalf * sn * rcutfac; + } + } else { sfac = zero; dsfac = zero; return; } // dummy return + + if (switch_inner_flag == 0) { sfac = sfac_outer; dsfac = dsfac_outer; return; } + else if (switch_inner_flag == 1) { + if (r >= sinner + dinner) { sfac = sfac_outer; dsfac = dsfac_outer; return; } + else if (r > sinner - dinner) { + double rcutfac = static_cast(MY_PI2) / dinner; + sfac_inner = onehalf * (one - cos(static_cast(MY_PI2) + (r - sinner) * rcutfac)); + dsfac_inner = onehalf * rcutfac * sin(static_cast(MY_PI2) + (r - sinner) * rcutfac); + sfac = sfac_outer * sfac_inner; + dsfac = dsfac_outer * sfac_inner + sfac_outer * dsfac_inner; + return; + } else { sfac = zero; dsfac = zero; return; } + } else { sfac = zero; dsfac = zero; return; } // dummy return + } From b7cd1a43d1a08e36d9aa9d9aab6e02d967cf800e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 13 Jul 2022 14:31:36 -0600 Subject: [PATCH 04/11] WIP --- src/force_types/sna.h | 3 +- src/force_types/sna_impl.hpp | 117 +++++++++++++++++------------------ src/types.h | 18 ++++++ 3 files changed, 77 insertions(+), 61 deletions(-) diff --git a/src/force_types/sna.h b/src/force_types/sna.h index f174ba6..f1e6c0b 100644 --- a/src/force_types/sna.h +++ b/src/force_types/sna.h @@ -95,11 +95,10 @@ struct alignas(8) FullHalfMapper { int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj }; -template +template class SNA { public: - using double = double_; using complex = SNAComplex; static constexpr int vector_length = vector_length_; diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp index 3bbcc4b..01a6140 100644 --- a/src/force_types/sna_impl.hpp +++ b/src/force_types/sna_impl.hpp @@ -17,7 +17,6 @@ ------------------------------------------------------------------------- */ #include "sna.h" -#include "memory_kokkos.h" #include #include #include @@ -60,12 +59,12 @@ SNA::SNA(double rfac0_in, build_indexlist(); int jdimpq = twojmax + 2; - MemKK::realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq); + realloc_kokkos(rootpqarray,"SNA::rootpqarray",jdimpq,jdimpq); - MemKK::realloc_kokkos(cglist,"SNA::cglist",idxcg_max); + realloc_kokkos(cglist,"SNA::cglist",idxcg_max); if (bzero_flag) { - MemKK::realloc_kokkos(bzero,"sna:bzero",twojmax+1); + realloc_kokkos(bzero,"sna:bzero",twojmax+1); auto h_bzero = Kokkos::create_mirror_view(bzero); double www = wself*wself*wself; @@ -91,7 +90,7 @@ void SNA::build_indexlist() // index list for cglist int jdim = twojmax + 1; - MemKK::realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim); + realloc_kokkos(idxcg_block,"SNA::idxcg_block",jdim,jdim,jdim); auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block); int idxcg_count = 0; @@ -109,7 +108,7 @@ void SNA::build_indexlist() // index list for uarray // need to include both halves - MemKK::realloc_kokkos(idxu_block,"SNA::idxu_block",jdim); + realloc_kokkos(idxu_block,"SNA::idxu_block",jdim); auto h_idxu_block = Kokkos::create_mirror_view(idxu_block); int idxu_count = 0; @@ -124,7 +123,7 @@ void SNA::build_indexlist() Kokkos::deep_copy(idxu_block,h_idxu_block); // index list for half uarray - MemKK::realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim); + realloc_kokkos(idxu_half_block,"SNA::idxu_half_block",jdim); auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block); int idxu_half_count = 0; @@ -138,7 +137,7 @@ void SNA::build_indexlist() Kokkos::deep_copy(idxu_half_block, h_idxu_half_block); // mapping between full and half indexing, encoding flipping - MemKK::realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max); + realloc_kokkos(idxu_full_half,"SNA::idxu_full_half",idxu_max); auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half); idxu_count = 0; @@ -165,7 +164,7 @@ void SNA::build_indexlist() // index list for "cache" uarray // this is the GPU scratch memory requirements // applied the CPU structures - MemKK::realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim); + realloc_kokkos(idxu_cache_block,"SNA::idxu_cache_block",jdim); auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block); int idxu_cache_count = 0; @@ -187,7 +186,7 @@ void SNA::build_indexlist() if (j >= j1) idxb_count++; idxb_max = idxb_count; - MemKK::realloc_kokkos(idxb,"SNA::idxb",idxb_max); + realloc_kokkos(idxb,"SNA::idxb",idxb_max); auto h_idxb = Kokkos::create_mirror_view(idxb); idxb_count = 0; @@ -204,7 +203,7 @@ void SNA::build_indexlist() // reverse index list for beta and b - MemKK::realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim); + realloc_kokkos(idxb_block,"SNA::idxb_block",jdim,jdim,jdim); auto h_idxb_block = Kokkos::create_mirror_view(idxb_block); idxb_count = 0; @@ -230,10 +229,10 @@ void SNA::build_indexlist() idxz_count++; idxz_max = idxz_count; - MemKK::realloc_kokkos(idxz,"SNA::idxz",idxz_max); + realloc_kokkos(idxz,"SNA::idxz",idxz_max); auto h_idxz = Kokkos::create_mirror_view(idxz); - MemKK::realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim); + realloc_kokkos(idxz_block,"SNA::idxz_block", jdim,jdim,jdim); auto h_idxz_block = Kokkos::create_mirror_view(idxz_block); idxz_count = 0; @@ -288,59 +287,59 @@ void SNA::grow_rij(int newnatom, int newnmax) natom = newnatom; nmax = newnmax; - MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3); - MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax); - MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax); - MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax); - MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax); - MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax); - MemKK::realloc_kokkos(element,"sna:element",natom,nmax); - MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); + realloc_kokkos(rij,"sna:rij",natom,nmax,3); + realloc_kokkos(wj,"sna:wj",natom,nmax); + realloc_kokkos(rcutij,"sna:rcutij",natom,nmax); + realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax); + realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax); + realloc_kokkos(inside,"sna:inside",natom,nmax); + realloc_kokkos(element,"sna:element",natom,nmax); + realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); #ifdef LMP_KOKKOS_GPU if (!host_flag) { const int natom_div = (natom + vector_length - 1) / vector_length; - MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div); - MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); - MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); - MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); + realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); + realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); + realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3); + realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3); + realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4); + realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation + realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); + realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); + realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div); + realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div); + realloc_kokkos(ulist,"sna:ulist",1,1,1); + realloc_kokkos(zlist,"sna:zlist",1,1,1); + realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); + realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); + realloc_kokkos(ylist,"sna:ylist",1,1,1); + realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); + realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); + realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { #endif - MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom); - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); - MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax); - MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); - MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); - MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); + realloc_kokkos(a_pack,"sna:a_pack",1,1,1); + realloc_kokkos(b_pack,"sna:b_pack",1,1,1); + realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); + realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); + realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); + realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom); + realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom); + realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); + realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); + realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); + realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax); + realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom); + realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); + realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); + realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom); + realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); + realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); + realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); #ifdef LMP_KOKKOS_GPU } diff --git a/src/types.h b/src/types.h index c72e631..771c3d5 100644 --- a/src/types.h +++ b/src/types.h @@ -114,6 +114,24 @@ typedef Kokkos::View t_mass_const; // Mass typedef Kokkos::DefaultExecutionSpace::memory_space t_neigh_mem_space; +namespace Kokkos { + static auto NoInit = [](std::string const& label) { + return Kokkos::view_alloc(Kokkos::WithoutInitializing, label); + }; +} + +/* ---------------------------------------------------------------------- + reallocate Kokkos views without initialization + deallocate first to reduce memory use +------------------------------------------------------------------------- */ + +template +static void realloc_kokkos(TYPE &data, const char *name, Indices... ns) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)), ns...); +} + template struct t_scalar3 { Scalar x,y,z; From fbd684ae39dfb6fd3c0e5c00da5a722da61b09de Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 13 Jul 2022 15:19:02 -0600 Subject: [PATCH 05/11] WIP --- src/force_types/force_snap_neigh_impl.h | 30 +++--- src/force_types/sna.h | 124 ++++++++++++------------ src/force_types/sna_impl.hpp | 53 +++++----- src/types.h | 78 +++++++++++++++ 4 files changed, 179 insertions(+), 106 deletions(-) diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h index de6ba6e..bcc8c7b 100644 --- a/src/force_types/force_snap_neigh_impl.h +++ b/src/force_types/force_snap_neigh_impl.h @@ -267,9 +267,9 @@ void ForceSNAP::init_coeff(int narg, char **arg) for (int i = 0; i < nelements; i++) delete[] elements[i]; delete[] elements; - radelem = Kokkos::View(); - wjelem = Kokkos::View(); - coeffelem = Kokkos::View(); + radelem = Kokkos::View(); + wjelem = Kokkos::View(); + coeffelem = Kokkos::View(); } nelements = narg - 5 - system->ntypes; @@ -422,9 +422,9 @@ void ForceSNAP::read_files(char *coefffilename, char *paramfilena // Set up element lists - radelem = Kokkos::View("pair:radelem",nelements); - wjelem = Kokkos::View("pair:wjelem",nelements); - coeffelem = Kokkos::View("pair:coeffelem",nelements,ncoeffall); + radelem = Kokkos::View("pair:radelem",nelements); + wjelem = Kokkos::View("pair:wjelem",nelements); + coeffelem = Kokkos::View("pair:coeffelem",nelements,ncoeffall); int *found = new int[nelements]; for (int ielem = 0; ielem < nelements; ielem++) @@ -811,12 +811,12 @@ void ForceSNAP::operator() (const Kokkos::TeamPolicy<>::member_ty [&] (const int jj, int& count) { Kokkos::single(Kokkos::PerThread(team), [&] (){ T_INT j = neighs_i(jj); - const T_F_FLOAT dx = x(j,0) - x_i; - const T_F_FLOAT dy = x(j,1) - y_i; - const T_F_FLOAT dz = x(j,2) - z_i; + const double dx = x(j,0) - x_i; + const double dy = x(j,1) - y_i; + const double dz = x(j,2) - z_i; const int type_j = type(j); - const T_F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const double rsq = dx*dx + dy*dy + dz*dz; const int elem_j = map[type_j]; if( rsq < rnd_cutsq(type_i,type_j) ) @@ -831,12 +831,12 @@ void ForceSNAP::operator() (const Kokkos::TeamPolicy<>::member_ty [&] (const int jj, int& offset, bool final){ //for (int jj = 0; jj < num_neighs; jj++) { T_INT j = neighs_i(jj); - const T_F_FLOAT dx = x(j,0) - x_i; - const T_F_FLOAT dy = x(j,1) - y_i; - const T_F_FLOAT dz = x(j,2) - z_i; + const double dx = x(j,0) - x_i; + const double dy = x(j,1) - y_i; + const double dz = x(j,2) - z_i; const int type_j = type(j); - const T_F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const double rsq = dx*dx + dy*dy + dz*dz; const int elem_j = map[type_j]; if( rsq < rnd_cutsq(type_i,type_j) ) { @@ -884,7 +884,7 @@ void ForceSNAP::operator() (const Kokkos::TeamPolicy<>::member_ty Kokkos::single(Kokkos::PerThread(team), [&] (){ - T_F_FLOAT fij[3]; + double fij[3]; fij[0] = 0.0; fij[1] = 0.0; diff --git a/src/force_types/sna.h b/src/force_types/sna.h index f1e6c0b..f2f99e6 100644 --- a/src/force_types/sna.h +++ b/src/force_types/sna.h @@ -66,9 +66,8 @@ #endif struct WignerWrapper { - using double = double; using complex = SNAComplex; - static constexpr int vector_length = vector_length_; + static constexpr int vector_length = 32; const int offset; // my offset into the vector (0, ..., vector_length - 1) double* buffer; // buffer of real numbers @@ -95,46 +94,43 @@ struct alignas(8) FullHalfMapper { int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj }; -template class SNA { public: using complex = SNAComplex; - static constexpr int vector_length = vector_length_; - - using KKDeviceType = typename KKDevice::value; - - typedef Kokkos::View t_sna_1i; - typedef Kokkos::View t_sna_1d; - typedef Kokkos::View> t_sna_1d_atomic; - typedef Kokkos::View t_sna_2i; - typedef Kokkos::View t_sna_2d; - typedef Kokkos::View t_sna_2d_ll; - typedef Kokkos::View t_sna_3d; - typedef Kokkos::View t_sna_3d_ll; - typedef Kokkos::View t_sna_4d; - typedef Kokkos::View t_sna_4d_ll; - typedef Kokkos::View t_sna_3d3; - typedef Kokkos::View t_sna_5d; - - typedef Kokkos::View t_sna_1c; - typedef Kokkos::View> t_sna_1c_atomic; - typedef Kokkos::View t_sna_2c; - typedef Kokkos::View t_sna_2c_ll; - typedef Kokkos::View t_sna_2c_lr; - typedef Kokkos::View t_sna_3c; - typedef Kokkos::View t_sna_3c_ll; - typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c3_ll; - typedef Kokkos::View t_sna_4c_ll; - typedef Kokkos::View t_sna_3c3; - typedef Kokkos::View t_sna_5c; + static constexpr int vector_length = 32; + + typedef Kokkos::View t_sna_1i; + typedef Kokkos::View t_sna_1d; + typedef Kokkos::View> t_sna_1d_atomic; + typedef Kokkos::View t_sna_2i; + typedef Kokkos::View t_sna_2d; + typedef Kokkos::View t_sna_2d_ll; + typedef Kokkos::View t_sna_3d; + typedef Kokkos::View t_sna_3d_ll; + typedef Kokkos::View t_sna_4d; + typedef Kokkos::View t_sna_4d_ll; + typedef Kokkos::View t_sna_3d3; + typedef Kokkos::View t_sna_5d; + + typedef Kokkos::View t_sna_1c; + typedef Kokkos::View> t_sna_1c_atomic; + typedef Kokkos::View t_sna_2c; + typedef Kokkos::View t_sna_2c_ll; + typedef Kokkos::View t_sna_2c_lr; + typedef Kokkos::View t_sna_3c; + typedef Kokkos::View t_sna_3c_ll; + typedef Kokkos::View t_sna_4c; + typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c_ll; + typedef Kokkos::View t_sna_3c3; + typedef Kokkos::View t_sna_5c; inline SNA() {}; KOKKOS_INLINE_FUNCTION - SNA(const SNA& sna, const typename Kokkos::TeamPolicy::member_type& team); + SNA(const SNA& sna, const typename Kokkos::TeamPolicy<>::member_type& team); inline SNA(double, int, double, int, int, int, int, int, int, int); @@ -161,19 +157,19 @@ class SNA { // version of the code with parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); // ForceSNAP + void compute_ui_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int, const int); // ForceSNAP // version of the code without parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); // ForceSNAP + void compute_ui_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&, const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP + const Kokkos::View &beta_pack); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_with_zlist(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP + const Kokkos::View &beta_pack); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&, const int&); // ForceSNAP @@ -181,16 +177,16 @@ class SNA { // version of the code with parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int, const int); //ForceSNAP // version of the code without parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int, const int, const int); //ForceSNAP // core "evaluation" functions that get plugged into "compute" functions // plugged into compute_ui_small, compute_ui_large KOKKOS_FORCEINLINE_FUNCTION - void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const double&, const int&, + void evaluate_ui_jbend(const WignerWrapper, const complex&, const complex&, const double&, const int&, const int&, const int&, const int&); // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION @@ -199,31 +195,31 @@ class SNA { // plugged into compute_yi, compute_yi_with_zlist KOKKOS_FORCEINLINE_FUNCTION double evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const Kokkos::View &); + const Kokkos::View &); // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large KOKKOS_FORCEINLINE_FUNCTION - double evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const double&, - const WignerWrapper&, const complex&, const complex&, const double&, + double evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const double&, + const WignerWrapper&, const complex&, const complex&, const double&, const int&, const int&, const int&, const int&); // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP + void pre_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_zi_cpu(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_cpu(int, - const Kokkos::View &beta); // ForceSNAP + const Kokkos::View &beta); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + void compute_bi_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int); // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION - void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION double compute_sfac(double, double, double, double); // add_uarraytot, compute_duarray @@ -296,19 +292,19 @@ class SNA { //use indexlist instead of loops, constructor generates these // Same across all SNA - Kokkos::View idxz; - Kokkos::View idxb; - Kokkos::View idxcg_block; + Kokkos::View idxz; + Kokkos::View idxb; + Kokkos::View idxcg_block; public: - Kokkos::View idxu_block; - Kokkos::View idxu_half_block; - Kokkos::View idxu_cache_block; - Kokkos::View idxu_full_half; + Kokkos::View idxu_block; + Kokkos::View idxu_half_block; + Kokkos::View idxu_cache_block; + Kokkos::View idxu_full_half; private: - Kokkos::View idxz_block; - Kokkos::View idxb_block; + Kokkos::View idxz_block; + Kokkos::View idxb_block; // data for bispectrum coefficients @@ -322,9 +318,9 @@ class SNA { double factorial(int); KOKKOS_INLINE_FUNCTION - void create_team_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team); // SNA() + void create_team_scratch_arrays(const typename Kokkos::TeamPolicy<>::member_type& team); // SNA() KOKKOS_INLINE_FUNCTION - void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team); // SNA() + void create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<>::member_type& team); // SNA() inline void init_clebsch_gordan(); // init() @@ -333,10 +329,10 @@ class SNA { void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui + void add_uarraytot(const typename Kokkos::TeamPolicy<>::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&, int); // compute_ui KOKKOS_INLINE_FUNCTION - void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + void compute_uarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int, const double&, const double&, const double&, const double&, const double&); // compute_ui_cpu @@ -347,7 +343,7 @@ class SNA { inline int compute_ncoeff(); // SNA() KOKKOS_INLINE_FUNCTION - void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + void compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int, int, const double&, const double&, const double&, // compute_duidrj_cpu const double&, const double&, const double&, const double&, const double&, const double&, const double&); @@ -371,7 +367,7 @@ class SNA { int wselfall_flag; int bzero_flag; // 1 if bzero subtracted from barray - Kokkos::View bzero; // array of B values for isolated atoms + Kokkos::View bzero; // array of B values for isolated atoms }; #include diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp index 01a6140..fda5a36 100644 --- a/src/force_types/sna_impl.hpp +++ b/src/force_types/sna_impl.hpp @@ -30,8 +30,7 @@ SNA::SNA(double rfac0_in, int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in, int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in) { - LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; - host_flag = (execution_space == LAMMPS_NS::Host); + host_flag = 0; wself = static_cast(1.0); @@ -479,7 +478,7 @@ void SNA::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int // Version of the code that exposes additional parallelism by threading over `j_bend` values KOKKOS_INLINE_FUNCTION -void SNA::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNA::compute_ui_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) { // get shared memory offset @@ -490,7 +489,7 @@ void SNA::compute_ui_small(const typename Kokkos::TeamPolicy::member const int scratch_shift = team_rank * tile_size; // extract and wrap - const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters const complex a = a_pack(iatom_mod, jnbor, iatom_div); @@ -509,7 +508,7 @@ void SNA::compute_ui_small(const typename Kokkos::TeamPolicy::member // Version of the code that loops over all `j_bend` values which reduces integer arithmetic // and some amount of load imbalance, at the expense of reducing parallelism KOKKOS_INLINE_FUNCTION -void SNA::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNA::compute_ui_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -519,7 +518,7 @@ void SNA::compute_ui_large(const typename Kokkos::TeamPolicy::member const int scratch_shift = team_rank * tile_size; // extract and wrap - const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters const complex a = a_pack(iatom_mod, jnbor, iatom_div); @@ -539,7 +538,7 @@ void SNA::compute_ui_large(const typename Kokkos::TeamPolicy::member // Core "evaluation" kernel that gets reused in `compute_ui_small` and `compute_ui_large` KOKKOS_FORCEINLINE_FUNCTION -void SNA::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, +void SNA::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac, const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { @@ -773,7 +772,7 @@ void SNA::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) KOKKOS_INLINE_FUNCTION void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) + const Kokkos::View &beta_pack) { const int j1 = idxz(jjz, 0); @@ -821,7 +820,7 @@ void SNA::compute_yi(int iatom_mod, int jjz, int iatom_div, KOKKOS_INLINE_FUNCTION void SNA::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) + const Kokkos::View &beta_pack) { const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -904,7 +903,7 @@ typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int& KOKKOS_FORCEINLINE_FUNCTION typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, - const Kokkos::View &beta_pack) { + const Kokkos::View &beta_pack) { double betaj = 0; @@ -943,7 +942,7 @@ typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, con // Version of the code that exposes additional parallelism by threading over `j_bend` values template KOKKOS_INLINE_FUNCTION -void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -953,8 +952,8 @@ void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); - WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters const complex a = a_pack(iatom_mod, jnbor, iatom_div); @@ -979,7 +978,7 @@ void SNA::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -989,8 +988,8 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); - WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters const complex a = a_pack(iatom_mod, jnbor, iatom_div); @@ -1020,8 +1019,8 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy& ulist_wrapper, const complex& a, const complex& b, const double& sfac, - const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu, +typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac, + const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu, const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { double dedr_full_sum = static_cast(0); @@ -1167,7 +1166,7 @@ typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper::member_type& team, const int& iatom, const int& ielem) +void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, const int& iatom, const int& ielem) { for (int jelem = 0; jelem < nelements; jelem++) { for (int j = 0; j <= twojmax; j++) { @@ -1200,7 +1199,7 @@ void SNA::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNA::compute_ui_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor) { double rsq, r, x, y, z, z0, theta0; @@ -1298,7 +1297,7 @@ void SNA::compute_zi_cpu(const int& iter) ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -1396,7 +1395,7 @@ void SNA::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_t KOKKOS_INLINE_FUNCTION void SNA::compute_yi_cpu(int iter, - const Kokkos::View &beta) + const Kokkos::View &beta) { double betaj; const int iatom = iter / idxz_max; @@ -1503,7 +1502,7 @@ void SNA::compute_yi_cpu(int iter, ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor) { double rsq, r, x, y, z, z0, theta0, cs, sn; double dz0dr; @@ -1534,7 +1533,7 @@ void SNA::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::memb KOKKOS_INLINE_FUNCTION -void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor) { t_scalar3 final_sum; const int jelem = element(iatom, jnbor); @@ -1599,7 +1598,7 @@ void SNA::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::memb ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNA::add_uarraytot(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor, const double& r, const double& wj, const double& rcut, const double& sinner, const double& dinner, int jelem) { @@ -1629,7 +1628,7 @@ void SNA::add_uarraytot(const typename Kokkos::TeamPolicy::member_ty ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor, const double& x, const double& y, const double& z, const double& z0, const double& r) { double r0inv; @@ -1719,7 +1718,7 @@ void SNA::compute_uarray_cpu(const typename Kokkos::TeamPolicy::memb ------------------------------------------------------------------------- */ KOKKOS_INLINE_FUNCTION -void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& team, int iatom, int jnbor, const double& x, const double& y, const double& z, const double& z0, const double& r, const double& dz0dr, const double& wj, const double& rcut, diff --git a/src/types.h b/src/types.h index 771c3d5..48d30ff 100644 --- a/src/types.h +++ b/src/types.h @@ -112,6 +112,84 @@ typedef Kokkos::View t_q_const; // Charge typedef Kokkos::View t_mass; // Mass typedef Kokkos::View t_mass_const; // Mass +// intentional: SNAreal/complex gets reused beyond SNAP +typedef double SNAreal; + +//typedef struct { SNAreal re, im; } SNAcomplex; +template +struct alignas(2*sizeof(real_type_)) SNAComplex +{ + using real_type = real_type_; + using complex = SNAComplex; + real_type re,im; + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex() + : re(static_cast(0.)), im(static_cast(0.)) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re) + : re(re), im(static_cast(0.)) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re, real_type im) + : re(re), im(im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other) + : re(other.re), im(other.im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(const SNAComplex& other) { + re = other.re; im = other.im; + return *this; + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(SNAComplex&& other) + : re(other.re), im(other.im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(SNAComplex&& other) { + re = other.re; im = other.im; + return *this; + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator+(SNAComplex const& other) { + return SNAComplex(re + other.re, im + other.im); + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator+=(SNAComplex const& other) { + re += other.re; im += other.im; + return *this; + } + + KOKKOS_INLINE_FUNCTION + static constexpr complex zero() { return complex(static_cast(0.), static_cast(0.)); } + + KOKKOS_INLINE_FUNCTION + static constexpr complex one() { return complex(static_cast(1.), static_cast(0.)); } + + KOKKOS_INLINE_FUNCTION + const complex conj() const { return complex(re, -im); } + + KOKKOS_INLINE_FUNCTION + const real_type real_part_product(const complex &cm2) { return re * cm2.re - im * cm2.im; } + + KOKKOS_INLINE_FUNCTION + const real_type real_part_product(const real_type &r) const { return re * r; } +}; + +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const real_type& r, const SNAComplex& self) { + return SNAComplex(r*self.re, r*self.im); +} + +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const SNAComplex& self, const real_type& r) { + return SNAComplex(r*self.re, r*self.im); +} + +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const SNAComplex& self, const SNAComplex& cm2) { + return SNAComplex(self.re*cm2.re - self.im*cm2.im, self.re*cm2.im + self.im*cm2.re); +} + +typedef SNAComplex SNAcomplex; + typedef Kokkos::DefaultExecutionSpace::memory_space t_neigh_mem_space; namespace Kokkos { From 9f39618157bed93bda5d8e4c640371f71363a929 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 13 Jul 2022 15:28:15 -0600 Subject: [PATCH 06/11] WIP --- src/force_types/force_snap_neigh_impl.h | 40 ++++++++++++------------- src/force_types/sna.h | 2 +- src/force_types/sna_impl.hpp | 4 +-- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h index bcc8c7b..e068bd6 100644 --- a/src/force_types/force_snap_neigh_impl.h +++ b/src/force_types/force_snap_neigh_impl.h @@ -624,7 +624,7 @@ void ForceSNAP::operator() (TagComputeNeighCPU,const typename Kok int ii = team.league_rank(); const int i = d_ilist[ii + chunk_offset]; // TODO: snaKK is used in lammps - SNAKokkos my_sna = snaKK; + SNA my_sna = snaKK; const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -645,12 +645,12 @@ void ForceSNAP::operator() (TagComputeNeighCPU,const typename Kok [&] (const int jj, int& count) { Kokkos::single(Kokkos::PerThread(team), [&] () { T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; + const double dx = x(j,0) - xtmp; + const double dy = x(j,1) - ytmp; + const double dz = x(j,2) - ztmp; const int jtype = type(j); - const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const double rsq = dx*dx + dy*dy + dz*dz; if (rsq < rnd_cutsq(itype,jtype)) count++; @@ -664,12 +664,12 @@ void ForceSNAP::operator() (TagComputeNeighCPU,const typename Kok [&] (const int jj, int& offset, bool final) { //for (int jj = 0; jj < num_neighs; jj++) { T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; + const double dx = x(j,0) - xtmp; + const double dy = x(j,1) - ytmp; + const double dz = x(j,2) - ztmp; const int jtype = type(j); - const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const double rsq = dx*dx + dy*dy + dz*dz; const int elem_j = d_map[jtype]; if (rsq < rnd_cutsq(itype,jtype)) { @@ -696,7 +696,7 @@ template KOKKOS_INLINE_FUNCTION void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; + SNA my_sna = snaKK; // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); @@ -713,9 +713,9 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos // Load various info about myself up front const int i = d_ilist[ii + chunk_offset]; - const F_FLOAT xtmp = x(i,0); - const F_FLOAT ytmp = x(i,1); - const F_FLOAT ztmp = x(i,2); + const double xtmp = x(i,0); + const double ytmp = x(i,1); + const double ztmp = x(i,2); const int itype = type[i]; const int ielem = d_map[itype]; const double radi = d_radelem[ielem]; @@ -733,12 +733,12 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs), [&] (const int jj, int& count) { T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; + const double dx = x(j,0) - xtmp; + const double dy = x(j,1) - ytmp; + const double dz = x(j,2) - ztmp; int jtype = type(j); - const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const double rsq = dx*dx + dy*dy + dz*dz; if (rsq >= rnd_cutsq(itype,jtype)) { jtype = -1; // use -1 to signal it's outside the radius @@ -760,9 +760,9 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos if (jtype >= 0) { if (final) { T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; + const double dx = x(j,0) - xtmp; + const double dy = x(j,1) - ytmp; + const double dz = x(j,2) - ztmp; const int elem_j = d_map[jtype]; my_sna.rij(ii,offset,0) = dx; my_sna.rij(ii,offset,1) = dy; diff --git a/src/force_types/sna.h b/src/force_types/sna.h index f2f99e6..bdd183f 100644 --- a/src/force_types/sna.h +++ b/src/force_types/sna.h @@ -186,7 +186,7 @@ class SNA { // core "evaluation" functions that get plugged into "compute" functions // plugged into compute_ui_small, compute_ui_large KOKKOS_FORCEINLINE_FUNCTION - void evaluate_ui_jbend(const WignerWrapper, const complex&, const complex&, const double&, const int&, + void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const double&, const int&, const int&, const int&, const int&); // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp index fda5a36..4804cc3 100644 --- a/src/force_types/sna_impl.hpp +++ b/src/force_types/sna_impl.hpp @@ -901,7 +901,7 @@ typename SNA::complex SNA::evaluate_zi(const int& j1, const int& j2, const int& // Core "evaluation" kernel that extracts and rescales the appropriate `beta` value, // which gets used in both `compute_yi` and `compute_yi_from_zlist KOKKOS_FORCEINLINE_FUNCTION -typename SNA::double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, +double SNA::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, const Kokkos::View &beta_pack) { @@ -1019,7 +1019,7 @@ void SNA::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<>::member // Core "evaluation" kernel that gets reused in `compute_fused_deidrj_small` and // `compute_fused_deidrj_large` KOKKOS_FORCEINLINE_FUNCTION -typename SNA::double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac, +double SNA::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const double& sfac, const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const double& dsfacu, const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { From 4fc5d711368c99beeb5435774aeeaa8ba021972e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 16 Aug 2022 10:57:05 -0600 Subject: [PATCH 07/11] WIP --- src/force_types/force_snap_neigh_impl.h | 331 +++++++++++++++++++++--- 1 file changed, 289 insertions(+), 42 deletions(-) diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h index e068bd6..b44b8ff 100644 --- a/src/force_types/force_snap_neigh_impl.h +++ b/src/force_types/force_snap_neigh_impl.h @@ -134,6 +134,8 @@ ForceSNAP::ForceSNAP(char** args, System* system_, bool half_neig // Need to set this because restart not handled by ForceHybrid cutsq = t_fparams("ForceSNAP::cutsq",system->ntypes,system->ntypes); + + host_flag = 0; } /* ---------------------------------------------------------------------- */ @@ -185,54 +187,296 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo NeighborClass* neighbor = (NeighborClass*) neighbor_; neigh_list = neighbor->get_neigh_list(); - int max_neighs = 0; /* for (int i = 0; i < nlocal; i++) { typename t_neigh_list::t_neighs neighs_i = neigh_list.get_neighs(i); const int num_neighs = neighs_i.get_num_neighs(); if(max_neighs(neigh_list), Kokkos::Max(max_neighs)); - // snaKK (line 220 in lammps) - sna.nmax = max_neighs; - // Put LAMMPS code starting here - T_INT team_scratch_size = sna.size_team_scratch_arrays(); - T_INT thread_scratch_size = sna.size_thread_scratch_arrays(); - - //printf("Sizes: %i %i\n",team_scratch_size/1024,thread_scratch_size/1024); - int vector_length = 8; - int team_size_max = Kokkos::TeamPolicy<>(nlocal,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); -#ifdef EMD_ENABLE_GPU - // Same as 207 in lammps - int team_size = 20;//max_neighs; - if(team_size*vector_length > team_size_max) - team_size = team_size_max/vector_length; -#else - int team_size = 1; -#endif - //ComputeNeigh -- lammps + max_neighs = 0; + Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",inum, FindMaxNumNeighs(k_list), Kokkos::Max(max_neighs)); + + int team_size_default = 1; + if (!host_flag) + team_size_default = 32;//max_neighs; + + if (beta_max < inum) { + beta_max = inum; + MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,inum); + if (!host_flag) + MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); + MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",inum); + } + + chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_offset = 0; + + sna.grow_rij(chunk_size,max_neighs); + + while (chunk_offset < inum) { // chunk up loop to prevent running out of memory + + if (chunk_size > inum - chunk_offset) + chunk_size = inum - chunk_offset; + + if (host_flag) + { + // Host codepath + + //ComputeNeigh + /*{ + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); + } + + //PreUi + { + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); + } + + // ComputeUi + { + int team_size = team_size_default; + // Fused calculation of ulist and accumulation into ulisttot using atomics + typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + } + + { + // Expand ulisttot -> ulisttot_full + // Zero out ylist + typename Kokkos::MDRangePolicy, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); + Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); + } + + //Compute bispectrum + if (quadraticflag || eflag) { + //ComputeZi + int idxz_max = sna.idxz_max; + typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); + + //ComputeBi + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); + } + + //ComputeYi + { + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); + + //ComputeYi + int idxz_max = sna.idxz_max; + typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); + } // host flag + + //ComputeDuidrj and Deidrj + { + int team_size = team_size_default; + + typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + + typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + }*/ + + } else { // GPU + +#ifdef EMD_KOKKOS_GPU + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + //ComputeNeigh { // team_size_compute_neigh is defined in `pair_snap_kokkos.h` int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - } + } + //ComputeCayleyKlein + /*{ + // tile_size_compute_ck is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); + Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + } + //PreUi + { + // tile_size_pre_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `pair_snap_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // team_size_transform_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_transform_ui({0,0,0},{vector_length,sna.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } + + //Compute bispectrum in AoSoA data layout, transform Bi + if (quadraticflag || eflag) { + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + + //ComputeZi + const int idxz_max = sna.idxz_max; + Snap3DRangePolicy + policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + + //ComputeBi + const int idxb_max = sna.idxb_max; + Snap3DRangePolicy + policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + + //Transform data layout of blist out of AoSoA + //We need this because `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA, which at best would only be beneficial on the margins + Snap3DRangePolicy + policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + } + + //Note zeroing `ylist` is fused into `TransformUi`. + { + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBeta",policy_beta,*this); + const int idxz_max = sna.idxz_max; + if (quadraticflag || eflag) { + Snap3DRangePolicy + policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); + Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); + } else { + Snap3DRangePolicy + policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); + Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); + } + } + + // Fused ComputeDuidrj, ComputeDeidrj + { + // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + + // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(2 * team_size_compute_fused_deidrj * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); + + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); + + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); + + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); + + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); + + } + }*/ + +#endif // LMP_KOKKOS_GPU + + } + + //ComputeForce + /*{ + if (neighflag == HALF) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); + } else if (neighflag == HALFTHREAD) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); + } + }*/ + chunk_offset += chunk_size; + + } -/* - Kokkos::TeamPolicy<> policy(nlocal,team_size,vector_length); - // Replaced in LAMMPS by a lot of functors - Kokkos::parallel_for("ForceSNAP::compute",policy - .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) - .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) - ,*this); -*/ -//static int step =0; -//step++; -//if(step%10==0) // printf(" %e %e %e %e %e (%e %e): %e\n",t1,t2,t3,t4,t5,t6,t7,t1+t2+t3+t4+t5); } @@ -344,7 +588,7 @@ void ForceSNAP::init_coeff(int narg, char **arg) diagonalstyle,use_shared_arrays, rmin0,switchflag,bzeroflag); //if (!use_shared_arrays) - sna.grow_rij(nmax); + sna.grow_rij(nmax,max_neighs); sna.init(); //printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff); @@ -623,8 +867,8 @@ void ForceSNAP::operator() (TagComputeNeighCPU,const typename Kok int ii = team.league_rank(); const int i = d_ilist[ii + chunk_offset]; - // TODO: snaKK is used in lammps - SNA my_sna = snaKK; + // TODO: sna is used in lammps + SNA my_sna = sna; const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -696,7 +940,7 @@ template KOKKOS_INLINE_FUNCTION void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - SNA my_sna = snaKK; + SNA my_sna = sna; // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); @@ -712,7 +956,12 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; // Load various info about myself up front - const int i = d_ilist[ii + chunk_offset]; + const int i = ii + chunk_offset; + + + typename t_neigh_list::t_neighs neighs_i = neigh_list.get_neighs(i); + const int num_neighs = neighs_i.get_num_neighs(); + const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -720,8 +969,6 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos const int ielem = d_map[itype]; const double radi = d_radelem[ielem]; - const int num_neighs = d_numneigh[i]; - // rij[][3] = displacements between atom I and those neighbors // inside = indices of neighbors of I within cutoff // wj = weights for neighbors of I within cutoff @@ -732,7 +979,7 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos int ninside = 0; Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs), [&] (const int jj, int& count) { - T_INT j = d_neighbors(i,jj); + T_INT j = neighs_i(jj); const double dx = x(j,0) - xtmp; const double dy = x(j,1) - ytmp; const double dz = x(j,2) - ztmp; @@ -759,7 +1006,7 @@ void ForceSNAP::operator() (TagComputeNeigh,const typename Kokkos if (jtype >= 0) { if (final) { - T_INT j = d_neighbors(i,jj); + T_INT j = neighs_i(jj); const double dx = x(j,0) - xtmp; const double dy = x(j,1) - ytmp; const double dz = x(j,2) - ztmp; From 76daaf40fdfb15b3355506d9706e710dc8199e43 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 16 Aug 2022 11:58:25 -0600 Subject: [PATCH 08/11] WIP --- src/force_types/force_snap_neigh.h | 40 ++++++----- src/force_types/force_snap_neigh_impl.h | 93 +++++++++++++------------ 2 files changed, 69 insertions(+), 64 deletions(-) diff --git a/src/force_types/force_snap_neigh.h b/src/force_types/force_snap_neigh.h index c342f10..d05b59b 100644 --- a/src/force_types/force_snap_neigh.h +++ b/src/force_types/force_snap_neigh.h @@ -142,10 +142,10 @@ struct TagComputeDeidrjCPU{}; void operator() (TagPreUi,const int iatom_mod, const int j, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeUiSmall,const typename Kokkos::TeamPolicyTagComputeUiSmall>::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeUiLarge,const typename Kokkos::TeamPolicyTagComputeUiLarge>::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagTransformUi,const int iatom_mod, const int j, const int iatom_div) const; @@ -170,21 +170,21 @@ struct TagComputeDeidrjCPU{}; template KOKKOS_INLINE_FUNCTION - void operator() (TagComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const; + void operator() (TagComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicyTagComputeFusedDeidrjSmall >::member_type& team) const; template KOKKOS_INLINE_FUNCTION - void operator() (TagComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const; + void operator() (TagComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicyTagComputeFusedDeidrjLarge >::member_type& team) const; // CPU backend only KOKKOS_INLINE_FUNCTION - void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeNeighCPU,const typename Kokkos::TeamPolicyTagComputeNeighCPU>::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPreUiCPU,const typename Kokkos::TeamPolicyTagPreUiCPU>::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeUiCPU,const typename Kokkos::TeamPolicyTagComputeUiCPU>::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagTransformUiCPU, const int j, const int iatom) const; @@ -193,16 +193,16 @@ struct TagComputeDeidrjCPU{}; void operator() (TagComputeZiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeBiCPU,const typename Kokkos::TeamPolicyTagComputeBiCPU>::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagComputeYiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeDuidrjCPU,const typename Kokkos::TeamPolicyTagComputeDuidrjCPU>::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagComputeDeidrjCPU,const typename Kokkos::TeamPolicyTagComputeDeidrjCPU>::member_type& team) const; */ protected: @@ -218,9 +218,9 @@ struct TagComputeDeidrjCPU{}; t_dbvec dbvec; SNA sna; - int nmax; + int nmax,max_neighs; - int chunk_size,chunk_offset; + int chunk_size,chunk_offset,chunksize; int host_flag; // How many interactions can be run concurrently @@ -280,7 +280,8 @@ struct TagComputeDeidrjCPU{}; Kokkos::View wjelem; // elements weights Kokkos::View coeffelem; // element bispectrum coefficients Kokkos::View map; // mapping from atom types to elements - int twojmax, diagonalstyle, switchflag, bzeroflag, quadraticflag; + int twojmax, switchflag, bzeroflag, bnormflag, quadraticflag; + int chemflag, wselfallflag, switchinnerflag, diagonalstyle; double rcutfac, rfac0, rmin0, wj1, wj2; int rcutfacflag, twojmaxflag; // flags for required parameters typedef Kokkos::View t_fparams; @@ -329,7 +330,7 @@ struct TagComputeDeidrjCPU{}; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(double) == 4 ? 4 : 2; #endif -static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN; +static constexpr int vector_length = 32; // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> @@ -344,10 +345,13 @@ static constexpr int vector_length = SNAP_KOKKOS_DEVICE_VECLEN; template using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagPairSNAP>; - - - - + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_ninside; // ninside for all atoms in list }; diff --git a/src/force_types/force_snap_neigh_impl.h b/src/force_types/force_snap_neigh_impl.h index b44b8ff..1feaa70 100644 --- a/src/force_types/force_snap_neigh_impl.h +++ b/src/force_types/force_snap_neigh_impl.h @@ -91,7 +91,6 @@ ForceSNAP::ForceSNAP(char** args, System* system_, bool half_neig nmax = 0; - vector_length = 8; concurrent_interactions = #if defined(KOKKOS_ENABLE_CUDA) std::is_same::value ? @@ -194,29 +193,30 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo if(max_neighs(k_list), Kokkos::Max(max_neighs)); + Kokkos::parallel_reduce("ForceSNAP::find_max_neighs",nlocal, FindMaxNumNeighs(neigh_list), Kokkos::Max(max_neighs)); int team_size_default = 1; if (!host_flag) team_size_default = 32;//max_neighs; - if (beta_max < inum) { - beta_max = inum; - MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,inum); +/* + if (beta_max < nlocal) { + beta_max = nlocal; + MemKK::realloc_kokkos(d_beta,"ForceSNAP:beta",ncoeff,nlocal); if (!host_flag) - MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); - MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",inum); - } + MemKK::realloc_kokkos(d_beta_pack,"ForceSNAP:beta_pack",vector_length,ncoeff,(nlocal + vector_length - 1) / vector_length); + MemKK::realloc_kokkos(d_ninside,"ForceSNAP:ninside",nlocal); + }*/ - chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_size = MIN(chunksize,nlocal); // "chunksize" variable is set by user chunk_offset = 0; sna.grow_rij(chunk_size,max_neighs); - while (chunk_offset < inum) { // chunk up loop to prevent running out of memory + while (chunk_offset < nlocal) { // chunk up loop to prevent running out of memory - if (chunk_size > inum - chunk_offset) - chunk_size = inum - chunk_offset; + if (chunk_size > nlocal - chunk_offset) + chunk_size = nlocal - chunk_offset; if (host_flag) { @@ -226,7 +226,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo /*{ int team_size = team_size_default; check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPComputeNeighCPU> policy_neigh(chunk_size,team_size,vector_length); Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); } @@ -234,7 +234,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo { int team_size = team_size_default; check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); } @@ -242,14 +242,14 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo { int team_size = team_size_default; // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); } { // Expand ulisttot -> ulisttot_full // Zero out ylist - typename Kokkos::MDRangePolicy, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); + typename Kokkos::MDRangePolicyKokkos::IndexType, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); } @@ -257,25 +257,25 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo if (quadraticflag || eflag) { //ComputeZi int idxz_max = sna.idxz_max; - typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); + typename Kokkos::RangePolicyTagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); //ComputeBi int team_size = team_size_default; check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); } //ComputeYi { //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); + typename Kokkos::RangePolicyTagPairSNAPBetaCPU> policy_beta(0,chunk_size); Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); //ComputeYi int idxz_max = sna.idxz_max; - typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); + typename Kokkos::RangePolicyTagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); } // host flag @@ -283,10 +283,10 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo { int team_size = team_size_default; - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::TeamPolicyTagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); }*/ @@ -303,7 +303,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo // team_size_compute_neigh is defined in `pair_snap_kokkos.h` int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + SnapAoSoATeamPolicyteam_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } @@ -311,7 +311,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //ComputeCayleyKlein /*{ // tile_size_compute_ck is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy + Snap3DRangePolicytile_size_compute_ck, TagPairSNAPComputeCayleyKlein> policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); } @@ -319,7 +319,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //PreUi { // tile_size_pre_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy + Snap3DRangePolicytile_size_pre_ui, TagPairSNAPPreUi> policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); Kokkos::parallel_for("PreUi",policy_preui,*this); } @@ -339,7 +339,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicyteam_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); } else { @@ -349,7 +349,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo const int n_teams = chunk_size_div * max_neighs; const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicyteam_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); } @@ -358,7 +358,7 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //TransformUi: un-"fold" ulisttot, zero ylist { // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy + Snap3DRangePolicytile_size_transform_ui, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{vector_length,sna.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); } @@ -369,20 +369,20 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //ComputeZi const int idxz_max = sna.idxz_max; - Snap3DRangePolicy + Snap3DRangePolicytile_size_compute_zi, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); //ComputeBi const int idxb_max = sna.idxb_max; - Snap3DRangePolicy + Snap3DRangePolicytile_size_compute_bi, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); //Transform data layout of blist out of AoSoA //We need this because `blist` gets used in ComputeForce which doesn't //take advantage of AoSoA, which at best would only be beneficial on the margins - Snap3DRangePolicy + Snap3DRangePolicytile_size_transform_bi, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); } @@ -390,15 +390,15 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //Note zeroing `ylist` is fused into `TransformUi`. { //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); + typename Kokkos::RangePolicyTagPairSNAPBeta> policy_beta(0,chunk_size); Kokkos::parallel_for("ComputeBeta",policy_beta,*this); const int idxz_max = sna.idxz_max; if (quadraticflag || eflag) { - Snap3DRangePolicy + Snap3DRangePolicytile_size_compute_yi, TagPairSNAPComputeYiWithZlist> policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); } else { - Snap3DRangePolicy + Snap3DRangePolicytile_size_compute_yi, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); } @@ -421,17 +421,17 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); } else { @@ -442,17 +442,17 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + SnapAoSoATeamPolicyteam_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); @@ -466,10 +466,10 @@ void ForceSNAP::compute(System* system, Binning* binning, Neighbo //ComputeForce /*{ if (neighflag == HALF) { - typename Kokkos::RangePolicy > policy_force(0,chunk_size); + typename Kokkos::RangePolicyTagPairSNAPComputeForce > policy_force(0,chunk_size); Kokkos::parallel_for(policy_force, *this); } else if (neighflag == HALFTHREAD) { - typename Kokkos::RangePolicy > policy_force(0,chunk_size); + typename Kokkos::RangePolicyTagPairSNAPComputeForce > policy_force(0,chunk_size); Kokkos::parallel_for(policy_force, *this); } }*/ @@ -584,10 +584,11 @@ void ForceSNAP::init_coeff(int narg, char **arg) // is wrapped into the sna class - sna = SNA(rfac0,twojmax, - diagonalstyle,use_shared_arrays, - rmin0,switchflag,bzeroflag); - //if (!use_shared_arrays) + sna = SNA(rfac0, twojmax, + rmin0, switchflag, bzeroflag, + chemflag, bnormflag, wselfallflag, + nelements, switchinnerflag); + sna.grow_rij(nmax,max_neighs); sna.init(); From 55451bbd36b6be98466e1b1882700a69df1b012b Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 26 Oct 2022 16:41:13 -0600 Subject: [PATCH 09/11] Changes from last co-development session (july 13, 2022) --- src/KokkosCore_Config_DeclareBackend.hpp | 51 ++++++++++++++++++++++++ src/KokkosCore_Config_FwdBackend.hpp | 51 ++++++++++++++++++++++++ src/KokkosCore_Config_PostInclude.hpp | 49 +++++++++++++++++++++++ src/KokkosCore_Config_SetupBackend.hpp | 50 +++++++++++++++++++++++ src/Makefile | 6 ++- 5 files changed, 205 insertions(+), 2 deletions(-) create mode 100644 src/KokkosCore_Config_DeclareBackend.hpp create mode 100644 src/KokkosCore_Config_FwdBackend.hpp create mode 100644 src/KokkosCore_Config_PostInclude.hpp create mode 100644 src/KokkosCore_Config_SetupBackend.hpp diff --git a/src/KokkosCore_Config_DeclareBackend.hpp b/src/KokkosCore_Config_DeclareBackend.hpp new file mode 100644 index 0000000..26a6b2a --- /dev/null +++ b/src/KokkosCore_Config_DeclareBackend.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_DECLARE_HPP_ +#define KOKKOS_DECLARE_HPP_ + + + +#endif +#include +#include diff --git a/src/KokkosCore_Config_FwdBackend.hpp b/src/KokkosCore_Config_FwdBackend.hpp new file mode 100644 index 0000000..4c64356 --- /dev/null +++ b/src/KokkosCore_Config_FwdBackend.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_FWD_HPP_ +#define KOKKOS_FWD_HPP_ + + + +#endif +#include +#include diff --git a/src/KokkosCore_Config_PostInclude.hpp b/src/KokkosCore_Config_PostInclude.hpp new file mode 100644 index 0000000..7fe4704 --- /dev/null +++ b/src/KokkosCore_Config_PostInclude.hpp @@ -0,0 +1,49 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_POST_INCLUDE_HPP_ +#define KOKKOS_POST_INCLUDE_HPP_ + + + +#endif diff --git a/src/KokkosCore_Config_SetupBackend.hpp b/src/KokkosCore_Config_SetupBackend.hpp new file mode 100644 index 0000000..50795dc --- /dev/null +++ b/src/KokkosCore_Config_SetupBackend.hpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_SETUP_HPP_ +#define KOKKOS_SETUP_HPP_ + + + +#endif +#include diff --git a/src/Makefile b/src/Makefile index e763ae9..5741d9b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,10 +14,12 @@ EXAMINIMD_PATH = $(subst Makefile,,$(MAKEFILE_PATH))/.. default: build echo "Start Build" +MY_MPI_EXE = $(shell which mpicxx) +MY_MPI_PATH = $(dir ${MY_MPI_EXE}) EXE = ExaMiniMD -CXXFLAGS = -O3 -g -LINKFLAGS = -O3 -g +CXXFLAGS = -O3 -g -I${MY_MPI_PATH}../include +LINKFLAGS = -O3 -g -L${MY_MPI_PATH}../lib -lmpi_ibm -lopen-pal -lopen-rte -lhwloc_ompi -levent_core -levent_pthreads -levent ifeq ($(MPI), 1) CXX = mpicxx From 267b6e2b90d9ef3014bfae8cad8ea0f1b7ff161f Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 27 Oct 2022 10:44:25 -0600 Subject: [PATCH 10/11] add RIG to examinimd.h, comment nfac_table --- src/examinimd.h | 5 ++++- src/force_types/sna.h | 2 +- src/force_types/sna_impl.hpp | 16 ++++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/examinimd.h b/src/examinimd.h index 4738703..3e3c0c2 100644 --- a/src/examinimd.h +++ b/src/examinimd.h @@ -36,6 +36,9 @@ // Questions? Contact Christian R. Trott (crtrott@sandia.gov) //************************************************************************ +#ifndef EXAMINI_H +#define EXAMINIMD_H + #include #include @@ -69,4 +72,4 @@ class ExaMiniMD { void shutdown(); }; - +#endif //EXAMINIMD_H diff --git a/src/force_types/sna.h b/src/force_types/sna.h index bdd183f..61fbb5f 100644 --- a/src/force_types/sna.h +++ b/src/force_types/sna.h @@ -313,7 +313,7 @@ class SNA { t_sna_2d rootpqarray; static const int nmaxfactorial = 167; - static const double nfac_table[]; + //static const double nfac_table[]; inline double factorial(int); diff --git a/src/force_types/sna_impl.hpp b/src/force_types/sna_impl.hpp index 4804cc3..51a7465 100644 --- a/src/force_types/sna_impl.hpp +++ b/src/force_types/sna_impl.hpp @@ -1886,19 +1886,22 @@ void SNA::compute_duarray_cpu(const typename Kokkos::TeamPolicy<>::member_type& inline double SNA::factorial(int n) { - //if (n < 0 || n > nmaxfactorial) { - // char str[128]; - // sprintf(str, "Invalid argument to factorial %d", n); - // error->all(FLERR, str); - //} + double result = 1.0; + for(int i=1; i<=n; i++) + result *= 1.0*i; + return result; - return nfac_table[n]; } /* ---------------------------------------------------------------------- factorial n table, size SNA::nmaxfactorial+1 ------------------------------------------------------------------------- */ + +// Using nfac_table gives linking errors + + +/* const double SNA::nfac_table[] = { 1, 1, @@ -2069,6 +2072,7 @@ const double SNA::nfac_table[] = { 9.00369170577843e+297, 1.503616514865e+300, // nmaxfactorial = 167 }; +*/ /* ---------------------------------------------------------------------- the function delta given by VMK Eq. 8.2(1) From 8e85e61c5361ad093d993487afdd0ba54346297d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 31 Oct 2022 11:21:32 -0600 Subject: [PATCH 11/11] CMakeLists.txt: STATUS messages to help identify Kokkos project --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c30c775..ba2e02d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,14 @@ cmake_minimum_required(VERSION 3.10) project(ExaMiniMD LANGUAGES CXX) find_package(Kokkos 3.0 REQUIRED) + +if ((Kokkos_DIR) AND (NOT Kokkos_ROOT)) + message(STATUS "STATUS using lib64/cmake/Kokkos/KokkosConfig.cmake file (in Kokkos installation point) and Kokkos_DIR set to: ${Kokkos_DIR}") +else() + message(STATUS "STATUS using Kokkos installation point with Kokkos_ROOT: ${Kokkos_ROOT}") + message(STATUS "STATUS Kokkos_DIR: ${Kokkos_DIR} automatically set when using Kokkos_ROOT") +endif() + option(USE_MPI "Build with MPI" ON) if (USE_MPI)