diff --git a/input/in.spme b/input/in.spme
new file mode 100644
index 00000000..f076676b
--- /dev/null
+++ b/input/in.spme
@@ -0,0 +1,34 @@
+# 3d NaCl melt
+
+units		metal
+atom_style	charge
+
+newton off
+lattice         fcc 5.64
+region	        box block 0 10 0 10 0 10
+create_box      2 box
+create_atoms    1 box
+
+lattice         fcc 5.64 origin 0.5 0.0 0.0
+region          box block 0 10 0 10 0 10
+create_atoms    2 box
+
+mass            1 22.990
+mass            2 35.453
+set             type 1 charge 1
+set             type 2 charge -1
+
+velocity        all create 300 87287 loop geom
+
+kspace_style    spme 1.0e-6
+pair_style      lj/cut/coul/long 8.0
+pair_coeff      1 1 0.00150286 2.52  8.0
+pair_coeff      2 2 0.01658406 3.85  8.0
+pair_coeff      1 2 0.00499235 3.185 8.0
+
+neighbor        0.3 bin
+neigh_modify    every 20
+fix             1 all nve
+thermo          10
+
+run             100
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 90351166..2f6c0072 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -60,7 +60,7 @@ configure_file(CabanaMD_config.hpp.cmakein CabanaMD_config.hpp @ONLY)
 #------------------------------------------------------------
 
 file(GLOB HEADERS_PUBLIC
-     GLOB *.h force_types/*.h neighbor_types/*.h system_types/*.h
+     GLOB *.h force_types/*.h longrange_force_types/*.h neighbor_types/*.h system_types/*.h
     )
 
 file(GLOB SOURCES
@@ -75,6 +75,11 @@ endif()
 
 list(APPEND SOURCES ${FORCE_TYPES} ${SYSTEM_TYPES} ${SYSNNP_TYPES})
 
+if( Cabana_ENABLE_HEFFTE )
+  file(GLOB LR_FORCE_TYPES longrange_force_types/*.cpp)
+  list(APPEND SOURCES ${LR_FORCE_TYPES})
+endif()
+
 install(FILES ${HEADERS_PUBLIC} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CabanaMD_config.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
@@ -85,6 +90,7 @@ add_library(CabanaMD ${SOURCES})
 # Sources linking against CabanaMD will implicitly include these directories:
 target_include_directories(CabanaMD PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
                                     ${CMAKE_CURRENT_SOURCE_DIR}/force_types
+                                    ${CMAKE_CURRENT_SOURCE_DIR}/longrange_force_types
                                     ${CMAKE_CURRENT_SOURCE_DIR}/neighbor_types
                                     ${CMAKE_CURRENT_SOURCE_DIR}/system_types
                                     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
@@ -100,4 +106,8 @@ if(CabanaMD_ENABLE_NNP AND N2P2_DIR)
   target_link_libraries(CabanaMD ${N2P2_LIB})
 endif()
 
+if(Cabana_ENABLE_HEFFTE)
+   target_link_libraries(CabanaMD Cabana::Cajita)
+endif()
+
 install(TARGETS CabanaMD DESTINATION lib)
diff --git a/src/cabanamd.h b/src/cabanamd.h
index bfb4582f..ba96e825 100644
--- a/src/cabanamd.h
+++ b/src/cabanamd.h
@@ -77,6 +77,7 @@ class CbnMD : public CabanaMD
     t_System *system;
     t_Neighbor *neighbor;
     Force<t_System, t_Neighbor> *force;
+    Force<t_System, t_Neighbor> *lrforce;
     Integrator<t_System> *integrator;
     Comm<t_System> *comm;
     Binning<t_System> *binning;
diff --git a/src/cabanamd_impl.h b/src/cabanamd_impl.h
index 69ce761b..17489d8a 100644
--- a/src/cabanamd_impl.h
+++ b/src/cabanamd_impl.h
@@ -49,6 +49,7 @@
 #include <CabanaMD_config.hpp>
 
 #include <Cabana_Core.hpp>
+#include <Cajita.hpp>
 #include <Kokkos_Core.hpp>
 
 #include <output.h>
@@ -72,23 +73,35 @@ void CbnMD<t_System, t_Neighbor>::init( InputCL commandline )
 
     // Create the Input class: Command line and LAMMPS input file
     input = new InputFile<t_System>( commandline, system );
+
+    // Create output/error streams and overwrite previous runs
+    std::ofstream out( input->output_file, std::ofstream::out );
+    std::ofstream err( input->error_file, std::ofstream::out );
+
     // Read input file
-    input->read_file();
+    input->read_file( out, err );
     nsteps = input->nsteps;
-    std::ofstream out( input->output_file, std::ofstream::app );
-    std::ofstream err( input->error_file, std::ofstream::app );
     log( out, "Read input file." );
 
     using exe_space = typename t_System::execution_space;
     if ( print_rank() )
+    {
         exe_space::print_configuration( out );
+    }
 
+    // Check that the requested pair_style/kspace_style was compiled
 #ifndef CabanaMD_ENABLE_NNP
-    // Check that the requested pair_style was compiled
     if ( input->force_type == FORCE_NNP )
     {
         log_err( err, "NNP requested, but not compiled!" );
     }
+#endif
+#ifndef Cabana_ENABLE_HEFFTE
+    if ( input->lrforce_type != FORCE_NONE )
+    {
+        log_err( err, "LongRange requested, but heFFTe and/or Cajita was not "
+                      "compiled with Cabana!" );
+    }
 #endif
     auto neigh_cutoff = input->force_cutoff + input->neighbor_skin;
     bool half_neigh = input->force_iteration_type == FORCE_ITER_NEIGH_HALF;
@@ -174,27 +187,60 @@ void CbnMD<t_System, t_Neighbor>::init( InputCL commandline )
     }
     force->init_coeff( input->force_coeff_lines );
 
-    log( out, "Using: SystemVectorLength: ", CabanaMD_VECTORLENGTH, " ",
-         system->name() );
-#ifdef CabanaMD_ENABLE_NNP
-    if ( input->force_type == FORCE_NNP )
-        log( out, "Using: SystemNNPVectorLength: ", CabanaMD_VECTORLENGTH_NNP,
-             " ", force->system_name() );
-#endif
-    log( out, "Using: ", force->name(), " ", neighbor->name(), " ",
-         comm->name(), " ", binning->name(), " ", integrator->name() );
-
     // Create atoms - from LAMMPS data file or create FCC/SC lattice
     if ( system->N == 0 && input->read_data_flag == true )
     {
-        read_lammps_data_file<t_System>( input, system, comm );
+        read_lammps_data_file<t_System>( input, system, comm, out, err );
     }
     else if ( system->N == 0 )
     {
-        input->create_lattice( comm );
+        input->create_lattices( comm, out );
     }
     log( out, "Created atoms." );
+    if ( input->create_velocity_flag )
+    {
+        input->create_velocities( comm, out );
+    }
 
+    // Create long range Force class: options in longrangeforce_types/ folder
+    // Delay because tuning (within init) uses atom count, domain size
+#ifdef Cabana_ENABLE_HEFFTE
+    if ( input->lrforce_type != FORCE_NONE )
+    {
+        bool half_neigh =
+            input->lrforce_iteration_type == FORCE_ITER_NEIGH_HALF;
+        if ( !half_neigh )
+            log( err, "Full neighbor list not implemented "
+                      "for the SPME longrange solver." );
+
+        if ( input->lrforce_type == FORCE_EWALD )
+            lrforce = new ForceEwald<t_System, t_Neighbor>( system );
+        else if ( input->lrforce_type == FORCE_SPME )
+            lrforce = new ForceSPME<t_System, t_Neighbor>( system );
+        else
+            log( err, "Invalid LongRangeForceType" );
+
+        lrforce->init_coeff( input->lrforce_coeff_lines );
+        lrforce->init_longrange( system, neigh_cutoff );
+    }
+#endif
+
+    log( out, "Using: SystemVectorLength: ", CabanaMD_VECTORLENGTH, " ",
+         system->name() );
+#ifdef CabanaMD_ENABLE_NNP
+    if ( input->force_type == FORCE_NNP )
+        log( out, "Using: SystemNNPVectorLength: ", CabanaMD_VECTORLENGTH_NNP,
+             " ", force->system_name() );
+#endif
+    if ( input->lrforce_type != FORCE_NONE )
+        log( out, "Using: ", force->name(), " ", lrforce->name(), " ",
+             neighbor->name(), " ", comm->name(), " ", binning->name(), " ",
+             integrator->name() );
+    else
+        log( out, "Using: ", force->name(), " ", neighbor->name(), " ",
+             comm->name(), " ", binning->name(), " ", integrator->name() );
+
+    // Run step 0
     // Set MPI rank neighbors
     comm->create_domain_decomposition();
 
@@ -210,12 +256,15 @@ void CbnMD<t_System, t_Neighbor>::init( InputCL commandline )
 
     // Compute atom neighbors
     neighbor->create( system );
+    // TODO: option for separate long range neighbors
 
     // Compute initial forces
     system->slice_f();
     auto f = system->f;
     Cabana::deep_copy( f, 0.0 );
     force->compute( system, neighbor );
+    if ( input->lrforce_type != FORCE_NONE )
+        lrforce->compute( system, neighbor );
 
     // Scatter ghost atom forces back to original MPI rank
     //   (update force for pair_style nnp even if full neighbor list)
@@ -233,6 +282,8 @@ void CbnMD<t_System, t_Neighbor>::init( InputCL commandline )
         KinE<t_System> kine( comm );
         auto T = temp.compute( system );
         auto PE = pote.compute( system, force, neighbor ) / system->N;
+        if ( input->lrforce_type != FORCE_NONE )
+            PE += pote.compute( system, lrforce, neighbor ) / system->N;
         auto KE = kine.compute( system ) / system->N;
         if ( !_print_lammps )
         {
@@ -273,16 +324,17 @@ void CbnMD<t_System, t_Neighbor>::run()
     KinE<t_System> kine( comm );
 
     double force_time = 0;
+    double lrforce_time = 0;
     double comm_time = 0;
     double neigh_time = 0;
     double integrate_time = 0;
     double other_time = 0;
 
     double last_time = 0;
-    Kokkos::Timer timer, force_timer, comm_timer, neigh_timer, integrate_timer,
-        other_timer;
+    Kokkos::Timer timer, force_timer, lrforce_timer, comm_timer, neigh_timer,
+        integrate_timer, other_timer;
 
-    // Main timestep loop
+    // Timestep Loop
     for ( int step = 1; step <= nsteps; step++ )
     {
         // Integrate atom positions - velocity Verlet first half
@@ -331,7 +383,15 @@ void CbnMD<t_System, t_Neighbor>::run()
         force->compute( system, neighbor );
         force_time += force_timer.seconds();
 
-        // This is where Bonds, Angles, and KSpace should go eventually
+        // Compute long range force
+        if ( input->lrforce_type != FORCE_NONE )
+        {
+            lrforce_timer.reset();
+            lrforce->compute( system, neighbor );
+            lrforce_time += lrforce_timer.seconds();
+        }
+
+        // This is where Bonds, Angles, etc. should go eventually
 
         // Scatter ghost atom forces back to original MPI rank
         //   (update force for pair_style nnp even if full neighbor list)
@@ -348,12 +408,13 @@ void CbnMD<t_System, t_Neighbor>::run()
         integrate_time += integrate_timer.seconds();
 
         other_timer.reset();
-
         // Print output
         if ( step % input->thermo_rate == 0 )
         {
             auto T = temp.compute( system );
             auto PE = pote.compute( system, force, neighbor ) / system->N;
+            if ( input->lrforce_type != FORCE_NONE )
+                PE += pote.compute( system, lrforce, neighbor ) / system->N;
             auto KE = kine.compute( system ) / system->N;
 
             if ( !_print_lammps )
@@ -391,16 +452,36 @@ void CbnMD<t_System, t_Neighbor>::run()
     {
         double steps_per_sec = 1.0 * nsteps / time;
         double atom_steps_per_sec = system->N * steps_per_sec;
-        log( out, std::fixed, std::setprecision( 2 ),
-             "\n#Procs Atoms | Time T_Force T_Neigh T_Comm T_Int ",
-             "T_Other |\n", comm->num_processes(), " ", system->N, " | ", time,
-             " ", force_time, " ", neigh_time, " ", comm_time, " ",
-             integrate_time, " ", other_time, " | PERFORMANCE\n", std::fixed,
-             comm->num_processes(), " ", system->N, " | ", 1.0, " ",
-             force_time / time, " ", neigh_time / time, " ", comm_time / time,
-             " ", integrate_time / time, " ", other_time / time,
-             " | FRACTION\n\n", "#Steps/s Atomsteps/s Atomsteps/(proc*s)\n",
-             std::scientific, steps_per_sec, " ", atom_steps_per_sec, " ",
+
+        if ( input->lrforce_type == FORCE_NONE )
+        {
+            log( out, std::fixed, std::setprecision( 2 ),
+                 "\n#Procs Atoms | Time T_Force T_Neigh T_Comm T_Int ",
+                 "T_Other |" );
+            log( out, comm->num_processes(), " ", system->N, " | ", time, " ",
+                 force_time, " ", neigh_time, " ", comm_time, " ",
+                 integrate_time, " ", other_time, " | PERFORMANCE" );
+            log( out, std::fixed, comm->num_processes(), " ", system->N, " | ",
+                 1.0, " ", force_time / time, " ", neigh_time / time, " ",
+                 comm_time / time, " ", integrate_time / time, " ",
+                 other_time / time, " | FRACTION\n" );
+        }
+        else
+        {
+            log( out, std::fixed, std::setprecision( 2 ),
+                 "\n#Procs Atoms | Time T_Force T_ForceLong T_Neigh T_Comm "
+                 "T_Int T_Other |" );
+            log( out, comm->num_processes(), " ", system->N, " | ", time, " ",
+                 force_time, " ", lrforce_time, " ", neigh_time, " ", comm_time,
+                 " ", integrate_time, " ", other_time, " | PERFORMANCE" );
+            log( out, std::fixed, comm->num_processes(), " ", system->N, " | ",
+                 1.0, " ", force_time / time, " ", lrforce_time / time, " ",
+                 neigh_time / time, " ", comm_time / time, " ",
+                 integrate_time / time, " ", other_time / time,
+                 " | FRACTION\n" );
+        }
+        log( out, "#Steps/s Atomsteps/s Atomsteps/(proc*s)\n", std::scientific,
+             steps_per_sec, " ", atom_steps_per_sec, " ",
              atom_steps_per_sec / comm->num_processes() );
     }
     else
@@ -457,13 +538,6 @@ void CbnMD<t_System, t_Neighbor>::dump_binary( int step )
     err.close();
 }
 
-// TODO: 1. Add path to Reference [DONE]
-//     2. Add MPI Rank file ids in Reference [DONE]
-//     3. Move to separate class
-//     4. Add pressure to thermo output
-//     5. basis_offset [DONE]
-//     6. correctness output to file [DONE]
-
 template <class t_System, class t_Neighbor>
 void CbnMD<t_System, t_Neighbor>::check_correctness( int step )
 {
diff --git a/src/force.h b/src/force.h
index a79fce2f..ba692094 100644
--- a/src/force.h
+++ b/src/force.h
@@ -62,6 +62,7 @@ class Force
     virtual ~Force() {}
 
     virtual void init_coeff( std::vector<std::vector<std::string>> args ) = 0;
+    virtual void init_longrange( t_System *, T_X_FLOAT ){};
 
     virtual void compute( t_System *system, t_Neighbor *neighbor ) = 0;
     virtual T_F_FLOAT compute_energy( t_System *, t_Neighbor * )
diff --git a/src/inputFile.h b/src/inputFile.h
index 16e90192..d91909cf 100644
--- a/src/inputFile.h
+++ b/src/inputFile.h
@@ -150,6 +150,7 @@ class InputFile
 {
   private:
     bool timestepflag; // input timestep?
+
   public:
     InputCL commandline;
     t_System *system;
@@ -157,10 +158,12 @@ class InputFile
     bool _print_rank;
     int units_style;
     int lattice_style;
-    double lattice_constant, lattice_offset_x, lattice_offset_y,
-        lattice_offset_z;
-    int lattice_nx, lattice_ny, lattice_nz;
-    int box[6];
+    std::vector<double> lattice_constant;
+    std::vector<double> lattice_offset_x, lattice_offset_y, lattice_offset_z;
+    std::vector<int> lattice_nx, lattice_ny, lattice_nz;
+    std::vector<double> charge;
+
+    int num_lattice;
 
     char *data_file;
     int data_file_type;
@@ -180,11 +183,16 @@ class InputFile
     int comm_exchange_rate;
 
     int force_type;
+
+    int lrforce_type;
     int force_iteration_type;
+    int lrforce_iteration_type;
     int force_neigh_parallel_type;
 
     T_F_FLOAT force_cutoff;
+    T_F_FLOAT lrforce_cutoff;
     std::vector<std::vector<std::string>> force_coeff_lines;
+    std::vector<std::vector<std::string>> lrforce_coeff_lines;
 
     T_F_FLOAT neighbor_skin;
     int neighbor_type;
@@ -199,14 +207,20 @@ class InputFile
     std::string input_data_file;
     std::string output_data_file;
     bool read_data_flag = false;
+    bool create_velocity_flag = false;
     bool write_data_flag = false;
 
     InputFile( InputCL cl, t_System *s );
-    void read_file( const char *filename = NULL );
+    void read_file( std::ofstream &out, std::ofstream &err,
+                    const char *filename = NULL );
     void read_lammps_file( std::ifstream &in, std::ofstream &out,
                            std::ofstream &err );
     void check_lammps_command( std::string line, std::ofstream &err );
-    void create_lattice( Comm<t_System> *comm );
+    void create_lattices( Comm<t_System> *comm, std::ofstream &out );
+    template <class t_HostSystem>
+    void create_one_lattice( Comm<t_System> *comm, std::ofstream &out,
+                             const int type, t_HostSystem &host_system );
+    void create_velocities( Comm<t_System> *comm, std::ofstream &out );
 };
 
 #include <inputFile_impl.h>
diff --git a/src/inputFile_impl.h b/src/inputFile_impl.h
index f4edc440..1c60aa89 100644
--- a/src/inputFile_impl.h
+++ b/src/inputFile_impl.h
@@ -75,11 +75,13 @@ InputFile<t_System>::InputFile( InputCL commandline_, t_System *system_ )
     integrator_type = INTEGRATOR_NVE;
     neighbor_type = NEIGH_VERLET_2D;
     force_type = FORCE_LJ;
+    lrforce_type = FORCE_NONE;
     binning_type = BINNING_LINKEDCELL;
 
     neighbor_type = commandline.neighbor_type;
     force_iteration_type = commandline.force_iteration_type;
     force_neigh_parallel_type = commandline.force_neigh_parallel_type;
+    lrforce_iteration_type = force_iteration_type;
 
     output_file = commandline.output_file;
     error_file = commandline.error_file;
@@ -95,19 +97,10 @@ InputFile<t_System>::InputFile( InputCL commandline_, t_System *system_ )
     correctnessflag = false;
     timestepflag = false;
 
-    lattice_offset_x = 0.0;
-    lattice_offset_y = 0.0;
-    lattice_offset_z = 0.0;
-    box[0] = 0;
-    box[2] = 0;
-    box[4] = 0;
-    box[1] = 40;
-    box[3] = 40;
-    box[5] = 40;
-
     units_style = UNITS_LJ;
     lattice_style = LATTICE_FCC;
-    lattice_constant = 0.8442;
+
+    num_lattice = 0;
 
     temperature_target = 1.4;
     temperature_seed = 87287;
@@ -121,16 +114,14 @@ InputFile<t_System>::InputFile( InputCL commandline_, t_System *system_ )
     comm_exchange_rate = 20;
 
     force_cutoff = 2.5;
+
+    charge = { 0 };
 }
 
 template <class t_System>
-void InputFile<t_System>::read_file( const char *filename )
+void InputFile<t_System>::read_file( std::ofstream &out, std::ofstream &err,
+                                     const char *filename )
 {
-    // This is the first time the streams are used - overwrite any previous
-    // output
-    std::ofstream out( output_file, std::ofstream::out );
-    std::ofstream err( error_file, std::ofstream::out );
-
     if ( filename == NULL )
     {
         filename = commandline.input_file;
@@ -142,8 +133,6 @@ void InputFile<t_System>::read_file( const char *filename )
         return;
     }
     log_err( err, "Unknown input file type: ", filename );
-    err.close();
-    out.close();
 }
 
 template <class t_System>
@@ -153,12 +142,11 @@ void InputFile<t_System>::read_lammps_file( std::ifstream &in,
 {
     log( out, "\n#InputFile:\n",
          "#=========================================================" );
+
     std::string line;
     while ( std::getline( in, line ) )
-    {
         check_lammps_command( line, err );
-        log( out, line );
-    }
+
     log( out, "#=========================================================\n" );
 }
 
@@ -239,21 +227,23 @@ void InputFile<t_System>::check_lammps_command( std::string line,
     }
     if ( keyword.compare( "lattice" ) == 0 )
     {
+        int curr_latt = lattice_constant.size();
+        lattice_constant.resize( curr_latt + 1 );
         if ( words.at( 1 ).compare( "sc" ) == 0 )
         {
             known = true;
             lattice_style = LATTICE_SC;
-            lattice_constant = std::stod( words.at( 2 ) );
+            lattice_constant.at( curr_latt ) = std::stod( words.at( 2 ) );
         }
         else if ( words.at( 1 ).compare( "fcc" ) == 0 )
         {
             known = true;
             lattice_style = LATTICE_FCC;
             if ( units_style == UNITS_LJ )
-                lattice_constant = std::pow(
+                lattice_constant.at( curr_latt ) = std::pow(
                     ( 4.0 / std::stod( words.at( 2 ) ) ), ( 1.0 / 3.0 ) );
             else
-                lattice_constant = std::stod( words.at( 2 ) );
+                lattice_constant.at( curr_latt ) = std::stod( words.at( 2 ) );
         }
         else
         {
@@ -261,25 +251,38 @@ void InputFile<t_System>::check_lammps_command( std::string line,
                      "LAMMPS-Command: 'lattice' command only supports 'sc' "
                      "and 'fcc' in CabanaMD" );
         }
+        system->lattice_constant = lattice_constant;
+
+        lattice_offset_x.resize( curr_latt + 1 );
+        lattice_offset_y.resize( curr_latt + 1 );
+        lattice_offset_z.resize( curr_latt + 1 );
         if ( words.size() > 3 )
         {
             if ( words.at( 3 ).compare( "origin" ) == 0 )
             {
-                lattice_offset_x = std::stod( words.at( 4 ) );
-                lattice_offset_y = std::stod( words.at( 5 ) );
-                lattice_offset_z = std::stod( words.at( 6 ) );
+                lattice_offset_x.at( curr_latt ) = std::stod( words.at( 4 ) );
+                lattice_offset_y.at( curr_latt ) = std::stod( words.at( 5 ) );
+                lattice_offset_z.at( curr_latt ) = std::stod( words.at( 6 ) );
             }
             else
             {
-                log_err(
-                    err,
-                    "LAMMPS-Command: 'lattice' command only supports 'origin' "
-                    "additional option in CabanaMD" );
+                log_err( err, "LAMMPS-Command: 'lattice' command only supports "
+                              "'origin' additional option in CabanaMD" );
             }
         }
+        else
+        {
+            lattice_offset_x.at( curr_latt ) = 0.0;
+            lattice_offset_y.at( curr_latt ) = 0.0;
+            lattice_offset_z.at( curr_latt ) = 0.0;
+        }
     }
     if ( keyword.compare( "region" ) == 0 )
     {
+        int curr_latt = lattice_nx.size();
+        lattice_nx.resize( curr_latt + 1 );
+        lattice_ny.resize( curr_latt + 1 );
+        lattice_nz.resize( curr_latt + 1 );
         if ( words.at( 2 ).compare( "block" ) == 0 )
         {
             known = true;
@@ -293,9 +296,9 @@ void InputFile<t_System>::check_lammps_command( std::string line,
             if ( ( box[0] != 0 ) || ( box[2] != 0 ) || ( box[4] != 0 ) )
                 log_err( err, "LAMMPS-Command: region only allows for boxes "
                               "with 0,0,0 offset in CabanaMD" );
-            lattice_nx = box[1];
-            lattice_ny = box[3];
-            lattice_nz = box[5];
+            lattice_nx.at( curr_latt ) = box[1];
+            lattice_ny.at( curr_latt ) = box[3];
+            lattice_nz.at( curr_latt ) = box[5];
         }
         else
         {
@@ -311,6 +314,7 @@ void InputFile<t_System>::check_lammps_command( std::string line,
     if ( keyword.compare( "create_atoms" ) == 0 )
     {
         known = true;
+        num_lattice += 1;
     }
     if ( keyword.compare( "mass" ) == 0 )
     {
@@ -323,6 +327,22 @@ void InputFile<t_System>::check_lammps_command( std::string line,
         T_V_FLOAT mass = std::stod( words.at( 2 ) );
         Kokkos::deep_copy( mass_one, mass );
     }
+    if ( keyword.compare( "set" ) == 0 )
+    {
+        known = true;
+        if ( words.at( 1 ).compare( "type" ) == 0 and
+             words.at( 3 ).compare( "charge" ) == 0 )
+        {
+            int type = stoi( words.at( 2 ) ) - 1;
+            charge.resize( charge.size() + 1 );
+            charge.at( type ) = std::stod( words.at( 4 ) );
+        }
+        else
+        {
+            log_err( err, "LAMMPS-Command: 'set' command only supports setting "
+                          "charges by type CabanaMD\n" );
+        }
+    }
     if ( keyword.compare( "read_data" ) == 0 )
     {
         known = true;
@@ -337,11 +357,16 @@ void InputFile<t_System>::check_lammps_command( std::string line,
     }
     if ( keyword.compare( "pair_style" ) == 0 )
     {
-        if ( words.at( 1 ).compare( "lj/cut" ) == 0 )
+        if ( words.at( 1 ).compare( "lj/cut" ) == 0 or
+             words.at( 1 ).compare( "lj/cut/coul/long" ) == 0 )
         {
             known = true;
             force_type = FORCE_LJ;
             force_cutoff = std::stod( words.at( 2 ) );
+            if ( words.at( 1 ).compare( "lj/cut/coul/long" ) == 0 )
+            {
+                lrforce_cutoff = std::stod( words.at( 2 ) );
+            }
         }
         if ( words.at( 1 ).compare( "snap" ) == 0 )
         {
@@ -357,8 +382,10 @@ void InputFile<t_System>::check_lammps_command( std::string line,
             force_coeff_lines.at( 0 ) = split( line );
         }
         if ( !known )
-            log_err( err, "LAMMPS-Command: 'pair_style' command only supports "
-                          "'lj/cut' and 'nnp' style in CabanaMD" );
+            log_err(
+                err,
+                "LAMMPS-Command: 'pair_style' command only supports 'lj/cut', "
+                "'lj/cut/coul/long', and 'nnp' style in CabanaMD" );
     }
     if ( keyword.compare( "pair_coeff" ) == 0 )
     {
@@ -372,6 +399,28 @@ void InputFile<t_System>::check_lammps_command( std::string line,
             force_coeff_lines.at( nlines ) = split( line );
         }
     }
+    if ( keyword.compare( "kspace_style" ) == 0 )
+    {
+        if ( words.at( 1 ).compare( "ewald" ) == 0 )
+        {
+            known = true;
+            lrforce_type = FORCE_EWALD;
+            lrforce_coeff_lines.resize( 1 );
+            lrforce_coeff_lines.at( 0 ) = split( line );
+        }
+        else if ( words.at( 1 ).compare( "spme" ) == 0 )
+        {
+            known = true;
+            lrforce_type = FORCE_SPME;
+            lrforce_coeff_lines.resize( 1 );
+            lrforce_coeff_lines.at( 0 ) = split( line );
+        }
+        else
+        {
+            log_err( err, "LAMMPS-Command: 'kspace_style' command only "
+                          "supports 'ewald' or 'spme' in CabanaMD\n" );
+        }
+    }
     if ( keyword.compare( "velocity" ) == 0 )
     {
         known = true;
@@ -387,6 +436,7 @@ void InputFile<t_System>::check_lammps_command( std::string line,
         }
         temperature_target = std::stod( words.at( 3 ) );
         temperature_seed = std::stoi( words.at( 4 ) );
+        create_velocity_flag = true;
     }
     if ( keyword.compare( "neighbor" ) == 0 )
     {
@@ -480,27 +530,44 @@ void InputFile<t_System>::check_lammps_command( std::string line,
 }
 
 template <class t_System>
-void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
+void InputFile<t_System>::create_lattices( Comm<t_System> *comm,
+                                           std::ofstream &out )
 {
-    std::ofstream out( output_file, std::ofstream::app );
-
     t_System s = *system;
     System<Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>,
            CabanaMD_LAYOUT>
         host_system;
 
-    using h_t_mass = typename t_System::h_t_mass;
-    h_t_mass h_mass = Kokkos::create_mirror_view( s.mass );
-    Kokkos::deep_copy( h_mass, s.mass );
+    for ( int i = 0; i < num_lattice; i++ )
+        create_one_lattice( comm, out, i, host_system );
+
+    system->deep_copy( host_system );
+}
+
+template <class t_System>
+template <class t_HostSystem>
+void InputFile<t_System>::create_one_lattice( Comm<t_System> *comm,
+                                              std::ofstream &out,
+                                              const int create_type,
+                                              t_HostSystem &host_system )
+{
+    auto curr_charge = charge.at( create_type );
+    auto curr_constant = lattice_constant.at( create_type );
+    auto curr_offset_x = lattice_offset_x.at( create_type );
+    auto curr_offset_y = lattice_offset_y.at( create_type );
+    auto curr_offset_z = lattice_offset_z.at( create_type );
+    auto curr_nx = lattice_nx.at( create_type );
+    auto curr_ny = lattice_ny.at( create_type );
+    auto curr_nz = lattice_nz.at( create_type );
 
     // Create the mesh.
-    T_X_FLOAT max_x = lattice_constant * lattice_nx;
-    T_X_FLOAT max_y = lattice_constant * lattice_ny;
-    T_X_FLOAT max_z = lattice_constant * lattice_nz;
+    T_X_FLOAT max_x = curr_constant * curr_nx;
+    T_X_FLOAT max_y = curr_constant * curr_ny;
+    T_X_FLOAT max_z = curr_constant * curr_nz;
     std::array<T_X_FLOAT, 3> global_low = { 0.0, 0.0, 0.0 };
     std::array<T_X_FLOAT, 3> global_high = { max_x, max_y, max_z };
     system->create_domain( global_low, global_high );
-    s = *system;
+    t_System s = *system;
 
     auto local_mesh_lo_x = s.local_mesh_lo_x;
     auto local_mesh_lo_y = s.local_mesh_lo_y;
@@ -509,12 +576,12 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
     auto local_mesh_hi_y = s.local_mesh_hi_y;
     auto local_mesh_hi_z = s.local_mesh_hi_z;
 
-    T_INT ix_start = local_mesh_lo_x / s.global_mesh_x * lattice_nx - 0.5;
-    T_INT iy_start = local_mesh_lo_y / s.global_mesh_y * lattice_ny - 0.5;
-    T_INT iz_start = local_mesh_lo_z / s.global_mesh_z * lattice_nz - 0.5;
-    T_INT ix_end = local_mesh_hi_x / s.global_mesh_x * lattice_nx + 0.5;
-    T_INT iy_end = local_mesh_hi_y / s.global_mesh_y * lattice_ny + 0.5;
-    T_INT iz_end = local_mesh_hi_z / s.global_mesh_z * lattice_nz + 0.5;
+    T_INT ix_start = local_mesh_lo_x / s.global_mesh_x * curr_nx - 1.5;
+    T_INT iy_start = local_mesh_lo_y / s.global_mesh_y * curr_ny - 1.5;
+    T_INT iz_start = local_mesh_lo_z / s.global_mesh_z * curr_nz - 1.5;
+    T_INT ix_end = local_mesh_hi_x / s.global_mesh_x * curr_nx + 1.5;
+    T_INT iy_end = local_mesh_hi_y / s.global_mesh_y * curr_ny + 1.5;
+    T_INT iz_end = local_mesh_hi_z / s.global_mesh_z * curr_nz + 1.5;
 
     // Create Simple Cubic Lattice
     if ( lattice_style == LATTICE_SC )
@@ -523,13 +590,13 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
 
         for ( T_INT iz = iz_start; iz <= iz_end; iz++ )
         {
-            T_FLOAT ztmp = lattice_constant * ( iz + lattice_offset_z );
+            T_FLOAT ztmp = curr_constant * ( iz + curr_offset_z );
             for ( T_INT iy = iy_start; iy <= iy_end; iy++ )
             {
-                T_FLOAT ytmp = lattice_constant * ( iy + lattice_offset_y );
+                T_FLOAT ytmp = curr_constant * ( iy + curr_offset_y );
                 for ( T_INT ix = ix_start; ix <= ix_end; ix++ )
                 {
-                    T_FLOAT xtmp = lattice_constant * ( ix + lattice_offset_x );
+                    T_FLOAT xtmp = curr_constant * ( ix + curr_offset_x );
                     if ( ( xtmp >= local_mesh_lo_x ) &&
                          ( ytmp >= local_mesh_lo_y ) &&
                          ( ztmp >= local_mesh_lo_z ) &&
@@ -542,9 +609,7 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                 }
             }
         }
-        system->N_local = n;
-        system->N = n;
-        system->resize( n );
+        system->resize( system->N_local + n );
         system->slice_all();
         s = *system;
         auto x = s.x;
@@ -553,16 +618,16 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
         auto type = s.type;
         auto q = s.q;
 
-        n = 0;
+        n = system->N_local;
         for ( T_INT iz = iz_start; iz <= iz_end; iz++ )
         {
-            T_FLOAT ztmp = lattice_constant * ( iz + lattice_offset_z );
+            T_FLOAT ztmp = curr_constant * ( iz + curr_offset_z );
             for ( T_INT iy = iy_start; iy <= iy_end; iy++ )
             {
-                T_FLOAT ytmp = lattice_constant * ( iy + lattice_offset_y );
+                T_FLOAT ytmp = curr_constant * ( iy + curr_offset_y );
                 for ( T_INT ix = ix_start; ix <= ix_end; ix++ )
                 {
-                    T_FLOAT xtmp = lattice_constant * ( ix + lattice_offset_x );
+                    T_FLOAT xtmp = curr_constant * ( ix + curr_offset_x );
                     if ( ( xtmp >= local_mesh_lo_x ) &&
                          ( ytmp >= local_mesh_lo_y ) &&
                          ( ztmp >= local_mesh_lo_z ) &&
@@ -573,13 +638,16 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                         x( n, 0 ) = xtmp;
                         x( n, 1 ) = ytmp;
                         x( n, 2 ) = ztmp;
-                        type( n ) = rand() % s.ntypes;
+                        type( n ) = create_type;
                         id( n ) = n + 1;
+                        q( n ) = curr_charge;
                         n++;
                     }
                 }
             }
         }
+        system->N_local = n;
+        system->N = n;
         comm->reduce_int( &system->N, 1 );
 
         // Make ids unique over all processes
@@ -608,9 +676,9 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
 
         for ( int i = 0; i < 4; i++ )
         {
-            basis[i][0] += lattice_offset_x;
-            basis[i][1] += lattice_offset_y;
-            basis[i][2] += lattice_offset_z;
+            basis[i][0] += curr_offset_x;
+            basis[i][1] += curr_offset_y;
+            basis[i][2] += curr_offset_z;
         }
 
         T_INT n = 0;
@@ -624,11 +692,11 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                     for ( int k = 0; k < 4; k++ )
                     {
                         T_FLOAT xtmp =
-                            lattice_constant * ( 1.0 * ix + basis[k][0] );
+                            curr_constant * ( 1.0 * ix + basis[k][0] );
                         T_FLOAT ytmp =
-                            lattice_constant * ( 1.0 * iy + basis[k][1] );
+                            curr_constant * ( 1.0 * iy + basis[k][1] );
                         T_FLOAT ztmp =
-                            lattice_constant * ( 1.0 * iz + basis[k][2] );
+                            curr_constant * ( 1.0 * iz + basis[k][2] );
                         if ( ( xtmp >= local_mesh_lo_x ) &&
                              ( ytmp >= local_mesh_lo_y ) &&
                              ( ztmp >= local_mesh_lo_z ) &&
@@ -642,19 +710,16 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                 }
             }
         }
+        system->resize( system->N_local + n );
 
-        system->N_local = n;
-        system->N = n;
-        system->resize( n );
-        system->slice_all();
-
-        host_system.resize( n );
+        host_system.resize( system->N_local + n );
         host_system.slice_all();
         auto h_x = host_system.x;
         auto h_id = host_system.id;
         auto h_type = host_system.type;
+        auto h_q = host_system.q;
 
-        n = 0;
+        n = system->N_local;
         for ( T_INT iz = iz_start; iz <= iz_end; iz++ )
         {
             for ( T_INT iy = iy_start; iy <= iy_end; iy++ )
@@ -664,11 +729,11 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                     for ( int k = 0; k < 4; k++ )
                     {
                         T_FLOAT xtmp =
-                            lattice_constant * ( 1.0 * ix + basis[k][0] );
+                            curr_constant * ( 1.0 * ix + basis[k][0] );
                         T_FLOAT ytmp =
-                            lattice_constant * ( 1.0 * iy + basis[k][1] );
+                            curr_constant * ( 1.0 * iy + basis[k][1] );
                         T_FLOAT ztmp =
-                            lattice_constant * ( 1.0 * iz + basis[k][2] );
+                            curr_constant * ( 1.0 * iz + basis[k][2] );
                         if ( ( xtmp >= local_mesh_lo_x ) &&
                              ( ytmp >= local_mesh_lo_y ) &&
                              ( ztmp >= local_mesh_lo_z ) &&
@@ -679,14 +744,17 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
                             h_x( n, 0 ) = xtmp;
                             h_x( n, 1 ) = ytmp;
                             h_x( n, 2 ) = ztmp;
-                            h_type( n ) = rand() % s.ntypes;
+                            h_type( n ) = create_type;
                             h_id( n ) = n + 1;
+                            h_q( n ) = curr_charge;
                             n++;
                         }
                     }
                 }
             }
         }
+        system->N_local = n;
+        system->N = n;
         comm->reduce_int( &system->N, 1 );
 
         // Make ids unique over all processes
@@ -694,22 +762,31 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
         comm->scan_int( &N_local_offset, 1 );
         for ( T_INT i = 0; i < n; i++ )
             h_id( i ) += N_local_offset - n;
-
-        // Already sliced
-        s = *system;
-        auto x = s.x;
-        auto id = s.id;
-        auto type = s.type;
-        system->deep_copy( host_system );
     }
     log( out, "Atoms: ", system->N, " ", system->N_local );
+}
 
+template <class t_System>
+void InputFile<t_System>::create_velocities( Comm<t_System> *comm,
+                                             std::ofstream &out )
+{
     // Initialize velocity using the equivalent of the LAMMPS
     // velocity geom option, i.e. uniform random kinetic energies.
     // zero out momentum of the whole system afterwards, to eliminate
     // drift (bad for energy statistics)
 
-    // already sliced
+    t_System s = *system;
+    System<Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>,
+           CabanaMD_LAYOUT>
+        host_system;
+    host_system.resize( system->N_local );
+    host_system.deep_copy( s );
+    host_system.slice_all();
+
+    using h_t_mass = typename t_System::h_t_mass;
+    h_t_mass h_mass = Kokkos::create_mirror_view( s.mass );
+    Kokkos::deep_copy( h_mass, s.mass );
+
     auto h_x = host_system.x;
     auto h_v = host_system.v;
     auto h_q = host_system.q;
@@ -735,8 +812,6 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
         h_v( i, 1 ) = vy / sqrt( mass_i );
         h_v( i, 2 ) = vz / sqrt( mass_i );
 
-        h_q( i ) = 0.0;
-
         total_mass += mass_i;
         total_momentum_x += mass_i * h_v( i, 0 );
         total_momentum_y += mass_i * h_v( i, 1 );
@@ -773,5 +848,5 @@ void InputFile<t_System>::create_lattice( Comm<t_System> *comm )
     }
     system->deep_copy( host_system );
 
-    out.close();
+    log( out, "Created velocities" );
 }
diff --git a/src/longrange_force_types/force_ewald_cabana_neigh.h b/src/longrange_force_types/force_ewald_cabana_neigh.h
new file mode 100644
index 00000000..2315d99a
--- /dev/null
+++ b/src/longrange_force_types/force_ewald_cabana_neigh.h
@@ -0,0 +1,69 @@
+/****************************************************************************
+ * Copyright (c) 2018-2019 by the Cabana authors                            *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#ifndef FORCE_EWALD_CABANA_NEIGH_H
+#define FORCE_EWALD_CABANA_NEIGH_H
+
+#include <Cabana_Core.hpp>
+#include <comm_mpi.h>
+#include <force.h>
+#include <system.h>
+#include <types.h>
+
+#include <assert.h>
+#include <string>
+#include <vector>
+
+template <class t_System, class t_Neighbor>
+class ForceEwald : public Force<t_System, t_Neighbor>
+{
+  private:
+    int N_local, ntypes;
+    typename t_System::t_x x;
+    typename t_System::t_f f;
+    typename t_System::t_f::atomic_access_slice f_a;
+    typename t_System::t_type type;
+
+    typedef typename t_Neighbor::t_neigh_list t_neigh_list;
+
+    T_F_FLOAT _alpha;
+    T_X_FLOAT _r_max;
+    T_X_FLOAT _k_max;
+    T_F_FLOAT accuracy;
+    T_INT _k_int;
+    T_INT n_kvec;
+    T_X_FLOAT lx, ly, lz;
+
+    // dielectric constant
+    T_F_FLOAT _eps_r = 1.0; // Assume 1 for now (vacuum)
+
+    MPI_Comm cart_comm;
+
+    using exe_space = typename t_System::execution_space;
+    using device_type = typename t_System::device_type;
+
+    Kokkos::View<T_F_FLOAT *, device_type> U_trigonometric;
+
+  public:
+    ForceEwald( t_System *system );
+
+    void init_coeff( std::vector<std::vector<std::string>> args ) override;
+    void init_longrange( t_System *system, T_X_FLOAT r_max ) override;
+    void tune( t_System *system );
+
+    void compute( t_System *system, t_Neighbor *neighbor ) override;
+    T_F_FLOAT compute_energy( t_System *system, t_Neighbor *neighbor ) override;
+
+    const char *name() override;
+};
+
+#include <force_ewald_cabana_neigh_impl.h>
+#endif
diff --git a/src/longrange_force_types/force_ewald_cabana_neigh_impl.h b/src/longrange_force_types/force_ewald_cabana_neigh_impl.h
new file mode 100644
index 00000000..5047813b
--- /dev/null
+++ b/src/longrange_force_types/force_ewald_cabana_neigh_impl.h
@@ -0,0 +1,370 @@
+/****************************************************************************
+ * Copyright (c) 2018-2019 by the Cabana authors                            *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+/* Ewald solver */
+
+template <class t_System, class t_Neighbor>
+ForceEwald<t_System, t_Neighbor>::ForceEwald( t_System *system )
+    : Force<t_System, t_Neighbor>( system )
+{
+    _alpha = 0.0;
+    _k_max = 0.0;
+    _r_max = 0.0;
+
+    std::vector<int> dims( 3 );
+    std::vector<int> periods( 3 );
+
+    dims.at( 0 ) = dims.at( 1 ) = dims.at( 2 ) = 0;
+    periods.at( 0 ) = periods.at( 1 ) = periods.at( 2 ) = 1;
+
+    int n_ranks;
+    MPI_Comm_size( MPI_COMM_WORLD, &n_ranks );
+
+    MPI_Dims_create( n_ranks, 3, dims.data() );
+
+    MPI_Cart_create( MPI_COMM_WORLD, 3, dims.data(), periods.data(), 0,
+                     &cart_comm );
+
+    int rank;
+    MPI_Comm_rank( cart_comm, &rank );
+
+    std::vector<int> loc_coords( 3 );
+    std::vector<int> cart_dims( 3 );
+    std::vector<int> cart_periods( 3 );
+    MPI_Cart_get( cart_comm, 3, cart_dims.data(), cart_periods.data(),
+                  loc_coords.data() );
+
+    std::vector<int> neighbor_low( 3 );
+    std::vector<int> neighbor_up( 3 );
+    for ( int dim = 0; dim < 3; ++dim )
+        MPI_Cart_shift( cart_comm, dim, 1, &neighbor_low.at( dim ),
+                        &neighbor_up.at( dim ) );
+}
+
+// TODO: allow user to specify parameters
+template <class t_System, class t_Neighbor>
+void ForceEwald<t_System, t_Neighbor>::init_coeff(
+    std::vector<std::vector<std::string>> args )
+{
+    // This is the kspace_style line (not coeff), so there's only one
+    accuracy = std::stod( args.at( 0 ).at( 2 ) );
+}
+
+template <class t_System, class t_Neighbor>
+void ForceEwald<t_System, t_Neighbor>::init_longrange( t_System *system,
+                                                       T_X_FLOAT r_max )
+{
+    _r_max = r_max;
+    tune( system );
+
+    _k_int = std::ceil( _k_max ) + 1;
+    n_kvec = ( 2 * _k_int + 1 ) * ( 2 * _k_int + 1 ) * ( 2 * _k_int + 1 );
+    U_trigonometric = Kokkos::View<T_F_FLOAT *, device_type>(
+        "ForceEwald::U_trig", 2 * n_kvec );
+}
+
+template <class t_System, class t_Neighbor>
+void ForceEwald<t_System, t_Neighbor>::tune( t_System *system )
+{
+    if ( system->global_mesh_x != system->global_mesh_y or
+         system->global_mesh_x != system->global_mesh_z )
+        throw std::runtime_error(
+            "Ewald needs symmetric system size for now." );
+    lx = system->global_mesh_x;
+    ly = system->global_mesh_y;
+    lz = system->global_mesh_z;
+
+    // Fincham 1994, Optimisation of the Ewald Sum for Large Systems
+    // only valid for cubic systems (needs adjustement for non-cubic systems)
+    auto p = -log( accuracy );
+    _k_max = 2 * p / _r_max;
+    _alpha = sqrt( p ) / _r_max;
+}
+
+template <class t_System, class t_Neighbor>
+void ForceEwald<t_System, t_Neighbor>::compute( t_System *system,
+                                                t_Neighbor *neighbor )
+{
+    // Per-atom slices
+    system->slice_force();
+    auto x = system->x;
+    auto f = system->f;
+    auto type = system->type;
+    system->slice_q();
+    auto q = system->q;
+
+    // Neighbor list
+    auto neigh_list = neighbor->get();
+
+    auto N_local = system->N_local;
+    auto alpha = _alpha;
+    auto k_int = _k_int;
+
+    // In order to compute the k-space contribution in parallel
+    // first the following sums need to be created for each
+    // k-vector:
+    //              sum(1<=i<=N_part) sin/cos (dot(k,r_i))
+    // This can be achieved by computing partial sums on each
+    // MPI process, reducing them over all processes and
+    // afterward using the pre-computed values to compute
+    // the forces and potentials acting on the particles
+    // in parallel independently again.
+
+    Kokkos::deep_copy( U_trigonometric, 0.0 );
+
+    // Compute partial sums
+    auto kspace_partial_sums = KOKKOS_LAMBDA( const int idx )
+    {
+        auto qi = q( idx );
+
+        for ( int kz = -k_int; kz <= k_int; ++kz )
+        {
+            // compute wave vector component
+            auto _kz = 2.0 * PI / lz * (T_X_FLOAT)kz;
+            for ( int ky = -k_int; ky <= k_int; ++ky )
+            {
+                // compute wave vector component
+                auto _ky = 2.0 * PI / ly * (T_X_FLOAT)ky;
+                for ( int kx = -k_int; kx <= k_int; ++kx )
+                {
+                    // no values required for the central box
+                    if ( kx == 0 && ky == 0 && kz == 0 )
+                        continue;
+                    // compute index in contribution array
+                    int kidx =
+                        ( kz + k_int ) * ( 2 * k_int + 1 ) * ( 2 * k_int + 1 ) +
+                        ( ky + k_int ) * ( 2 * k_int + 1 ) + ( kx + k_int );
+                    // compute wave vector component
+                    auto _kx = 2.0 * PI / lx * (T_X_FLOAT)kx;
+                    // compute dot product with local particle and wave
+                    // vector
+                    auto kr = _kx * x( idx, 0 ) + _ky * x( idx, 1 ) +
+                              _kz * x( idx, 2 );
+                    // add contributions
+                    Kokkos::atomic_add( &U_trigonometric( 2 * kidx ),
+                                        qi * cos( kr ) );
+                    Kokkos::atomic_add( &U_trigonometric( 2 * kidx + 1 ),
+                                        qi * sin( kr ) );
+                }
+            }
+        }
+    };
+    Kokkos::RangePolicy<exe_space> policy( 0, N_local );
+    Kokkos::parallel_for( "ForceEwald::KspacePartial", policy,
+                          kspace_partial_sums );
+    Kokkos::fence();
+
+    // reduce the partial results
+    MPI_Allreduce( MPI_IN_PLACE, U_trigonometric.data(), 2 * n_kvec, MPI_DOUBLE,
+                   MPI_SUM, cart_comm );
+
+    const double ELECTRON_CHARGE = 1.60217662E-19; // electron charge in
+                                                   // Coulombs
+    const double EPS_0 =
+        8.8541878128E-22; // permittivity of free space in Farads/Angstrom
+    const double ANGSTROMS = 1E-10; // convert meters to Angstroms
+    const double FORCE_CONVERSION_FACTOR =
+        ANGSTROMS * ELECTRON_CHARGE / ( 4.0 * PI * EPS_0 );
+
+    auto kspace_force = KOKKOS_LAMBDA( const int idx )
+    {
+        T_X_FLOAT k[3];
+
+        auto qi = q( idx );
+
+        for ( int kz = -k_int; kz <= k_int; ++kz )
+        {
+            // compute wave vector component
+            k[2] = 2.0 * PI / lz * (T_X_FLOAT)kz;
+            for ( int ky = -k_int; ky <= k_int; ++ky )
+            {
+                // compute wave vector component
+                k[1] = 2.0 * PI / ly * (T_X_FLOAT)ky;
+                for ( int kx = -k_int; kx <= k_int; ++kx )
+                {
+                    // no values required for the central box
+                    if ( kx == 0 && ky == 0 && kz == 0 )
+                        continue;
+                    // compute index in contribution array
+                    int kidx =
+                        ( kz + k_int ) * ( 2 * k_int + 1 ) * ( 2 * k_int + 1 ) +
+                        ( ky + k_int ) * ( 2 * k_int + 1 ) + ( kx + k_int );
+                    // compute wave vector component
+                    k[0] = 2.0 * PI / lx * (T_X_FLOAT)kx;
+                    // compute dot product of wave vector with itself
+                    auto kk = k[0] * k[0] + k[1] * k[1] + k[2] * k[2];
+
+                    // compute dot product with local particle and wave
+                    // vector
+                    auto kr = k[0] * x( idx, 0 ) + k[1] * x( idx, 1 ) +
+                              k[2] * x( idx, 2 );
+
+                    // coefficient dependent on wave vector
+                    auto k_coeff = exp( -kk / ( 4 * alpha * alpha ) ) / kk;
+
+                    for ( int dim = 0; dim < 3; ++dim )
+                        f( idx, dim ) +=
+                            k_coeff * 2.0 * qi * k[dim] *
+                            FORCE_CONVERSION_FACTOR *
+                            ( U_trigonometric( 2 * kidx + 1 ) * cos( kr ) -
+                              U_trigonometric( 2 * kidx ) * sin( kr ) );
+                }
+            }
+        }
+    };
+    Kokkos::parallel_for( "ForceEwald::Kspace", policy, kspace_force );
+    Kokkos::fence();
+
+    auto realspace_force = KOKKOS_LAMBDA( const int idx )
+    {
+        int num_n =
+            Cabana::NeighborList<t_neigh_list>::numNeighbor( neigh_list, idx );
+
+        auto rx = x( idx, 0 );
+        auto ry = x( idx, 1 );
+        auto rz = x( idx, 2 );
+        auto qi = q( idx );
+
+        for ( int ij = 0; ij < num_n; ++ij )
+        {
+            int j = Cabana::NeighborList<t_neigh_list>::getNeighbor( neigh_list,
+                                                                     idx, ij );
+            auto dx = x( j, 0 ) - rx;
+            auto dy = x( j, 1 ) - ry;
+            auto dz = x( j, 2 ) - rz;
+            auto d = sqrt( dx * dx + dy * dy + dz * dz );
+            auto qj = q( j );
+
+            // force computation
+            auto f_fact = qi * qj * FORCE_CONVERSION_FACTOR *
+                          ( 2.0 * sqrt( alpha / PI ) * exp( -alpha * d * d ) +
+                            erfc( sqrt( alpha ) * d ) ) /
+                          ( d * d );
+            Kokkos::atomic_add( &f( idx, 0 ), f_fact * dx );
+            Kokkos::atomic_add( &f( idx, 1 ), f_fact * dy );
+            Kokkos::atomic_add( &f( idx, 2 ), f_fact * dz );
+            Kokkos::atomic_add( &f( j, 0 ), -f_fact * dx );
+            Kokkos::atomic_add( &f( j, 1 ), -f_fact * dy );
+            Kokkos::atomic_add( &f( j, 2 ), -f_fact * dz );
+        }
+    };
+    Kokkos::parallel_for( policy, realspace_force );
+    Kokkos::fence();
+}
+
+template <class t_System, class t_Neighbor>
+T_V_FLOAT
+ForceEwald<t_System, t_Neighbor>::compute_energy( t_System *system,
+                                                  t_Neighbor *neighbor )
+{
+    auto N_local = system->N_local;
+    auto alpha = _alpha;
+    auto k_int = _k_int;
+
+    // Per-atom slices
+    system->slice_x();
+    auto x = system->x;
+    system->slice_q();
+    auto q = system->q;
+
+    // Neighbor list
+    auto neigh_list = neighbor->get();
+
+    const double ELECTRON_CHARGE = 1.60217662E-19; // electron charge in
+                                                   // Coulombs
+    const double EPS_0 =
+        8.8541878128E-22; // permittivity of free space in Farads/Angstrom
+    const double ENERGY_CONVERSION_FACTOR =
+        ELECTRON_CHARGE / ( 4.0 * PI * EPS_0 );
+
+    T_V_FLOAT energy_k = 0.0;
+
+    auto coeff = 4.0 * PI / ( lx * ly * lz );
+    T_X_FLOAT k[3];
+
+    for ( int kz = -k_int; kz <= k_int; ++kz )
+    {
+        // compute wave vector component
+        k[2] = 2.0 * PI / lz * (T_X_FLOAT)kz;
+        for ( int ky = -k_int; ky <= k_int; ++ky )
+        {
+            // compute wave vector component
+            k[1] = 2.0 * PI / ly * (T_X_FLOAT)ky;
+            for ( int kx = -k_int; kx <= k_int; ++kx )
+            {
+                // no values required for the central box
+                if ( kx == 0 && ky == 0 && kz == 0 )
+                    continue;
+                // compute index in contribution array
+                int kidx =
+                    ( kz + k_int ) * ( 2 * k_int + 1 ) * ( 2 * k_int + 1 ) +
+                    ( ky + k_int ) * ( 2 * k_int + 1 ) + ( kx + k_int );
+                // compute wave vector component
+                k[0] = 2.0 * PI / lx * (T_X_FLOAT)kx;
+                // compute dot product of wave vector with itself
+                auto kk = k[0] * k[0] + k[1] * k[1] + k[2] * k[2];
+
+                // coefficient dependent on wave vector
+                auto k_coeff = exp( -kk / ( 4 * alpha * alpha ) ) / kk;
+
+                // contribution to potential energy
+                energy_k += coeff * k_coeff *
+                            ( U_trigonometric( 2 * kidx ) *
+                                  U_trigonometric( 2 * kidx ) +
+                              U_trigonometric( 2 * kidx + 1 ) *
+                                  U_trigonometric( 2 * kidx + 1 ) ) *
+                            ENERGY_CONVERSION_FACTOR;
+            }
+        }
+    }
+    energy_k *= N_local;
+
+    T_V_FLOAT energy_r = 0.0;
+    auto realspace_potential = KOKKOS_LAMBDA( const int idx, T_V_FLOAT &PE )
+    {
+        int num_n =
+            Cabana::NeighborList<t_neigh_list>::numNeighbor( neigh_list, idx );
+        auto rx = x( idx, 0 );
+        auto ry = x( idx, 1 );
+        auto rz = x( idx, 2 );
+        auto qi = q( idx );
+
+        for ( int ij = 0; ij < num_n; ++ij )
+        {
+            int j = Cabana::NeighborList<t_neigh_list>::getNeighbor( neigh_list,
+                                                                     idx, ij );
+            auto dx = x( j, 0 ) - rx;
+            auto dy = x( j, 1 ) - ry;
+            auto dz = x( j, 2 ) - rz;
+            auto d = sqrt( dx * dx + dy * dy + dz * dz );
+            auto qj = q( j );
+
+            PE += ENERGY_CONVERSION_FACTOR * qi * qj * erfc( alpha * d ) / d;
+        }
+
+        // self-energy contribution
+        PE += -alpha / PI_SQRT * qi * qi * ENERGY_CONVERSION_FACTOR;
+    };
+    Kokkos::RangePolicy<exe_space> policy( 0, N_local );
+    Kokkos::parallel_reduce( "ForceEwald::RealSpacePE", policy,
+                             realspace_potential, energy_r );
+    Kokkos::fence();
+
+    // Not including dipole correction (usually unnecessary)
+    T_V_FLOAT energy = energy_k + energy_r;
+    return energy;
+}
+
+template <class t_System, class t_Neighbor>
+const char *ForceEwald<t_System, t_Neighbor>::name()
+{
+    return "LongRangeForce:Ewald";
+}
diff --git a/src/longrange_force_types/force_spme_cabana_neigh.h b/src/longrange_force_types/force_spme_cabana_neigh.h
new file mode 100644
index 00000000..fed6a02c
--- /dev/null
+++ b/src/longrange_force_types/force_spme_cabana_neigh.h
@@ -0,0 +1,84 @@
+/****************************************************************************
+ * Copyright (c) 2018-2019 by the Cabana authors                            *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#ifndef FORCE_SPME_CABANA_NEIGH_H
+#define FORCE_SPME_CABANA_NEIGH_H
+
+#include <comm_mpi.h>
+#include <force.h>
+#include <system.h>
+#include <types.h>
+
+#include <Cabana_Core.hpp>
+#include <Cajita.hpp>
+
+#include <assert.h>
+#include <string>
+#include <vector>
+
+template <class t_System, class t_Neighbor>
+class ForceSPME : public Force<t_System, t_Neighbor>
+{
+  private:
+    int N_local, ntypes;
+    typename t_System::t_x x;
+    typename t_System::t_f f;
+    typename t_System::t_f::atomic_access_slice f_a;
+    typename t_System::t_type type;
+
+    typedef typename t_Neighbor::t_neigh_list t_neigh_list;
+
+    T_F_FLOAT _alpha;
+    T_X_FLOAT _r_max;
+    T_X_FLOAT _k_max;
+    T_F_FLOAT accuracy;
+    T_INT _k_int;
+    T_INT n_kvec;
+
+    // dielectric constant
+    T_X_FLOAT _eps_r = 1.0; // Assume 1 for now (vacuum)
+
+    using exe_space = typename t_System::execution_space;
+    using mem_space = typename t_System::memory_space;
+    using device_type = typename t_System::device_type;
+
+  public:
+    std::shared_ptr<Cajita::LocalGrid<Cajita::UniformMesh<double>>> local_grid;
+    std::shared_ptr<Cajita::Array<double, Cajita::Node,
+                                  Cajita::UniformMesh<double>, device_type>>
+        Q;
+    std::shared_ptr<Cajita::Halo<mem_space>> Q_halo;
+    std::shared_ptr<Cajita::Array<double, Cajita::Node,
+                                  Cajita::UniformMesh<double>, device_type>>
+        BC_array;
+    std::shared_ptr<Cajita::Array<double, Cajita::Node,
+                                  Cajita::UniformMesh<double>, device_type>>
+        Qcomplex;
+
+    ForceSPME( t_System *system );
+
+    void init_coeff( std::vector<std::vector<std::string>> args ) override;
+    void init_longrange( t_System *system, T_X_FLOAT r_max ) override;
+    void tune( t_System *system );
+    void create_mesh( t_System *system );
+
+    double oneDspline( double x );
+    double oneDsplinederiv( double x );
+    double oneDeuler( int k, int meshwidth );
+
+    void compute( t_System *system, t_Neighbor *neighbor ) override;
+    T_F_FLOAT compute_energy( t_System *system, t_Neighbor *neighbor ) override;
+
+    const char *name() override;
+};
+
+#include <force_spme_cabana_neigh_impl.h>
+#endif
diff --git a/src/longrange_force_types/force_spme_cabana_neigh_impl.h b/src/longrange_force_types/force_spme_cabana_neigh_impl.h
new file mode 100644
index 00000000..69467fa6
--- /dev/null
+++ b/src/longrange_force_types/force_spme_cabana_neigh_impl.h
@@ -0,0 +1,467 @@
+/****************************************************************************
+ * Copyright (c) 2018-2019 by the Cabana authors                            *
+ * All rights reserved.                                                     *
+ *                                                                          *
+ * This file is part of the Cabana library. Cabana is distributed under a   *
+ * BSD 3-clause license. For the licensing terms see the LICENSE file in    *
+ * the top-level directory.                                                 *
+ *                                                                          *
+ * SPDX-License-Identifier: BSD-3-Clause                                    *
+ ****************************************************************************/
+
+#include <Cajita.hpp>
+#ifdef Cabana_ENABLE_Cuda
+#include <cufft.h>
+#include <cufftw.h>
+#else
+#include <fftw3.h>
+#endif
+
+#include <mpi.h>
+
+#include <string>
+
+/* Smooth particle mesh Ewald (SPME) solver
+ * - This method, from Essman et al. (1995) computes long-range Coulombic forces
+ *   with O(nlogN) scaling by using 3D FFT and interpolation to a mesh for the
+ *   reciprocal space part of the Ewald sum.
+ */
+
+template <class t_System, class t_Neighbor>
+ForceSPME<t_System, t_Neighbor>::ForceSPME( t_System *system )
+    : Force<t_System, t_Neighbor>( system )
+{
+    _alpha = 0.0;
+    _k_max = 0.0;
+    _r_max = 0.0;
+}
+
+// TODO: allow user to specify parameters
+template <class t_System, class t_Neighbor>
+void ForceSPME<t_System, t_Neighbor>::init_coeff(
+    std::vector<std::vector<std::string>> args )
+{
+    // This is the kspace_style line (not coeff), so there's only one
+    accuracy = std::stod( args.at( 0 ).at( 2 ) );
+}
+
+template <class t_System, class t_Neighbor>
+void ForceSPME<t_System, t_Neighbor>::init_longrange( t_System *system,
+                                                      double r_max )
+{
+    _r_max = r_max;
+    tune( system );
+    create_mesh( system );
+}
+
+// Tune to a given accuracy
+template <class t_System, class t_Neighbor>
+void ForceSPME<t_System, t_Neighbor>::tune( t_System *system )
+{
+    if ( system->global_mesh_x != system->global_mesh_y or
+         system->global_mesh_x != system->global_mesh_z )
+        throw std::runtime_error( "SPME needs symmetric system size for now." );
+
+    // Fincham 1994, Optimisation of the Ewald Sum for Large Systems
+    // only valid for cubic systems (needs adjustement for non-cubic systems)
+    auto p = -log( accuracy );
+    _k_max = 2 * p / _r_max;
+    _alpha = sqrt( p ) / _r_max;
+}
+
+// Compute a 1D cubic cardinal B-spline value, used in spreading charge to mesh
+//   Given the distance from the particle (x) in units of mesh spaces, this
+//   computes the fraction of that charge to place at a mesh point x mesh spaces
+//   away The cubic B-spline used here is shifted so that it is symmetric about
+//   zero All cubic B-splines are smooth functions that go to zero and are
+//   defined piecewise
+// TODO: replace use of this with Cajita functions
+template <class t_System, class t_Neighbor>
+KOKKOS_INLINE_FUNCTION double
+ForceSPME<t_System, t_Neighbor>::oneDspline( double x )
+{
+    if ( x >= 0.0 and x < 1.0 )
+    {
+        return ( 1.0 / 6.0 ) * x * x * x;
+    }
+    else if ( x >= 1.0 and x <= 2.0 )
+    {
+        return -( 1.0 / 2.0 ) * x * x * x + 2.0 * x * x - 2.0 * x +
+               ( 2.0 / 3.0 );
+    }
+    // Using the symmetry here, only need to define function between 0 and 2
+    // Beware: This means all input to this function should be made positive
+    {
+        return 0.0; // Zero if distance is >= 2 mesh spacings
+    }
+}
+
+// Compute derivative of 1D cubic cardinal B-spline
+// TODO: replace use of this with Cajita functions
+template <class t_System, class t_Neighbor>
+KOKKOS_INLINE_FUNCTION double
+ForceSPME<t_System, t_Neighbor>::oneDsplinederiv( double origx )
+{
+    double x = 2.0 - std::abs( origx );
+    double forcedir = 1.0;
+    if ( origx < 0.0 )
+    {
+        forcedir = -1.0;
+    }
+    if ( x >= 0.0 and x < 1.0 )
+    {
+        return ( 1.0 / 2.0 ) * x * x * forcedir;
+    }
+    else if ( x >= 1.0 and x <= 2.0 )
+    {
+        return ( -( 3.0 / 2.0 ) * x * x + 4.0 * x - 2.0 ) * forcedir;
+    }
+    // Using the symmetry here, only need to define function between 0 and 2
+    // Beware: This means all input to this function should be made positive
+    else
+    {
+        return 0.0; // Zero if distance is >= 2 mesh spacings
+    }
+}
+
+// Compute a 1-D Euler spline. This function is part of the "lattice structure
+// factor" and is given by:
+//   b(k, meshwidth) = exp(2*PI*i*3*k/meshwidth) / SUM_{l=0,2}(1Dspline(l+1) *
+//   exp(2*PI*i*k*l/meshwidth)) when using a non-shifted cubic B-spline in the
+//   charge spread, where meshwidth is the number of mesh points in that
+//   dimension and k is the scaled fractional coordinate
+template <class t_System, class t_Neighbor>
+KOKKOS_INLINE_FUNCTION double
+ForceSPME<t_System, t_Neighbor>::oneDeuler( int k, int meshwidth )
+{
+    double denomreal = 0.0;
+    double denomimag = 0.0;
+    // Compute the denominator sum first, splitting the complex exponential into
+    // sin and cos
+    for ( int l = 0; l < 3; l++ )
+    {
+        denomreal +=
+            ForceSPME::oneDspline( fmin( 4.0 - ( l + 1.0 ), l + 1.0 ) ) *
+            cos( 2.0 * PI * double( k ) * l / double( meshwidth ) );
+        denomimag +=
+            ForceSPME::oneDspline( fmin( 4.0 - ( l + 1.0 ), l + 1.0 ) ) *
+            sin( 2.0 * PI * double( k ) * l / double( meshwidth ) );
+    }
+    // Compute the numerator, again splitting the complex exponential
+    double numreal = cos( 2.0 * PI * 3.0 * double( k ) / double( meshwidth ) );
+    double numimag = sin( 2.0 * PI * 3.0 * double( k ) / double( meshwidth ) );
+    // Returning the norm of the 1-D Euler spline
+    return ( numreal * numreal + numimag * numimag ) /
+           ( denomreal * denomreal + denomimag * denomimag );
+}
+
+// Create uniform mesh for SPME method
+template <class t_System, class t_Neighbor>
+void ForceSPME<t_System, t_Neighbor>::create_mesh( t_System *system )
+{
+    // TODO: This should be configurable
+    double cell_size = system->global_mesh_x / 10.0;
+    std::array<double, 3> global_low_corner = {system->global_mesh_lo_x,
+                                               system->global_mesh_lo_y,
+                                               system->global_mesh_lo_z};
+    std::array<double, 3> global_high_corner = {system->global_mesh_hi_x,
+                                                system->global_mesh_hi_y,
+                                                system->global_mesh_hi_z};
+
+    // Create the global mesh, global grid, and local grid.
+    auto uniform_global_mesh = Cajita::createUniformGlobalMesh(
+        global_low_corner, global_high_corner, cell_size );
+    Cajita::UniformDimPartitioner partitioner;
+    auto global_grid = Cajita::createGlobalGrid(
+        MPI_COMM_WORLD, uniform_global_mesh, system->is_periodic, partitioner );
+    const int halo_width = 3;
+    local_grid = Cajita::createLocalGrid( global_grid, halo_width );
+
+    // Create a scalar field for charge on the grid.
+    // Using uppercase for grid/mesh variables and lowercase for particles
+    auto scalar_layout =
+        Cajita::createArrayLayout( local_grid, 1, Cajita::Node() );
+    Q = Cajita::createArray<double, device_type>( "meshq", scalar_layout );
+    Q_halo = Cajita::createHalo( *Q, Cajita::FullHaloPattern() );
+
+    auto owned_space = local_grid->indexSpace( Cajita::Own(), Cajita::Node(),
+                                               Cajita::Local() );
+
+    int meshwidth_x = global_grid->globalNumEntity( Cajita::Node(), 0 );
+    int meshwidth_y = global_grid->globalNumEntity( Cajita::Node(), 1 );
+    int meshwidth_z = global_grid->globalNumEntity( Cajita::Node(), 2 );
+    double alpha = _alpha;
+
+    // Calculating the values of the BC array involves first shifting the
+    // fractional coords then compute the B and C arrays as described in the
+    // paper This can be done once at the start of a run if the mesh stays
+    // constant
+    BC_array =
+        Cajita::createArray<double, device_type>( "BC_array", scalar_layout );
+    auto BC_view = BC_array->view();
+
+    auto BC_functor = KOKKOS_LAMBDA( const int kx, const int ky, const int kz )
+    {
+      // shift to avoid ghost cells. need indexing from 0 to num_cells_x-1
+      int kxx, kyy, kzz;
+      kxx = kx - 3; //TODO: Is it always 3 here?
+      kyy = ky - 3;
+      kzz = kz - 3;
+      int mx, my, mz;
+        if ( kxx + kyy + kzz > 0 )
+        {
+            // Shift the C array
+            mx = kxx;
+            my = kyy;
+            mz = kzz;
+            if ( mx > meshwidth_x / 2.0 )
+            {
+                mx = kxx - meshwidth_x;
+            }
+            if ( my > meshwidth_y / 2.0 )
+            {
+                my = kyy - meshwidth_y;
+            }
+            if ( mz > meshwidth_z / 2.0 )
+            {
+                mz = kzz - meshwidth_z;
+            }
+            auto m2 = ( mx * mx + my * my + mz * mz );
+            // Calculate BC.
+            BC_view( kx, ky, kz, 0 ) =
+                ForceSPME::oneDeuler( mx, meshwidth_x ) *
+                ForceSPME::oneDeuler( my, meshwidth_y ) *
+                ForceSPME::oneDeuler( mz, meshwidth_z ) *
+                exp( -PI * PI * m2 / ( alpha * alpha ) ) /
+                ( PI * system->global_mesh_x * system->global_mesh_y *
+                  system->global_mesh_z * m2 );
+        }
+        else
+        {
+            BC_view( kx, ky, kz, 0 ) = 0.0;
+        }
+    };
+    Kokkos::parallel_for(
+        Cajita::createExecutionPolicy( owned_space, exe_space() ), BC_functor );
+    Kokkos::fence();
+}
+
+// Compute the forces
+template <class t_System, class t_Neighbor>
+void ForceSPME<t_System, t_Neighbor>::compute( t_System *system,
+                                               t_Neighbor *neighbor )
+{
+    // For now, force symmetry
+    if ( system->global_mesh_x != system->global_mesh_y or
+         system->global_mesh_x != system->global_mesh_z )
+        throw std::runtime_error( "SPME needs symmetric system size for now." );
+
+    auto N_local = system->N_local;
+    auto alpha = _alpha;
+
+    // Per-atom properties
+    system->slice_force();
+    auto x = system->x;
+    auto f = system->f;
+    auto type = system->type;
+    system->slice_q();
+    auto q = system->q;
+
+    // Neighbor list
+    auto neigh_list = neighbor->get();
+
+    /* reciprocal-space contribution */
+    auto owned_space = local_grid->indexSpace( Cajita::Own(), Cajita::Node(),
+                                               Cajita::Local() );
+    auto grid_policy =
+        Cajita::createExecutionPolicy( owned_space, exe_space() );
+
+    // First, spread the charges onto the mesh
+    Cajita::ArrayOp::assign( *Q, 0.0, Cajita::Ghost() );
+    auto scalar_p2g = Cajita::createScalarValueP2G( q, 1.0 );
+    Cajita::p2g( scalar_p2g, x, N_local, Cajita::Spline<3>(), *Q_halo, *Q );
+
+    // Copy mesh charge into complex view for FFT work
+    auto vector_layout =
+        Cajita::createArrayLayout( local_grid, 2, Cajita::Node() );
+    Qcomplex =
+        Cajita::createArray<double, device_type>( "Qcomplex", vector_layout );
+    auto Qcomplex_view = Qcomplex->view();
+    auto Q_view = Q->view();
+
+    auto copy_charge = KOKKOS_LAMBDA( const int i, const int j, const int k )
+    {
+        Qcomplex_view( i, j, k, 0 ) = Q_view( i, j, k, 0 );
+        Qcomplex_view( i, j, k, 1 ) = 0.0;
+    };
+    Kokkos::parallel_for( grid_policy, copy_charge );
+    Kokkos::fence();
+
+    // Next, solve Poisson's equation taking some FFTs of charges on mesh grid
+
+    // Using default FastFourierTransformParams
+    auto fft = Cajita::Experimental::createHeffteFastFourierTransform<
+        double, device_type>( *vector_layout );
+
+    fft->reverse( *Qcomplex, Cajita::Experimental::FFTScaleNone() );
+
+    // update Q for later force calcs
+    auto BC_view = BC_array->view();
+    auto mult_BC_Qr = KOKKOS_LAMBDA( const int i, const int j, const int k )
+    {
+        Qcomplex_view( i, j, k, 0 ) *= BC_view( i, j, k, 0 );
+        Qcomplex_view( i, j, k, 1 ) *= BC_view( i, j, k, 0 );
+    };
+    Kokkos::parallel_for( grid_policy, mult_BC_Qr );
+    Kokkos::fence();
+
+    fft->forward( *Qcomplex, Cajita::Experimental::FFTScaleNone() );
+
+    // Copy real part of complex Qr to Q
+    auto copy_back_charge =
+        KOKKOS_LAMBDA( const int i, const int j, const int k )
+    {
+        Q_view( i, j, k, 0 ) = Qcomplex_view( i, j, k, 0 );
+    };
+    Kokkos::parallel_for( grid_policy, copy_back_charge );
+    Kokkos::fence();
+
+    // Now, compute forces on each particle
+    auto scalar_gradient_g2p = Cajita::createScalarGradientG2P( f, 1.0 );
+    Cajita::g2p( *Q, *Q_halo, x, N_local, Cajita::Spline<3>(),
+                 scalar_gradient_g2p );
+
+    // Now, multiply each value by particle's charge to get force
+    // TODO: This needs to only be applied to the longrange portion
+    auto mult_q = KOKKOS_LAMBDA( const int idx )
+    {
+        f( idx, 0 ) *= q( idx );
+        f( idx, 1 ) *= q( idx );
+        f( idx, 2 ) *= q( idx );
+    };
+    Kokkos::RangePolicy<exe_space> policy( 0, N_local );
+    Kokkos::parallel_for( policy, mult_q );
+    Kokkos::fence();
+
+    /* real-space contribution */
+    auto realspace_force = KOKKOS_LAMBDA( const int idx )
+    {
+        int num_n =
+            Cabana::NeighborList<t_neigh_list>::numNeighbor( neigh_list, idx );
+
+        auto rx = x( idx, 0 );
+        auto ry = x( idx, 1 );
+        auto rz = x( idx, 2 );
+        auto qi = q( idx );
+
+        for ( int ij = 0; ij < num_n; ++ij )
+        {
+            int j = Cabana::NeighborList<t_neigh_list>::getNeighbor( neigh_list,
+                                                                     idx, ij );
+            auto dx = x( j, 0 ) - rx;
+            auto dy = x( j, 1 ) - ry;
+            auto dz = x( j, 2 ) - rz;
+            auto d = sqrt( dx * dx + dy * dy + dz * dz );
+            auto qj = q( j );
+
+            // force computation
+            auto f_fact = qi * qj *
+                          ( 2.0 * sqrt( alpha / PI ) * exp( -alpha * d * d ) +
+                            erfc( sqrt( alpha ) * d ) ) /
+                          ( d * d );
+            Kokkos::atomic_add( &f( idx, 0 ), f_fact * dx );
+            Kokkos::atomic_add( &f( idx, 1 ), f_fact * dy );
+            Kokkos::atomic_add( &f( idx, 2 ), f_fact * dz );
+            Kokkos::atomic_add( &f( j, 0 ), -f_fact * dx );
+            Kokkos::atomic_add( &f( j, 1 ), -f_fact * dy );
+            Kokkos::atomic_add( &f( j, 2 ), -f_fact * dz );
+        }
+    };
+    Kokkos::parallel_for( policy, realspace_force );
+    Kokkos::fence();
+}
+
+template <class t_System, class t_Neighbor>
+T_V_FLOAT
+ForceSPME<t_System, t_Neighbor>::compute_energy( t_System *system,
+                                                 t_Neighbor *neighbor )
+{
+    N_local = system->N_local;
+    auto alpha = _alpha;
+
+    // Per-atom properties
+    system->slice_x();
+    auto x = system->x;
+    system->slice_q();
+    auto q = system->q;
+
+    // Neighbor list
+    auto neigh_list = neighbor->get();
+
+    auto owned_space = local_grid->indexSpace( Cajita::Own(), Cajita::Node(),
+                                               Cajita::Local() );
+    auto BC_view = BC_array->view();
+    auto Qcomplex_view = Qcomplex->view();
+
+    const double ELECTRON_CHARGE = 1.60217662E-19; // electron charge in
+                                                   // Coulombs
+    const double EPS_0 =
+        8.8541878128E-22; // permittivity of free space in Farads/Angstrom
+    const double ENERGY_CONVERSION_FACTOR =
+        ELECTRON_CHARGE / ( 4.0 * PI * EPS_0 );
+
+    T_V_FLOAT energy_k = 0.0;
+    auto kspace_potential =
+        KOKKOS_LAMBDA( const int i, const int j, const int k, T_V_FLOAT &PE )
+    {
+        PE += ENERGY_CONVERSION_FACTOR * BC_view( i, j, k, 0 ) *
+              ( ( Qcomplex_view( i, j, k, 0 ) * Qcomplex_view( i, j, k, 0 ) ) +
+                ( Qcomplex_view( i, j, k, 1 ) * Qcomplex_view( i, j, k, 1 ) ) );
+    };
+    Kokkos::parallel_reduce(
+        "ForceSPME::KspacePE",
+        Cajita::createExecutionPolicy( owned_space, exe_space() ),
+        kspace_potential, energy_k );
+    Kokkos::fence();
+
+    T_V_FLOAT energy_r = 0.0;
+    auto realspace_potential = KOKKOS_LAMBDA( const int idx, T_V_FLOAT &PE )
+    {
+        int num_n =
+            Cabana::NeighborList<t_neigh_list>::numNeighbor( neigh_list, idx );
+
+        auto rx = x( idx, 0 );
+        auto ry = x( idx, 1 );
+        auto rz = x( idx, 2 );
+        auto qi = q( idx );
+
+        for ( int ij = 0; ij < num_n; ++ij )
+        {
+            int j = Cabana::NeighborList<t_neigh_list>::getNeighbor( neigh_list,
+                                                                     idx, ij );
+            auto dx = x( j, 0 ) - rx;
+            auto dy = x( j, 1 ) - ry;
+            auto dz = x( j, 2 ) - rz;
+            auto d = sqrt( dx * dx + dy * dy + dz * dz );
+            auto qj = q( j );
+
+            PE += ENERGY_CONVERSION_FACTOR * qi * qj * erfc( alpha * d ) / d;
+        }
+
+        // self-energy contribution
+        PE += -alpha / PI_SQRT * qi * qi * ENERGY_CONVERSION_FACTOR;
+    };
+    Kokkos::RangePolicy<exe_space> policy( 0, N_local );
+    Kokkos::parallel_reduce( "ForceSPME::RealSpacePE", policy,
+                             realspace_potential, energy_r );
+    Kokkos::fence();
+
+    T_V_FLOAT energy = energy_k + energy_r;
+    return energy;
+}
+
+template <class t_System, class t_Neighbor>
+const char *ForceSPME<t_System, t_Neighbor>::name()
+{
+    return "LongRangeForce:SPME";
+}
diff --git a/src/modules_force.h b/src/modules_force.h
index c846254a..3cd37beb 100644
--- a/src/modules_force.h
+++ b/src/modules_force.h
@@ -48,8 +48,17 @@
 
 #include <CabanaMD_config.hpp>
 
+#include <Cabana_Core.hpp>
+#include <Cajita.hpp>
+
 #include <force_lj_cabana_neigh.h>
 
 #ifdef CabanaMD_ENABLE_NNP
 #include <force_nnp_cabana_neigh.h>
 #endif
+
+// Include Module header files for longrange force
+#ifdef Cabana_ENABLE_HEFFTE
+#include <force_ewald_cabana_neigh.h>
+#include <force_spme_cabana_neigh.h>
+#endif
diff --git a/src/read_data.h b/src/read_data.h
index 2a4490cf..bf687831 100644
--- a/src/read_data.h
+++ b/src/read_data.h
@@ -309,13 +309,12 @@ void read_lammps_pair( std::ifstream &file, t_System *s )
 
 template <class t_System>
 void read_lammps_data_file( InputFile<t_System> *input, t_System *s,
-                            Comm<t_System> *comm )
+                            Comm<t_System> *comm, std::ofstream &out,
+                            std::ofstream &err )
 {
     int atomflag = 0;
     std::string keyword;
     std::ifstream file( input->input_data_file );
-    std::ofstream out( input->output_file, std::ofstream::app );
-    std::ofstream err( input->error_file, std::ofstream::app );
 
     read_lammps_header<t_System>( file, err, s );
 
diff --git a/src/system.h b/src/system.h
index e286cdce..b62cf813 100644
--- a/src/system.h
+++ b/src/system.h
@@ -83,6 +83,9 @@ class SystemCommon
 
     // Simulation total domain
     T_X_FLOAT global_mesh_x, global_mesh_y, global_mesh_z;
+    T_X_FLOAT global_mesh_lo_x, global_mesh_lo_y, global_mesh_lo_z;
+    T_X_FLOAT global_mesh_hi_x, global_mesh_hi_y, global_mesh_hi_z;
+    std::vector<T_X_FLOAT> lattice_constant;
 
     // Simulation sub domain (single MPI rank)
     T_X_FLOAT local_mesh_x, local_mesh_y, local_mesh_z;
@@ -97,6 +100,8 @@ class SystemCommon
     std::array<int, 3> ranks_per_dim;
     std::array<int, 3> rank_dim_pos;
 
+    std::array<bool, 3> is_periodic;
+
     // Units
     T_FLOAT boltz, mvv2e, dt;
 
@@ -112,6 +117,8 @@ class SystemCommon
         mass = t_mass( "System::mass", ntypes );
 
         global_mesh_x = global_mesh_y = global_mesh_z = 0.0;
+        global_mesh_lo_x = global_mesh_lo_y = global_mesh_lo_z = 0.0;
+        global_mesh_hi_x = global_mesh_hi_y = global_mesh_hi_z = 0.0;
         local_mesh_lo_x = local_mesh_lo_y = local_mesh_lo_z = 0.0;
         local_mesh_hi_x = local_mesh_hi_y = local_mesh_hi_z = 0.0;
         ghost_mesh_lo_x = ghost_mesh_lo_y = ghost_mesh_lo_z = 0.0;
@@ -136,12 +143,8 @@ class SystemCommon
         auto global_mesh = Cajita::createUniformGlobalMesh(
             low_corner, high_corner, ranks_per_dim );
 
-        global_mesh_x = global_mesh->extent( 0 );
-        global_mesh_y = global_mesh->extent( 1 );
-        global_mesh_z = global_mesh->extent( 2 );
-
         // Create the global grid.
-        std::array<bool, 3> is_periodic = { true, true, true };
+        is_periodic = {true, true, true};
         auto global_grid = Cajita::createGlobalGrid(
             MPI_COMM_WORLD, global_mesh, is_periodic, partitioner );
 
@@ -155,6 +158,16 @@ class SystemCommon
         local_grid = Cajita::createLocalGrid( global_grid, halo_width );
         auto local_mesh = Cajita::createLocalMesh<t_device>( *local_grid );
 
+        global_mesh_x = global_mesh->extent( 0 );
+        global_mesh_y = global_mesh->extent( 1 );
+        global_mesh_z = global_mesh->extent( 2 );
+        global_mesh_lo_x = low_corner[0];
+        global_mesh_lo_y = low_corner[1];
+        global_mesh_lo_z = low_corner[2];
+        global_mesh_hi_x = high_corner[0];
+        global_mesh_hi_y = high_corner[1];
+        global_mesh_hi_z = high_corner[2];
+
         local_mesh_lo_x = local_mesh.lowCorner( Cajita::Own(), 0 );
         local_mesh_lo_y = local_mesh.lowCorner( Cajita::Own(), 1 );
         local_mesh_lo_z = local_mesh.lowCorner( Cajita::Own(), 2 );
diff --git a/src/types.h b/src/types.h
index 9d46cf5f..17c0a715 100644
--- a/src/types.h
+++ b/src/types.h
@@ -95,7 +95,10 @@ enum
 {
     FORCE_LJ,
     FORCE_SNAP,
-    FORCE_NNP
+    FORCE_NNP,
+    FORCE_EWALD,
+    FORCE_SPME,
+    FORCE_NONE
 };
 // Force Iteration Type
 enum
@@ -125,6 +128,11 @@ enum
     INPUT_LAMMPS
 };
 
+constexpr double PI( 3.141592653589793238462643 );
+constexpr double PI_SQRT( 1.772453850905516 );
+constexpr double PI_SQ( PI *PI );          // 9.869604401089359
+constexpr double PI_DIV_SQ( 1.0 / PI_SQ ); // 0.101321183642338
+
 // Macros to work around the fact that std::max/min is not available on GPUs
 #define MAX( a, b ) ( a > b ? a : b )
 #define MIN( a, b ) ( a < b ? a : b )