From 71df3bdbce519f104c8645bedd03efc9a1769bf7 Mon Sep 17 00:00:00 2001
From: Andrew Myers <atmyers2@gmail.com>
Date: Thu, 4 Jan 2024 21:31:07 -0800
Subject: [PATCH] initialize the distributions on the GPU (#498)

* initialize the distributions on the GPU

* Fix Thermal: `const` operator

* Python Support, Cleaning

* Cleanup: Unused, Resize

Fix the missing resize before redistribute.
Remove the old, unusued, serial initialization logic.

* Thermal Distribution: GPU Memory

Remove stack memory usage and use GPU memory for pre-computed CDF.

* Cosmetic Cleaning

* Waterbag: Clean C++ Math

Avoid promoting to double in C functions.

* Cosmetic Cleaning for Clarity

* Fix CUDA Compile

On older NVCC

* Analysis: Slightly Increase Sampling Tolerance

Run on local GPU (RTX A2000).

---------

Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja>
---
 docs/source/usage/python.rst                  |   3 +-
 examples/alignment/analysis_alignment.py      |   2 +-
 .../apochromatic/analysis_apochromatic.py     |   2 +-
 examples/chicane/analysis_chicane.py          |   4 +-
 examples/compression/analysis_compression.py  |   4 +-
 examples/cyclotron/analysis_cyclotron.py      |   2 +-
 examples/distgen/analysis_gaussian.py         |   4 +-
 examples/distgen/analysis_kurth4d.py          |   4 +-
 examples/distgen/analysis_semigaussian.py     |   4 +-
 examples/expanding_beam/analysis_expanding.py |   2 +-
 examples/fodo/analysis_fodo.py                |   4 +-
 examples/fodo_chromatic/analysis_fodo_chr.py  |   4 +-
 examples/fodo_rf/analysis_fodo_rf.py          |   4 +-
 examples/iota_lattice/analysis_iotalattice.py |   4 +-
 .../iota_lattice/analysis_iotalattice_sdep.py |   8 +-
 examples/kurth/analysis_kurth_periodic.py     |   4 +-
 examples/multipole/analysis_multipole.py      |   4 +-
 .../analysis_quadrupole_softedge.py           |   4 +-
 examples/solenoid/analysis_solenoid.py        |   4 +-
 src/initialization/InitDistribution.H         | 103 ++++++
 src/initialization/InitDistribution.cpp       |  58 ++--
 src/particles/ImpactXParticleContainer.H      |  20 +-
 src/particles/ImpactXParticleContainer.cpp    | 110 ++++---
 src/particles/distribution/Gaussian.H         |   9 +
 src/particles/distribution/KVdist.H           |   9 +
 src/particles/distribution/Kurth4D.H          |   9 +
 src/particles/distribution/Kurth6D.H          |   9 +
 src/particles/distribution/None.H             |   9 +
 src/particles/distribution/Semigaussian.H     |   9 +
 src/particles/distribution/Thermal.H          | 307 ++++++++++--------
 src/particles/distribution/Triangle.H         |   9 +
 src/particles/distribution/Waterbag.H         |  37 ++-
 src/python/ImpactXParticleContainer.cpp       |   2 -
 33 files changed, 499 insertions(+), 272 deletions(-)
 create mode 100644 src/initialization/InitDistribution.H

diff --git a/docs/source/usage/python.rst b/docs/source/usage/python.rst
index 2729edcbd..96cd5a83a 100644
--- a/docs/source/usage/python.rst
+++ b/docs/source/usage/python.rst
@@ -237,7 +237,7 @@ Particles
 
    This class stores particles, distributed over MPI ranks.
 
-   .. py:method:: add_n_particles(lev, x, y, t, px, py, pt, qm, bchchg)
+   .. py:method:: add_n_particles(x, y, t, px, py, pt, qm, bchchg)
 
       Add new particles to the container for fixed s.
 
@@ -245,7 +245,6 @@ Particles
             been created, meaning after the call to :py:meth:`ImpactX.init_grids`
             has been made in the ImpactX class.
 
-      :param lev: mesh-refinement level
       :param x: positions in x
       :param y: positions in y
       :param t: positions as time-of-flight in c*t
diff --git a/examples/alignment/analysis_alignment.py b/examples/alignment/analysis_alignment.py
index d84e95fd8..ddd4bd77b 100755
--- a/examples/alignment/analysis_alignment.py
+++ b/examples/alignment/analysis_alignment.py
@@ -76,7 +76,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = 1.8 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.0 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/apochromatic/analysis_apochromatic.py b/examples/apochromatic/analysis_apochromatic.py
index 78f98e707..c893b4fd6 100755
--- a/examples/apochromatic/analysis_apochromatic.py
+++ b/examples/apochromatic/analysis_apochromatic.py
@@ -108,7 +108,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = 15.0 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 19.0 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/chicane/analysis_chicane.py b/examples/chicane/analysis_chicane.py
index 07ceb5cdd..f09f3fbcc 100755
--- a/examples/chicane/analysis_chicane.py
+++ b/examples/chicane/analysis_chicane.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/compression/analysis_compression.py b/examples/compression/analysis_compression.py
index 775e1a903..f1aa959b1 100755
--- a/examples/compression/analysis_compression.py
+++ b/examples/compression/analysis_compression.py
@@ -69,7 +69,7 @@ def get_moments(beam):
 print(f"  gamma={initial_gamma_ref:e}")
 
 atol = 0.0  # ignored
-rtol = 1.8 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.9 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -99,7 +99,7 @@ def get_moments(beam):
 
 
 atol = 0.0  # ignored
-rtol = 1.8 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.9 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/cyclotron/analysis_cyclotron.py b/examples/cyclotron/analysis_cyclotron.py
index 472f8cf74..e90dd76ec 100755
--- a/examples/cyclotron/analysis_cyclotron.py
+++ b/examples/cyclotron/analysis_cyclotron.py
@@ -86,7 +86,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = 1.8 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.0 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/distgen/analysis_gaussian.py b/examples/distgen/analysis_gaussian.py
index c2c582b7d..11deb81ff 100755
--- a/examples/distgen/analysis_gaussian.py
+++ b/examples/distgen/analysis_gaussian.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/distgen/analysis_kurth4d.py b/examples/distgen/analysis_kurth4d.py
index 40851889b..c2a7560b4 100755
--- a/examples/distgen/analysis_kurth4d.py
+++ b/examples/distgen/analysis_kurth4d.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/distgen/analysis_semigaussian.py b/examples/distgen/analysis_semigaussian.py
index c2c582b7d..11deb81ff 100755
--- a/examples/distgen/analysis_semigaussian.py
+++ b/examples/distgen/analysis_semigaussian.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/expanding_beam/analysis_expanding.py b/examples/expanding_beam/analysis_expanding.py
index cdf5ac0cf..48eecad55 100755
--- a/examples/expanding_beam/analysis_expanding.py
+++ b/examples/expanding_beam/analysis_expanding.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/fodo/analysis_fodo.py b/examples/fodo/analysis_fodo.py
index d4c4fc29c..3b09df05e 100755
--- a/examples/fodo/analysis_fodo.py
+++ b/examples/fodo/analysis_fodo.py
@@ -59,7 +59,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -86,7 +86,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/fodo_chromatic/analysis_fodo_chr.py b/examples/fodo_chromatic/analysis_fodo_chr.py
index d4c4fc29c..3b09df05e 100755
--- a/examples/fodo_chromatic/analysis_fodo_chr.py
+++ b/examples/fodo_chromatic/analysis_fodo_chr.py
@@ -59,7 +59,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -86,7 +86,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/fodo_rf/analysis_fodo_rf.py b/examples/fodo_rf/analysis_fodo_rf.py
index 32ed5f6ee..0bcd71343 100755
--- a/examples/fodo_rf/analysis_fodo_rf.py
+++ b/examples/fodo_rf/analysis_fodo_rf.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/iota_lattice/analysis_iotalattice.py b/examples/iota_lattice/analysis_iotalattice.py
index f84cfffb3..d7529bd17 100755
--- a/examples/iota_lattice/analysis_iotalattice.py
+++ b/examples/iota_lattice/analysis_iotalattice.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # a big number
-rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.9 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -78,7 +78,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # a big number
-rtol = 1.8 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.9 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/iota_lattice/analysis_iotalattice_sdep.py b/examples/iota_lattice/analysis_iotalattice_sdep.py
index 13e457798..92998c22f 100755
--- a/examples/iota_lattice/analysis_iotalattice_sdep.py
+++ b/examples/iota_lattice/analysis_iotalattice_sdep.py
@@ -56,7 +56,7 @@ def read_all_files(file_pattern):
 print(f"  meanH={meanH:e} sigH={sigH:e} meanI={meanI:e} sigI={sigI:e}")
 
 atol = 0.0  # a big number
-rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.6 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -73,7 +73,7 @@ def read_all_files(file_pattern):
 print(f"  meanH={meanH:e} sigH={sigH:e} meanI={meanI:e} sigI={sigI:e}")
 
 atol = 0.0  # a big number
-rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.6 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -94,14 +94,14 @@ def read_all_files(file_pattern):
 Hrms = np.sqrt(sigH**2 + meanH**2)
 Irms = np.sqrt(sigI**2 + meanI**2)
 
-atol = 4.0e-3 * Hrms
+atol = 4.2e-3 * Hrms
 rtol = 0.0  # large number
 print()
 print(f"  atol={atol} (ignored: rtol~={rtol})")
 print(f"  dH_max={beam_joined['dH'].max()}")
 assert np.allclose(beam_joined["dH"], 0.0, rtol=rtol, atol=atol)
 
-atol = 5.5e-3 * Irms
+atol = 5.7e-3 * Irms
 rtol = 0.0
 print()
 print(f"  atol={atol} (ignored: rtol~={rtol})")
diff --git a/examples/kurth/analysis_kurth_periodic.py b/examples/kurth/analysis_kurth_periodic.py
index 88a7b7a5c..6a42288c3 100755
--- a/examples/kurth/analysis_kurth_periodic.py
+++ b/examples/kurth/analysis_kurth_periodic.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.7 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = 1.5 * num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.7 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/multipole/analysis_multipole.py b/examples/multipole/analysis_multipole.py
index 4aa2358b6..f3a87f2a3 100755
--- a/examples/multipole/analysis_multipole.py
+++ b/examples/multipole/analysis_multipole.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.5 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/quadrupole_softedge/analysis_quadrupole_softedge.py b/examples/quadrupole_softedge/analysis_quadrupole_softedge.py
index c2c582b7d..9c0c60bbd 100755
--- a/examples/quadrupole_softedge/analysis_quadrupole_softedge.py
+++ b/examples/quadrupole_softedge/analysis_quadrupole_softedge.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 2.2 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/examples/solenoid/analysis_solenoid.py b/examples/solenoid/analysis_solenoid.py
index 2a686f030..87c0594e3 100755
--- a/examples/solenoid/analysis_solenoid.py
+++ b/examples/solenoid/analysis_solenoid.py
@@ -58,7 +58,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
@@ -85,7 +85,7 @@ def get_moments(beam):
 )
 
 atol = 0.0  # ignored
-rtol = num_particles**-0.5  # from random sampling of a smooth distribution
+rtol = 1.3 * num_particles**-0.5  # from random sampling of a smooth distribution
 print(f"  rtol={rtol} (ignored: atol~={atol})")
 
 assert np.allclose(
diff --git a/src/initialization/InitDistribution.H b/src/initialization/InitDistribution.H
new file mode 100644
index 000000000..a3186b2ad
--- /dev/null
+++ b/src/initialization/InitDistribution.H
@@ -0,0 +1,103 @@
+/* Copyright 2022-2023 The Regents of the University of California, through Lawrence
+ *           Berkeley National Laboratory (subject to receipt of any required
+ *           approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * This file is part of ImpactX.
+ *
+ * Authors: Axel Huebl, Andrew Myers
+ * License: BSD-3-Clause-LBNL
+ */
+#ifndef IMPACTX_INITIALIZATION_INITDISTRIBUTION_H
+#define IMPACTX_INITIALIZATION_INITDISTRIBUTION_H
+
+#include "particles/ImpactXParticleContainer.H"
+
+#include <AMReX_Extension.H>  // for AMREX_RESTRICT
+#include <AMReX_REAL.H>
+
+#include <utility>  // for std::move
+
+
+namespace impactx::initialization
+{
+    /** Initialize a single particle's data using the given distribution
+     *
+     * Note: we usually would just write a C++ lambda below in ParallelFor. But, due to restrictions
+     * in NVCC as of 11.5, we cannot write a lambda in a lambda as we also std::visit the element
+     * types of our lattice elements list.
+     *    error #3206-D: An extended __device__ lambda cannot be defined inside a generic lambda expression("operator()").
+     * Thus, we fall back to writing a C++ functor here, instead of nesting two lambdas.
+     *
+     * Nvidia bug report: 3458976
+     * Minimal demonstrator: https://cuda.godbolt.org/z/39e4q53Ye
+     *
+     * @tparam T_Distribution This can be a \see Gaussian, \see Waterbag, \see Kurth6D, \see Thermal etc.
+     */
+    template <typename T_Distribution>
+    struct InitSingleParticleData
+    {
+        /** Constructor taking in pointers to particle data
+         *
+         * @param distribution the type of distribution function to call
+         * @param part_x the array to the particle position (x)
+         * @param part_y the array to the particle position (y)
+         * @param part_t the array to the particle position (t)
+         * @param part_px the array to the particle momentum (x)
+         * @param part_py the array to the particle momentum (y)
+         * @param part_pt the array to the particle momentum (t)
+         */
+        InitSingleParticleData (
+            T_Distribution distribution,
+            amrex::ParticleReal* AMREX_RESTRICT part_x,
+            amrex::ParticleReal* AMREX_RESTRICT part_y,
+            amrex::ParticleReal* AMREX_RESTRICT part_t,
+            amrex::ParticleReal* AMREX_RESTRICT part_px,
+            amrex::ParticleReal* AMREX_RESTRICT part_py,
+            amrex::ParticleReal* AMREX_RESTRICT part_pt
+        )
+        : m_distribution(std::move(distribution)),
+          m_part_x(part_x), m_part_y(part_y), m_part_t(part_t),
+          m_part_px(part_px), m_part_py(part_py), m_part_pt(part_pt)
+        {
+        }
+
+        InitSingleParticleData () = delete;
+        InitSingleParticleData (InitSingleParticleData const &) = default;
+        InitSingleParticleData (InitSingleParticleData &&) = default;
+        ~InitSingleParticleData () = default;
+
+        /** Initialize the data for a single particle
+         *
+         * @param i particle index in the current box
+         * @param engine a random number engine (with associated state)
+         */
+        AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+        void
+        operator() (
+            amrex::Long i,
+            amrex::RandomEngine const & engine
+        ) const
+        {
+            m_distribution(
+                m_part_x[i],
+                m_part_y[i],
+                m_part_t[i],
+                m_part_px[i],
+                m_part_py[i],
+                m_part_pt[i],
+                engine
+            );
+        }
+
+    private:
+        T_Distribution const m_distribution;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_x;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_y;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_t;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_px;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_py;
+        amrex::ParticleReal* const AMREX_RESTRICT m_part_pt;
+    };
+}  // namespace impactx::initialization
+
+#endif  // IMPACTX_INITIALIZATION_INITDISTRIBUTION_H
diff --git a/src/initialization/InitDistribution.cpp b/src/initialization/InitDistribution.cpp
index 5e0dd0700..64a31ec15 100644
--- a/src/initialization/InitDistribution.cpp
+++ b/src/initialization/InitDistribution.cpp
@@ -7,6 +7,8 @@
  * Authors: Axel Huebl, Chad Mitchell, Ji Qiang
  * License: BSD-3-Clause-LBNL
  */
+#include "initialization/InitDistribution.H"
+
 #include "ImpactX.H"
 #include "particles/ImpactXParticleContainer.H"
 #include "particles/distribution/All.H"
@@ -21,6 +23,7 @@
 #include <AMReX_Print.H>
 
 #include <string>
+#include <type_traits>
 #include <variant>
 
 
@@ -56,12 +59,6 @@ namespace impactx
             );
         }
 
-        // init particles
-        amrex::Vector<amrex::ParticleReal> x, y, t;
-        amrex::Vector<amrex::ParticleReal> px, py, pt;
-        amrex::ParticleReal ix, iy, it, ipx, ipy, ipt;
-        amrex::RandomEngine rng;
-
         // Logic: We initialize 1/Nth of particles, independent of their
         // position, per MPI rank. We then measure the distribution's spatial
         // extent, create a grid, resize it to fit the beam, and then
@@ -71,34 +68,41 @@ namespace impactx
         int const navg = npart / nprocs;
         int const nleft = npart - navg * nprocs;
         int npart_this_proc = (myproc < nleft) ? navg+1 : navg;
-        auto const rel_part_this_proc = amrex::ParticleReal(npart_this_proc) /
-                                        amrex::ParticleReal(npart);
+        auto const rel_part_this_proc =
+            amrex::ParticleReal(npart_this_proc) / amrex::ParticleReal(npart);
+
+        // alloc data for particle attributes
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> x, y, t;
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> px, py, pt;
+        x.resize(npart_this_proc);
+        y.resize(npart_this_proc);
+        t.resize(npart_this_proc);
+        px.resize(npart_this_proc);
+        py.resize(npart_this_proc);
+        pt.resize(npart_this_proc);
 
         std::visit([&](auto&& distribution){
             // initialize distributions
             distribution.initialize(bunch_charge, ref);
 
-            // alloc data for particle attributes
-            x.reserve(npart_this_proc);
-            y.reserve(npart_this_proc);
-            t.reserve(npart_this_proc);
-            px.reserve(npart_this_proc);
-            py.reserve(npart_this_proc);
-            pt.reserve(npart_this_proc);
-
-            for (amrex::Long i = 0; i < npart_this_proc; ++i) {
-                distribution(ix, iy, it, ipx, ipy, ipt, rng);
-                x.push_back(ix);
-                y.push_back(iy);
-                t.push_back(it);
-                px.push_back(ipx);
-                py.push_back(ipy);
-                pt.push_back(ipt);
-            }
+            amrex::ParticleReal * const AMREX_RESTRICT x_ptr = x.data();
+            amrex::ParticleReal * const AMREX_RESTRICT y_ptr = y.data();
+            amrex::ParticleReal * const AMREX_RESTRICT t_ptr = t.data();
+            amrex::ParticleReal * const AMREX_RESTRICT px_ptr = px.data();
+            amrex::ParticleReal * const AMREX_RESTRICT py_ptr = py.data();
+            amrex::ParticleReal * const AMREX_RESTRICT pt_ptr = pt.data();
+
+            using Distribution = std::remove_reference_t< std::remove_cv_t<decltype(distribution)> >;
+            initialization::InitSingleParticleData<Distribution> const init_single_particle_data(
+                distribution, x_ptr, y_ptr, t_ptr, px_ptr, py_ptr, pt_ptr);
+            amrex::ParallelForRNG(npart_this_proc, init_single_particle_data);
+
+            // finalize distributions and deallocate temporary device global memory
+            amrex::Gpu::streamSynchronize();
+            distribution.finalize();
         }, distr);
 
-        int const lev = 0;
-        m_particle_container->AddNParticles(lev, x, y, t, px, py, pt,
+        m_particle_container->AddNParticles(x, y, t, px, py, pt,
                                             ref.qm_qeeV(),
                                             bunch_charge * rel_part_this_proc);
 
diff --git a/src/particles/ImpactXParticleContainer.H b/src/particles/ImpactXParticleContainer.H
index 7ab1abed6..4d4f7dee5 100644
--- a/src/particles/ImpactXParticleContainer.H
+++ b/src/particles/ImpactXParticleContainer.H
@@ -156,7 +156,6 @@ namespace impactx
          *       or AmrCore::InitFromCheckpoint has been made in the ImpactX
          *       class.
          *
-         * @param lev mesh-refinement level
          * @param x positions in x
          * @param y positions in y
          * @param t positions as time-of-flight in c*t
@@ -167,15 +166,16 @@ namespace impactx
          * @param bchchg total charge within a bunch in C
          */
         void
-        AddNParticles (int lev,
-                       amrex::Vector<amrex::ParticleReal> const & x,
-                       amrex::Vector<amrex::ParticleReal> const & y,
-                       amrex::Vector<amrex::ParticleReal> const & t,
-                       amrex::Vector<amrex::ParticleReal> const & px,
-                       amrex::Vector<amrex::ParticleReal> const & py,
-                       amrex::Vector<amrex::ParticleReal> const & pt,
-                       amrex::ParticleReal const & qm,
-                       amrex::ParticleReal const & bchchg);
+        AddNParticles (
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & x,
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & y,
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & t,
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & px,
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & py,
+            amrex::Gpu::DeviceVector<amrex::ParticleReal> const & pt,
+            amrex::ParticleReal qm,
+            amrex::ParticleReal bchchg
+        );
 
         /** Register storage for lost particles
          *
diff --git a/src/particles/ImpactXParticleContainer.cpp b/src/particles/ImpactXParticleContainer.cpp
index d333b1e75..e18d53341 100644
--- a/src/particles/ImpactXParticleContainer.cpp
+++ b/src/particles/ImpactXParticleContainer.cpp
@@ -100,19 +100,19 @@ namespace impactx
     }
 
     void
-    ImpactXParticleContainer::AddNParticles (int lev,
-                                             amrex::Vector<amrex::ParticleReal> const & x,
-                                             amrex::Vector<amrex::ParticleReal> const & y,
-                                             amrex::Vector<amrex::ParticleReal> const & t,
-                                             amrex::Vector<amrex::ParticleReal> const & px,
-                                             amrex::Vector<amrex::ParticleReal> const & py,
-                                             amrex::Vector<amrex::ParticleReal> const & pt,
-                                             amrex::ParticleReal const & qm,
-                                             amrex::ParticleReal const & bchchg)
+    ImpactXParticleContainer::AddNParticles (
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & x,
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & y,
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & t,
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & px,
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & py,
+        amrex::Gpu::DeviceVector<amrex::ParticleReal> const & pt,
+        amrex::ParticleReal qm,
+        amrex::ParticleReal bchchg
+    )
     {
         BL_PROFILE("ImpactX::AddNParticles");
 
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lev == 0, "AddNParticles: only lev=0 is supported yet.");
         AMREX_ALWAYS_ASSERT(x.size() == y.size());
         AMREX_ALWAYS_ASSERT(x.size() == t.size());
         AMREX_ALWAYS_ASSERT(x.size() == px.size());
@@ -123,48 +123,60 @@ namespace impactx
         int const np = x.size();
 
         auto& particle_tile = DefineAndReturnParticleTile(0, 0, 0);
+        auto old_np = particle_tile.numParticles();
+        auto new_np = old_np + np;
+        particle_tile.resize(new_np);
 
-        /* Create a temporary tile to obtain data from simulation. This data
-         * is then copied to the permanent tile which is stored on the particle
-         * (particle_tile).
-         */
-        using PinnedTile = amrex::ParticleTile<
-            amrex::Particle<NStructReal, NStructInt>,
-            NArrayReal, NArrayInt,
-            amrex::PinnedArenaAllocator
-        >;
-        PinnedTile pinned_tile;
-        pinned_tile.define(NumRuntimeRealComps(), NumRuntimeIntComps());
-
-        for (int i = 0; i < np; i++)
+        // Update NextID to include particles created in this function
+        int pid;
+#ifdef AMREX_USE_OMP
+#pragma omp critical (add_beam_nextid)
+#endif
         {
-            ParticleType p;
-            p.id() = ParticleType::NextID();
-            p.cpu() = amrex::ParallelDescriptor::MyProc();
-            p.pos(RealAoS::x) = x[i];
-            p.pos(RealAoS::y) = y[i];
-            p.pos(RealAoS::t) = t[i];
-            // write position, creating cpu id, and particle id
-            pinned_tile.push_back(p);
+            pid = ParticleType::NextID();
+            ParticleType::NextID(pid+np);
         }
-
-        // write Real attributes (SoA) to particle initialized zero
-        DefineAndReturnParticleTile(0, 0, 0);
-
-        pinned_tile.push_back_real(RealSoA::px, px);
-        pinned_tile.push_back_real(RealSoA::py, py);
-        pinned_tile.push_back_real(RealSoA::pt, pt);
-        pinned_tile.push_back_real(RealSoA::qm, np, qm);
-        pinned_tile.push_back_real(RealSoA::w, np, bchchg/ablastr::constant::SI::q_e/np);
-
-        /* Redistributes particles to their respective tiles (spatial bucket
-         * sort per box over MPI ranks)
-         */
-        auto old_np = particle_tile.numParticles();
-        auto new_np = old_np + pinned_tile.numParticles();
-        particle_tile.resize(new_np);
-        amrex::copyParticles(
-                particle_tile, pinned_tile, 0, old_np, pinned_tile.numParticles());
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
+        static_cast<amrex::Long>(pid) + static_cast<amrex::Long>(np) < amrex::LongParticleIds::LastParticleID,
+            "ERROR: overflow on particle id numbers");
+
+        const int cpuid = amrex::ParallelDescriptor::MyProc();
+
+        auto * AMREX_RESTRICT pstructs = particle_tile.GetArrayOfStructs()().dataPtr();
+        auto & soa = particle_tile.GetStructOfArrays().GetRealData();
+        amrex::ParticleReal * const AMREX_RESTRICT px_arr = soa[RealSoA::px].dataPtr();
+        amrex::ParticleReal * const AMREX_RESTRICT py_arr = soa[RealSoA::py].dataPtr();
+        amrex::ParticleReal * const AMREX_RESTRICT pt_arr = soa[RealSoA::pt].dataPtr();
+        amrex::ParticleReal * const AMREX_RESTRICT qm_arr = soa[RealSoA::qm].dataPtr();
+        amrex::ParticleReal * const AMREX_RESTRICT w_arr  = soa[RealSoA::w ].dataPtr();
+
+        amrex::ParticleReal const * const AMREX_RESTRICT x_ptr = x.data();
+        amrex::ParticleReal const * const AMREX_RESTRICT y_ptr = y.data();
+        amrex::ParticleReal const * const AMREX_RESTRICT t_ptr = t.data();
+        amrex::ParticleReal const * const AMREX_RESTRICT px_ptr = px.data();
+        amrex::ParticleReal const * const AMREX_RESTRICT py_ptr = py.data();
+        amrex::ParticleReal const * const AMREX_RESTRICT pt_ptr = pt.data();
+
+        amrex::ParallelFor(np,
+        [=] AMREX_GPU_DEVICE (int i) noexcept
+        {
+            ParticleType& p = pstructs[old_np + i];
+            p.id() = pid + i;
+            p.cpu() = cpuid;
+            p.pos(RealAoS::x) = x_ptr[i];
+            p.pos(RealAoS::y) = y_ptr[i];
+            p.pos(RealAoS::t) = t_ptr[i];
+
+            px_arr[old_np+i] = px_ptr[i];
+            py_arr[old_np+i] = py_ptr[i];
+            pt_arr[old_np+i] = pt_ptr[i];
+            qm_arr[old_np+i] = qm;
+            w_arr[old_np+i]  = bchchg/ablastr::constant::SI::q_e/np;
+        });
+
+        // safety first: in case passed attribute arrays were temporary, we
+        // want to make sure the ParallelFor has ended here
+        amrex::Gpu::streamSynchronize();
     }
 
     void
diff --git a/src/particles/distribution/Gaussian.H b/src/particles/distribution/Gaussian.H
index 83b19e1da..89ceb5479 100644
--- a/src/particles/distribution/Gaussian.H
+++ b/src/particles/distribution/Gaussian.H
@@ -60,6 +60,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/KVdist.H b/src/particles/distribution/KVdist.H
index 65dac699c..4bf9ee94f 100644
--- a/src/particles/distribution/KVdist.H
+++ b/src/particles/distribution/KVdist.H
@@ -61,6 +61,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/Kurth4D.H b/src/particles/distribution/Kurth4D.H
index 988448241..0c3102db8 100644
--- a/src/particles/distribution/Kurth4D.H
+++ b/src/particles/distribution/Kurth4D.H
@@ -61,6 +61,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/Kurth6D.H b/src/particles/distribution/Kurth6D.H
index d1ed44ed4..86574ce04 100644
--- a/src/particles/distribution/Kurth6D.H
+++ b/src/particles/distribution/Kurth6D.H
@@ -63,6 +63,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/None.H b/src/particles/distribution/None.H
index 79e3657e3..0fa9501ab 100644
--- a/src/particles/distribution/None.H
+++ b/src/particles/distribution/None.H
@@ -37,6 +37,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * Sets all values to zero.
diff --git a/src/particles/distribution/Semigaussian.H b/src/particles/distribution/Semigaussian.H
index 9b91e568a..2f1a942cc 100644
--- a/src/particles/distribution/Semigaussian.H
+++ b/src/particles/distribution/Semigaussian.H
@@ -61,6 +61,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/Thermal.H b/src/particles/distribution/Thermal.H
index b2cc31819..63169332a 100644
--- a/src/particles/distribution/Thermal.H
+++ b/src/particles/distribution/Thermal.H
@@ -14,6 +14,7 @@
 
 #include <ablastr/constant.H>
 
+#include <AMReX_GpuContainers.H>
 #include <AMReX_Math.H>
 #include <AMReX_Print.H>
 #include <AMReX_Random.H>
@@ -23,6 +24,7 @@
 #include <cstdint>
 #include <random>
 #include <utility>
+#include <vector>
 
 
 namespace impactx
@@ -36,68 +38,69 @@ namespace distribution
         static constexpr amrex::ParticleReal rout = 10.0;  ///< final r value for numerical integration
         static constexpr int nsteps = 2000;  ///< number of radial steps for numerical integration
 
-        struct Rprofile
+        amrex::ParticleReal m_f1;  ///< cumulative distribution of first population
+        amrex::ParticleReal m_f2;  ///< cumulative distribution of second population
+        amrex::ParticleReal m_phi1;  ///< potential generated by first population
+        amrex::ParticleReal m_phi2; ///< potential generated by second population
+        amrex::ParticleReal m_p1;  ///< normalization constant of first population
+        amrex::ParticleReal m_p2; ///< normalization constant of second population
+        amrex::ParticleReal m_rmin;  ///< minimum r value for tabulated cdf
+        amrex::ParticleReal m_rmax;  ///< maximum r value for tabulated cdf
+        int m_nbins;  ///< number of radial bins for tabulated cdf
+        amrex::ParticleReal* m_cdf1 = nullptr;  ///< tabulated cumulative distribution (first)
+        amrex::ParticleReal* m_cdf2 = nullptr;  ///< tabulated cumulative distribution (second)
+        amrex::ParticleReal m_Cintensity;  ///< space charge intensity parameter
+        amrex::ParticleReal m_bg; ///< reference value of relativistic beta*gamma
+        amrex::ParticleReal m_k;  ///< linear focusing strength (1/meters)
+        amrex::ParticleReal m_T1; ///< temperature k*T of the primary (core) population
+        amrex::ParticleReal m_T2; ///< temperature k*T of the secondary (halo) population
+        amrex::ParticleReal m_w;  ///< weight of the secondary (halo) population
+
+        // GPU data
+        static inline amrex::Gpu::DeviceVector<amrex::ParticleReal> m_d_cdf1;
+        static inline amrex::Gpu::DeviceVector<amrex::ParticleReal> m_d_cdf2;
+
+        ThermalData (
+            amrex::ParticleReal kin,
+            amrex::ParticleReal T1in,
+            amrex::ParticleReal T2in,
+            amrex::ParticleReal p1in,
+            amrex::ParticleReal p2in,
+            amrex::ParticleReal win
+        )
         {
-            amrex::ParticleReal f1;  ///< cumulative distribution of first population
-            amrex::ParticleReal f2;  ///< cumulative distribution of second population
-            amrex::ParticleReal phi1;  ///< potential generated by first population
-            amrex::ParticleReal phi2; ///< potential generated by second population
-            amrex::ParticleReal p1;  ///< normalization constant of first population
-            amrex::ParticleReal p2; ///< normalization constant of second population
-            amrex::ParticleReal rmin;  ///< minimum r value for tabulated cdf
-            amrex::ParticleReal rmax;  ///< maximum r value for tabulated cdf
-            int nbins;  ///< number of radial bins for tabulated cdf
-            amrex::ParticleReal cdf1[2001];  ///< tabulated cumulative distribution (first)
-            amrex::ParticleReal cdf2[2001];  ///< tabulated cumulative distribution (second)
-            amrex::ParticleReal Cintensity;  ///< space charge intensity parameter
-            amrex::ParticleReal bg; ///< reference value of relativistic beta*gamma
-            amrex::ParticleReal k;  ///< linear focusing strength (1/meters)
-            amrex::ParticleReal T1; ///< temperature k*T of the primary (core) population
-            amrex::ParticleReal T2; ///< temperature k*T of the secondary (halo) population
-            amrex::ParticleReal w;  ///< weight of the secondary (halo) population
-
-            Rprofile (amrex::ParticleReal kin,
-                      amrex::ParticleReal T1in,
-                      amrex::ParticleReal T2in,
-                      amrex::ParticleReal p1in,
-                      amrex::ParticleReal p2in,
-                      amrex::ParticleReal win)
-            {
-                 k = kin;
-                 T1 = T1in;
-                 T2 = T2in;
-                 p1 = p1in;
-                 p2 = p2in;
-                 w = win;
-            }
-        };
+             m_k = kin;
+             m_T1 = T1in;
+             m_T2 = T2in;
+             m_p1 = p1in;
+             m_p2 = p2in;
+             m_w = win;
+        }
 
         /** Populate the radial CDF data.
          *
          * @param[in] bunch_charge the bunch charge in C
          * @param[in] refpart the reference particle
-         * @param[inout] data the data to contain the radial CDF
          */
-        static void
-        generate_radial_dist (amrex::ParticleReal bunch_charge, RefPart const & refpart, Rprofile & data)
-
+        void
+        generate_radial_dist (amrex::ParticleReal bunch_charge, RefPart const & refpart)
         {
             using namespace amrex::literals;  // for _rt and _prt
             using namespace ablastr::constant::math;  // for pi
 
             // Get relativistic beta*gamma
             amrex::ParticleReal bg = refpart.beta_gamma();
-            data.bg = bg;
+            m_bg = bg;
 
             // Get reference particle rest energy (eV) and charge (q_e)
             amrex::ParticleReal Erest = refpart.mass_MeV()*1.0e6;
             amrex::ParticleReal q_e = refpart.charge_qe();
 
             // Set space charge intensity
-            data.Cintensity = q_e*bunch_charge/(pow(bg,2)*Erest*ablastr::constant::SI::ep0);
+            m_Cintensity = q_e*bunch_charge/(pow(bg,2)*Erest*ablastr::constant::SI::ep0);
 
             // Set minimum and maximum radius
-            amrex::ParticleReal r_scale = matched_scale_radius(data);
+            amrex::ParticleReal r_scale = matched_scale_radius();
             amrex::ParticleReal rmin = rin*r_scale;
             amrex::ParticleReal rmax = rout*r_scale;
             // amrex::PrintToFile("equilibrium_params.out") << r_scale << " " << data.Cintensity << "\n";
@@ -105,49 +108,70 @@ namespace distribution
             // Scale the parameters p1 and p2
             amrex::ParticleReal rt2pi = sqrt(2.0_prt*pi);
             amrex::ParticleReal p_scale = pow(r_scale*rt2pi,-3);
-            data.p1 = data.p1*p_scale;
-            data.p2 = data.p2*p_scale;
+            m_p1 = m_p1*p_scale;
+            m_p2 = m_p2*p_scale;
 
             // store integration parameters
-            data.nbins = nsteps;
-            data.rmin = rmin;
-            data.rmax = rmax;
+            m_nbins = nsteps;
+            m_rmin = rmin;
+            m_rmax = rmax;
+
+            // allocate CDFs
+            std::vector<amrex::ParticleReal> cdf1(m_nbins+1);
+            std::vector<amrex::ParticleReal> cdf2(m_nbins+1);
+            m_cdf1 = cdf1.data();
+            m_cdf2 = cdf2.data();
 
             // set initial conditions
-            data.f1 = 0.0_prt;
-            data.f2 = 0.0_prt;
-            data.phi1 = 0.0_prt;
-            data.phi2 = 0.0_prt;
-            data.cdf1[0] = 0.0_prt;
-            data.cdf2[0] = 0.0_prt;
+            m_f1 = 0.0_prt;
+            m_f2 = 0.0_prt;
+            m_phi1 = 0.0_prt;
+            m_phi2 = 0.0_prt;
+            m_cdf1[0] = 0.0_prt;
+            m_cdf2[0] = 0.0_prt;
 
             // integrate ODEs for the radial density
-            integrate(data,rmin,rmax,nsteps);
+            integrate(rmin,rmax,nsteps);
 
             // a search over normalization parameters p1, p2 can be added here
 
+            // rescale cdf to ensure the exact range [0,1]
+            for (int n = 0; n < m_nbins; ++n) {
+                m_cdf1[n] = m_cdf1[n] / m_cdf1[m_nbins];
+                m_cdf2[n] = m_cdf2[n] / m_cdf2[m_nbins];
+            }
+
+            // copy CFD data to device
+            m_d_cdf1 = amrex::Gpu::DeviceVector<amrex::ParticleReal>(m_nbins+1);
+            m_d_cdf2 = amrex::Gpu::DeviceVector<amrex::ParticleReal>(m_nbins+1);
+            amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice,
+                                  cdf1.begin(), cdf1.end(),
+                                  m_d_cdf1.begin());
+            amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice,
+                                  cdf2.begin(), cdf2.end(),
+                                  m_d_cdf2.begin());
+            amrex::Gpu::streamSynchronize();
         }
 
-        static amrex::ParticleReal
-        matched_scale_radius(Rprofile & data)
+        amrex::ParticleReal
+        matched_scale_radius ()
         {
             using namespace amrex::literals;  // for _rt and _prt
             using namespace ablastr::constant::math;  // for pi
 
-            amrex::ParticleReal k = data.k;
-            amrex::ParticleReal kT = (1.0_prt-data.w)*data.T1 + data.w*data.T2;
-            amrex::ParticleReal a = data.Cintensity/(4.0_prt*pi*5.0_prt*sqrt(5.0_prt));
+            amrex::ParticleReal k = m_k;
+            amrex::ParticleReal kT = (1.0_prt - m_w) * m_T1 + m_w * m_T2;
+            amrex::ParticleReal a = m_Cintensity/(4.0_prt*pi*5.0_prt*sqrt(5.0_prt));
             amrex::ParticleReal rscale = sqrt(kT + pow(a*k,2.0/3.0))/k;
 
             return rscale;
         }
 
-        static void
+        void
         integrate (
-            Rprofile & data,
-            amrex::ParticleReal const in,
-            amrex::ParticleReal const out,
-            int const steps
+            amrex::ParticleReal in,
+            amrex::ParticleReal out,
+            int steps
         )
         {
             using namespace amrex::literals; // for _rt and _prt
@@ -163,12 +187,12 @@ namespace distribution
             for (int j=0; j < steps; ++j)
             {
                 // for a second-order Strang splitting
-                map1(half,data,reval);
-                map2(tau,data,reval);
-                map1(half,data,reval);
+                map1(half, reval);
+                map2(tau, reval);
+                map1(half, reval);
                 // store tabulated CDF
-                data.cdf1[j+1] = data.f1;
-                data.cdf2[j+1] = data.f2;
+                m_cdf1[j+1] = m_f1;
+                m_cdf2[j+1] = m_f2;
                 // write cumulative density to file for debugging
                 /* comment in for debugging
                 amrex::PrintToFile("density1.out") << reval << " " << data.f1 << "\n";
@@ -180,68 +204,65 @@ namespace distribution
 
         }
 
-        static void
+        void
         map1 (
             amrex::ParticleReal const tau,
-            Rprofile & data,
             amrex::ParticleReal & reval
         )
         {
             using namespace amrex::literals;  // for _rt and _prt
             using namespace ablastr::constant::math;  // for pi
 
-            amrex::ParticleReal const f1 = data.f1;
-            amrex::ParticleReal const f2 = data.f2;
-            amrex::ParticleReal const phi1 = data.phi1;
-            amrex::ParticleReal const phi2 = data.phi2;
+            amrex::ParticleReal const f1 = m_f1;
+            amrex::ParticleReal const f2 = m_f2;
+            amrex::ParticleReal const phi1 = m_phi1;
+            amrex::ParticleReal const phi2 = m_phi2;
             amrex::ParticleReal const r = reval;
 
             // Apply map to update phi1, phi2, and r:
             reval = r + tau;
-            data.phi1 = phi1 + f1/(4.0_prt*pi*reval) - f1/(4.0_prt*pi*r);
-            data.phi2 = phi2 + f2/(4.0_prt*pi*reval) - f2/(4.0_prt*pi*r);;
-            data.f1 = f1;
-            data.f2 = f2;
-
+            m_phi1 = phi1 + f1/(4.0_prt*pi*reval) - f1/(4.0_prt*pi*r);
+            m_phi2 = phi2 + f2/(4.0_prt*pi*reval) - f2/(4.0_prt*pi*r);;
+            m_f1 = f1;
+            m_f2 = f2;
         }
 
-        static void
+        void
         map2 (
             amrex::ParticleReal const tau,
-            Rprofile & data,
             amrex::ParticleReal & reval
         )
         {
             using namespace amrex::literals;  // for _rt and _prt
             using namespace ablastr::constant::math;  // for pi
 
-            amrex::ParticleReal const f1 = data.f1;
-            amrex::ParticleReal const f2 = data.f2;
-            amrex::ParticleReal const phi1 = data.phi1;
-            amrex::ParticleReal const phi2 = data.phi2;
+            amrex::ParticleReal const f1 = m_f1;
+            amrex::ParticleReal const f2 = m_f2;
+            amrex::ParticleReal const phi1 = m_phi1;
+            amrex::ParticleReal const phi2 = m_phi2;
             amrex::ParticleReal const r = reval;
-            amrex::ParticleReal const k = data.k;
-            amrex::ParticleReal const w = data.w;
-            amrex::ParticleReal const T1 = data.T1;
-            amrex::ParticleReal const T2 = data.T2;
+            amrex::ParticleReal const k = m_k;
+            amrex::ParticleReal const w = m_w;
+            amrex::ParticleReal const T1 = m_T1;
+            amrex::ParticleReal const T2 = m_T2;
 
             // Define space charge intensity parameters
-            amrex::ParticleReal const c1 = (1.0_prt-w)*data.Cintensity;
-            amrex::ParticleReal const c2 = w*data.Cintensity;
+            amrex::ParticleReal const c1 = (1.0_prt-w) * m_Cintensity;
+            amrex::ParticleReal const c2 = w * m_Cintensity;
 
             // Define intermediate quantities
             amrex::ParticleReal potential = 0.0_prt;
             potential = pow(k*r,2.0)/2.0_prt + c1*phi1 + c2*phi2;
-            amrex::ParticleReal Pdensity1 = data.p1*exp(-potential/T1);
-            amrex::ParticleReal Pdensity2 = data.p2*exp(-potential/T2);
+            amrex::ParticleReal Pdensity1 = m_p1*exp(-potential/T1);
+            amrex::ParticleReal Pdensity2 = m_p2*exp(-potential/T2);
             // amrex::ParticleReal Pdensity_tot = (1.0_prt-w)*Pdensity1 + w*Pdensity2;
             // amrex::PrintToFile("Pdensity.out") << reval << " " << Pdensity_tot << "\n";
 
             // Apply map to update f1 and f2:
-            data.phi1 = phi1;
-            data.phi2 = phi2;
-            data.f1 = f1 + tau*4.0_prt*pi*pow(r,2.0)*Pdensity1;
-            data.f2 = f2 + tau*4.0_prt*pi*pow(r,2.0)*Pdensity2;
+            m_phi1 = phi1;
+            m_phi2 = phi2;
+            m_f1 = f1 + tau*4.0_prt*pi*pow(r,2.0)*Pdensity1;
+            m_f2 = f2 + tau*4.0_prt*pi*pow(r,2.0)*Pdensity2;
             reval = r;
         }
     };
@@ -270,12 +291,11 @@ namespace distribution
             amrex::ParticleReal halo
         )
         : m_k(k),
-          m_kT(kT),
+          m_T1(kT),
           m_T2(kT_halo),
           m_normalize(normalize),
           m_normalize_halo(normalize_halo),
-          m_halo(halo),
-          m_data(ThermalData::Rprofile(m_k, m_kT, m_T2, m_normalize, m_normalize_halo, m_halo))
+          m_halo(halo)
         {
         }
 
@@ -289,7 +309,29 @@ namespace distribution
         void initialize (amrex::ParticleReal bunch_charge, RefPart const & ref)
         {
             // Generate the struct "data" containing the radial CDF:
-            distribution::ThermalData::generate_radial_dist(bunch_charge, ref, m_data);
+            ThermalData data(m_k, m_T1, m_T2, m_normalize, m_normalize_halo, m_halo);
+            data.generate_radial_dist(bunch_charge, ref);
+
+            // share data
+            m_w = data.m_w;
+            m_T1 = data.m_T1;
+            m_T2 = data.m_T2;
+            m_rmin = data.m_rmin;
+            m_rmax = data.m_rmax;
+            m_nbins = data.m_nbins;
+            m_bg = data.m_bg;
+            m_cdf1 = data.m_d_cdf1.data();
+            m_cdf2 = data.m_d_cdf2.data();
+        }
+
+        /** Close and deallocate all data and handles.
+         */
+        void
+        finalize ()
+        {
+            // deallocate
+            ThermalData::m_d_cdf1.clear();
+            ThermalData::m_d_cdf2.clear();
         }
 
         /** Return 1 6D particle coordinate
@@ -304,13 +346,15 @@ namespace distribution
          */
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         void operator() (
-                amrex::ParticleReal & AMREX_RESTRICT x,
-                amrex::ParticleReal & AMREX_RESTRICT y,
-                amrex::ParticleReal & AMREX_RESTRICT t,
-                amrex::ParticleReal & AMREX_RESTRICT px,
-                amrex::ParticleReal & AMREX_RESTRICT py,
-                amrex::ParticleReal & AMREX_RESTRICT pt,
-                amrex::RandomEngine const& engine) {
+            amrex::ParticleReal & AMREX_RESTRICT x,
+            amrex::ParticleReal & AMREX_RESTRICT y,
+            amrex::ParticleReal & AMREX_RESTRICT t,
+            amrex::ParticleReal & AMREX_RESTRICT px,
+            amrex::ParticleReal & AMREX_RESTRICT py,
+            amrex::ParticleReal & AMREX_RESTRICT pt,
+            amrex::RandomEngine const & engine
+        ) const
+        {
 
             using namespace amrex::literals;  // for _rt and _prt
             using namespace ablastr::constant::math;  // for pi
@@ -342,7 +386,7 @@ namespace distribution
             g6 = ln1*sin(2_prt*pi*u2);
 
             // Scale the last three variables to produce the momenta:
-            amrex::ParticleReal kT = (uhalo > m_data.w) ? m_data.T1 : m_data.T2;  // select core or halo value
+            amrex::ParticleReal kT = (uhalo > m_w) ? m_T1 : m_T2;  // select core or halo value
             px = sqrt(kT)*g4;
             py = sqrt(kT)*g5;
             pz = sqrt(kT)*g6;
@@ -354,47 +398,42 @@ namespace distribution
             g3 /= norm;
 
             // Collect the radial CDF returned by generate_radial_dist:
-            amrex::ParticleReal rmin = m_data.rmin;
-            amrex::ParticleReal rmax = m_data.rmax;
-            int const nbins = m_data.nbins;
-            constexpr int length = 2001;  // Ideally, int const length = nbins + 1;
-            amrex::ParticleReal cdf[length];
-            for (int n = 0; n < length; ++n) {
-                cdf[n] = (uhalo > m_data.w) ? m_data.cdf1[n] : m_data.cdf2[n]; //select core or halo CDF
-            }
-            for (int n = 0; n < length; ++n) {
-                cdf[n] = cdf[n]/cdf[nbins];  //rescale cdf to ensure the exact range [0,1]
-            }
+
+            // select core or halo CDF
+            amrex::ParticleReal const * cdf = (uhalo > m_w) ? m_cdf1 : m_cdf2;
 
             // Generate a radial coordinate from the CDF
             amrex::ParticleReal u = amrex::Random(engine);
-            amrex::ParticleReal *ptr = amrex::lower_bound(cdf, cdf + nbins + 1, u);
-            int off = amrex::max(0, (int)(ptr - cdf - 1));
+            amrex::ParticleReal const * ptr = amrex::lower_bound(cdf, cdf + m_nbins + 1, u);
+            int const off = amrex::max(0, int(ptr - cdf - 1));
             amrex::ParticleReal tv = (u - cdf[off]) / (cdf[off + 1] - cdf[off]);
-            amrex::ParticleReal xv = (off + tv) / (float)(nbins);
-            amrex::ParticleReal r = rmin + (rmax - rmin) * xv;
+            amrex::ParticleReal xv = (off + tv) / amrex::ParticleReal(m_nbins);
+            amrex::ParticleReal r = m_rmin + (m_rmax - m_rmin) * xv;
 
             // Scale to produce samples with the correct radial density:
             x = g1*r;
             y = g2*r;
             z = g3*r;
 
-            // Transform logitudinal variables into the laboratory frame:
-            t = -z/(m_data.bg);
-            pt = -pz*(m_data.bg);
-
-            (void) m_k;
-            (void) m_kT;
-            (void) m_T2;
-            (void) m_halo;
+            // Transform longitudinal variables into the laboratory frame:
+            t = -z / m_bg;
+            pt = -pz * m_bg;
         }
 
     private:
         amrex::ParticleReal m_k; //! linear focusing strength (1/meters)
-        amrex::ParticleReal m_kT, m_T2; //! temperature of each particle population
+        amrex::ParticleReal m_T1, m_T2; //! temperature of each particle population
         amrex::ParticleReal m_normalize, m_normalize_halo; //! normalization constant of first/second population
         amrex::ParticleReal m_halo; //! relative weight of halo population
-        ThermalData::Rprofile m_data; //! struct containing radial profile data
+        amrex::ParticleReal m_rmin;  ///< minimum r value for tabulated cdf
+        amrex::ParticleReal m_rmax;  ///< maximum r value for tabulated cdf
+        int m_nbins;  ///< number of radial bins for tabulated cdf
+        amrex::ParticleReal m_bg; ///< reference value of relativistic beta*gamma
+        amrex::ParticleReal m_w;  ///< weight of the secondary (halo) population
+
+        // radial profile data
+        amrex::ParticleReal const * m_cdf1 = nullptr; //! non-owning pointer to device core CDF
+        amrex::ParticleReal const * m_cdf2 = nullptr; //! non-owning pointer to device halo CDF
     };
 
 } // namespace distribution
diff --git a/src/particles/distribution/Triangle.H b/src/particles/distribution/Triangle.H
index 45c3fbaaa..61e916d51 100644
--- a/src/particles/distribution/Triangle.H
+++ b/src/particles/distribution/Triangle.H
@@ -58,6 +58,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
diff --git a/src/particles/distribution/Waterbag.H b/src/particles/distribution/Waterbag.H
index d41ed6eb2..b86837e78 100644
--- a/src/particles/distribution/Waterbag.H
+++ b/src/particles/distribution/Waterbag.H
@@ -60,6 +60,15 @@ namespace impactx::distribution
         {
         }
 
+        /** Close and deallocate all data and handles.
+         *
+         * Nothing to do here.
+         */
+        void
+        finalize ()
+        {
+        }
+
         /** Return 1 6D particle coordinate
          *
          * @param x particle position in x
@@ -91,22 +100,22 @@ namespace impactx::distribution
             // Generate six standard normal random variables using Box-Muller:
             u1 = amrex::Random(engine);
             u2 = amrex::Random(engine);
-            ln1 = sqrt(-2_prt*log(u1));
-            g1 = ln1*cos(2_prt*pi*u2);
-            g2 = ln1*sin(2_prt*pi*u2);
+            ln1 = std::sqrt(-2_prt*std::log(u1));
+            g1 = ln1*std::cos(2_prt*pi*u2);
+            g2 = ln1*std::sin(2_prt*pi*u2);
             u1 = amrex::Random(engine);
             u2 = amrex::Random(engine);
-            ln1 = sqrt(-2_prt*log(u1));
-            g3 = ln1*cos(2_prt*pi*u2);
-            g4 = ln1*sin(2_prt*pi*u2);
+            ln1 = std::sqrt(-2_prt*std::log(u1));
+            g3 = ln1*std::cos(2_prt*pi*u2);
+            g4 = ln1*std::sin(2_prt*pi*u2);
             u1 = amrex::Random(engine);
             u2 = amrex::Random(engine);
-            ln1 = sqrt(-2_prt*log(u1));
-            g5 = ln1*cos(2_prt*pi*u2);
-            g6 = ln1*sin(2_prt*pi*u2);
+            ln1 = std::sqrt(-2_prt*std::log(u1));
+            g5 = ln1*std::cos(2_prt*pi*u2);
+            g6 = ln1*std::sin(2_prt*pi*u2);
 
             // Normalize to produce uniform samples on the unit sphere:
-            norm = sqrt(g1*g1+g2*g2+g3*g3+g4*g4+g5*g5+g6*g6);
+            norm = std::sqrt(g1*g1+g2*g2+g3*g3+g4*g4+g5*g5+g6*g6);
             g1 /= norm;
             g2 /= norm;
             g3 /= norm;
@@ -116,7 +125,7 @@ namespace impactx::distribution
 
             // Scale to produce uniform samples in a ball (unit variance):
             u1 = amrex::Random(engine);
-            u2 = sqrt(8_prt)*pow(u1,1_prt/6);
+            u2 = std::sqrt(8_prt)*std::pow(u1,1_prt/6_prt);
             x = g1*u2;
             y = g2*u2;
             t = g3*u2;
@@ -125,17 +134,17 @@ namespace impactx::distribution
             pt = g6*u2;
 
             // Transform to produce the desired second moments/correlations:
-            root = sqrt(1.0_prt-m_muxpx*m_muxpx);
+            root = std::sqrt(1.0_prt-m_muxpx*m_muxpx);
             a1 = m_sigmaX*x/root;
             a2 = m_sigmaPx*(-m_muxpx*x/root+px);
             x = a1;
             px = a2;
-            root = sqrt(1.0_prt-m_muypy*m_muypy);
+            root = std::sqrt(1.0_prt-m_muypy*m_muypy);
             a1 = m_sigmaY*y/root;
             a2 = m_sigmaPy*(-m_muypy*y/root+py);
             y = a1;
             py = a2;
-            root = sqrt(1.0_prt-m_mutpt*m_mutpt);
+            root = std::sqrt(1.0_prt-m_mutpt*m_mutpt);
             a1 = m_sigmaT*t/root;
             a2 = m_sigmaPt*(-m_mutpt*t/root+pt);
             t = a1;
diff --git a/src/python/ImpactXParticleContainer.cpp b/src/python/ImpactXParticleContainer.cpp
index c1a645b8a..e9f25f9e3 100644
--- a/src/python/ImpactXParticleContainer.cpp
+++ b/src/python/ImpactXParticleContainer.cpp
@@ -59,7 +59,6 @@ void init_impactxparticlecontainer(py::module& m)
 
         .def("add_n_particles",
              &ImpactXParticleContainer::AddNParticles,
-             py::arg("lev"),
              py::arg("x"), py::arg("y"), py::arg("t"),
              py::arg("px"), py::arg("py"), py::arg("pt"),
              py::arg("qm"), py::arg("bchchg"),
@@ -67,7 +66,6 @@ void init_impactxparticlecontainer(py::module& m)
              "Note: This can only be used *after* the initialization (grids) have\n"
              "      been created, meaning after the call to ImpactX.init_grids\n"
              "      has been made in the ImpactX class.\n\n"
-             ":param lev: mesh-refinement level\n"
              ":param x: positions in x\n"
              ":param y: positions in y\n"
              ":param t: positions as time-of-flight in c*t\n"