diff --git a/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h
index e9dfed7bca7a6..77cf567dca681 100644
--- a/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h
+++ b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h
@@ -4,17 +4,6 @@
 #include <cstdint>
 #include <limits>
 
-namespace pixelGPUConstants {
-#ifdef GPU_SMALL_EVENTS
-  // kept for testing and debugging
-  constexpr uint32_t maxNumberOfHits = 24 * 1024;
-#else
-  // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
-  // tested on MC events with 55-75 pileup events
-  constexpr uint32_t maxNumberOfHits = 48 * 1024;
-#endif
-}  // namespace pixelGPUConstants
-
 namespace gpuClustering {
 #ifdef GPU_SMALL_EVENTS
   // kept for testing and debugging
@@ -28,7 +17,6 @@ namespace gpuClustering {
 
   constexpr uint16_t maxNumModules = 2000;
   constexpr int32_t maxNumClustersPerModules = maxHitsInModule();
-  constexpr uint32_t maxNumClusters = pixelGPUConstants::maxNumberOfHits;
   constexpr uint16_t invalidModuleId = std::numeric_limits<uint16_t>::max() - 1;
   static_assert(invalidModuleId > maxNumModules);  // invalidModuleId must be > maxNumModules
 
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
index bd39f3c4d3bfe..f74717c41e4d7 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -17,7 +17,7 @@ class TrackSoAHeterogeneousT {
 
   using Quality = pixelTrack::Quality;
   using hindex_type = uint32_t;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S, 5 * S>;
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, 5 * S>;
 
   // Always check quality is at least loose!
   // CUDA does not support enums  in __lgc ...
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 967b5c6c8282f..7fa73432100f9 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -34,6 +34,7 @@ class TrackingRecHit2DHeterogeneous {
   auto hitsModuleStart() const { return m_hitsModuleStart; }
   auto hitsLayerStart() { return m_hitsLayerStart; }
   auto phiBinner() { return m_phiBinner; }
+  auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
   // only the local coord and detector index
@@ -42,7 +43,7 @@ class TrackingRecHit2DHeterogeneous {
 
 private:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
-  static constexpr uint32_t n32 = 9;                 // number of elements in m_store32
+  static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
 
   unique_ptr<uint16_t[]> m_store16;  //!
@@ -59,6 +60,7 @@ class TrackingRecHit2DHeterogeneous {
 
   // needed as kernel params...
   PhiBinner* m_phiBinner;
+  PhiBinner::index_type* m_phiBinnerStorage;
   uint32_t* m_hitsLayerStart;
   int16_t* m_iphi;
 };
@@ -97,14 +99,20 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
   m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
+  m_store32 =
+      Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
   m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
 
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
+
   auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
 
   // copy all the pointers
   m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
 
   view->m_xl = get32(0);
   view->m_yl = get32(1);
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
index 7f3c59cd70faf..bebca103158ed 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -14,11 +14,9 @@ namespace pixelCPEforGPU {
 
 class TrackingRecHit2DSOAView {
 public:
-  static constexpr uint32_t maxHits() { return gpuClustering::maxNumClusters; }
   using hindex_type = uint32_t;  // if above is <=2^32
 
-  using PhiBinner =
-      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::maxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;
+  using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
 
   using AverageGeometry = phase1PixelTopology::AverageGeometry;
 
@@ -95,6 +93,7 @@ class TrackingRecHit2DSOAView {
   uint32_t* m_hitsLayerStart;
 
   PhiBinner* m_phiBinner;
+  PhiBinner::index_type* m_phiBinnerStorage;
 
   uint32_t m_nHits;
 };
diff --git a/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h b/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h
new file mode 100644
index 0000000000000..f794fb53afe4d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h
@@ -0,0 +1,49 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+
+#include <cstdint>
+
+namespace cms {
+  namespace cuda {
+
+    template <typename I, int S>
+    class FlexiStorage {
+    public:
+      constexpr int capacity() const { return S; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I m_v[S];
+    };
+
+    template <typename I>
+    class FlexiStorage<I, -1> {
+    public:
+      constexpr void init(I* v, int s) {
+        m_v = v;
+        m_capacity = s;
+      }
+
+      constexpr int capacity() const { return m_capacity; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I* m_v;
+      int m_capacity;
+    };
+
+  }  // namespace cuda
+
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index bd80281f75333..7bf5db603bccd 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -1,19 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 
-#include <algorithm>
-#ifndef __CUDA_ARCH__
-#include <atomic>
-#endif  // __CUDA_ARCH__
-#include <cstddef>
-#include <cstdint>
-#include <type_traits>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
 namespace cms {
   namespace cuda {
@@ -50,61 +38,27 @@ namespace cms {
       }
     }
 
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
-                                                          cudaStream_t stream
-#ifndef __CUDACC__
-                                                          = cudaStreamDefault
-#endif
-    ) {
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
-      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
-#ifdef __CUDACC__
-      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
-#else
-      ::memset(poff, 0, size);
-#endif
-    }
-
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
-                                                              cudaStream_t stream
-#ifndef __CUDACC__
-                                                              = cudaStreamDefault
-#endif
-    ) {
-#ifdef __CUDACC__
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
-      auto nthreads = 1024;
-      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
-      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
-          poff, poff, Histo::totbins(), ppsws);
-      cudaCheck(cudaGetLastError());
-#else
-      h->finalize();
-#endif
-    }
-
     template <typename Histo, typename T>
     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
                                                                   uint32_t nh,
                                                                   T const *__restrict__ v,
                                                                   uint32_t const *__restrict__ offsets,
-                                                                  uint32_t totSize,
+                                                                  int32_t totSize,
                                                                   int nthreads,
+                                                                  typename Histo::index_type *mem,
                                                                   cudaStream_t stream
 #ifndef __CUDACC__
                                                                   = cudaStreamDefault
 #endif
     ) {
-      launchZero(h, stream);
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
-      launchFinalize(h, stream);
+      launchFinalize(view, stream);
       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
@@ -114,11 +68,6 @@ namespace cms {
 #endif
     }
 
-    template <typename Assoc>
-    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
-      assoc->bulkFinalizeFill(*apc);
-    }
-
     // iteratate over N bins left and right of the one containing "v"
     template <typename Hist, typename V, typename Func>
     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
@@ -142,20 +91,19 @@ namespace cms {
       }
     }
 
-    template <typename T,                  // the type of the discretized input values
-              uint32_t NBINS,              // number of bins
-              uint32_t SIZE,               // max number of element
+    template <typename T,      // the type of the discretized input values
+              uint32_t NBINS,  // number of bins
+              int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
               uint32_t S = sizeof(T) * 8,  // number of significant bits in T
               typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
               uint32_t NHISTS = 1     // number of histos stored
               >
-    class HistoContainer {
+    class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
     public:
-      using Counter = uint32_t;
-
-      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
-
-      using index_type = I;
+      using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
+      using View = typename Base::View;
+      using Counter = typename Base::Counter;
+      using index_type = typename Base::index_type;
       using UT = typename std::make_unsigned<T>::type;
 
       static constexpr uint32_t ilog2(uint32_t v) {
@@ -176,7 +124,8 @@ namespace cms {
       static constexpr uint32_t nhists() { return NHISTS; }
       static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
       static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
-      static constexpr uint32_t capacity() { return SIZE; }
+
+      // static_assert(int32_t(totbins())==Base::ctNOnes());
 
       static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
 
@@ -186,91 +135,18 @@ namespace cms {
         return (t >> shift) & mask;
       }
 
-      __host__ __device__ void zero() {
-        for (auto &i : off)
-          i = 0;
-      }
-
-      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
-        for (uint32_t i = 0; i < totbins(); ++i) {
-#ifdef __CUDA_ARCH__
-          atomicAdd(off + i, co.off[i]);
-#else
-          auto &a = (std::atomic<Counter> &)(off[i]);
-          a += co.off[i];
-#endif
-        }
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicAdd(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a++;
-#endif
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicSub(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a--;
-#endif
-      }
-
-      __host__ __device__ __forceinline__ void countDirect(T b) {
-        assert(b < nbins());
-        atomicIncrement(off[b]);
-      }
-
-      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
-        assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
-        assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
-        auto c = apc.add(n);
-        if (c.m >= nbins())
-          return -int32_t(c.m);
-        off[c.m] = c.n;
-        for (uint32_t j = 0; j < n; ++j)
-          bins[c.n + j] = v[j];
-        return c.m;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
-        off[apc.get().m] = apc.get().n;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
-        auto m = apc.get().m;
-        auto n = apc.get().n;
-        if (m >= nbins()) {  // overflow!
-          off[nbins()] = uint32_t(off[nbins() - 1]);
-          return;
-        }
-        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
-        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
-          off[i] = n;
-        }
-      }
-
       __host__ __device__ __forceinline__ void count(T t) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
+        this->content[w - 1] = j;
       }
 
       __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
@@ -278,7 +154,7 @@ namespace cms {
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
@@ -286,37 +162,12 @@ namespace cms {
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
-        assert(off[totbins() - 1] == 0);
-        blockPrefixScan(off, totbins(), ws);
-        assert(off[totbins() - 1] == off[totbins() - 2]);
+        this->content[w - 1] = j;
       }
-
-      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
-      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
-
-      constexpr index_type const *begin() const { return bins; }
-      constexpr index_type const *end() const { return begin() + size(); }
-
-      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
-      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
-
-      Counter off[totbins()];
-      int32_t psws;  // prefix-scan working space
-      index_type bins[capacity()];
     };
 
-    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
-              uint32_t MAXONES,  // max number of "ones"
-              uint32_t MAXMANYS  // max number of "manys"
-              >
-    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
-
   }  // namespace cuda
 }  // namespace cms
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
new file mode 100644
index 0000000000000..01f48bca94f4b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -0,0 +1,282 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
+#define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
+
+#include <algorithm>
+#ifndef __CUDA_ARCH__
+#include <atomic>
+#endif  // __CUDA_ARCH__
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <typename Assoc>
+    struct OneToManyAssocView {
+      using Counter = typename Assoc::Counter;
+      using index_type = typename Assoc::index_type;
+
+      Assoc *assoc = nullptr;
+      Counter *offStorage = nullptr;
+      index_type *contentStorage = nullptr;
+      int32_t offSize = -1;
+      int32_t contentSize = -1;
+    };
+
+    // this MUST BE DONE in a single block (or in two kernels!)
+    template <typename Assoc>
+    __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
+      auto h = view.assoc;
+      assert(1 == gridDim.x);
+      assert(0 == blockIdx.x);
+
+      int first = threadIdx.x;
+
+      if (0 == first) {
+        h->psws = 0;
+        h->initStorage(view);
+      }
+      __syncthreads();
+      for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
+        h->off[i] = 0;
+      }
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchZero(Assoc *h,
+                                                          cudaStream_t stream
+#ifndef __CUDACC__
+                                                          = cudaStreamDefault
+#endif
+    ) {
+      typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
+      launchZero(view, stream);
+    }
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
+                                                          cudaStream_t stream
+#ifndef __CUDACC__
+                                                          = cudaStreamDefault
+#endif
+    ) {
+
+      if constexpr (Assoc::ctCapacity() < 0) {
+        assert(view.contentStorage);
+        assert(view.contentSize > 0);
+      }
+      if constexpr (Assoc::ctNOnes() < 0) {
+        assert(view.offStorage);
+        assert(view.offSize > 0);
+      }
+#ifdef __CUDACC__
+      auto nthreads = 1024;
+      auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
+      zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
+      cudaCheck(cudaGetLastError());
+#else
+      auto h = view.assoc;
+      assert(h);
+      h->initStorage(view);
+      h->zero();
+      h->psws = 0;
+#endif
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
+                                                              cudaStream_t stream
+#ifndef __CUDACC__
+                                                              = cudaStreamDefault
+#endif
+    ) {
+      typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
+      launchFinalize(view, stream);
+    }
+
+    template <typename Assoc>
+    inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
+                                                              cudaStream_t stream
+#ifndef __CUDACC__
+                                                              = cudaStreamDefault
+#endif
+    ) {
+      auto h = view.assoc;
+      assert(h);
+#ifdef __CUDACC__
+      using Counter = typename Assoc::Counter;
+      Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
+      auto nOnes = Assoc::ctNOnes();
+      if constexpr (Assoc::ctNOnes() < 0) {
+        assert(view.offStorage);
+        assert(view.offSize > 0);
+        nOnes = view.offSize;
+        poff = view.offStorage;
+      }
+      assert(nOnes > 0);
+      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
+      auto nthreads = 1024;
+      auto nblocks = (nOnes + nthreads - 1) / nthreads;
+      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
+      cudaCheck(cudaGetLastError());
+#else
+      h->finalize();
+#endif
+    }
+
+    template <typename Assoc>
+    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
+      assoc->bulkFinalizeFill(*apc);
+    }
+
+    template <typename I,    // type stored in the container (usually an index in a vector of the input values)
+              int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
+              int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
+              >
+    class OneToManyAssoc {
+    public:
+      using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;
+      using Counter = uint32_t;
+
+      using CountersOnly = OneToManyAssoc<I, ONES, 0>;
+
+      using index_type = I;
+
+      static constexpr uint32_t ilog2(uint32_t v) {
+        constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+        constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+        uint32_t r = 0;  // result of log2(v) will go here
+        for (auto i = 4; i >= 0; i--)
+          if (v & b[i]) {
+            v >>= s[i];
+            r |= s[i];
+          }
+        return r;
+      }
+
+      static constexpr int32_t ctNOnes() { return ONES; }
+      constexpr auto totOnes() const { return off.capacity(); }
+      constexpr auto nOnes() const { return totOnes() - 1; }
+      static constexpr int32_t ctCapacity() { return SIZE; }
+      constexpr auto capacity() const { return content.capacity(); }
+
+      __host__ __device__ void initStorage(View view) {
+        assert(view.assoc == this);
+        if constexpr (ctCapacity() < 0) {
+          assert(view.contentStorage);
+          assert(view.contentSize > 0);
+          content.init(view.contentStorage, view.contentSize);
+        }
+        if constexpr (ctNOnes() < 0) {
+          assert(view.offStorage);
+          assert(view.offSize > 0);
+          off.init(view.offStorage, view.offSize);
+        }
+      }
+
+      __host__ __device__ void zero() {
+        for (int32_t i = 0; i < totOnes(); ++i) {
+          off[i] = 0;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
+        for (int32_t i = 0; i < totOnes(); ++i) {
+#ifdef __CUDA_ARCH__
+          atomicAdd(off.data() + i, co.off[i]);
+#else
+          auto &a = (std::atomic<Counter> &)(off[i]);
+          a += co.off[i];
+#endif
+        }
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicAdd(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a++;
+#endif
+      }
+
+      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+        return atomicSub(&x, 1);
+#else
+        auto &a = (std::atomic<Counter> &)(x);
+        return a--;
+#endif
+      }
+
+      __host__ __device__ __forceinline__ void count(int32_t b) {
+        assert(b < nOnes());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {
+        assert(b < nOnes());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        content[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+        auto c = apc.add(n);
+        if (int(c.m) >= nOnes())
+          return -int32_t(c.m);
+        off[c.m] = c.n;
+        for (uint32_t j = 0; j < n; ++j)
+          content[c.n + j] = v[j];
+        return c.m;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+        off[apc.get().m] = apc.get().n;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+        int m = apc.get().m;
+        auto n = apc.get().n;
+        if (m >= nOnes()) {  // overflow!
+          off[nOnes()] = uint32_t(off[nOnes() - 1]);
+          return;
+        }
+        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+        for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
+          off[i] = n;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
+        assert(off[totOnes() - 1] == 0);
+        blockPrefixScan(off.data(), totOnes(), ws);
+        assert(off[totOnes() - 1] == off[totOnes() - 2]);
+      }
+
+      constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
+      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+      constexpr index_type const *begin() const { return content.data(); }
+      constexpr index_type const *end() const { return begin() + size(); }
+
+      constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
+      constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
+
+      FlexiStorage<Counter, ONES> off;
+      int32_t psws;  // prefix-scan working space
+      FlexiStorage<index_type, SIZE> content;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index c663d4f824a77..53d41efcf4236 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -40,6 +40,10 @@
     <flags CXXFLAGS="-g"/>
   </bin>
 
+  <bin file="FlexiStorage_t.cpp">
+   <flags CXXFLAGS="-g"/>
+  </bin>
+
   <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
     <flags CUDA_FLAGS="-g"/>
   </bin>
@@ -48,6 +52,14 @@
     <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   </bin>
 
+ <bin file="HistoContainerRT_t.cu" name="gpuHistoContainerRT_t">
+    <flags CUDA_FLAGS="-g"/>
+  </bin>
+
+  <bin file="HistoContainerRT_t.cu" name="gpuHistoContainerRT_debug">
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  </bin>
+
   <bin file="OneHistoContainer_t.cu" name="gpuOneHistoContainer_t">
     <flags CUDA_FLAGS="-g"/>
   </bin>
@@ -76,6 +88,19 @@
     <flags CXXFLAGS="-g"/>
   </bin>
 
+  <bin file="OneToManyAssoc_t.cu" name="gpuOneToManyAssocRT_t">
+    <flags CUDA_FLAGS="-g -DRUNTIME_SIZE"/>
+  </bin>
+
+  <bin file="OneToManyAssoc_t.cu" name="gpuOneToManyAssocRT_debug">
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG -DRUNTIME_SIZE"/>
+  </bin>
+
+  <bin file="OneToManyAssoc_t.cpp" name="cpuOneToManyAssocRT_t">
+    <flags CXXFLAGS="-g -DRUNTIME_SIZE"/>
+  </bin>
+
+
   <bin file="prefixScan_t.cu" name="gpuPrefixScan_t">
     <flags CUDA_FLAGS="-g"/>
   </bin>
@@ -104,4 +129,4 @@
     <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
     <flags CXXFLAGS="-g -DGPU_DEBUG"/>
   </bin>
-</iftool>
\ No newline at end of file
+</iftool>
diff --git a/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp b/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp
new file mode 100644
index 0000000000000..d16c4e44cc97d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp
@@ -0,0 +1,34 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"
+
+#include <cassert>
+
+using namespace cms::cuda;
+
+int main() {
+  FlexiStorage<int, 1024> a;
+
+  assert(a.capacity() == 1024);
+
+  FlexiStorage<int, -1> v;
+
+  v.init(a.data(), 20);
+
+  assert(v.capacity() == 20);
+
+  assert(v.data() == a.data());
+
+  a[4] = 42;
+
+  assert(42 == a[4]);
+  assert(42 == v[4]);
+
+  auto const& ac = a;
+  auto const& vc = v;
+
+  assert(42 == ac[4]);
+  assert(42 == vc[4]);
+
+  assert(ac.data() == vc.data());
+
+  return 0;
+};
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
new file mode 100644
index 0000000000000..b49498990fcbe
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
@@ -0,0 +1,165 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+using namespace cms::cuda;
+
+template <typename T>
+void go() {
+  std::mt19937 eng;
+  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+
+  constexpr int N = 12000;
+  T v[N];
+  auto v_d = make_device_unique<T[]>(N, nullptr);
+
+  cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+  constexpr uint32_t nParts = 10;
+  constexpr uint32_t partSize = N / nParts;
+  uint32_t offsets[nParts + 1];
+
+  using Hist = HistoContainer<T, 128, -1, 8 * sizeof(T), uint32_t, nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist::ctCapacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' '
+            << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
+
+  Hist h;
+  uint32_t mem[N];
+  auto h_d = make_device_unique<Hist[]>(1, nullptr);
+  auto h_s = make_device_unique<uint32_t[]>(N, nullptr);
+  // auto h_s = make_device_unique<Hist::index_type[]>(N, nullptr);
+
+  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+
+  for (int it = 0; it < 5; ++it) {
+    offsets[0] = 0;
+    for (uint32_t j = 1; j < nParts + 1; ++j) {
+      offsets[j] = offsets[j - 1] + partSize - 3 * j;
+      assert(offsets[j] <= N);
+    }
+
+    if (it == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+      assert(offsets[10] <= N);
+    }
+
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
+
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+
+    if (it == 2) {  // big bin
+      for (long long j = 1000; j < 2000; j++)
+        v[j] = sizeof(T) == 1 ? 22 : 3456;
+    }
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, h_s.get(), 0);
+    cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
+    assert(h.capacity() == offsets[10]);
+    // get content
+    cudaCheck(cudaMemcpy(mem, h_s.get(), N * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+    typename Hist::View view = {&h, nullptr, mem, -1, N};
+    // plug correct content
+    h.initStorage(view);
+    assert(0 == h.off[0]);
+    assert(offsets[10] == h.size());
+
+    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert(t1 < N);
+      assert(t2 < N);
+      if (T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+
+    // make sure it spans 3 bins...
+    auto window = T(1300);
+
+    for (uint32_t j = 0; j < nParts; ++j) {
+      auto off = Hist::histOff(j);
+      for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+        auto ii = i + off;
+        if (0 == h.size(ii))
+          continue;
+        auto k = *h.begin(ii);
+        if (j % 2)
+          k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2);
+        auto bk = h.bin(v[k]);
+        assert(bk == i);
+        assert(k < offsets[j + 1]);
+        auto kl = h.bin(v[k] - window);
+        auto kh = h.bin(v[k] + window);
+        assert(kl != i);
+        assert(kh != i);
+        // std::cout << kl << ' ' << kh << std::endl;
+
+        auto me = v[k];
+        auto tot = 0;
+        auto nm = 0;
+        bool l = true;
+        auto khh = kh;
+        incr(khh);
+        for (auto kk = kl; kk != khh; incr(kk)) {
+          if (kk != kl && kk != kh)
+            nm += h.size(kk + off);
+          for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) {
+            if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) {
+            } else {
+              ++tot;
+            }
+          }
+          if (kk == i) {
+            l = false;
+            continue;
+          }
+          if (l)
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, k, (*p));
+          else
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, (*p), k);
+        }
+        if (!(tot >= nm)) {
+          std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        }
+        if (l)
+          std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        assert(!l);
+      }
+    }
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<int8_t>();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 577e9bbb69b57..2109e5a0d5b38 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -25,51 +25,73 @@ void go() {
   constexpr int N = 12000;
   T v[N];
 
+  using HistR = HistoContainer<T, NBINS, -1, S>;
   using Hist = HistoContainer<T, NBINS, N, S>;
   using Hist4 = HistoContainer<T, NBINS, N, S, uint16_t, 4>;
+  std::cout << "HistoContainerR " << HistR::nbits() << ' ' << HistR::nbins() << ' ' << HistR::totbins() << ' '
+            << HistR::ctNOnes() << ' ' << HistR::ctCapacity() << ' ' << (rmax - rmin) / HistR::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
   std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
-            << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+            << Hist::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
   std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' '
-            << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+            << Hist4::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
   for (auto nh = 0; nh < 4; ++nh)
     std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh)
               << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl;
 
+  uint32_t mem[N];
+  HistR hr;
+  typename HistR::View view{&hr, nullptr, mem, -1, N};
+  hr.initStorage(view);
+  std::cout << "HistoContainerR " << hr.capacity() << std::endl;
+  assert(hr.capacity() == N);
   Hist h;
   Hist4 h4;
+  assert(h.capacity() == N);
+  assert(h4.capacity() == N);
+
   for (int it = 0; it < 5; ++it) {
     for (long long j = 0; j < N; j++)
       v[j] = rgen(eng);
     if (it == 2)
       for (long long j = N / 2; j < N / 2 + N / 4; j++)
         v[j] = 4;
+    hr.zero();
     h.zero();
     h4.zero();
+    assert(hr.size() == 0);
     assert(h.size() == 0);
     assert(h4.size() == 0);
     for (long long j = 0; j < N; j++) {
+      hr.count(v[j]);
       h.count(v[j]);
       if (j < 2000)
         h4.count(v[j], 2);
       else
         h4.count(v[j], j % 4);
     }
+    assert(hr.size() == 0);
     assert(h.size() == 0);
     assert(h4.size() == 0);
+    hr.finalize();
     h.finalize();
     h4.finalize();
     assert(h.size() == N);
     assert(h4.size() == N);
     for (long long j = 0; j < N; j++) {
+      hr.fill(v[j], j);
       h.fill(v[j], j);
       if (j < 2000)
         h4.fill(v[j], j, 2);
       else
         h4.fill(v[j], j, j % 4);
     }
+    assert(hr.off[0] == 0);
     assert(h.off[0] == 0);
     assert(h4.off[0] == 0);
+    assert(hr.size() == N);
     assert(h.size() == N);
     assert(h4.size() == N);
 
@@ -80,6 +102,11 @@ void go() {
         std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
     };
 
+    for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+      assert(h.size(i) == hr.size(i));
+      assert(*h.begin(i) == *hr.begin(i));
+    }
+
     for (uint32_t i = 0; i < Hist::nbins(); ++i) {
       if (0 == h.size(i))
         continue;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 69974aeef0267..75f9cc0e626f5 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -28,9 +28,11 @@ void go() {
 
   using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
   std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
-            << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' '
+            << Hist::ctCapacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' '
             << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
 
+  assert(Hist::totbins() == Hist::ctNOnes());
+
   Hist h;
   auto h_d = make_device_unique<Hist[]>(1, nullptr);
 
@@ -55,6 +57,7 @@ void go() {
       offsets[8] = 256 * 11 + offsets[7];
       offsets[9] = 44 + offsets[8];
       offsets[10] = 3297 + offsets[9];
+      assert(offsets[10] <= N);
     }
 
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
@@ -69,7 +72,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index f08c768b43fa5..f2d74c8e7ce98 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -112,7 +112,7 @@ void go() {
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
-  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' '
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::ctCapacity() << ' '
             << (rmax - rmin) / Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 36ab026ce1eb6..8ba9158cb69b7 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -13,16 +13,23 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #endif
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 using cms::cuda::AtomicPairCounter;
 
 constexpr uint32_t MaxElem = 64000;
 constexpr uint32_t MaxTk = 8000;
 constexpr uint32_t MaxAssocs = 4 * MaxTk;
 
+#ifdef RUNTIME_SIZE
+using Assoc = cms::cuda::OneToManyAssoc<uint16_t, -1, -1>;
+using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, -1>;
+using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
+#else
 using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
 using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
 using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
+#endif
+
 using TK = std::array<uint16_t, 4>;
 
 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
@@ -32,7 +39,7 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr
     if (threadIdx.x == 0)
       local.zero();
     __syncthreads();
-    local.countDirect(2 + i % 4);
+    local.count(2 + i % 4);
     __syncthreads();
     if (threadIdx.x == 0)
       assoc->add(local);
@@ -42,12 +49,12 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr
 __global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < n; i += gridDim.x * blockDim.x)
-    assoc->countDirect(2 + i % 4);
+    assoc->count(2 + i % 4);
 }
 
 __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
   auto first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x)
+  for (int i = first; i < m1->totOnes(); i += gridDim.x * blockDim.x)
     assert(m1->off[i] == m2->off[i]);
 }
 
@@ -60,7 +67,7 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3
     if (k >= n)
       return;
     if (tk[k][j] < MaxElem)
-      assoc->countDirect(tk[k][j]);
+      assoc->count(tk[k][j]);
   }
 }
 
@@ -73,11 +80,11 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32
     if (k >= n)
       return;
     if (tk[k][j] < MaxElem)
-      assoc->fillDirect(tk[k][j], k);
+      assoc->fill(tk[k][j], k);
   }
 }
 
-__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); }
+__global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
 
 template <typename Assoc>
 __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
@@ -90,9 +97,55 @@ __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Asso
 
 template <typename Assoc>
 __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
-  if (apc->get().m >= Assoc::nbins())
-    printf("Overflow %d %d\n", apc->get().m, Assoc::nbins());
-  assert(assoc->size() < Assoc::capacity());
+  if (int(apc->get().m) >= assoc->nOnes())
+    printf("Overflow %d %d\n", apc->get().m, assoc->nOnes());
+  assert(int(assoc->size()) < assoc->capacity());
+}
+
+template <typename Assoc>
+__global__ void verifyFill(Assoc const* __restrict__ la, int n) {
+  printf("assoc size %d\n", la->size());
+  int imax = 0;
+  long long ave = 0;
+  int z = 0;
+  for (int i = 0; i < n; ++i) {
+    auto x = la->size(i);
+    if (x == 0) {
+      z++;
+      continue;
+    }
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la->size(n));
+  printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
+}
+
+template <typename Assoc>
+__global__ void verifyFinal(Assoc const* __restrict__ la, int N) {
+  printf("assoc size %d\n", la->size());
+
+  int imax = 0;
+  long long ave = 0;
+  for (int i = 0; i < N; ++i) {
+    auto x = la->size(i);
+    if (!(x == 4 || x == 3))
+      printf("%d %d\n", i, x);
+    assert(x == 4 || x == 3);
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la->size(N));
+  printf("found with ave occupancy %f %d\n", double(ave) / N, imax);
+}
+
+template <typename T>
+auto make_unique(std::size_t size) {
+#ifdef __CUDACC__
+  return cms::cuda::make_device_unique<T>(size, 0);
+#else
+  return std::make_unique<T>(size);
+#endif
 }
 
 int main() {
@@ -118,9 +171,9 @@ int main() {
   assert(gridDim.z == 1);
 #endif
 
-  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl;
-  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' '
-            << SmallAssoc::capacity() << std::endl;
+  std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::ctNOnes() << ' ' << Assoc::ctCapacity() << std::endl;
+  std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::ctNOnes() << ' '
+            << SmallAssoc::ctCapacity() << std::endl;
 
   std::mt19937 eng;
 
@@ -164,19 +217,36 @@ int main() {
   }
   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
 
+  auto a_d = make_unique<Assoc[]>(1);
+  auto sa_d = make_unique<SmallAssoc[]>(1);
 #ifdef __CUDACC__
   auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
   assert(v_d.get());
-  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
-  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
-  auto a_d = std::make_unique<Assoc>();
-  auto sa_d = std::make_unique<SmallAssoc>();
   auto v_d = tr.data();
 #endif
 
-  launchZero(a_d.get(), 0);
+  Assoc::Counter* a_st = nullptr;
+  int a_n = MaxElem;
+
+  Assoc::index_type* a_st2 = nullptr;
+  SmallAssoc::index_type* sa_st2 = nullptr;
+  int a_n2 = MaxAssocs;
+
+// storage
+#ifdef RUNTIME_SIZE
+  auto a_st_d = make_unique<Assoc::Counter[]>(a_n);
+  auto a_st2_d = make_unique<Assoc::index_type[]>(a_n2);
+  auto sa_st2_d = make_unique<SmallAssoc::index_type[]>(a_n2);
+  a_st = a_st_d.get();
+  a_st2 = a_st2_d.get();
+  sa_st2 = sa_st2_d.get();
+#endif
+  Assoc::View aView = {a_d.get(), a_st, a_st2, a_n, a_n2};
+  launchZero(aView, 0);
+  SmallAssoc::View saView = {sa_d.get(), nullptr, sa_st2, -1, a_n2};
+  launchZero(saView, 0);
 
 #ifdef __CUDACC__
   auto nThreads = 256;
@@ -184,41 +254,21 @@ int main() {
 
   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
-  launchFinalize(a_d.get(), 0);
+  launchFinalize(aView, 0);
   verify<<<1, 1>>>(a_d.get());
   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  verifyFill<<<1, 1>>>(a_d.get(), n);
+
 #else
   count(v_d, a_d.get(), N);
-  launchFinalize(a_d.get());
+  launchFinalize(aView);
   verify(a_d.get());
   fill(v_d, a_d.get(), N);
-#endif
+  verifyFill(a_d.get(), n);
 
-  Assoc la;
-
-#ifdef __CUDACC__
-  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
-#else
-  memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
 #endif
 
-  std::cout << la.size() << std::endl;
-  imax = 0;
-  ave = 0;
-  z = 0;
-  for (auto i = 0U; i < n; ++i) {
-    auto x = la.size(i);
-    if (x == 0) {
-      z++;
-      continue;
-    }
-    ave += x;
-    imax = std::max(imax, int(x));
-  }
-  assert(0 == la.size(n));
-  std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
-
-  // now the inverse map (actually this is the direct....)
+  // now the inverse map (actually this is the ....)
   AtomicPairCounter* dc_d;
   AtomicPairCounter dc(0);
 
@@ -230,8 +280,8 @@ int main() {
   finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
-  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
   cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+  verifyFinal<<<1, 1>>>(a_d.get(), N);
 
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
@@ -243,7 +293,8 @@ int main() {
   fillBulk(dc_d, v_d, a_d.get(), N);
   finalizeBulk(dc_d, a_d.get());
   verifyBulk(a_d.get(), dc_d);
-  memcpy(&la, a_d.get(), sizeof(Assoc));
+
+  verifyFinal(a_d.get(), N);
 
   AtomicPairCounter sdc(0);
   fillBulk(&sdc, v_d, sa_d.get(), N);
@@ -254,40 +305,35 @@ int main() {
 
   std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
 
-  std::cout << la.size() << std::endl;
-  imax = 0;
-  ave = 0;
-  for (auto i = 0U; i < N; ++i) {
-    auto x = la.size(i);
-    if (!(x == 4 || x == 3))
-      std::cout << i << ' ' << x << std::endl;
-    assert(x == 4 || x == 3);
-    ave += x;
-    imax = std::max(imax, int(x));
-  }
-  assert(0 == la.size(N));
-  std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
-
   // here verify use of block local counters
-#ifdef __CUDACC__
-  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
-  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
-#else
-  auto m1_d = std::make_unique<Multiplicity>();
-  auto m2_d = std::make_unique<Multiplicity>();
+  auto m1_d = make_unique<Multiplicity[]>(1);
+  auto m2_d = make_unique<Multiplicity[]>(1);
+
+  Multiplicity::index_type* m1_st = nullptr;
+  Multiplicity::index_type* m2_st = nullptr;
+  int m_n = 0;
+
+#ifdef RUNTIME_SIZE
+  m_n = MaxTk;
+  auto m1_st_d = make_unique<Multiplicity::index_type[]>(m_n);
+  auto m2_st_d = make_unique<Multiplicity::index_type[]>(m_n);
+  m1_st = m1_st_d.get();
+  m2_st = m1_st_d.get();
 #endif
-  launchZero(m1_d.get(), 0);
-  launchZero(m2_d.get(), 0);
+  Multiplicity::View view1 = {m1_d.get(), nullptr, m1_st, -1, m_n};
+  Multiplicity::View view2 = {m2_d.get(), nullptr, m2_st, -1, m_n};
+  launchZero(view1, 0);
+  launchZero(view2, 0);
 
 #ifdef __CUDACC__
   nBlocks = (4 * N + nThreads - 1) / nThreads;
   countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
-  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+  verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
 
-  launchFinalize(m1_d.get(), 0);
-  launchFinalize(m2_d.get(), 0);
-  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+  launchFinalize(view1, 0);
+  launchFinalize(view2, 0);
+  verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get());
 
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
@@ -296,8 +342,8 @@ int main() {
   countMultiLocal(v_d, m2_d.get(), N);
   verifyMulti(m1_d.get(), m2_d.get());
 
-  launchFinalize(m1_d.get());
-  launchFinalize(m2_d.get());
+  launchFinalize(view1, 0);
+  launchFinalize(view2, 0);
   verifyMulti(m1_d.get(), m2_d.get());
 #endif
   return 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index e7267343d3d01..6bebcd9ea7aab 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -500,12 +500,6 @@ namespace pixelgpudetails {
         printf("moduleStart %d %d\n", i, moduleStart[i]);
     }
 #endif
-
-    // avoid overflow
-    auto constexpr maxNumClusters = gpuClustering::maxNumClusters;
-    for (int i = first, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
-      moduleStart[i] = std::clamp(moduleStart[i], 0U, maxNumClusters);
-    }
   }
 
   // Interface to outside
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index f75d5e3b3bef7..61c228bea3c56 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -47,30 +47,29 @@ namespace pixelgpudetails {
 #ifdef GPU_DEBUG
     std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
 #endif
-    if (blocks)  // protect from empty events
+    // protect from empty events
+    if (blocks) {
       gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream>>>(
           cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
 #endif
+    }
 
     // assuming full warp of threads is better than a smaller number...
     if (nHits) {
       setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
       cudaCheck(cudaGetLastError());
-    }
 
-    if (nHits) {
-      cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
+      cms::cuda::fillManyFromVector(
+          hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream);
       cudaCheck(cudaGetLastError());
-    }
 
 #ifdef GPU_DEBUG
-    cudaDeviceSynchronize();
-    cudaCheck(cudaGetLastError());
+      cudaCheck(cudaDeviceSynchronize());
 #endif
+    }
 
     return hits_d;
   }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 09b90526bf7db..65846cbb652d6 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -78,12 +78,6 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
   iEvent.getByToken(tBeamSpot, hbs);
   auto const& bs = ctx.get(*hbs);
 
-  auto nHits = clusters.nClusters();
-  if (nHits >= TrackingRecHit2DSOAView::maxHits()) {
-    edm::LogWarning("PixelRecHitGPUKernel")
-        << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits();
-  }
-
   ctx.emplace(iEvent,
               tokenHit_,
               gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream()));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 790b0da51ecfb..e2f9e1263e429 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -146,8 +146,6 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
       if (clust.originalId() >= nhits)
         continue;
       auto ij = jnd(clust.originalId());
-      if (ij >= TrackingRecHit2DSOAView::maxHits())
-        continue;  // overflow...
       LocalPoint lp(xl[ij], yl[ij]);
       LocalError le(xe[ij], 0, ye[ij]);
       SiPixelRecHitQuality::QualWordType rqw = 0;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index b3dafcc45d029..fbf3384ce328c 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -245,7 +245,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
                                 output->hitsLayerStart(),
                                 numberOfHits,
                                 256,
-                                nullptr);
+                                output->phiBinnerStorage());
 
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
                                          << numberOfDetUnits << " Dets";
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 2401fed6c5171..da7d3de7336a8 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -166,7 +166,6 @@ namespace gpuPixelRecHits {
       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
         auto h = first + ic;  // output index in global memory
 
-        assert(h < TrackingRecHit2DSOAView::maxHits());
         assert(h < hits.nHits());
         assert(h < clusters.clusModuleStart(me + 1));
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index ee5065e81fc45..87e93e6f0bd05 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -45,7 +45,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 
 #ifdef BROKENLINE_DEBUG
   if (0 == local_start) {
-    printf("%d total Ntuple\n", foundNtuplets->nbins());
+    printf("%d total Ntuple\n", foundNtuplets->nOnes());
     printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
   }
 #endif
@@ -58,7 +58,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets,
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
-    assert(tkid < foundNtuplets->nbins());
+    assert(tkid < foundNtuplets->nOnes());
 
     assert(foundNtuplets->size(tkid) == nHits);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
index 5342141d2c9e4..8ea536465814c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -74,8 +74,7 @@ namespace caConstants {
 
   using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
   using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
-  using HitToTuple =
-      cms::cuda::OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4 * maxTuples>;  // 3.5 should be enough
+  using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>;  // 3.5 should be enough
   using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
 
 }  // namespace caConstants
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 379aa5802f7dd..5a2157bf87b7c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -84,7 +84,6 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
   // std::cout << "N hits " << nhits << std::endl;
   // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
@@ -172,7 +171,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
     kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
   }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 160be9142ea4c..ca334223f8fc7 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -3,7 +3,7 @@
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
-  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+  auto numberOfBlocks = (HitContainer::ctCapacity() + blockSize - 1) / blockSize;
 
   kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       &tracks_d->hitIndices, hv, &tracks_d->detIndices);
@@ -24,10 +24,12 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
-  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
 
-  // std::cout << "N hits " << nhits << std::endl;
-  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+#ifdef NTUPLE_DEBUG
+  std::cout << "start tuple building. N hits " << nhits << std::endl;
+  if (nhits < 2)
+    std::cout << "too few hits " << nhits << std::endl;
+#endif
 
   //
   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
@@ -95,7 +97,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 #endif
 
   blockSize = 128;
-  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
@@ -136,7 +138,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
 template <>
 void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  auto nhits = hh.nHits();
+  int32_t nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
@@ -148,7 +150,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
 
   cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
@@ -162,7 +164,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   {
     int threadsPerBlock = 128;
     // at least one block!
-    int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock;
     gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
                                                                            nhits,
                                                                            device_theCellNeighbors_.get(),
@@ -225,6 +227,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();
 
+  int32_t nhits = hh.nHits();
+
   auto blockSize = 64;
 
   // classify tracks based on kinematics
@@ -245,30 +249,38 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
   kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
   cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+  cudaCheck(cudaDeviceSynchronize());
+#endif
 
   if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) {
     // fill hit->track "map"
+    assert(hitToTupleView_.offSize > nhits);
     numberOfBlocks = nQuadrupletBlocks(blockSize);
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream);
+    assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
+           (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
+    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+#endif
   }
 
   if (params_.doSharedHitCut_) {
     // remove duplicates (tracks that share a hit)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
     kernel_sharedHitCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
   }
 
   if (params_.doStats_) {
-    auto nhits = hh.nHits();
-    numberOfBlocks = (std::max(nhits, params_.maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
     kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
                                                                         device_tupleMultiplicity_.get(),
                                                                         device_hitToTuple_.get(),
@@ -286,7 +298,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doStats_) {
     // counters (add flag???)
-    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize;
     kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
     cudaCheck(cudaGetLastError());
     numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index f79119552e0bc..f5c2755ea8ac6 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -1,6 +1,8 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
 
+// #define GPU_DEBUG
+
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 
@@ -179,7 +181,7 @@ class CAHitNtupletGeneratorKernels {
   void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream);
 
   void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
-  void allocateOnGPU(cudaStream_t stream);
+  void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
   static void printCounters(Counters const* counters);
@@ -200,6 +202,9 @@ class CAHitNtupletGeneratorKernels {
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;
+  unique_ptr<HitToTuple::Counter[]> device_hitToTupleStorage_;
+  HitToTuple::View hitToTupleView_;
+
   cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
   cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index 1d19aa43d6e1b..5978ef8851c73 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -4,9 +4,9 @@
 
 template <>
 #ifdef __CUDACC__
-void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
 #else
-void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
 #endif
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
@@ -15,7 +15,17 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
   device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
 
+#ifdef GPU_DEBUG
+  std::cout << "Allocation for tuple building. N hits " << nHits << std::endl;
+#endif
+
+  nHits++;  // storage requires one more counter;
+  assert(nHits > 0);
   device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
+  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, stream);
+  hitToTupleView_.assoc = device_hitToTuple_.get();
+  hitToTupleView_.offStorage = device_hitToTupleStorage_.get();
+  hitToTupleView_.offSize = nHits;
 
   device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
 
@@ -25,11 +35,16 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
 
+  // FIXME: consider collapsing these 3 in one adhoc kernel
   if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
   }
   cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index e55d7f124a9c5..550541fdca6fb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -3,6 +3,7 @@
 //
 
 // #define NTUPLE_DEBUG
+// #define GPU_DEBUG
 
 #include <cmath>
 #include <cstdint>
@@ -38,7 +39,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
                                       gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
                                       gpuPixelDoublets::CellTracksVector const *cellTracks,
                                       GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
-                                      uint32_t nHits,
+                                      int32_t nHits,
                                       uint32_t maxNumberOfDoublets,
                                       CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -55,23 +56,24 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 
 #ifdef NTUPLE_DEBUG
   if (0 == first) {
-    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
            *nCells,
            apc->get().m,
            apc->get().n,
-           nHits);
+           nHits,
+           hitToTuple->totOnes());
     if (apc->get().m < caConstants::maxNumberOfQuadruplets()) {
       assert(foundNtuplets->size(apc->get().m) == 0);
       assert(foundNtuplets->size() == apc->get().n);
     }
   }
 
-  for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
     if (foundNtuplets->size(idx) > 5)
       printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
     assert(foundNtuplets->size(idx) < 6);
     for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
-      assert(*ih < nHits);
+      assert(int(*ih) < nHits);
   }
 #endif
 
@@ -84,6 +86,8 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       printf("cellNeighbors overflow\n");
     if (cellTracks && cellTracks->full())
       printf("cellTracks overflow\n");
+    if (int(hitToTuple->nOnes()) < nHits)
+      printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
   }
 
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
@@ -302,7 +306,7 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
                                          Quality const *__restrict__ quality,
                                          caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
@@ -312,7 +316,7 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
-    tupleMultiplicity->countDirect(nhits);
+    tupleMultiplicity->count(nhits);
   }
 }
 
@@ -320,7 +324,7 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
                                         Quality const *__restrict__ quality,
                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
     if (nhits < 3)
       continue;
@@ -330,7 +334,7 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
-    tupleMultiplicity->fillDirect(nhits, it);
+    tupleMultiplicity->fill(nhits, it);
   }
 }
 
@@ -339,7 +343,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
                                       Quality *__restrict__ quality) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) {
+  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = tuples->size(it);
     if (nhits == 0)
       break;  // guard
@@ -377,7 +381,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                     (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
     // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
     if (3.f * tracks->chi2(it) >= chi2Cut) {
-#ifdef NTUPLE_DEBUG
+#ifdef NTUPLE_FIT_DEBUG
       printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
              it,
              tuples->size(it),
@@ -406,7 +410,7 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  //guard
     if (quality[idx] != pixelTrack::Quality::loose)
@@ -419,13 +423,13 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
                                         Quality const *__restrict__ quality,
                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
     if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->countDirect(*h);
+      hitToTuple->count(*h);
   }
 }
 
@@ -433,13 +437,13 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
                                        Quality const *__restrict__ quality,
                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
     if (quality[idx] != pixelTrack::Quality::loose)
       continue;
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fillDirect(*h, idx);
+      hitToTuple->fill(*h, idx);
   }
 }
 
@@ -448,15 +452,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
                                          HitContainer *__restrict__ hitDetIndices) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   // copy offsets
-  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     hitDetIndices->off[idx] = tuples->off[idx];
   }
   // fill hit indices
   auto const &hh = *hhp;
   auto nhits = hh.nHits();
   for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    assert(tuples->bins[idx] < nhits);
-    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+    assert(tuples->content[idx] < nhits);
+    hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
   }
 }
 
@@ -464,7 +468,7 @@ __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::Hi
                                              CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
   auto &c = *counters;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (hitToTuple->size(idx) == 0)
       continue;  // SHALL NOT BE break
     atomicAdd(&c.nUsedHits, 1);
@@ -491,7 +495,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric
   int l1end = hh.hitsLayerStart()[1];
 
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (hitToTuple.size(idx) < 2)
       continue;
 
@@ -540,12 +544,12 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
                                             TkSoA const *__restrict__ ptracks,
                                             Quality const *__restrict__ quality,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
-                                            uint32_t maxPrint,
+                                            int32_t maxPrint,
                                             int iev) {
   auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
   int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) {
+  for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
     auto nh = foundNtuplets.size(i);
     if (nh < 3)
       continue;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 5a86ece3590f6..3f5ba1d04e7db 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -2,6 +2,8 @@
 // Original Author: Felice Pantaleo, CERN
 //
 
+// #define GPU_DEBUG
+
 #include <array>
 #include <cassert>
 #include <functional>
@@ -179,11 +181,11 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
+  assert(soa);
 
   CAHitNtupletGeneratorKernelsGPU kernels(m_params);
   kernels.setCounters(m_counters);
-
-  kernels.allocateOnGPU(stream);
+  kernels.allocateOnGPU(hits_d.nHits(), stream);
 
   kernels.buildDoublets(hits_d, stream);
   kernels.launchKernels(hits_d, soa, stream);
@@ -198,6 +200,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
   }
   kernels.classifyTuples(hits_d, soa, stream);
 
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+  std::cout << "finished building pixel tracks on GPU" << std::endl;
+#endif
+
   return tracks;
 }
 
@@ -209,7 +217,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   CAHitNtupletGeneratorKernelsCPU kernels(m_params);
   kernels.setCounters(m_counters);
-  kernels.allocateOnGPU(nullptr);
+  kernels.allocateOnGPU(hits_d.nHits(), nullptr);
 
   kernels.buildDoublets(hits_d, nullptr);
   kernels.launchKernels(hits_d, soa, nullptr);
@@ -230,5 +238,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC
 
   kernels.classifyTuples(hits_d, soa, nullptr);
 
+#ifdef GPU_DEBUG
+  std::cout << "finished building pixel tracks on CPU" << std::endl;
+#endif
+
   return tracks;
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 5b661bc3be028..926002d674b83 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -51,7 +51,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets,
 
     // get it from the ntuple container (one to one to helix)
     auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
-    assert(tkid < foundNtuplets->nbins());
+    assert(tkid < foundNtuplets->nOnes());
 
     assert(foundNtuplets->size(tkid) == nHits);