diff --git a/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h index e9dfed7bca7a6..77cf567dca681 100644 --- a/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h +++ b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h @@ -4,17 +4,6 @@ #include #include -namespace pixelGPUConstants { -#ifdef GPU_SMALL_EVENTS - // kept for testing and debugging - constexpr uint32_t maxNumberOfHits = 24 * 1024; -#else - // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away - // tested on MC events with 55-75 pileup events - constexpr uint32_t maxNumberOfHits = 48 * 1024; -#endif -} // namespace pixelGPUConstants - namespace gpuClustering { #ifdef GPU_SMALL_EVENTS // kept for testing and debugging @@ -28,7 +17,6 @@ namespace gpuClustering { constexpr uint16_t maxNumModules = 2000; constexpr int32_t maxNumClustersPerModules = maxHitsInModule(); - constexpr uint32_t maxNumClusters = pixelGPUConstants::maxNumberOfHits; constexpr uint16_t invalidModuleId = std::numeric_limits::max() - 1; static_assert(invalidModuleId > maxNumModules); // invalidModuleId must be > maxNumModules diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h index bd39f3c4d3bfe..f74717c41e4d7 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -17,7 +17,7 @@ class TrackSoAHeterogeneousT { using Quality = pixelTrack::Quality; using hindex_type = uint32_t; - using HitContainer = cms::cuda::OneToManyAssoc; + using HitContainer = cms::cuda::OneToManyAssoc; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 967b5c6c8282f..7fa73432100f9 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -34,6 +34,7 @@ class TrackingRecHit2DHeterogeneous { auto hitsModuleStart() const { return m_hitsModuleStart; } auto hitsLayerStart() { return m_hitsLayerStart; } auto phiBinner() { return m_phiBinner; } + auto phiBinnerStorage() { return m_phiBinnerStorage; } auto iphi() { return m_iphi; } // only the local coord and detector index @@ -42,7 +43,7 @@ class TrackingRecHit2DHeterogeneous { private: static constexpr uint32_t n16 = 4; // number of elements in m_store16 - static constexpr uint32_t n32 = 9; // number of elements in m_store32 + static constexpr uint32_t n32 = 10; // number of elements in m_store32 static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious unique_ptr m_store16; //! @@ -59,6 +60,7 @@ class TrackingRecHit2DHeterogeneous { // needed as kernel params... PhiBinner* m_phiBinner; + PhiBinner::index_type* m_phiBinnerStorage; uint32_t* m_hitsLayerStart; int16_t* m_iphi; }; @@ -97,14 +99,20 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // this will break 1to1 correspondence with cluster and module locality // so unless proven VERY inefficient we keep it ordered as generated m_store16 = Traits::template make_device_unique(nHits * n16, stream); - m_store32 = Traits::template make_device_unique(nHits * n32 + 11, stream); + m_store32 = + Traits::template make_device_unique(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream); m_PhiBinnerStore = Traits::template make_device_unique(stream); + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + auto get16 = [&](int i) { return m_store16.get() + i * nHits; }; auto get32 = [&](int i) { return m_store32.get() + i * nHits; }; // copy all the pointers m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(9)); view->m_xl = get32(0); view->m_yl = get32(1); diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h index 7f3c59cd70faf..bebca103158ed 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -14,11 +14,9 @@ namespace pixelCPEforGPU { class TrackingRecHit2DSOAView { public: - static constexpr uint32_t maxHits() { return gpuClustering::maxNumClusters; } using hindex_type = uint32_t; // if above is <=2^32 - using PhiBinner = - cms::cuda::HistoContainer; + using PhiBinner = cms::cuda::HistoContainer; using AverageGeometry = phase1PixelTopology::AverageGeometry; @@ -95,6 +93,7 @@ class TrackingRecHit2DSOAView { uint32_t* m_hitsLayerStart; PhiBinner* m_phiBinner; + PhiBinner::index_type* m_phiBinnerStorage; uint32_t m_nHits; }; diff --git a/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h b/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h new file mode 100644 index 0000000000000..f794fb53afe4d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h @@ -0,0 +1,49 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h +#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h + +#include + +namespace cms { + namespace cuda { + + template + class FlexiStorage { + public: + constexpr int capacity() const { return S; } + + constexpr I& operator[](int i) { return m_v[i]; } + constexpr const I& operator[](int i) const { return m_v[i]; } + + constexpr I* data() { return m_v; } + constexpr I const* data() const { return m_v; } + + private: + I m_v[S]; + }; + + template + class FlexiStorage { + public: + constexpr void init(I* v, int s) { + m_v = v; + m_capacity = s; + } + + constexpr int capacity() const { return m_capacity; } + + constexpr I& operator[](int i) { return m_v[i]; } + constexpr const I& operator[](int i) const { return m_v[i]; } + + constexpr I* data() { return m_v; } + constexpr I const* data() const { return m_v; } + + private: + I* m_v; + int m_capacity; + }; + + } // namespace cuda + +} // namespace cms + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index bd80281f75333..7bf5db603bccd 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -1,19 +1,7 @@ #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h -#include -#ifndef __CUDA_ARCH__ -#include -#endif // __CUDA_ARCH__ -#include -#include -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" -#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" namespace cms { namespace cuda { @@ -50,61 +38,27 @@ namespace cms { } } - template - inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - int32_t size = offsetof(Histo, bins) - offsetof(Histo, off); - assert(size >= int(sizeof(uint32_t) * Histo::totbins())); -#ifdef __CUDACC__ - cudaCheck(cudaMemsetAsync(poff, 0, size, stream)); -#else - ::memset(poff, 0, size); -#endif - } - - template - inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, - cudaStream_t stream -#ifndef __CUDACC__ - = cudaStreamDefault -#endif - ) { -#ifdef __CUDACC__ - uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off)); - int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws)); - auto nthreads = 1024; - auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads; - multiBlockPrefixScan<<>>( - poff, poff, Histo::totbins(), ppsws); - cudaCheck(cudaGetLastError()); -#else - h->finalize(); -#endif - } - template inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, uint32_t nh, T const *__restrict__ v, uint32_t const *__restrict__ offsets, - uint32_t totSize, + int32_t totSize, int nthreads, + typename Histo::index_type *mem, cudaStream_t stream #ifndef __CUDACC__ = cudaStreamDefault #endif ) { - launchZero(h, stream); + typename Histo::View view = {h, nullptr, mem, -1, totSize}; + launchZero(view, stream); #ifdef __CUDACC__ auto nblocks = (totSize + nthreads - 1) / nthreads; + assert(nblocks > 0); countFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); - launchFinalize(h, stream); + launchFinalize(view, stream); fillFromVector<<>>(h, nh, v, offsets); cudaCheck(cudaGetLastError()); #else @@ -114,11 +68,6 @@ namespace cms { #endif } - template - __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { - assoc->bulkFinalizeFill(*apc); - } - // iteratate over N bins left and right of the one containing "v" template __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { @@ -142,20 +91,19 @@ namespace cms { } } - template - class HistoContainer { + class HistoContainer : public OneToManyAssoc { public: - using Counter = uint32_t; - - using CountersOnly = HistoContainer; - - using index_type = I; + using Base = OneToManyAssoc; + using View = typename Base::View; + using Counter = typename Base::Counter; + using index_type = typename Base::index_type; using UT = typename std::make_unsigned::type; static constexpr uint32_t ilog2(uint32_t v) { @@ -176,7 +124,8 @@ namespace cms { static constexpr uint32_t nhists() { return NHISTS; } static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } - static constexpr uint32_t capacity() { return SIZE; } + + // static_assert(int32_t(totbins())==Base::ctNOnes()); static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } @@ -186,91 +135,18 @@ namespace cms { return (t >> shift) & mask; } - __host__ __device__ void zero() { - for (auto &i : off) - i = 0; - } - - __host__ __device__ __forceinline__ void add(CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { -#ifdef __CUDA_ARCH__ - atomicAdd(off + i, co.off[i]); -#else - auto &a = (std::atomic &)(off[i]); - a += co.off[i]; -#endif - } - } - - static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { -#ifdef __CUDA_ARCH__ - return atomicAdd(&x, 1); -#else - auto &a = (std::atomic &)(x); - return a++; -#endif - } - - static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { -#ifdef __CUDA_ARCH__ - return atomicSub(&x, 1); -#else - auto &a = (std::atomic &)(x); - return a--; -#endif - } - - __host__ __device__ __forceinline__ void countDirect(T b) { - assert(b < nbins()); - atomicIncrement(off[b]); - } - - __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) { - assert(b < nbins()); - auto w = atomicDecrement(off[b]); - assert(w > 0); - bins[w - 1] = j; - } - - __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } - - __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } - - __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } - auto first = m + blockDim.x * blockIdx.x + threadIdx.x; - for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) { - off[i] = n; - } - } - __host__ __device__ __forceinline__ void count(T t) { uint32_t b = bin(t); assert(b < nbins()); - atomicIncrement(off[b]); + Base::atomicIncrement(this->off[b]); } __host__ __device__ __forceinline__ void fill(T t, index_type j) { uint32_t b = bin(t); assert(b < nbins()); - auto w = atomicDecrement(off[b]); + auto w = Base::atomicDecrement(this->off[b]); assert(w > 0); - bins[w - 1] = j; + this->content[w - 1] = j; } __host__ __device__ __forceinline__ void count(T t, uint32_t nh) { @@ -278,7 +154,7 @@ namespace cms { assert(b < nbins()); b += histOff(nh); assert(b < totbins()); - atomicIncrement(off[b]); + Base::atomicIncrement(this->off[b]); } __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) { @@ -286,37 +162,12 @@ namespace cms { assert(b < nbins()); b += histOff(nh); assert(b < totbins()); - auto w = atomicDecrement(off[b]); + auto w = Base::atomicDecrement(this->off[b]); assert(w > 0); - bins[w - 1] = j; - } - - __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) { - assert(off[totbins() - 1] == 0); - blockPrefixScan(off, totbins(), ws); - assert(off[totbins() - 1] == off[totbins() - 2]); + this->content[w - 1] = j; } - - constexpr auto size() const { return uint32_t(off[totbins() - 1]); } - constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } - - constexpr index_type const *begin() const { return bins; } - constexpr index_type const *end() const { return begin() + size(); } - - constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } - constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } - - Counter off[totbins()]; - int32_t psws; // prefix-scan working space - index_type bins[capacity()]; }; - template - using OneToManyAssoc = HistoContainer; - } // namespace cuda } // namespace cms diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h new file mode 100644 index 0000000000000..01f48bca94f4b --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h @@ -0,0 +1,282 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h +#define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h + +#include +#ifndef __CUDA_ARCH__ +#include +#endif // __CUDA_ARCH__ +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h" + +namespace cms { + namespace cuda { + + template + struct OneToManyAssocView { + using Counter = typename Assoc::Counter; + using index_type = typename Assoc::index_type; + + Assoc *assoc = nullptr; + Counter *offStorage = nullptr; + index_type *contentStorage = nullptr; + int32_t offSize = -1; + int32_t contentSize = -1; + }; + + // this MUST BE DONE in a single block (or in two kernels!) + template + __global__ void zeroAndInit(OneToManyAssocView view) { + auto h = view.assoc; + assert(1 == gridDim.x); + assert(0 == blockIdx.x); + + int first = threadIdx.x; + + if (0 == first) { + h->psws = 0; + h->initStorage(view); + } + __syncthreads(); + for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) { + h->off[i] = 0; + } + } + + template + inline __attribute__((always_inline)) void launchZero(Assoc *h, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + typename Assoc::View view = {h, nullptr, nullptr, -1, -1}; + launchZero(view, stream); + } + template + inline __attribute__((always_inline)) void launchZero(OneToManyAssocView view, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + + if constexpr (Assoc::ctCapacity() < 0) { + assert(view.contentStorage); + assert(view.contentSize > 0); + } + if constexpr (Assoc::ctNOnes() < 0) { + assert(view.offStorage); + assert(view.offSize > 0); + } +#ifdef __CUDACC__ + auto nthreads = 1024; + auto nblocks = 1; // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels); + zeroAndInit<<>>(view); + cudaCheck(cudaGetLastError()); +#else + auto h = view.assoc; + assert(h); + h->initStorage(view); + h->zero(); + h->psws = 0; +#endif + } + + template + inline __attribute__((always_inline)) void launchFinalize(Assoc *h, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + typename Assoc::View view = {h, nullptr, nullptr, -1, -1}; + launchFinalize(view, stream); + } + + template + inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView view, + cudaStream_t stream +#ifndef __CUDACC__ + = cudaStreamDefault +#endif + ) { + auto h = view.assoc; + assert(h); +#ifdef __CUDACC__ + using Counter = typename Assoc::Counter; + Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off)); + auto nOnes = Assoc::ctNOnes(); + if constexpr (Assoc::ctNOnes() < 0) { + assert(view.offStorage); + assert(view.offSize > 0); + nOnes = view.offSize; + poff = view.offStorage; + } + assert(nOnes > 0); + int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws)); + auto nthreads = 1024; + auto nblocks = (nOnes + nthreads - 1) / nthreads; + multiBlockPrefixScan<<>>(poff, poff, nOnes, ppsws); + cudaCheck(cudaGetLastError()); +#else + h->finalize(); +#endif + } + + template + __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) { + assoc->bulkFinalizeFill(*apc); + } + + template + class OneToManyAssoc { + public: + using View = OneToManyAssocView>; + using Counter = uint32_t; + + using CountersOnly = OneToManyAssoc; + + using index_type = I; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr int32_t ctNOnes() { return ONES; } + constexpr auto totOnes() const { return off.capacity(); } + constexpr auto nOnes() const { return totOnes() - 1; } + static constexpr int32_t ctCapacity() { return SIZE; } + constexpr auto capacity() const { return content.capacity(); } + + __host__ __device__ void initStorage(View view) { + assert(view.assoc == this); + if constexpr (ctCapacity() < 0) { + assert(view.contentStorage); + assert(view.contentSize > 0); + content.init(view.contentStorage, view.contentSize); + } + if constexpr (ctNOnes() < 0) { + assert(view.offStorage); + assert(view.offSize > 0); + off.init(view.offStorage, view.offSize); + } + } + + __host__ __device__ void zero() { + for (int32_t i = 0; i < totOnes(); ++i) { + off[i] = 0; + } + } + + __host__ __device__ __forceinline__ void add(CountersOnly const &co) { + for (int32_t i = 0; i < totOnes(); ++i) { +#ifdef __CUDA_ARCH__ + atomicAdd(off.data() + i, co.off[i]); +#else + auto &a = (std::atomic &)(off[i]); + a += co.off[i]; +#endif + } + } + + static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicAdd(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a++; +#endif + } + + static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) { +#ifdef __CUDA_ARCH__ + return atomicSub(&x, 1); +#else + auto &a = (std::atomic &)(x); + return a--; +#endif + } + + __host__ __device__ __forceinline__ void count(int32_t b) { + assert(b < nOnes()); + atomicIncrement(off[b]); + } + + __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) { + assert(b < nOnes()); + auto w = atomicDecrement(off[b]); + assert(w > 0); + content[w - 1] = j; + } + + __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(n); + if (int(c.m) >= nOnes()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + content[c.n + j] = v[j]; + return c.m; + } + + __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) { + int m = apc.get().m; + auto n = apc.get().n; + if (m >= nOnes()) { // overflow! + off[nOnes()] = uint32_t(off[nOnes() - 1]); + return; + } + auto first = m + blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) { + off[i] = n; + } + } + + __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) { + assert(off[totOnes() - 1] == 0); + blockPrefixScan(off.data(), totOnes(), ws); + assert(off[totOnes() - 1] == off[totOnes() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totOnes() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return content.data(); } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; } + constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; } + + FlexiStorage off; + int32_t psws; // prefix-scan working space + FlexiStorage content; + }; + + } // namespace cuda +} // namespace cms + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index c663d4f824a77..53d41efcf4236 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -40,6 +40,10 @@ + + + + @@ -48,6 +52,14 @@ + + + + + + + + @@ -76,6 +88,19 @@ + + + + + + + + + + + + + @@ -104,4 +129,4 @@ - \ No newline at end of file + diff --git a/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp b/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp new file mode 100644 index 0000000000000..d16c4e44cc97d --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/FlexiStorage_t.cpp @@ -0,0 +1,34 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h" + +#include + +using namespace cms::cuda; + +int main() { + FlexiStorage a; + + assert(a.capacity() == 1024); + + FlexiStorage v; + + v.init(a.data(), 20); + + assert(v.capacity() == 20); + + assert(v.data() == a.data()); + + a[4] = 42; + + assert(42 == a[4]); + assert(42 == v[4]); + + auto const& ac = a; + auto const& vc = v; + + assert(42 == ac[4]); + assert(42 == vc[4]); + + assert(ac.data() == vc.data()); + + return 0; +}; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu new file mode 100644 index 0000000000000..b49498990fcbe --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +using namespace cms::cuda; + +template +void go() { + std::mt19937 eng; + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr int N = 12000; + T v[N]; + auto v_d = make_device_unique(N, nullptr); + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + uint32_t offsets[nParts + 1]; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::ctCapacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + + Hist h; + uint32_t mem[N]; + auto h_d = make_device_unique(1, nullptr); + auto h_s = make_device_unique(N, nullptr); + // auto h_s = make_device_unique(N, nullptr); + + auto off_d = make_device_unique(nParts + 1, nullptr); + + for (int it = 0; it < 5; ++it) { + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + assert(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + assert(offsets[10] <= N); + } + + cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); + + fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, h_s.get(), 0); + cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); + assert(h.capacity() == offsets[10]); + // get content + cudaCheck(cudaMemcpy(mem, h_s.get(), N * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + typename Hist::View view = {&h, nullptr, mem, -1, N}; + // plug correct content + h.initStorage(view); + assert(0 == h.off[0]); + assert(offsets[10] == h.size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1 < N); + assert(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h.size(ii)) + continue; + auto k = *h.begin(ii); + if (j % 2) + k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2); + auto bk = h.bin(v[k]); + assert(bk == i); + assert(k < offsets[j + 1]); + auto kl = h.bin(v[k] - window); + auto kh = h.bin(v[k] + window); + assert(kl != i); + assert(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h.size(kk + off); + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + assert(!l); + } + } + } +} + +int main() { + cms::cudatest::requireDevices(); + + go(); + go(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 577e9bbb69b57..2109e5a0d5b38 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -25,51 +25,73 @@ void go() { constexpr int N = 12000; T v[N]; + using HistR = HistoContainer; using Hist = HistoContainer; using Hist4 = HistoContainer; + std::cout << "HistoContainerR " << HistR::nbits() << ' ' << HistR::nbins() << ' ' << HistR::totbins() << ' ' + << HistR::ctNOnes() << ' ' << HistR::ctCapacity() << ' ' << (rmax - rmin) / HistR::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' - << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + << Hist::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' - << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; + << Hist4::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; for (auto nh = 0; nh < 4; ++nh) std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl; + uint32_t mem[N]; + HistR hr; + typename HistR::View view{&hr, nullptr, mem, -1, N}; + hr.initStorage(view); + std::cout << "HistoContainerR " << hr.capacity() << std::endl; + assert(hr.capacity() == N); Hist h; Hist4 h4; + assert(h.capacity() == N); + assert(h4.capacity() == N); + for (int it = 0; it < 5; ++it) { for (long long j = 0; j < N; j++) v[j] = rgen(eng); if (it == 2) for (long long j = N / 2; j < N / 2 + N / 4; j++) v[j] = 4; + hr.zero(); h.zero(); h4.zero(); + assert(hr.size() == 0); assert(h.size() == 0); assert(h4.size() == 0); for (long long j = 0; j < N; j++) { + hr.count(v[j]); h.count(v[j]); if (j < 2000) h4.count(v[j], 2); else h4.count(v[j], j % 4); } + assert(hr.size() == 0); assert(h.size() == 0); assert(h4.size() == 0); + hr.finalize(); h.finalize(); h4.finalize(); assert(h.size() == N); assert(h4.size() == N); for (long long j = 0; j < N; j++) { + hr.fill(v[j], j); h.fill(v[j], j); if (j < 2000) h4.fill(v[j], j, 2); else h4.fill(v[j], j, j % 4); } + assert(hr.off[0] == 0); assert(h.off[0] == 0); assert(h4.off[0] == 0); + assert(hr.size() == N); assert(h.size() == N); assert(h4.size() == N); @@ -80,6 +102,11 @@ void go() { std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; }; + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + assert(h.size(i) == hr.size(i)); + assert(*h.begin(i) == *hr.begin(i)); + } + for (uint32_t i = 0; i < Hist::nbins(); ++i) { if (0 == h.size(i)) continue; diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 69974aeef0267..75f9cc0e626f5 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -28,9 +28,11 @@ void go() { using Hist = HistoContainer; std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' - << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' + << Hist::ctCapacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + assert(Hist::totbins() == Hist::ctNOnes()); + Hist h; auto h_d = make_device_unique(1, nullptr); @@ -55,6 +57,7 @@ void go() { offsets[8] = 256 * 11 + offsets[7]; offsets[9] = 44 + offsets[8]; offsets[10] = 3297 + offsets[9]; + assert(offsets[10] <= N); } cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice)); @@ -69,7 +72,7 @@ void go() { cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); - fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0); + fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0); cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost)); assert(0 == h.off[0]); assert(offsets[10] == h.size()); diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index f08c768b43fa5..f2d74c8e7ce98 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -112,7 +112,7 @@ void go() { assert(v_d.get()); using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::ctCapacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h index 36ab026ce1eb6..8ba9158cb69b7 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h +++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h @@ -13,16 +13,23 @@ #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" #endif -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" using cms::cuda::AtomicPairCounter; constexpr uint32_t MaxElem = 64000; constexpr uint32_t MaxTk = 8000; constexpr uint32_t MaxAssocs = 4 * MaxTk; +#ifdef RUNTIME_SIZE +using Assoc = cms::cuda::OneToManyAssoc; +using SmallAssoc = cms::cuda::OneToManyAssoc; +using Multiplicity = cms::cuda::OneToManyAssoc; +#else using Assoc = cms::cuda::OneToManyAssoc; using SmallAssoc = cms::cuda::OneToManyAssoc; using Multiplicity = cms::cuda::OneToManyAssoc; +#endif + using TK = std::array; __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { @@ -32,7 +39,7 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr if (threadIdx.x == 0) local.zero(); __syncthreads(); - local.countDirect(2 + i % 4); + local.count(2 + i % 4); __syncthreads(); if (threadIdx.x == 0) assoc->add(local); @@ -42,12 +49,12 @@ __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restr __global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) { int first = blockDim.x * blockIdx.x + threadIdx.x; for (int i = first; i < n; i += gridDim.x * blockDim.x) - assoc->countDirect(2 + i % 4); + assoc->count(2 + i % 4); } __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) { auto first = blockDim.x * blockIdx.x + threadIdx.x; - for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x) + for (int i = first; i < m1->totOnes(); i += gridDim.x * blockDim.x) assert(m1->off[i] == m2->off[i]); } @@ -60,7 +67,7 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3 if (k >= n) return; if (tk[k][j] < MaxElem) - assoc->countDirect(tk[k][j]); + assoc->count(tk[k][j]); } } @@ -73,11 +80,11 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32 if (k >= n) return; if (tk[k][j] < MaxElem) - assoc->fillDirect(tk[k][j], k); + assoc->fill(tk[k][j], k); } } -__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); } +__global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); } template __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { @@ -90,9 +97,55 @@ __global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Asso template __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) { - if (apc->get().m >= Assoc::nbins()) - printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); - assert(assoc->size() < Assoc::capacity()); + if (int(apc->get().m) >= assoc->nOnes()) + printf("Overflow %d %d\n", apc->get().m, assoc->nOnes()); + assert(int(assoc->size()) < assoc->capacity()); +} + +template +__global__ void verifyFill(Assoc const* __restrict__ la, int n) { + printf("assoc size %d\n", la->size()); + int imax = 0; + long long ave = 0; + int z = 0; + for (int i = 0; i < n; ++i) { + auto x = la->size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la->size(n)); + printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z); +} + +template +__global__ void verifyFinal(Assoc const* __restrict__ la, int N) { + printf("assoc size %d\n", la->size()); + + int imax = 0; + long long ave = 0; + for (int i = 0; i < N; ++i) { + auto x = la->size(i); + if (!(x == 4 || x == 3)) + printf("%d %d\n", i, x); + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la->size(N)); + printf("found with ave occupancy %f %d\n", double(ave) / N, imax); +} + +template +auto make_unique(std::size_t size) { +#ifdef __CUDACC__ + return cms::cuda::make_device_unique(size, 0); +#else + return std::make_unique(size); +#endif } int main() { @@ -118,9 +171,9 @@ int main() { assert(gridDim.z == 1); #endif - std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; - std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' - << SmallAssoc::capacity() << std::endl; + std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::ctNOnes() << ' ' << Assoc::ctCapacity() << std::endl; + std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::ctNOnes() << ' ' + << SmallAssoc::ctCapacity() << std::endl; std::mt19937 eng; @@ -164,19 +217,36 @@ int main() { } std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + auto a_d = make_unique(1); + auto sa_d = make_unique(1); #ifdef __CUDACC__ auto v_d = cms::cuda::make_device_unique[]>(N, nullptr); assert(v_d.get()); - auto a_d = cms::cuda::make_device_unique(1, nullptr); - auto sa_d = cms::cuda::make_device_unique(1, nullptr); cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array), cudaMemcpyHostToDevice)); #else - auto a_d = std::make_unique(); - auto sa_d = std::make_unique(); auto v_d = tr.data(); #endif - launchZero(a_d.get(), 0); + Assoc::Counter* a_st = nullptr; + int a_n = MaxElem; + + Assoc::index_type* a_st2 = nullptr; + SmallAssoc::index_type* sa_st2 = nullptr; + int a_n2 = MaxAssocs; + +// storage +#ifdef RUNTIME_SIZE + auto a_st_d = make_unique(a_n); + auto a_st2_d = make_unique(a_n2); + auto sa_st2_d = make_unique(a_n2); + a_st = a_st_d.get(); + a_st2 = a_st2_d.get(); + sa_st2 = sa_st2_d.get(); +#endif + Assoc::View aView = {a_d.get(), a_st, a_st2, a_n, a_n2}; + launchZero(aView, 0); + SmallAssoc::View saView = {sa_d.get(), nullptr, sa_st2, -1, a_n2}; + launchZero(saView, 0); #ifdef __CUDACC__ auto nThreads = 256; @@ -184,41 +254,21 @@ int main() { count<<>>(v_d.get(), a_d.get(), N); - launchFinalize(a_d.get(), 0); + launchFinalize(aView, 0); verify<<<1, 1>>>(a_d.get()); fill<<>>(v_d.get(), a_d.get(), N); + verifyFill<<<1, 1>>>(a_d.get(), n); + #else count(v_d, a_d.get(), N); - launchFinalize(a_d.get()); + launchFinalize(aView); verify(a_d.get()); fill(v_d, a_d.get(), N); -#endif + verifyFill(a_d.get(), n); - Assoc la; - -#ifdef __CUDACC__ - cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); -#else - memcpy(&la, a_d.get(), sizeof(Assoc)); // not required, easier #endif - std::cout << la.size() << std::endl; - imax = 0; - ave = 0; - z = 0; - for (auto i = 0U; i < n; ++i) { - auto x = la.size(i); - if (x == 0) { - z++; - continue; - } - ave += x; - imax = std::max(imax, int(x)); - } - assert(0 == la.size(n)); - std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; - - // now the inverse map (actually this is the direct....) + // now the inverse map (actually this is the ....) AtomicPairCounter* dc_d; AtomicPairCounter dc(0); @@ -230,8 +280,8 @@ int main() { finalizeBulk<<>>(dc_d, a_d.get()); verifyBulk<<<1, 1>>>(a_d.get(), dc_d); - cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost)); + verifyFinal<<<1, 1>>>(a_d.get(), N); cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter))); fillBulk<<>>(dc_d, v_d.get(), sa_d.get(), N); @@ -243,7 +293,8 @@ int main() { fillBulk(dc_d, v_d, a_d.get(), N); finalizeBulk(dc_d, a_d.get()); verifyBulk(a_d.get(), dc_d); - memcpy(&la, a_d.get(), sizeof(Assoc)); + + verifyFinal(a_d.get(), N); AtomicPairCounter sdc(0); fillBulk(&sdc, v_d, sa_d.get(), N); @@ -254,40 +305,35 @@ int main() { std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl; - std::cout << la.size() << std::endl; - imax = 0; - ave = 0; - for (auto i = 0U; i < N; ++i) { - auto x = la.size(i); - if (!(x == 4 || x == 3)) - std::cout << i << ' ' << x << std::endl; - assert(x == 4 || x == 3); - ave += x; - imax = std::max(imax, int(x)); - } - assert(0 == la.size(N)); - std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; - // here verify use of block local counters -#ifdef __CUDACC__ - auto m1_d = cms::cuda::make_device_unique(1, nullptr); - auto m2_d = cms::cuda::make_device_unique(1, nullptr); -#else - auto m1_d = std::make_unique(); - auto m2_d = std::make_unique(); + auto m1_d = make_unique(1); + auto m2_d = make_unique(1); + + Multiplicity::index_type* m1_st = nullptr; + Multiplicity::index_type* m2_st = nullptr; + int m_n = 0; + +#ifdef RUNTIME_SIZE + m_n = MaxTk; + auto m1_st_d = make_unique(m_n); + auto m2_st_d = make_unique(m_n); + m1_st = m1_st_d.get(); + m2_st = m1_st_d.get(); #endif - launchZero(m1_d.get(), 0); - launchZero(m2_d.get(), 0); + Multiplicity::View view1 = {m1_d.get(), nullptr, m1_st, -1, m_n}; + Multiplicity::View view2 = {m2_d.get(), nullptr, m2_st, -1, m_n}; + launchZero(view1, 0); + launchZero(view2, 0); #ifdef __CUDACC__ nBlocks = (4 * N + nThreads - 1) / nThreads; countMulti<<>>(v_d.get(), m1_d.get(), N); countMultiLocal<<>>(v_d.get(), m2_d.get(), N); - verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get()); - launchFinalize(m1_d.get(), 0); - launchFinalize(m2_d.get(), 0); - verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get()); + launchFinalize(view1, 0); + launchFinalize(view2, 0); + verifyMulti<<<1, Multiplicity::ctNOnes()>>>(m1_d.get(), m2_d.get()); cudaCheck(cudaGetLastError()); cudaCheck(cudaDeviceSynchronize()); @@ -296,8 +342,8 @@ int main() { countMultiLocal(v_d, m2_d.get(), N); verifyMulti(m1_d.get(), m2_d.get()); - launchFinalize(m1_d.get()); - launchFinalize(m2_d.get()); + launchFinalize(view1, 0); + launchFinalize(view2, 0); verifyMulti(m1_d.get(), m2_d.get()); #endif return 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index e7267343d3d01..6bebcd9ea7aab 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -500,12 +500,6 @@ namespace pixelgpudetails { printf("moduleStart %d %d\n", i, moduleStart[i]); } #endif - - // avoid overflow - auto constexpr maxNumClusters = gpuClustering::maxNumClusters; - for (int i = first, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) { - moduleStart[i] = std::clamp(moduleStart[i], 0U, maxNumClusters); - } } // Interface to outside diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index f75d5e3b3bef7..61c228bea3c56 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -47,30 +47,29 @@ namespace pixelgpudetails { #ifdef GPU_DEBUG std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif - if (blocks) // protect from empty events + // protect from empty events + if (blocks) { gpuPixelRecHits::getHits<<>>( cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); #endif + } // assuming full warp of threads is better than a smaller number... if (nHits) { setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); cudaCheck(cudaGetLastError()); - } - if (nHits) { - cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream); + cms::cuda::fillManyFromVector( + hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); - } #ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); #endif + } return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index 09b90526bf7db..65846cbb652d6 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -78,12 +78,6 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons iEvent.getByToken(tBeamSpot, hbs); auto const& bs = ctx.get(*hbs); - auto nHits = clusters.nClusters(); - if (nHits >= TrackingRecHit2DSOAView::maxHits()) { - edm::LogWarning("PixelRecHitGPUKernel") - << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits(); - } - ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(ctx.stream()), ctx.stream())); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 790b0da51ecfb..e2f9e1263e429 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -146,8 +146,6 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e if (clust.originalId() >= nhits) continue; auto ij = jnd(clust.originalId()); - if (ij >= TrackingRecHit2DSOAView::maxHits()) - continue; // overflow... LocalPoint lp(xl[ij], yl[ij]); LocalError le(xe[ij], 0, ye[ij]); SiPixelRecHitQuality::QualWordType rqw = 0; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index b3dafcc45d029..fbf3384ce328c 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -245,7 +245,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv output->hitsLayerStart(), numberOfHits, 256, - nullptr); + output->phiBinnerStorage()); LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets"; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 2401fed6c5171..da7d3de7336a8 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -166,7 +166,6 @@ namespace gpuPixelRecHits { for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) { auto h = first + ic; // output index in global memory - assert(h < TrackingRecHit2DSOAView::maxHits()); assert(h < hits.nHits()); assert(h < clusters.clusModuleStart(me + 1)); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index ee5065e81fc45..87e93e6f0bd05 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -45,7 +45,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, #ifdef BROKENLINE_DEBUG if (0 == local_start) { - printf("%d total Ntuple\n", foundNtuplets->nbins()); + printf("%d total Ntuple\n", foundNtuplets->nOnes()); printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit); } #endif @@ -58,7 +58,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, // get it from the ntuple container (one to one to helix) auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - assert(tkid < foundNtuplets->nbins()); + assert(tkid < foundNtuplets->nOnes()); assert(foundNtuplets->size(tkid) == nHits); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h index 5342141d2c9e4..8ea536465814c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -74,8 +74,7 @@ namespace caConstants { using OuterHitOfCell = cms::cuda::VecArray; using TuplesContainer = cms::cuda::OneToManyAssoc; - using HitToTuple = - cms::cuda::OneToManyAssoc; // 3.5 should be enough + using HitToTuple = cms::cuda::OneToManyAssoc; // 3.5 should be enough using TupleMultiplicity = cms::cuda::OneToManyAssoc; } // namespace caConstants diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 379aa5802f7dd..5a2157bf87b7c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -84,7 +84,6 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * cms::cuda::launchZero(tuples_d, cudaStream); auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); // std::cout << "N hits " << nhits << std::endl; // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; @@ -172,7 +171,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); - cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); + cms::cuda::launchFinalize(hitToTupleView_, cudaStream); kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 160be9142ea4c..ca334223f8fc7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -3,7 +3,7 @@ template <> void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { auto blockSize = 128; - auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + auto numberOfBlocks = (HitContainer::ctCapacity() + blockSize - 1) / blockSize; kernel_fillHitDetIndices<<>>( &tracks_d->hitIndices, hv, &tracks_d->detIndices); @@ -24,10 +24,12 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cms::cuda::launchZero(tuples_d, cudaStream); auto nhits = hh.nHits(); - assert(nhits <= pixelGPUConstants::maxNumberOfHits); - // std::cout << "N hits " << nhits << std::endl; - // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; +#ifdef NTUPLE_DEBUG + std::cout << "start tuple building. N hits " << nhits << std::endl; + if (nhits < 2) + std::cout << "too few hits " << nhits << std::endl; +#endif // // applying conbinatoric cleaning such as fishbone at this stage is too expensive @@ -95,7 +97,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * #endif blockSize = 128; - numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; + numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); // remove duplicates (tracks that share a doublet) @@ -136,7 +138,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * template <> void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - auto nhits = hh.nHits(); + int32_t nhits = hh.nHits(); #ifdef NTUPLE_DEBUG std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; @@ -148,7 +150,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1U, nhits), stream); + device_isOuterHitOfCell_ = cms::cuda::make_device_unique(std::max(1, nhits), stream); assert(device_isOuterHitOfCell_.get()); cellStorage_ = cms::cuda::make_device_unique( @@ -162,7 +164,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr { int threadsPerBlock = 128; // at least one block! - int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; + int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock; gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), nhits, device_theCellNeighbors_.get(), @@ -225,6 +227,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA auto const *tuples_d = &tracks_d->hitIndices; auto *quality_d = tracks_d->qualityData(); + int32_t nhits = hh.nHits(); + auto blockSize = 64; // classify tracks based on kinematics @@ -245,30 +249,38 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA kernel_fastDuplicateRemover<<>>( device_theCells_.get(), device_nCells_, tuples_d, tracks_d); cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); +#endif if (params_.minHitsPerNtuplet_ < 4 || params_.doStats_) { // fill hit->track "map" + assert(hitToTupleView_.offSize > nhits); numberOfBlocks = nQuadrupletBlocks(blockSize); kernel_countHitInTracks<<>>( tuples_d, quality_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); - cms::cuda::launchFinalize(device_hitToTuple_.get(), cudaStream); + assert((hitToTupleView_.assoc == device_hitToTuple_.get()) && + (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0)); + cms::cuda::launchFinalize(hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); kernel_fillHitInTracks<<>>(tuples_d, quality_d, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); +#endif } if (params_.doSharedHitCut_) { // remove duplicates (tracks that share a hit) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_sharedHitCleaner<<>>( hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); } if (params_.doStats_) { - auto nhits = hh.nHits(); - numberOfBlocks = (std::max(nhits, params_.maxNumberOfDoublets_) + blockSize - 1) / blockSize; + numberOfBlocks = (std::max(nhits, int(params_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; kernel_checkOverflows<<>>(tuples_d, device_tupleMultiplicity_.get(), device_hitToTuple_.get(), @@ -286,7 +298,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA if (params_.doStats_) { // counters (add flag???) - numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index f79119552e0bc..f5c2755ea8ac6 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -1,6 +1,8 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h +// #define GPU_DEBUG + #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "GPUCACell.h" @@ -179,7 +181,7 @@ class CAHitNtupletGeneratorKernels { void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); - void allocateOnGPU(cudaStream_t stream); + void allocateOnGPU(int32_t nHits, cudaStream_t stream); void cleanup(cudaStream_t cudaStream); static void printCounters(Counters const* counters); @@ -200,6 +202,9 @@ class CAHitNtupletGeneratorKernels { uint32_t* device_nCells_ = nullptr; unique_ptr device_hitToTuple_; + unique_ptr device_hitToTupleStorage_; + HitToTuple::View hitToTupleView_; + cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr; cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h index 1d19aa43d6e1b..5978ef8851c73 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h @@ -4,9 +4,9 @@ template <> #ifdef __CUDACC__ -void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { #else -void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { #endif ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) @@ -15,7 +15,17 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { device_theCellNeighbors_ = Traits::template make_unique(stream); device_theCellTracks_ = Traits::template make_unique(stream); +#ifdef GPU_DEBUG + std::cout << "Allocation for tuple building. N hits " << nHits << std::endl; +#endif + + nHits++; // storage requires one more counter; + assert(nHits > 0); device_hitToTuple_ = Traits::template make_unique(stream); + device_hitToTupleStorage_ = Traits::template make_unique(nHits, stream); + hitToTupleView_.assoc = device_hitToTuple_.get(); + hitToTupleView_.offStorage = device_hitToTupleStorage_.get(); + hitToTupleView_.offSize = nHits; device_tupleMultiplicity_ = Traits::template make_unique(stream); @@ -25,11 +35,16 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) { device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; device_nCells_ = (uint32_t*)(device_storage_.get() + 2); + // FIXME: consider collapsing these 3 in one adhoc kernel if constexpr (std::is_same::value) { cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); } else { *device_nCells_ = 0; } cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream); - cms::cuda::launchZero(device_hitToTuple_.get(), stream); // we may wish to keep it in the edm... + cms::cuda::launchZero(hitToTupleView_, stream); // we may wish to keep it in the edm +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index e55d7f124a9c5..550541fdca6fb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -3,6 +3,7 @@ // // #define NTUPLE_DEBUG +// #define GPU_DEBUG #include #include @@ -38,7 +39,7 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, gpuPixelDoublets::CellNeighborsVector const *cellNeighbors, gpuPixelDoublets::CellTracksVector const *cellTracks, GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, - uint32_t nHits, + int32_t nHits, uint32_t maxNumberOfDoublets, CAHitNtupletGeneratorKernelsGPU::Counters *counters) { auto first = threadIdx.x + blockIdx.x * blockDim.x; @@ -55,23 +56,24 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, #ifdef NTUPLE_DEBUG if (0 == first) { - printf("number of found cells %d, found tuples %d with total hits %d out of %d\n", + printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n", *nCells, apc->get().m, apc->get().n, - nHits); + nHits, + hitToTuple->totOnes()); if (apc->get().m < caConstants::maxNumberOfQuadruplets()) { assert(foundNtuplets->size(apc->get().m) == 0); assert(foundNtuplets->size() == apc->get().n); } } - for (int idx = first, nt = foundNtuplets->nbins(); idx < nt; idx += gridDim.x * blockDim.x) { + for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { if (foundNtuplets->size(idx) > 5) printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); assert(foundNtuplets->size(idx) < 6); for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) - assert(*ih < nHits); + assert(int(*ih) < nHits); } #endif @@ -84,6 +86,8 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, printf("cellNeighbors overflow\n"); if (cellTracks && cellTracks->full()) printf("cellTracks overflow\n"); + if (int(hitToTuple->nOnes()) < nHits) + printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits); } for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -302,7 +306,7 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN Quality const *__restrict__ quality, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) { + for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; @@ -312,7 +316,7 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN if (nhits > 5) printf("wrong mult %d %d\n", it, nhits); assert(nhits < 8); - tupleMultiplicity->countDirect(nhits); + tupleMultiplicity->count(nhits); } } @@ -320,7 +324,7 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt Quality const *__restrict__ quality, caConstants::TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nbins(); it < nt; it += gridDim.x * blockDim.x) { + for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = foundNtuplets->size(it); if (nhits < 3) continue; @@ -330,7 +334,7 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt if (nhits > 5) printf("wrong mult %d %d\n", it, nhits); assert(nhits < 8); - tupleMultiplicity->fillDirect(nhits, it); + tupleMultiplicity->fill(nhits, it); } } @@ -339,7 +343,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts, Quality *__restrict__ quality) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nbins(); it < nt; it += gridDim.x * blockDim.x) { + for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { auto nhits = tuples->size(it); if (nhits == 0) break; // guard @@ -377,7 +381,7 @@ __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) if (3.f * tracks->chi2(it) >= chi2Cut) { -#ifdef NTUPLE_DEBUG +#ifdef NTUPLE_FIT_DEBUG printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", it, tuples->size(it), @@ -406,7 +410,7 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::Counters *counters) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) break; //guard if (quality[idx] != pixelTrack::Quality::loose) @@ -419,13 +423,13 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) break; // guard if (quality[idx] != pixelTrack::Quality::loose) continue; for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->countDirect(*h); + hitToTuple->count(*h); } } @@ -433,13 +437,13 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (tuples->size(idx) == 0) break; // guard if (quality[idx] != pixelTrack::Quality::loose) continue; for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->fillDirect(*h, idx); + hitToTuple->fill(*h, idx); } } @@ -448,15 +452,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples HitContainer *__restrict__ hitDetIndices) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets - for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { hitDetIndices->off[idx] = tuples->off[idx]; } // fill hit indices auto const &hh = *hhp; auto nhits = hh.nHits(); for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->bins[idx] < nhits); - hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); + assert(tuples->content[idx] < nhits); + hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); } } @@ -464,7 +468,7 @@ __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::Hi CAHitNtupletGeneratorKernelsGPU::Counters *counters) { auto &c = *counters; int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (hitToTuple->size(idx) == 0) continue; // SHALL NOT BE break atomicAdd(&c.nUsedHits, 1); @@ -491,7 +495,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric int l1end = hh.hitsLayerStart()[1]; int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (hitToTuple.size(idx) < 2) continue; @@ -540,12 +544,12 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res TkSoA const *__restrict__ ptracks, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple, - uint32_t maxPrint, + int32_t maxPrint, int iev) { auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(maxPrint, foundNtuplets.nbins()); i < np; i += blockDim.x * gridDim.x) { + for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { auto nh = foundNtuplets.size(i); if (nh < 3) continue; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 5a86ece3590f6..3f5ba1d04e7db 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -2,6 +2,8 @@ // Original Author: Felice Pantaleo, CERN // +// #define GPU_DEBUG + #include #include #include @@ -179,11 +181,11 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); auto* soa = tracks.get(); + assert(soa); CAHitNtupletGeneratorKernelsGPU kernels(m_params); kernels.setCounters(m_counters); - - kernels.allocateOnGPU(stream); + kernels.allocateOnGPU(hits_d.nHits(), stream); kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); @@ -198,6 +200,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH } kernels.classifyTuples(hits_d, soa, stream); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); + std::cout << "finished building pixel tracks on GPU" << std::endl; +#endif + return tracks; } @@ -209,7 +217,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC CAHitNtupletGeneratorKernelsCPU kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(nullptr); + kernels.allocateOnGPU(hits_d.nHits(), nullptr); kernels.buildDoublets(hits_d, nullptr); kernels.launchKernels(hits_d, soa, nullptr); @@ -230,5 +238,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC kernels.classifyTuples(hits_d, soa, nullptr); +#ifdef GPU_DEBUG + std::cout << "finished building pixel tracks on CPU" << std::endl; +#endif + return tracks; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 5b661bc3be028..926002d674b83 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -51,7 +51,7 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, // get it from the ntuple container (one to one to helix) auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - assert(tkid < foundNtuplets->nbins()); + assert(tkid < foundNtuplets->nOnes()); assert(foundNtuplets->size(tkid) == nHits);