From 438200cb5a65eb8d37999f6c24761d61ec11c0b7 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 29 Dec 2020 08:46:30 -0800 Subject: [PATCH] [cudacompat] Make clusterizer kernels independent of grid size (cms-patatrack/cmssw#588) --- .../gpuClusterChargeCut.h | 173 ++++---- .../plugin-SiPixelClusterizer/gpuClustering.h | 408 +++++++++--------- src/cudacompat/test/gpuClustering_t.h | 37 +- 3 files changed, 304 insertions(+), 314 deletions(-) diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h index d0dd93044..000a7d36e 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h +++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClusterChargeCut.h @@ -19,105 +19,106 @@ namespace gpuClustering { uint32_t const* __restrict__ moduleId, // module id of each module int32_t* __restrict__ clusterId, // modified: cluster id of each pixel uint32_t numElements) { - if (blockIdx.x >= moduleStart[0]) - return; - - auto firstPixel = moduleStart[1 + blockIdx.x]; - auto thisModuleId = id[firstPixel]; - assert(thisModuleId < MaxNumModules); - assert(thisModuleId == moduleId[blockIdx.x]); + __shared__ int32_t charge[MaxNumClustersPerModules]; + __shared__ uint8_t ok[MaxNumClustersPerModules]; + __shared__ uint16_t newclusId[MaxNumClustersPerModules]; - auto nclus = nClustersInModule[thisModuleId]; - if (nclus == 0) - return; + auto firstModule = blockIdx.x; + auto endModule = moduleStart[0]; + for (auto module = firstModule; module < endModule; module += gridDim.x) { + auto firstPixel = moduleStart[1 + module]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + assert(thisModuleId == moduleId[module]); + + auto nclus = nClustersInModule[thisModuleId]; + if (nclus == 0) + continue; + + if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules) + printf("Warning too many clusters in module %d in block %d: %d > %d\n", + thisModuleId, + blockIdx.x, + nclus, + MaxNumClustersPerModules); + + auto first = firstPixel + threadIdx.x; + + if (nclus > MaxNumClustersPerModules) { + // remove excess FIXME find a way to cut charge first.... + for (auto i = first; i < numElements; i += blockDim.x) { + if (id[i] == InvId) + continue; // not valid + if (id[i] != thisModuleId) + break; // end of module + if (clusterId[i] >= MaxNumClustersPerModules) { + id[i] = InvId; + clusterId[i] = InvId; + } + } + nclus = MaxNumClustersPerModules; + } - if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules) - printf("Warning too many clusters in module %d in block %d: %d > %d\n", - thisModuleId, - blockIdx.x, - nclus, - MaxNumClustersPerModules); +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x); +#endif - auto first = firstPixel + threadIdx.x; + assert(nclus <= MaxNumClustersPerModules); + for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { + charge[i] = 0; + } + __syncthreads(); - if (nclus > MaxNumClustersPerModules) { - // remove excess FIXME find a way to cut charge first.... for (auto i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) continue; // not valid if (id[i] != thisModuleId) break; // end of module - if (clusterId[i] >= MaxNumClustersPerModules) { - id[i] = InvId; - clusterId[i] = InvId; - } + atomicAdd(&charge[clusterId[i]], adc[i]); } - nclus = MaxNumClustersPerModules; - } + __syncthreads(); -#ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) - if (threadIdx.x == 0) - printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); -#endif + auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?) + for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { + newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; + } - __shared__ int32_t charge[MaxNumClustersPerModules]; - __shared__ uint8_t ok[MaxNumClustersPerModules]; - __shared__ uint16_t newclusId[MaxNumClustersPerModules]; + __syncthreads(); + + // renumber + __shared__ uint16_t ws[32]; + cms::cuda::blockPrefixScan(newclusId, nclus, ws); + + assert(nclus >= newclusId[nclus - 1]); + + if (nclus == newclusId[nclus - 1]) + continue; + + nClustersInModule[thisModuleId] = newclusId[nclus - 1]; + __syncthreads(); + + // mark bad cluster again + for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { + if (0 == ok[i]) + newclusId[i] = InvId + 1; + } + __syncthreads(); + + // reassign id + for (auto i = first; i < numElements; i += blockDim.x) { + if (id[i] == InvId) + continue; // not valid + if (id[i] != thisModuleId) + break; // end of module + clusterId[i] = newclusId[clusterId[i]] - 1; + if (clusterId[i] == InvId) + id[i] = InvId; + } - assert(nclus <= MaxNumClustersPerModules); - for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { - charge[i] = 0; - } - __syncthreads(); - - for (auto i = first; i < numElements; i += blockDim.x) { - if (id[i] == InvId) - continue; // not valid - if (id[i] != thisModuleId) - break; // end of module - atomicAdd(&charge[clusterId[i]], adc[i]); - } - __syncthreads(); - - auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?) - for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { - newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; - } - - __syncthreads(); - - // renumber - __shared__ uint16_t ws[32]; - cms::cuda::blockPrefixScan(newclusId, nclus, ws); - - assert(nclus >= newclusId[nclus - 1]); - - if (nclus == newclusId[nclus - 1]) - return; - - nClustersInModule[thisModuleId] = newclusId[nclus - 1]; - __syncthreads(); - - // mark bad cluster again - for (auto i = threadIdx.x; i < nclus; i += blockDim.x) { - if (0 == ok[i]) - newclusId[i] = InvId + 1; - } - __syncthreads(); - - // reassign id - for (auto i = first; i < numElements; i += blockDim.x) { - if (id[i] == InvId) - continue; // not valid - if (id[i] != thisModuleId) - break; // end of module - clusterId[i] = newclusId[clusterId[i]] - 1; - if (clusterId[i] == InvId) - id[i] = InvId; - } - - //done + //done + } // loop on modules } } // namespace gpuClustering diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h index 84609bd10..e485a2331 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h +++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuClustering.h @@ -47,258 +47,260 @@ namespace gpuClustering { uint32_t* __restrict__ moduleId, // output: module id of each module int32_t* __restrict__ clusterId, // output: cluster id of each pixel int numElements) { - if (blockIdx.x >= moduleStart[0]) - return; + __shared__ int msize; - auto firstPixel = moduleStart[1 + blockIdx.x]; - auto thisModuleId = id[firstPixel]; - assert(thisModuleId < MaxNumModules); + auto firstModule = blockIdx.x; + auto endModule = moduleStart[0]; + for (auto module = firstModule; module < endModule; module += gridDim.x) { + auto firstPixel = moduleStart[1 + module]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); #ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) - if (threadIdx.x == 0) - printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); #endif - auto first = firstPixel + threadIdx.x; + auto first = firstPixel + threadIdx.x; - // find the index of the first pixel not belonging to this module (or invalid) - __shared__ int msize; - msize = numElements; - __syncthreads(); + // find the index of the first pixel not belonging to this module (or invalid) + msize = numElements; + __syncthreads(); - // skip threads not associated to an existing pixel - for (int i = first; i < numElements; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - if (id[i] != thisModuleId) { // find the first pixel in a different module - atomicMin(&msize, i); - break; + // skip threads not associated to an existing pixel + for (int i = first; i < numElements; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + if (id[i] != thisModuleId) { // find the first pixel in a different module + atomicMin(&msize, i); + break; + } } - } - //init hist (ymax=416 < 512 : 9bits) - constexpr uint32_t maxPixInModule = 4000; - constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; - using Hist = cms::cuda::HistoContainer; - __shared__ Hist hist; - __shared__ typename Hist::Counter ws[32]; - for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { - hist.off[j] = 0; - } - __syncthreads(); + //init hist (ymax=416 < 512 : 9bits) + constexpr uint32_t maxPixInModule = 4000; + constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); - assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId))); + assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId))); - // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer) - if (0 == threadIdx.x) { - if (msize - firstPixel > maxPixInModule) { - printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule); - msize = maxPixInModule + firstPixel; + // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer) + if (0 == threadIdx.x) { + if (msize - firstPixel > maxPixInModule) { + printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule); + msize = maxPixInModule + firstPixel; + } } - } - __syncthreads(); - assert(msize - firstPixel <= maxPixInModule); + __syncthreads(); + assert(msize - firstPixel <= maxPixInModule); #ifdef GPU_DEBUG - __shared__ uint32_t totGood; - totGood = 0; - __syncthreads(); + __shared__ uint32_t totGood; + totGood = 0; + __syncthreads(); #endif - // fill histo - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - hist.count(y[i]); + // fill histo + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + hist.count(y[i]); #ifdef GPU_DEBUG - atomicAdd(&totGood, 1); + atomicAdd(&totGood, 1); #endif - } - __syncthreads(); - if (threadIdx.x < 32) - ws[threadIdx.x] = 0; // used by prefix scan... - __syncthreads(); - hist.finalize(ws); - __syncthreads(); + } + __syncthreads(); + if (threadIdx.x < 32) + ws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(ws); + __syncthreads(); #ifdef GPU_DEBUG - assert(hist.size() == totGood); - if (thisModuleId % 100 == 1) - if (threadIdx.x == 0) - printf("histo size %d\n", hist.size()); + assert(hist.size() == totGood); + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("histo size %d\n", hist.size()); #endif - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - hist.fill(y[i], i - firstPixel); - } + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + hist.fill(y[i], i - firstPixel); + } #ifdef __CUDA_ARCH__ - // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations - constexpr int maxiter = 16; + // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations + constexpr int maxiter = 16; #else - auto maxiter = hist.size(); + auto maxiter = hist.size(); #endif - // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event - constexpr int maxNeighbours = 10; - assert((hist.size() / blockDim.x) <= maxiter); - // nearest neighbour - uint16_t nn[maxiter][maxNeighbours]; - uint8_t nnn[maxiter]; // number of nn - for (uint32_t k = 0; k < maxiter; ++k) - nnn[k] = 0; + // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event + constexpr int maxNeighbours = 10; + assert((hist.size() / blockDim.x) <= maxiter); + // nearest neighbour + uint16_t nn[maxiter][maxNeighbours]; + uint8_t nnn[maxiter]; // number of nn + for (uint32_t k = 0; k < maxiter; ++k) + nnn[k] = 0; - __syncthreads(); // for hit filling! + __syncthreads(); // for hit filling! #ifdef GPU_DEBUG - // look for anomalous high occupancy - __shared__ uint32_t n40, n60; - n40 = n60 = 0; - __syncthreads(); - for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) { - if (hist.size(j) > 60) - atomicAdd(&n60, 1); - if (hist.size(j) > 40) - atomicAdd(&n40, 1); - } - __syncthreads(); - if (0 == threadIdx.x) { - if (n60 > 0) - printf("columns with more than 60 px %d in %d\n", n60, thisModuleId); - else if (n40 > 0) - printf("columns with more than 40 px %d in %d\n", n40, thisModuleId); - } - __syncthreads(); + // look for anomalous high occupancy + __shared__ uint32_t n40, n60; + n40 = n60 = 0; + __syncthreads(); + for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) { + if (hist.size(j) > 60) + atomicAdd(&n60, 1); + if (hist.size(j) > 40) + atomicAdd(&n40, 1); + } + __syncthreads(); + if (0 == threadIdx.x) { + if (n60 > 0) + printf("columns with more than 60 px %d in %d\n", n60, thisModuleId); + else if (n40 > 0) + printf("columns with more than 40 px %d in %d\n", n40, thisModuleId); + } + __syncthreads(); #endif - // fill NN - for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { - assert(k < maxiter); - auto p = hist.begin() + j; - auto i = *p + firstPixel; - assert(id[i] != InvId); - assert(id[i] == thisModuleId); // same module - int be = Hist::bin(y[i] + 1); - auto e = hist.end(be); - ++p; - assert(0 == nnn[k]); - for (; p < e; ++p) { - auto m = (*p) + firstPixel; - assert(m != i); - assert(int(y[m]) - int(y[i]) >= 0); - assert(int(y[m]) - int(y[i]) <= 1); - if (std::abs(int(x[m]) - int(x[i])) > 1) - continue; - auto l = nnn[k]++; - assert(l < maxNeighbours); - nn[k][l] = *p; + // fill NN + for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { + assert(k < maxiter); + auto p = hist.begin() + j; + auto i = *p + firstPixel; + assert(id[i] != InvId); + assert(id[i] == thisModuleId); // same module + int be = Hist::bin(y[i] + 1); + auto e = hist.end(be); + ++p; + assert(0 == nnn[k]); + for (; p < e; ++p) { + auto m = (*p) + firstPixel; + assert(m != i); + assert(int(y[m]) - int(y[i]) >= 0); + assert(int(y[m]) - int(y[i]) <= 1); + if (std::abs(int(x[m]) - int(x[i])) > 1) + continue; + auto l = nnn[k]++; + assert(l < maxNeighbours); + nn[k][l] = *p; + } } - } - // for each pixel, look at all the pixels until the end of the module; - // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; - // after the loop, all the pixel in each cluster should have the id equeal to the lowest - // pixel in the cluster ( clus[i] == i ). - bool more = true; - int nloops = 0; - while (__syncthreads_or(more)) { - if (1 == nloops % 2) { - for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { - auto p = hist.begin() + j; - auto i = *p + firstPixel; - auto m = clusterId[i]; - while (m != clusterId[m]) - m = clusterId[m]; - clusterId[i] = m; + // for each pixel, look at all the pixels until the end of the module; + // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; + // after the loop, all the pixel in each cluster should have the id equeal to the lowest + // pixel in the cluster ( clus[i] == i ). + bool more = true; + int nloops = 0; + while (__syncthreads_or(more)) { + if (1 == nloops % 2) { + for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { + auto p = hist.begin() + j; + auto i = *p + firstPixel; + auto m = clusterId[i]; + while (m != clusterId[m]) + m = clusterId[m]; + clusterId[i] = m; + } + } else { + more = false; + for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { + auto p = hist.begin() + j; + auto i = *p + firstPixel; + for (int kk = 0; kk < nnn[k]; ++kk) { + auto l = nn[k][kk]; + auto m = l + firstPixel; + assert(m != i); + auto old = atomicMin(&clusterId[m], clusterId[i]); + if (old != clusterId[i]) { + // end the loop only if no changes were applied + more = true; + } + atomicMin(&clusterId[i], old); + } // nnloop + } // pixel loop } - } else { - more = false; - for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) { - auto p = hist.begin() + j; - auto i = *p + firstPixel; - for (int kk = 0; kk < nnn[k]; ++kk) { - auto l = nn[k][kk]; - auto m = l + firstPixel; - assert(m != i); - auto old = atomicMin(&clusterId[m], clusterId[i]); - if (old != clusterId[i]) { - // end the loop only if no changes were applied - more = true; - } - atomicMin(&clusterId[i], old); - } // nnloop - } // pixel loop - } - ++nloops; - } // end while + ++nloops; + } // end while #ifdef GPU_DEBUG - { - __shared__ int n0; - if (threadIdx.x == 0) - n0 = nloops; - __syncthreads(); - auto ok = n0 == nloops; - assert(__syncthreads_and(ok)); - if (thisModuleId % 100 == 1) + { + __shared__ int n0; if (threadIdx.x == 0) - printf("# loops %d\n", nloops); - } + n0 = nloops; + __syncthreads(); + auto ok = n0 == nloops; + assert(__syncthreads_and(ok)); + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("# loops %d\n", nloops); + } #endif - __shared__ unsigned int foundClusters; - foundClusters = 0; - __syncthreads(); + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); - // find the number of different clusters, identified by a pixels with clus[i] == i; - // mark these pixels with a negative id. - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - if (clusterId[i] == i) { - auto old = atomicInc(&foundClusters, 0xffffffff); - clusterId[i] = -(old + 1); + // find the number of different clusters, identified by a pixels with clus[i] == i; + // mark these pixels with a negative id. + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + if (clusterId[i] == i) { + auto old = atomicInc(&foundClusters, 0xffffffff); + clusterId[i] = -(old + 1); + } } - } - __syncthreads(); + __syncthreads(); - // propagate the negative id to all the pixels in the cluster. - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - if (clusterId[i] >= 0) { - // mark each pixel in a cluster with the same id as the first one - clusterId[i] = clusterId[clusterId[i]]; + // propagate the negative id to all the pixels in the cluster. + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + if (clusterId[i] >= 0) { + // mark each pixel in a cluster with the same id as the first one + clusterId[i] = clusterId[clusterId[i]]; + } } - } - __syncthreads(); + __syncthreads(); - // adjust the cluster id to be a positive value starting from 0 - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) { // skip invalid pixels - clusterId[i] = -9999; - continue; + // adjust the cluster id to be a positive value starting from 0 + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) { // skip invalid pixels + clusterId[i] = -9999; + continue; + } + clusterId[i] = -clusterId[i] - 1; } - clusterId[i] = -clusterId[i] - 1; - } - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) { - nClustersInModule[thisModuleId] = foundClusters; - moduleId[blockIdx.x] = thisModuleId; + if (threadIdx.x == 0) { + nClustersInModule[thisModuleId] = foundClusters; + moduleId[module] = thisModuleId; #ifdef GPU_DEBUG - if (foundClusters > gMaxHit) { - gMaxHit = foundClusters; - if (foundClusters > 8) - printf("max hit %d in %d\n", foundClusters, thisModuleId); - } + if (foundClusters > gMaxHit) { + gMaxHit = foundClusters; + if (foundClusters > 8) + printf("max hit %d in %d\n", foundClusters, thisModuleId); + } #endif #ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) - printf("%d clusters in module %d\n", foundClusters, thisModuleId); + if (thisModuleId % 100 == 1) + printf("%d clusters in module %d\n", foundClusters, thisModuleId); #endif - } + } + } // module loop } } // namespace gpuClustering diff --git a/src/cudacompat/test/gpuClustering_t.h b/src/cudacompat/test/gpuClustering_t.h index 5388e3499..55998dcda 100644 --- a/src/cudacompat/test/gpuClustering_t.h +++ b/src/cudacompat/test/gpuClustering_t.h @@ -15,7 +15,7 @@ #include "CUDACore/cudaCheck.h" #include "CUDACore/requireDevices.h" #include "CUDACore/launch.h" -#endif +#endif // __CUDACC__ // dirty, but works #include "plugin-SiPixelClusterizer/gpuClustering.h" @@ -24,7 +24,7 @@ int main(void) { #ifdef __CUDACC__ cms::cudatest::requireDevices(); -#endif +#endif // __CUDACC__ using namespace gpuClustering; @@ -46,7 +46,7 @@ int main(void) { auto d_moduleStart = cms::cuda::make_device_unique(MaxNumModules + 1, nullptr); auto d_clusInModule = cms::cuda::make_device_unique(MaxNumModules, nullptr); auto d_moduleId = cms::cuda::make_device_unique(MaxNumModules, nullptr); -#else +#else // __CUDACC__ auto h_moduleStart = std::make_unique(MaxNumModules + 1); auto h_clusInModule = std::make_unique(MaxNumModules); @@ -245,11 +245,11 @@ int main(void) { // size_t size8 = n * sizeof(uint8_t); cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice)); cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice)); + // Launch CUDA Kernels int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; @@ -301,22 +301,13 @@ int main(void) { n); cudaDeviceSynchronize(); -#else +#else // __CUDACC__ h_moduleStart[0] = nModules; countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n); memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)); - gridDim.x = MaxNumModules; //not needed in the kernel for this specific case; - assert(blockIdx.x == 0); - for (; blockIdx.x < gridDim.x; ++blockIdx.x) - findClus(h_id.get(), - h_x.get(), - h_y.get(), - h_moduleStart.get(), - h_clusInModule.get(), - h_moduleId.get(), - h_clus.get(), - n); - resetGrid(); + + findClus( + h_id.get(), h_x.get(), h_y.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n); nModules = h_moduleStart[0]; auto nclus = h_clusInModule.get(); @@ -331,14 +322,10 @@ int main(void) { if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; - gridDim.x = MaxNumModules; // no needed in the kernel for in this specific case - assert(blockIdx.x == 0); - for (; blockIdx.x < gridDim.x; ++blockIdx.x) - clusterChargeCut( - h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n); - resetGrid(); + clusterChargeCut( + h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n); -#endif +#endif // __CUDACC__ std::cout << "found " << nModules << " Modules active" << std::endl; @@ -347,7 +334,7 @@ int main(void) { cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), MaxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost)); -#endif +#endif // __CUDACC__ std::set clids; for (int i = 0; i < n; ++i) {