cms-sw · cmsbuild · Jun 30, 2021 · Jun 25, 2021 · Jun 25, 2021 · Jun 25, 2021
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -39,6 +39,11 @@ namespace cms {
       return old;
     }
 
+    template <typename T1, typename T2>
+    T1 atomicCAS_block(T1* address, T1 compare, T2 val) {
+      return atomicCAS(address, compare, val);
+    }
+
     template <typename T1, typename T2>
     T1 atomicInc(T1* a, T2 b) {
       auto ret = *a;
@@ -47,33 +52,59 @@ namespace cms {
       return ret;
     }
 
+    template <typename T1, typename T2>
+    T1 atomicInc_block(T1* a, T2 b) {
+      return atomicInc(a, b);
+    }
+
     template <typename T1, typename T2>
     T1 atomicAdd(T1* a, T2 b) {
       auto ret = *a;
       (*a) += b;
       return ret;
     }
 
+    template <typename T1, typename T2>
+    T1 atomicAdd_block(T1* a, T2 b) {
+      return atomicAdd(a, b);
+    }
+
     template <typename T1, typename T2>
     T1 atomicSub(T1* a, T2 b) {
       auto ret = *a;
       (*a) -= b;
       return ret;
     }
 
+    template <typename T1, typename T2>
+    T1 atomicSub_block(T1* a, T2 b) {
+      return atomicSub(a, b);
+    }
+
     template <typename T1, typename T2>
     T1 atomicMin(T1* a, T2 b) {
       auto ret = *a;
       *a = std::min(*a, T1(b));
       return ret;
     }
+
+    template <typename T1, typename T2>
+    T1 atomicMin_block(T1* a, T2 b) {
+      return atomicMin(a, b);
+    }
+
     template <typename T1, typename T2>
     T1 atomicMax(T1* a, T2 b) {
       auto ret = *a;
       *a = std::max(*a, T1(b));
       return ret;
     }
 
+    template <typename T1, typename T2>
+    T1 atomicMax_block(T1* a, T2 b) {
+      return atomicMax(a, b);
+    }
+
     inline void __syncthreads() {}
     inline void __threadfence() {}
     inline bool __syncthreads_or(bool x) { return x; }

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -218,12 +218,13 @@ namespace gpuClustering {
               auto l = nn[k][kk];
               auto m = l + firstPixel;
               assert(m != i);
-              auto old = atomicMin(&clusterId[m], clusterId[i]);
+              auto old = atomicMin_block(&clusterId[m], clusterId[i]);
+              // do we need memory fence?
               if (old != clusterId[i]) {
                 // end the loop only if no changes were applied
                 more = true;
               }
-              atomicMin(&clusterId[i], old);
+              atomicMin_block(&clusterId[i], old);
             }  // nnloop
           }    // pixel loop
         }

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -350,7 +350,9 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
     auto const &thisCell = cells[idx];
     if (thisCell.isKilled())
       continue;  // cut by earlyFishbone
-
+    // we require at least three hits...
+    if (thisCell.outerNeighbors().empty())
+      continue;
     auto pid = thisCell.layerPairId();
     auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
     if (doit) {

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -63,8 +63,8 @@ namespace gpuVertexFinder {
       assert(iv[i] >= 0);
       assert(iv[i] < int(foundClusters));
       auto w = 1.f / ezt2[i];
-      atomicAdd(&zv[iv[i]], zt[i] * w);
-      atomicAdd(&wv[iv[i]], w);
+      atomicAdd_block(&zv[iv[i]], zt[i] * w);
+      atomicAdd_block(&wv[iv[i]], w);
     }
 
     __syncthreads();
@@ -87,8 +87,8 @@ namespace gpuVertexFinder {
         iv[i] = 9999;
         continue;
       }
-      atomicAdd(&chi2[iv[i]], c2);
-      atomicAdd(&nn[iv[i]], 1);
+      atomicAdd_block(&chi2[iv[i]], c2);
+      atomicAdd_block(&nn[iv[i]], 1);
     }
     __syncthreads();
     for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -46,7 +46,7 @@ namespace gpuVertexFinder {
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] > 9990)
         continue;
-      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+      atomicAdd_block(&ptv2[iv[i]], ptt2[i]);
     }
     __syncthreads();
 

diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -128,7 +128,7 @@ namespace gpuVertexFinder {
 
 #ifdef __CUDACC__
     // Running too many thread lead to problems when printf is enabled.
-    constexpr int maxThreadsForPrint = 1024 - 256;
+    constexpr int maxThreadsForPrint = 1024 - 128;
     constexpr int numBlocks = 1024;
     constexpr int threadsPerBlock = 128;