Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up Patatrack CA #34250

Merged
merged 3 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ namespace cms {
return old;
}

template <typename T1, typename T2>
T1 atomicCAS_block(T1* address, T1 compare, T2 val) {
return atomicCAS(address, compare, val);
Comment on lines +42 to +44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for my education,
what does this and the other _block methods do in this PR, especially in the context of speeding things up.

}

template <typename T1, typename T2>
T1 atomicInc(T1* a, T2 b) {
auto ret = *a;
Expand All @@ -47,33 +52,59 @@ namespace cms {
return ret;
}

template <typename T1, typename T2>
T1 atomicInc_block(T1* a, T2 b) {
return atomicInc(a, b);
}

template <typename T1, typename T2>
T1 atomicAdd(T1* a, T2 b) {
auto ret = *a;
(*a) += b;
return ret;
}

template <typename T1, typename T2>
T1 atomicAdd_block(T1* a, T2 b) {
return atomicAdd(a, b);
}

template <typename T1, typename T2>
T1 atomicSub(T1* a, T2 b) {
auto ret = *a;
(*a) -= b;
return ret;
}

template <typename T1, typename T2>
T1 atomicSub_block(T1* a, T2 b) {
return atomicSub(a, b);
}

template <typename T1, typename T2>
T1 atomicMin(T1* a, T2 b) {
auto ret = *a;
*a = std::min(*a, T1(b));
return ret;
}

template <typename T1, typename T2>
T1 atomicMin_block(T1* a, T2 b) {
return atomicMin(a, b);
}

template <typename T1, typename T2>
T1 atomicMax(T1* a, T2 b) {
auto ret = *a;
*a = std::max(*a, T1(b));
return ret;
}

template <typename T1, typename T2>
T1 atomicMax_block(T1* a, T2 b) {
return atomicMax(a, b);
}

inline void __syncthreads() {}
inline void __threadfence() {}
inline bool __syncthreads_or(bool x) { return x; }
Expand Down
5 changes: 3 additions & 2 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,13 @@ namespace gpuClustering {
auto l = nn[k][kk];
auto m = l + firstPixel;
assert(m != i);
auto old = atomicMin(&clusterId[m], clusterId[i]);
auto old = atomicMin_block(&clusterId[m], clusterId[i]);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clusterId is in global memory, but "m" (and "i") below refers to digis on the module that is reconstructed in this block. (one module per block, one block per module)

// do we need memory fence?
if (old != clusterId[i]) {
// end the loop only if no changes were applied
more = true;
}
atomicMin(&clusterId[i], old);
atomicMin_block(&clusterId[i], old);
} // nnloop
} // pixel loop
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,9 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
auto const &thisCell = cells[idx];
if (thisCell.isKilled())
continue; // cut by earlyFishbone

// we require at least three hits...
if (thisCell.outerNeighbors().empty())
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simply avoids the whole lot below (in particular the function call that being recursive cannot be inlined)

continue;
auto pid = thisCell.layerPairId();
auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
if (doit) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ namespace gpuVertexFinder {
assert(iv[i] >= 0);
assert(iv[i] < int(foundClusters));
auto w = 1.f / ezt2[i];
atomicAdd(&zv[iv[i]], zt[i] * w);
atomicAdd(&wv[iv[i]], w);
atomicAdd_block(&zv[iv[i]], zt[i] * w);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vertex reconstruction is performed in a single block.

atomicAdd_block(&wv[iv[i]], w);
}

__syncthreads();
Expand All @@ -87,8 +87,8 @@ namespace gpuVertexFinder {
iv[i] = 9999;
continue;
}
atomicAdd(&chi2[iv[i]], c2);
atomicAdd(&nn[iv[i]], 1);
atomicAdd_block(&chi2[iv[i]], c2);
atomicAdd_block(&nn[iv[i]], 1);
}
__syncthreads();
for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace gpuVertexFinder {
for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
if (iv[i] > 9990)
continue;
atomicAdd(&ptv2[iv[i]], ptt2[i]);
atomicAdd_block(&ptv2[iv[i]], ptt2[i]);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vertex sorting is performed in a single block

}
__syncthreads();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ namespace gpuVertexFinder {

#ifdef __CUDACC__
// Running too many thread lead to problems when printf is enabled.
constexpr int maxThreadsForPrint = 1024 - 256;
constexpr int maxThreadsForPrint = 1024 - 128;
constexpr int numBlocks = 1024;
constexpr int threadsPerBlock = 128;

Expand Down