Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor GPU clustering thresholds #608

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 9 additions & 18 deletions CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,25 @@
#include <cstdint>
#include <limits>

namespace pixelGPUConstants {
namespace gpuClusteringConstants {
#ifdef GPU_SMALL_EVENTS
// kept for testing and debugging
constexpr uint32_t maxHitsInIter = 64;
constexpr uint32_t maxNumberOfHits = 24 * 1024;
#else
// data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
// tested on MC events with 55-75 pileup events
constexpr uint32_t maxNumberOfHits = 48 * 1024;
#endif
} // namespace pixelGPUConstants

namespace gpuClustering {
#ifdef GPU_SMALL_EVENTS
// kept for testing and debugging
constexpr uint32_t maxHitsInIter() { return 64; }
#else
// optimized for real data PU 50
// tested on MC events with 55-75 pileup events
constexpr uint32_t maxHitsInIter() { return 160; }
constexpr uint32_t maxHitsInIter = 160;
// data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away
constexpr uint32_t maxNumberOfHits = 48 * 1024;
#endif
constexpr uint32_t maxHitsInModule() { return 1024; }

constexpr uint32_t maxHitsInModule = 1024;
constexpr uint16_t maxNumModules = 2000;
constexpr int32_t maxNumClustersPerModules = maxHitsInModule();
constexpr uint32_t maxNumClusters = pixelGPUConstants::maxNumberOfHits;
constexpr int32_t maxNumClustersPerModules = maxHitsInModule;
constexpr uint32_t maxNumClusters = gpuClusteringConstants::maxNumberOfHits;
constexpr uint16_t invalidModuleId = std::numeric_limits<uint16_t>::max() - 1;
static_assert(invalidModuleId > maxNumModules); // invalidModuleId must be > maxNumModules

} // namespace gpuClustering
} // namespace gpuClusteringConstants

#endif // CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ namespace pixelCPEforGPU {

class TrackingRecHit2DSOAView {
public:
static constexpr uint32_t maxHits() { return gpuClustering::maxNumClusters; }
static constexpr uint32_t maxHits() { return gpuClusteringConstants::maxNumClusters; }
using hindex_type = uint32_t; // if above is <=2^32

using PhiBinner =
cms::cuda::HistoContainer<int16_t, 128, gpuClustering::maxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;
cms::cuda::HistoContainer<int16_t, 128, gpuClusteringConstants::maxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;

using AverageGeometry = phase1PixelTopology::AverageGeometry;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClusteringConstants::maxNumModules + 1, stream);
cudaCheck(cudaMemcpyAsync(
ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClusteringConstants::maxNumModules + 1), cudaMemcpyDefault, stream));
return ret;
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class SiPixelGainForHLTonGPU {
constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision_ + minPed_; }

DecodingStructure* v_pedestals_;
std::pair<Range, int> rangeAndCols_[gpuClustering::maxNumModules];
std::pair<Range, int> rangeAndCols_[gpuClusteringConstants::maxNumModules];

float minPed_, maxPed_, minGain_, maxGain_;
float pedPrecision_, gainPrecision_;
Expand Down
1 change: 0 additions & 1 deletion RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
<use name="FWCore/ParameterSet"/>
<use name="FWCore/Utilities"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="RecoLocalTracker/SiPixelClusterizer"/>
<use name="RecoTracker/Record"/>
<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelClusterizerPlugins">
<flags EDM_PLUGIN="1"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con

auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
outputClusters->reserve(gpuClustering::maxNumModules, nDigis / 4);
outputClusters->reserve(gpuClusteringConstants::maxNumModules, nDigis / 4);

edm::DetSet<PixelDigi>* detDigis = nullptr;
for (uint32_t i = 0; i < nDigis; i++) {
Expand All @@ -67,7 +67,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
}

int32_t nclus = -1;
std::vector<PixelClusterizerBase::AccretionCluster> aclusters(gpuClustering::maxNumClustersPerModules);
std::vector<PixelClusterizerBase::AccretionCluster> aclusters(gpuClusteringConstants::maxNumClustersPerModules);
#ifdef EDM_ML_DEBUG
auto totClustersFilled = 0;
#endif
Expand Down Expand Up @@ -128,7 +128,7 @@ void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, con
auto const& dig = (*detDigis).data.back();
// fill clusters
assert(digis.clus(i) >= 0);
assert(digis.clus(i) < gpuClustering::maxNumClustersPerModules);
assert(digis.clus(i) < gpuClusteringConstants::maxNumClustersPerModules);
nclus = std::max(digis.clus(i), nclus);
auto row = dig.row();
auto col = dig.column();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ namespace pixelgpudetails {
uint32_t roc = 1;
uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
if (rID_temp != gpuClustering::invalidModuleId)
if (rID_temp != gpuClusteringConstants::invalidModuleId)
rID = rID_temp;
break;
}
Expand Down Expand Up @@ -319,7 +319,7 @@ namespace pixelgpudetails {
uint32_t roc = 1;
uint32_t link = chanNmbr;
uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
if (rID_temp != gpuClustering::invalidModuleId)
if (rID_temp != gpuClusteringConstants::invalidModuleId)
rID = rID_temp;
break;
}
Expand All @@ -328,7 +328,7 @@ namespace pixelgpudetails {
uint32_t roc = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
if (rID_temp != gpuClustering::invalidModuleId)
if (rID_temp != gpuClusteringConstants::invalidModuleId)
rID = rID_temp;
break;
}
Expand Down Expand Up @@ -370,7 +370,7 @@ namespace pixelgpudetails {
// initialize (too many coninue below)
pdigi[gIndex] = 0;
rawIdArr[gIndex] = 0;
moduleId[gIndex] = gpuClustering::invalidModuleId;
moduleId[gIndex] = gpuClusteringConstants::invalidModuleId;

uint32_t ww = word[gIndex]; // Array containing 32 bit raw data
if (ww == 0) {
Expand Down Expand Up @@ -463,48 +463,52 @@ namespace pixelgpudetails {
} // end of Raw to Digi kernel

__global__ void fillHitsModuleStart(uint32_t const *__restrict__ cluStart, uint32_t *__restrict__ moduleStart) {
assert(gpuClustering::maxNumModules < 2048); // easy to extend at least till 32*1024
assert(gpuClusteringConstants::maxNumModules < 2048); // easy to extend at least till 32*1024
assert(1 == gridDim.x);
assert(0 == blockIdx.x);

int first = threadIdx.x;

// limit to maxHitsInModule()
for (int i = first, iend = gpuClustering::maxNumModules; i < iend; i += blockDim.x) {
moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]);
// limit to maxHitsInModule
for (int i = first, iend = gpuClusteringConstants::maxNumModules; i < iend; i += blockDim.x) {
// use CUDA ::min() instead of std::min to allow using constexpr variables in device code
moduleStart[i + 1] = ::min(gpuClusteringConstants::maxHitsInModule, cluStart[i]);
}

__shared__ uint32_t ws[32];
cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClustering::maxNumModules - 1024, ws);
cms::cuda::blockPrefixScan(moduleStart + 1025, moduleStart + 1025, gpuClusteringConstants::maxNumModules - 1024, ws);

for (int i = first + 1025, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
for (int i = first + 1025, iend = gpuClusteringConstants::maxNumModules + 1; i < iend; i += blockDim.x) {
moduleStart[i] += moduleStart[1024];
}
__syncthreads();

#ifdef GPU_DEBUG
assert(0 == moduleStart[0]);
auto c0 = std::min(gpuClustering::maxHitsInModule(), cluStart[0]);
auto c0 = ::min(gpuClusteringConstants::maxHitsInModule, cluStart[0]);
assert(c0 == moduleStart[1]);
assert(moduleStart[1024] >= moduleStart[1023]);
assert(moduleStart[1025] >= moduleStart[1024]);
assert(moduleStart[gpuClustering::maxNumModules] >= moduleStart[1025]);
assert(moduleStart[gpuClusteringConstants::maxNumModules] >= moduleStart[1025]);

for (int i = first, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
for (int i = first, iend = gpuClusteringConstants::maxNumModules + 1; i < iend; i += blockDim.x) {
if (0 != i)
assert(moduleStart[i] >= moduleStart[i - i]);
// [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID]
// [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856]
if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::maxNumModules)
if (i == 96 || i == 1184 || i == 1744 || i == gpuClusteringConstants::maxNumModules)
printf("moduleStart %d %d\n", i, moduleStart[i]);
}
#endif

// avoid overflow
auto constexpr maxNumClusters = gpuClustering::maxNumClusters;
for (int i = first, iend = gpuClustering::maxNumModules + 1; i < iend; i += blockDim.x) {
moduleStart[i] = std::clamp(moduleStart[i], 0U, maxNumClusters);
auto constexpr maxNumClusters = gpuClusteringConstants::maxNumClusters;
for (int i = first, iend = gpuClusteringConstants::maxNumModules + 1; i < iend; i += blockDim.x) {
// clamp moduleStart[i] to boundaries 0 and maxNumClusters (uint32_t)
// there is not CUDA ::clamp(), replace with ::min() and ::max()
moduleStart[i] = ::max(0U, moduleStart[i]);
moduleStart[i] = ::min(moduleStart[i], maxNumClusters);
}
}

Expand All @@ -531,7 +535,7 @@ namespace pixelgpudetails {
if (includeErrors) {
digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
}
clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream);
clusters_d = SiPixelClustersCUDA(gpuClusteringConstants::maxNumModules, stream);

nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);

Expand Down Expand Up @@ -581,10 +585,10 @@ namespace pixelgpudetails {

{
// clusterizer ...
using namespace gpuClustering;
using namespace gpuClusteringConstants;
int threadsPerBlock = 256;
int blocks =
(std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
(std::max(int(wordCounter), int(gpuClusteringConstants::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;

gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
digis_d.moduleInd(),
Expand Down Expand Up @@ -654,7 +658,7 @@ namespace pixelgpudetails {

// last element holds the number of all clusters
cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
clusters_d.clusModuleStart() + gpuClustering::maxNumModules,
clusters_d.clusModuleStart() + gpuClusteringConstants::maxNumModules,
sizeof(uint32_t),
cudaMemcpyDefault,
stream));
Expand Down
4 changes: 2 additions & 2 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace gpuCalibPixel {

using gpuClustering::invalidModuleId;
using gpuClusteringConstants::invalidModuleId;

// valid for run2
constexpr float VCaltoElectronGain = 47; // L2-4: 47 +- 4.7
Expand All @@ -34,7 +34,7 @@ namespace gpuCalibPixel {
// zero for next kernels...
if (0 == first)
clusModuleStart[0] = moduleStart[0] = 0;
for (int i = first; i < gpuClustering::maxNumModules; i += gridDim.x * blockDim.x) {
for (int i = first; i < gpuClusteringConstants::maxNumModules; i += gridDim.x * blockDim.x) {
nClustersInModule[i] = 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"

namespace gpuClustering {
namespace gpuClusteringConstants {

__global__ void clusterChargeCut(
uint16_t* __restrict__ id, // module id of each pixel (modified if bad cluster)
Expand Down Expand Up @@ -120,6 +120,6 @@ namespace gpuClustering {
} // loop on modules
}

} // namespace gpuClustering
} // namespace gpuClusteringConstants

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
4 changes: 2 additions & 2 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

namespace gpuClustering {
namespace gpuClusteringConstants {

#ifdef GPU_DEBUG
__device__ uint32_t gMaxHit = 0;
Expand Down Expand Up @@ -298,6 +298,6 @@ namespace gpuClustering {
}
} // module loop
}
} // namespace gpuClustering
} // namespace gpuClusteringConstants

#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 changes: 1 addition & 1 deletion RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ int main(void) {
cms::cudatest::requireDevices();
#endif // __CUDACC__

using namespace gpuClustering;
using namespace gpuClusteringConstants;

constexpr int numElements = 256 * maxNumModules;

Expand Down
2 changes: 1 addition & 1 deletion RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ namespace pixelCPEforGPU {
int16_t ysize[N];
};

constexpr int32_t MaxHitsInIter = gpuClustering::maxHitsInIter();
constexpr int32_t MaxHitsInIter = gpuClusteringConstants::maxHitsInIter;
using ClusParams = ClusParamsT<MaxHitsInIter>;

constexpr inline void computeAnglesFromDet(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,29 +191,29 @@ namespace cms {
const edmNew::DetSetVector<SiPixelCluster>& input = *inputhandle;

// allocate a buffer for the indices of the clusters
auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::maxNumModules + 1);
auto hmsp = std::make_unique<uint32_t[]>(gpuClusteringConstants::maxNumModules + 1);
// hitsModuleStart is a non-owning pointer to the buffer
auto hitsModuleStart = hmsp.get();
// fill cluster arrays
std::array<uint32_t, gpuClustering::maxNumModules + 1> clusInModule{};
std::array<uint32_t, gpuClusteringConstants::maxNumModules + 1> clusInModule{};
for (auto const& dsv : input) {
unsigned int detid = dsv.detId();
DetId detIdObject(detid);
const GeomDetUnit* genericDet = geom.idToDetUnit(detIdObject);
auto gind = genericDet->index();
// FIXME to be changed to support Phase2
if (gind >= int(gpuClustering::maxNumModules))
if (gind >= int(gpuClusteringConstants::maxNumModules))
continue;
auto const nclus = dsv.size();
assert(nclus > 0);
clusInModule[gind] = nclus;
numberOfClusters += nclus;
}
hitsModuleStart[0] = 0;
assert(clusInModule.size() > gpuClustering::maxNumModules);
assert(clusInModule.size() > gpuClusteringConstants::maxNumModules);
for (int i = 1, n = clusInModule.size(); i < n; ++i)
hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule[i - 1];
assert(numberOfClusters == int(hitsModuleStart[gpuClustering::maxNumModules]));
assert(numberOfClusters == int(hitsModuleStart[gpuClusteringConstants::maxNumModules]));

// wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
iEvent.emplace(tHost_, std::move(hmsp));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,

void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
// allocate a buffer for the indices of the clusters
auto hmsp = std::make_unique<uint32_t[]>(gpuClustering::maxNumModules + 1);
std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + gpuClustering::maxNumModules + 1, hmsp.get());
auto hmsp = std::make_unique<uint32_t[]>(gpuClusteringConstants::maxNumModules + 1);
std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + gpuClusteringConstants::maxNumModules + 1, hmsp.get());
// wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
iEvent.emplace(hostPutToken_, std::move(hmsp));

Expand All @@ -105,7 +105,7 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
edm::Handle<SiPixelClusterCollectionNew> hclusters = iEvent.getHandle(clusterToken_);
auto const& input = *hclusters;

constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule();
constexpr uint32_t maxHitsInModule = gpuClusteringConstants::maxHitsInModule;

int numberOfDetUnits = 0;
int numberOfClusters = 0;
Expand Down
Loading