diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc new file mode 100644 index 000000000..06f753528 --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.cc @@ -0,0 +1,16 @@ +#include "CUDADataFormats/SiPixelClustersSoA.h" + +SiPixelClustersSoA::SiPixelClustersSoA(size_t maxClusters) { + moduleStart_d = std::make_unique(maxClusters + 1); + clusInModule_d = std::make_unique(maxClusters); + moduleId_d = std::make_unique(maxClusters); + clusModuleStart_d = std::make_unique(maxClusters + 1); + + auto view = std::make_unique(); + view->moduleStart_ = moduleStart_d.get(); + view->clusInModule_ = clusInModule_d.get(); + view->moduleId_ = moduleId_d.get(); + view->clusModuleStart_ = clusModuleStart_d.get(); + + view_d = std::move(view); +} diff --git a/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h new file mode 100644 index 000000000..24d0d209d --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelClustersSoA.h @@ -0,0 +1,56 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersSoA_h +#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersSoA_h + +#include "CUDACore/cudaCompat.h" +#include "CUDADataFormats/SiPixelClustersCUDA.h" + +#include + +class SiPixelClustersSoA { +public: + SiPixelClustersSoA() = default; + explicit SiPixelClustersSoA(size_t maxClusters); + ~SiPixelClustersSoA() = default; + + SiPixelClustersSoA(const SiPixelClustersSoA &) = delete; + SiPixelClustersSoA &operator=(const SiPixelClustersSoA &) = delete; + SiPixelClustersSoA(SiPixelClustersSoA &&) = default; + SiPixelClustersSoA &operator=(SiPixelClustersSoA &&) = default; + + void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; } + + uint32_t nClusters() const { return nClusters_h; } + + uint32_t *moduleStart() { return moduleStart_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } + + using DeviceConstView = SiPixelClustersCUDA::DeviceConstView; + + DeviceConstView *view() const { return view_d.get(); } + +private: + std::unique_ptr moduleStart_d; // index of the first pixel of each module + std::unique_ptr clusInModule_d; // number of clusters found in each module + std::unique_ptr moduleId_d; // module id of each module + + // originally from rechits + std::unique_ptr clusModuleStart_d; // index of the first cluster of each module + + std::unique_ptr view_d; // "me" pointer + + uint32_t nClusters_h; +}; + +#endif diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc new file mode 100644 index 000000000..76c0e8655 --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.cc @@ -0,0 +1,17 @@ +#include "CUDADataFormats/SiPixelDigiErrorsSoA.h" + +#include +#include + +SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t maxFedWords, PixelFormatterErrors errors) + : formatterErrors_h(std::move(errors)) { + error_d = std::make_unique>(); + data_d = std::make_unique(maxFedWords); + + std::memset(data_d.get(), 0x00, maxFedWords); + + error_d = std::make_unique>(); + cms::cuda::make_SimpleVector(error_d.get(), maxFedWords, data_d.get()); + assert(error_d->empty()); + assert(error_d->capacity() == static_cast(maxFedWords)); +} diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h new file mode 100644 index 000000000..50c5a3e86 --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelDigiErrorsSoA.h @@ -0,0 +1,32 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h + +#include + +#include "CUDACore/SimpleVector.h" +#include "DataFormats/PixelErrors.h" + +class SiPixelDigiErrorsSoA { +public: + SiPixelDigiErrorsSoA() = default; + explicit SiPixelDigiErrorsSoA(size_t maxFedWords, PixelFormatterErrors errors); + ~SiPixelDigiErrorsSoA() = default; + + SiPixelDigiErrorsSoA(const SiPixelDigiErrorsSoA&) = delete; + SiPixelDigiErrorsSoA& operator=(const SiPixelDigiErrorsSoA&) = delete; + SiPixelDigiErrorsSoA(SiPixelDigiErrorsSoA&&) = default; + SiPixelDigiErrorsSoA& operator=(SiPixelDigiErrorsSoA&&) = default; + + const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } + + cms::cuda::SimpleVector* error() { return error_d.get(); } + cms::cuda::SimpleVector const* error() const { return error_d.get(); } + cms::cuda::SimpleVector const* c_error() const { return error_d.get(); } + +private: + std::unique_ptr data_d; + std::unique_ptr> error_d; + PixelFormatterErrors formatterErrors_h; +}; + +#endif diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc new file mode 100644 index 000000000..265f69064 --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.cc @@ -0,0 +1,21 @@ +#include "CUDADataFormats/SiPixelDigisSoA.h" + +SiPixelDigisSoA::SiPixelDigisSoA(size_t maxFedWords) { + xx_d = std::make_unique(maxFedWords); + yy_d = std::make_unique(maxFedWords); + adc_d = std::make_unique(maxFedWords); + moduleInd_d = std::make_unique(maxFedWords); + clus_d = std::make_unique(maxFedWords); + + pdigi_d = std::make_unique(maxFedWords); + rawIdArr_d = std::make_unique(maxFedWords); + + auto view = std::make_unique(); + view->xx_ = xx_d.get(); + view->yy_ = yy_d.get(); + view->adc_ = adc_d.get(); + view->moduleInd_ = moduleInd_d.get(); + view->clus_ = clus_d.get(); + + view_d = std::move(view); +} diff --git a/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h new file mode 100644 index 000000000..5a0727f97 --- /dev/null +++ b/src/cudacompat/CUDADataFormats/SiPixelDigisSoA.h @@ -0,0 +1,74 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h + +#include "CUDACore/cudaCompat.h" +#include "CUDADataFormats/SiPixelDigisCUDA.h" + +#include + +class SiPixelDigisSoA { +public: + SiPixelDigisSoA() = default; + explicit SiPixelDigisSoA(size_t maxFedWords); + ~SiPixelDigisSoA() = default; + + SiPixelDigisSoA(const SiPixelDigisSoA &) = delete; + SiPixelDigisSoA &operator=(const SiPixelDigisSoA &) = delete; + SiPixelDigisSoA(SiPixelDigisSoA &&) = default; + SiPixelDigisSoA &operator=(SiPixelDigisSoA &&) = default; + + void setNModulesDigis(uint32_t nModules, uint32_t nDigis) { + nModules_h = nModules; + nDigis_h = nDigis; + } + + uint32_t nModules() const { return nModules_h; } + uint32_t nDigis() const { return nDigis_h; } + + uint16_t *xx() { return xx_d.get(); } + uint16_t *yy() { return yy_d.get(); } + uint16_t *adc() { return adc_d.get(); } + uint16_t *moduleInd() { return moduleInd_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *pdigi() { return pdigi_d.get(); } + uint32_t *rawIdArr() { return rawIdArr_d.get(); } + + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *pdigi() const { return pdigi_d.get(); } + uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } + + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_pdigi() const { return pdigi_d.get(); } + uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); } + + using DeviceConstView = SiPixelDigisCUDA::DeviceConstView; + + const DeviceConstView *view() const { return view_d.get(); } + +private: + // These are consumed by downstream device code + std::unique_ptr xx_d; // local coordinates of each pixel + std::unique_ptr yy_d; // + std::unique_ptr adc_d; // ADC of each pixel + std::unique_ptr moduleInd_d; // module id of each pixel + std::unique_ptr clus_d; // cluster id of each pixel + std::unique_ptr view_d; // "me" pointer + + // These are for CPU output; should we (eventually) place them to a + // separate product? + std::unique_ptr pdigi_d; + std::unique_ptr rawIdArr_d; + + uint32_t nModules_h = 0; + uint32_t nDigis_h = 0; +}; + +#endif diff --git a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h index 027e7d25c..c037faa67 100644 --- a/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h +++ b/src/cudacompat/CondFormats/SiPixelFedCablingMapGPUWrapper.h @@ -20,9 +20,11 @@ class SiPixelFedCablingMapGPUWrapper { // returns pointer to GPU memory const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const; + const SiPixelFedCablingMapGPU *getCPUProduct() const { return cablingMapHost; } // returns pointer to GPU memory const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const; + const unsigned char *getModToUnpAll() const { return modToUnpDefault.data(); } private: std::vector> modToUnpDefault; diff --git a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc index 76e64e8f3..6e308da8f 100644 --- a/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc +++ b/src/cudacompat/CondFormats/SiPixelGainCalibrationForHLTGPU.cc @@ -7,11 +7,20 @@ SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain, std::vector gainData) : gainData_(std::move(gainData)) { + /* cudaCheck(cudaMallocHost(&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU))); *gainForHLTonHost_ = gain; + */ + gainForHLTonHost_ = new SiPixelGainForHLTonGPU(gain); + gainForHLTonHost_->v_pedestals = reinterpret_cast(gainData_.data()); } -SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); } +SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { + /* + cudaCheck(cudaFreeHost(gainForHLTonHost_)); + */ + delete gainForHLTonHost_; +} SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() { cudaCheck(cudaFree(gainForHLTonGPU)); diff --git a/src/cudacompat/bin/main.cc b/src/cudacompat/bin/main.cc index c8a76eee5..966f2a1f6 100644 --- a/src/cudacompat/bin/main.cc +++ b/src/cudacompat/bin/main.cc @@ -17,16 +17,15 @@ namespace { void print_help(std::string const& name) { std::cout << name - << ": [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] [--transfer] [--validation] " + << ": [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] [--validation] " "[--histogram] [--empty]\n\n" << "Options\n" << " --numberOfThreads Number of threads to use (default 1)\n" << " --numberOfStreams Number of concurrent events (default 0=numberOfThreads)\n" << " --maxEvents Number of events to process (default -1 for all events in the input file)\n" << " --data Path to the 'data' directory (default 'data' in the directory of the executable)\n" - << " --transfer Transfer results from GPU to CPU (default is to leave them on GPU)\n" - << " --validation Run (rudimentary) validation at the end (implies --transfer)\n" - << " --histogram Produce histograms at the end (implies --transfer)\n" + << " --validation Run (rudimentary) validation at the end\n" + << " --histogram Produce histograms at the end\n" << " --empty Ignore all producers (for testing only)\n" << std::endl; } @@ -39,7 +38,6 @@ int main(int argc, char** argv) { int numberOfStreams = 0; int maxEvents = -1; std::filesystem::path datadir; - bool transfer = false; bool validation = false; bool histogram = false; bool empty = false; @@ -59,13 +57,9 @@ int main(int argc, char** argv) { } else if (*i == "--data") { ++i; datadir = *i; - } else if (*i == "--transfer") { - transfer = true; } else if (*i == "--validation") { - transfer = true; validation = true; } else if (*i == "--histogram") { - transfer = true; histogram = true; } else if (*i == "--empty") { empty = true; @@ -85,6 +79,8 @@ int main(int argc, char** argv) { std::cout << "Data directory '" << datadir << "' does not exist" << std::endl; return EXIT_FAILURE; } + + // TODO: remove when can run without a GPU int numberOfDevices; auto status = cudaGetDeviceCount(&numberOfDevices); if (cudaSuccess != status) { @@ -98,19 +94,11 @@ int main(int argc, char** argv) { std::vector esmodules; if (not empty) { edmodules = { - "BeamSpotToCUDA", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"}; + "BeamSpotToPOD", "SiPixelRawToClusterCUDA", "SiPixelRecHitCUDA", "CAHitNtupletCUDA", "PixelVertexProducerCUDA"}; esmodules = {"BeamSpotESProducer", "SiPixelFedCablingMapGPUWrapperESProducer", "SiPixelGainCalibrationForHLTGPUESProducer", "PixelCPEFastESProducer"}; - if (transfer) { - auto capos = std::find(edmodules.begin(), edmodules.end(), "CAHitNtupletCUDA"); - assert(capos != edmodules.end()); - edmodules.insert(capos + 1, "PixelTrackSoAFromCUDA"); - auto vertpos = std::find(edmodules.begin(), edmodules.end(), "PixelVertexProducerCUDA"); - assert(vertpos != edmodules.end()); - edmodules.insert(vertpos + 1, "PixelVertexSoAFromCUDA"); - } if (validation) { edmodules.emplace_back("CountValidator"); } diff --git a/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc new file mode 100644 index 000000000..209479c54 --- /dev/null +++ b/src/cudacompat/plugin-BeamSpotProducer/BeamSpotToPOD.cc @@ -0,0 +1,24 @@ +#include "DataFormats/BeamSpotPOD.h" +#include "Framework/EDProducer.h" +#include "Framework/Event.h" +#include "Framework/EventSetup.h" +#include "Framework/PluginFactory.h" + +class BeamSpotToPOD : public edm::EDProducer { +public: + explicit BeamSpotToPOD(edm::ProductRegistry& reg); + ~BeamSpotToPOD() override = default; + + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + +private: + const edm::EDPutTokenT bsPutToken_; +}; + +BeamSpotToPOD::BeamSpotToPOD(edm::ProductRegistry& reg) : bsPutToken_{reg.produces()} {} + +void BeamSpotToPOD::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + iEvent.emplace(bsPutToken_, iSetup.get()); +} + +DEFINE_FWK_MODULE(BeamSpotToPOD); diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc index 57baea007..ae0efa96c 100644 --- a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc +++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletCUDA.cc @@ -1,6 +1,3 @@ -#include - -#include "CUDACore/Product.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" @@ -10,7 +7,7 @@ #include "CAHitNtupletGeneratorOnGPU.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" class CAHitNtupletCUDA : public edm::EDProducer { public: @@ -20,25 +17,23 @@ class CAHitNtupletCUDA : public edm::EDProducer { private: void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; - edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDGetTokenT tokenHitCPU_; + edm::EDPutTokenT tokenTrackCPU_; CAHitNtupletGeneratorOnGPU gpuAlgo_; }; CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg) - : tokenHitGPU_{reg.consumes>()}, - tokenTrackGPU_{reg.produces>()}, + : tokenHitCPU_{reg.consumes()}, + tokenTrackCPU_{reg.produces()}, gpuAlgo_(reg) {} void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { auto bf = 0.0114256972711507; // 1/fieldInGeV - auto const& phits = iEvent.get(tokenHitGPU_); - cms::cuda::ScopedContextProduce ctx{phits}; - auto const& hits = ctx.get(phits); + auto const& hits = iEvent.get(tokenHitCPU_); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); } DEFINE_FWK_MODULE(CAHitNtupletCUDA); diff --git a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc index d0e428da6..d91c1cc9f 100644 --- a/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc +++ b/src/cudacompat/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc @@ -43,7 +43,7 @@ namespace { using namespace std; CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(edm::ProductRegistry& reg) - : m_params(true, // onGPU + : m_params(false, // onGPU 3, // minHitsPerNtuplet, 458752, // maxNumberOfDoublets false, //useRiemannFit diff --git a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc index 15e3c486e..557329c9e 100644 --- a/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc +++ b/src/cudacompat/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc @@ -32,7 +32,7 @@ class PixelVertexProducerCUDA : public edm::EDProducer { }; PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg) - : m_OnGPU(true), + : m_OnGPU(false), m_gpuAlgo(true, // oneKernel true, // useDensity false, // useDBSCAN diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 06624744e..090cb9e55 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -1,7 +1,6 @@ -#include "CUDACore/Product.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigisCUDA.h" -#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" +#include "CUDADataFormats/SiPixelDigiErrorsSoA.h" #include "CondFormats/SiPixelGainCalibrationForHLTGPU.h" #include "CondFormats/SiPixelFedCablingMapGPUWrapper.h" #include "CondFormats/SiPixelFedIds.h" @@ -22,23 +21,20 @@ #include #include -class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork { +class SiPixelRawToClusterCUDA : public edm::EDProducer { public: explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg); ~SiPixelRawToClusterCUDA() override = default; private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; cms::cuda::ContextState ctxState_; edm::EDGetTokenT rawGetToken_; - edm::EDPutTokenT> digiPutToken_; - edm::EDPutTokenT> digiErrorPutToken_; - edm::EDPutTokenT> clusterPutToken_; + edm::EDPutTokenT digiPutToken_; + edm::EDPutTokenT digiErrorPutToken_; + edm::EDPutTokenT clusterPutToken_; pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; std::unique_ptr wordFedAppender_; @@ -51,35 +47,31 @@ class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork { SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg) : rawGetToken_(reg.consumes()), - digiPutToken_(reg.produces>()), - clusterPutToken_(reg.produces>()), + digiPutToken_(reg.produces()), + clusterPutToken_(reg.produces()), isRun2_(true), includeErrors_(true), useQuality_(true) { if (includeErrors_) { - digiErrorPutToken_ = reg.produces>(); + digiErrorPutToken_ = reg.produces(); } wordFedAppender_ = std::make_unique(); } -void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; - +void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { auto const& hgpuMap = iSetup.get(); if (hgpuMap.hasQuality() != useQuality_) { throw std::runtime_error("UseQuality of the module (" + std::to_string(useQuality_) + ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration."); } // get the GPU product already here so that the async transfer can begin - const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream()); - const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAllAsync(ctx.stream()); + const auto* gpuMap = hgpuMap.getCPUProduct(); + const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAll(); auto const& hgains = iSetup.get(); // get the GPU product already here so that the async transfer can begin - const auto* gpuGains = hgains.getGPUProductAsync(ctx.stream()); + const auto* gpuGains = hgains.getCPUProduct(); auto const& fedIds_ = iSetup.get().fedIds(); @@ -147,28 +139,23 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, } // end of for loop - gpuAlgo_.makeClustersAsync(isRun2_, - gpuMap, - gpuModulesToUnpack, - gpuGains, - *wordFedAppender_, - std::move(errors_), - wordCounterGPU, - fedCounter, - useQuality_, - includeErrors_, - false, // debug - ctx.stream()); -} - -void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - cms::cuda::ScopedContextProduce ctx{ctxState_}; + gpuAlgo_.makeClusters(isRun2_, + gpuMap, + gpuModulesToUnpack, + gpuGains, + *wordFedAppender_, + std::move(errors_), + wordCounterGPU, + fedCounter, + useQuality_, + includeErrors_, + false); // debug auto tmp = gpuAlgo_.getResults(); - ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); - ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); + iEvent.emplace(digiPutToken_, std::move(tmp.first)); + iEvent.emplace(clusterPutToken_, std::move(tmp.second)); if (includeErrors_) { - ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); + iEvent.emplace(digiErrorPutToken_, gpuAlgo_.getErrors()); } } diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc similarity index 74% rename from src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu rename to src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc index f5070130a..b8ad06ed0 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu +++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cc @@ -17,15 +17,9 @@ #include #include -// CUDA includes -#include -#include - // CMSSW includes +#include "CUDACore/cudaCompat.h" #include "CUDADataFormats/gpuClusteringConstants.h" -#include "CUDACore/cudaCheck.h" -#include "CUDACore/device_unique_ptr.h" -#include "CUDACore/host_unique_ptr.h" #include "CondFormats/SiPixelFedCablingMapGPU.h" // local includes @@ -40,8 +34,8 @@ namespace pixelgpudetails { constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() { - word_ = cms::cuda::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); - fedId_ = cms::cuda::make_host_noncached_unique(MAX_FED_WORDS, cudaHostAllocWriteCombined); + word_ = std::make_unique(MAX_FED_WORDS); + fedId_ = std::make_unique(MAX_FED_WORDS); } void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, @@ -519,140 +513,85 @@ namespace pixelgpudetails { } // Interface to outside - void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2, - const SiPixelFedCablingMapGPU *cablingMap, - const unsigned char *modToUnp, - const SiPixelGainForHLTonGPU *gains, - const WordFedAppender &wordFed, - PixelFormatterErrors &&errors, - const uint32_t wordCounter, - const uint32_t fedCounter, - bool useQualityInfo, - bool includeErrors, - bool debug, - cudaStream_t stream) { - nDigis = wordCounter; - + void SiPixelRawToClusterGPUKernel::makeClusters(bool isRun2, + const SiPixelFedCablingMapGPU *cablingMap, + const unsigned char *modToUnp, + const SiPixelGainForHLTonGPU *gains, + const WordFedAppender &wordFed, + PixelFormatterErrors &&errors, + const uint32_t wordCounter, + const uint32_t fedCounter, + bool useQualityInfo, + bool includeErrors, + bool debug) { #ifdef GPU_DEBUG std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl; #endif - digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream); + digis_d = SiPixelDigisSoA(pixelgpudetails::MAX_FED_WORDS); if (includeErrors) { - digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream); + digiErrors_d = SiPixelDigiErrorsSoA(pixelgpudetails::MAX_FED_WORDS, std::move(errors)); } - clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream); - - nModules_Clusters_h = cms::cuda::make_host_unique(2, stream); + clusters_d = SiPixelClustersSoA(gpuClustering::MaxNumModules); if (wordCounter) // protect in case of empty event.... { - const int threadsPerBlock = 512; - const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; // fill it all - assert(0 == wordCounter % 2); - // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = cms::cuda::make_device_unique(wordCounter, stream); - auto fedId_d = cms::cuda::make_device_unique(wordCounter, stream); - - cudaCheck( - cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync( - fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream)); - // Launch rawToDigi kernel - RawToDigi_kernel<<>>( - cablingMap, - modToUnp, - wordCounter, - word_d.get(), - fedId_d.get(), - digis_d.xx(), - digis_d.yy(), - digis_d.adc(), - digis_d.pdigi(), - digis_d.rawIdArr(), - digis_d.moduleInd(), - digiErrors_d.error(), // returns nullptr if default-constructed - useQualityInfo, - includeErrors, - debug); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - if (includeErrors) { - digiErrors_d.copyErrorToHostAsync(stream); - } + RawToDigi_kernel(cablingMap, + modToUnp, + wordCounter, + wordFed.word(), + wordFed.fedId(), + digis_d.xx(), + digis_d.yy(), + digis_d.adc(), + digis_d.pdigi(), + digis_d.rawIdArr(), + digis_d.moduleInd(), + digiErrors_d.error(), // returns nullptr if default-constructed + useQualityInfo, + includeErrors, + debug); } // End of Raw2Digi and passing data for clustering { // clusterizer ... using namespace gpuClustering; - int threadsPerBlock = 256; - int blocks = - (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlock - 1) / threadsPerBlock; - - gpuCalibPixel::calibDigis<<>>(isRun2, - digis_d.moduleInd(), - digis_d.c_xx(), - digis_d.c_yy(), - digis_d.adc(), - gains, - wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - -#ifdef GPU_DEBUG - std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock - << " threads\n"; -#endif - - countModules<<>>( - digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter); - cudaCheck(cudaGetLastError()); + gpuCalibPixel::calibDigis(isRun2, + digis_d.moduleInd(), + digis_d.c_xx(), + digis_d.c_yy(), + digis_d.adc(), + gains, + wordCounter, + clusters_d.moduleStart(), + clusters_d.clusInModule(), + clusters_d.clusModuleStart()); + + countModules(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync( - &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); + digis_d.setNModulesDigis(clusters_d.moduleStart()[0], wordCounter); - threadsPerBlock = 256; - blocks = MaxNumModules; -#ifdef GPU_DEBUG - std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; -#endif - findClus<<>>(digis_d.c_moduleInd(), - digis_d.c_xx(), - digis_d.c_yy(), - clusters_d.c_moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.clus(), - wordCounter); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif + findClus(digis_d.c_moduleInd(), + digis_d.c_xx(), + digis_d.c_yy(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), + clusters_d.moduleId(), + digis_d.clus(), + wordCounter); // apply charge cut - clusterChargeCut<<>>(digis_d.moduleInd(), - digis_d.c_adc(), - clusters_d.c_moduleStart(), - clusters_d.clusInModule(), - clusters_d.c_moduleId(), - digis_d.clus(), - wordCounter); - cudaCheck(cudaGetLastError()); + clusterChargeCut(digis_d.moduleInd(), + digis_d.c_adc(), + clusters_d.c_moduleStart(), + clusters_d.clusInModule(), + clusters_d.c_moduleId(), + digis_d.clus(), + wordCounter); // count the module start indices already here (instead of // rechits) so that the number of clusters/hits can be made @@ -660,20 +599,10 @@ namespace pixelgpudetails { // synchronization/ExternalWork // MUST be ONE block - fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.c_clusInModule(), clusters_d.clusModuleStart()); + fillHitsModuleStart(clusters_d.c_clusInModule(), clusters_d.clusModuleStart()); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), - clusters_d.clusModuleStart() + gpuClustering::MaxNumModules, - sizeof(uint32_t), - cudaMemcpyDefault, - stream)); - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - + clusters_d.setNClusters(clusters_d.clusModuleStart()[gpuClustering::MaxNumModules]); } // end clusterizer scope } } // namespace pixelgpudetails diff --git a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h index 3cbce9e71..8360f70c9 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h +++ b/src/cudacompat/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h @@ -2,14 +2,13 @@ #define RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h #include -#include +#include -#include "CUDADataFormats/SiPixelDigisCUDA.h" -#include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" +#include "CUDACore/cudaCompat.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" +#include "CUDADataFormats/SiPixelDigiErrorsSoA.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" #include "CUDACore/SimpleVector.h" -#include "CUDACore/host_unique_ptr.h" -#include "CUDACore/host_noncached_unique_ptr.h" #include "DataFormats/PixelErrors.h" struct SiPixelFedCablingMapGPU; @@ -155,8 +154,8 @@ namespace pixelgpudetails { const unsigned char* fedId() const { return fedId_.get(); } private: - cms::cuda::host::noncached::unique_ptr word_; - cms::cuda::host::noncached::unique_ptr fedId_; + std::unique_ptr word_; + std::unique_ptr fedId_; }; SiPixelRawToClusterGPUKernel() = default; @@ -167,42 +166,29 @@ namespace pixelgpudetails { SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete; SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete; - void makeClustersAsync(bool isRun2, - const SiPixelFedCablingMapGPU* cablingMap, - const unsigned char* modToUnp, - const SiPixelGainForHLTonGPU* gains, - const WordFedAppender& wordFed, - PixelFormatterErrors&& errors, - const uint32_t wordCounter, - const uint32_t fedCounter, - bool useQualityInfo, - bool includeErrors, - bool debug, - cudaStream_t stream); - - std::pair getResults() { - digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis); - clusters_d.setNClusters(nModules_Clusters_h[1]); - // need to explicitly deallocate while the associated CUDA - // stream is still alive - // - // technically the statement above is not true anymore now that - // the CUDA streams are cached within the cms::cuda::StreamCache, but it is - // still better to release as early as possible - nModules_Clusters_h.reset(); + void makeClusters(bool isRun2, + const SiPixelFedCablingMapGPU* cablingMap, + const unsigned char* modToUnp, + const SiPixelGainForHLTonGPU* gains, + const WordFedAppender& wordFed, + PixelFormatterErrors&& errors, + const uint32_t wordCounter, + const uint32_t fedCounter, + bool useQualityInfo, + bool includeErrors, + bool debug); + + std::pair getResults() { return std::make_pair(std::move(digis_d), std::move(clusters_d)); } - SiPixelDigiErrorsCUDA&& getErrors() { return std::move(digiErrors_d); } + SiPixelDigiErrorsSoA&& getErrors() { return std::move(digiErrors_d); } private: - uint32_t nDigis = 0; - // Data to be put in the event - cms::cuda::host::unique_ptr nModules_Clusters_h; - SiPixelDigisCUDA digis_d; - SiPixelClustersCUDA clusters_d; - SiPixelDigiErrorsCUDA digiErrors_d; + SiPixelDigisSoA digis_d; + SiPixelClustersSoA clusters_d; + SiPixelDigiErrorsSoA digiErrors_d; }; // see RecoLocalTracker/SiPixelClusterizer diff --git a/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h index da36be6c4..72ad36e1f 100644 --- a/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h +++ b/src/cudacompat/plugin-SiPixelClusterizer/gpuCalibPixel.h @@ -35,7 +35,7 @@ namespace gpuCalibPixel { // zero for next kernels... if (0 == first) clusModuleStart[0] = moduleStart[0] = 0; - for (int i = first; i < gpuClustering::MaxNumModules; i += gridDim.x * blockDim.x) { + for (int i = first; i < static_cast(gpuClustering::MaxNumModules); i += gridDim.x * blockDim.x) { nClustersInModule[i] = 0; } diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc new file mode 100644 index 000000000..e3ce1fb01 --- /dev/null +++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cc @@ -0,0 +1,56 @@ +// C++ headers +#include +#include + +// CMSSW headers +#include "CUDACore/cudaCompat.h" + +#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h" // ! +#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h" // ! + +#include "PixelRecHits.h" +#include "gpuPixelRecHits.h" + +namespace { + __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t* hitsLayerStart) { + assert(0 == hitsModuleStart[0]); + + int begin = blockIdx.x * blockDim.x + threadIdx.x; + constexpr int end = 11; + for (int i = begin; i < end; i += blockDim.x * gridDim.x) { + hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; +#ifdef GPU_DEBUG + printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); +#endif + } + } +} // namespace + +namespace pixelgpudetails { + + TrackingRecHit2DCPU PixelRecHitGPUKernel::makeHits(SiPixelDigisSoA const& digis_d, + SiPixelClustersSoA const& clusters_d, + BeamSpotPOD const& bs_d, + pixelCPEforGPU::ParamsOnGPU const* cpeParams) const { + auto nHits = clusters_d.nClusters(); + TrackingRecHit2DCPU hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), nullptr); + + if (digis_d.nModules()) // protect from empty events + gpuPixelRecHits::getHits(cpeParams, &bs_d, digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); + cudaCheck(cudaGetLastError()); + + // assuming full warp of threads is better than a smaller number... + if (nHits) { + setHitsLayerStart(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + } + + if (nHits) { + cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256); + } + + return hits_d; + } + +} // namespace pixelgpudetails diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu deleted file mode 100644 index 4cd3fc152..000000000 --- a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.cu +++ /dev/null @@ -1,78 +0,0 @@ -// C++ headers -#include -#include - -// CUDA runtime -#include - -// CMSSW headers -#include "CUDACore/cudaCheck.h" -#include "CUDACore/device_unique_ptr.h" -#include "plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h" // ! -#include "plugin-SiPixelClusterizer/gpuClusteringConstants.h" // ! - -#include "PixelRecHits.h" -#include "gpuPixelRecHits.h" - -namespace { - __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t* hitsLayerStart) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - - assert(0 == hitsModuleStart[0]); - - if (i < 11) { - hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; -#ifdef GPU_DEBUG - printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); -#endif - } - } -} // namespace - -namespace pixelgpudetails { - - TrackingRecHit2DCUDA PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - cudaStream_t stream) const { - auto nHits = clusters_d.nClusters(); - TrackingRecHit2DCUDA hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream); - - int threadsPerBlock = 128; - int blocks = digis_d.nModules(); // active modules (with digis) - -#ifdef GPU_DEBUG - std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; -#endif - if (blocks) // protect from empty events - gpuPixelRecHits::getHits<<>>( - cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - // assuming full warp of threads is better than a smaller number... - if (nHits) { - setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); - cudaCheck(cudaGetLastError()); - } - - if (nHits) { - cms::cuda::fillManyFromVector(hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream); - cudaCheck(cudaGetLastError()); - } - -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif - - return hits_d; - } - -} // namespace pixelgpudetails diff --git a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h index 8f5653fbd..68c02230d 100644 --- a/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h +++ b/src/cudacompat/plugin-SiPixelRecHits/PixelRecHits.h @@ -3,12 +3,10 @@ #include -#include - -#include "CUDADataFormats/BeamSpotCUDA.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "DataFormats/BeamSpotPOD.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" +#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" namespace pixelgpudetails { @@ -22,11 +20,10 @@ namespace pixelgpudetails { PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - TrackingRecHit2DCUDA makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - cudaStream_t stream) const; + TrackingRecHit2DCPU makeHits(SiPixelDigisSoA const& digis_d, + SiPixelClustersSoA const& clusters_d, + BeamSpotPOD const& bs_d, + pixelCPEforGPU::ParamsOnGPU const* cpeParams) const; }; } // namespace pixelgpudetails diff --git a/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc index a82e23eab..d5b9dc687 100644 --- a/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc +++ b/src/cudacompat/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc @@ -1,10 +1,9 @@ #include -#include "CUDADataFormats/BeamSpotCUDA.h" -#include "CUDACore/Product.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" +#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" +#include "DataFormats/BeamSpotPOD.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" @@ -23,39 +22,34 @@ class SiPixelRecHitCUDA : public edm::EDProducer { void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; // The mess with inputs will be cleaned up when migrating to the new framework - edm::EDGetTokenT> tBeamSpot; - edm::EDGetTokenT> token_; - edm::EDGetTokenT> tokenDigi_; + edm::EDGetTokenT tBeamSpot; + edm::EDGetTokenT token_; + edm::EDGetTokenT tokenDigi_; - edm::EDPutTokenT> tokenHit_; + edm::EDPutTokenT tokenHit_; pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg) - : tBeamSpot(reg.consumes>()), - token_(reg.consumes>()), - tokenDigi_(reg.consumes>()), - tokenHit_(reg.produces>()) {} + : tBeamSpot(reg.consumes()), + token_(reg.consumes()), + tokenDigi_(reg.consumes()), + tokenHit_(reg.produces()) {} void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { PixelCPEFast const& fcpe = es.get(); - auto const& pclusters = iEvent.get(token_); - cms::cuda::ScopedContextProduce ctx{pclusters}; - - auto const& clusters = ctx.get(pclusters); - auto const& digis = ctx.get(iEvent, tokenDigi_); - auto const& bs = ctx.get(iEvent, tBeamSpot); + auto const& clusters = iEvent.get(token_); + auto const& digis = iEvent.get(tokenDigi_); + auto const& bs = iEvent.get(tBeamSpot); auto nHits = clusters.nClusters(); if (nHits >= TrackingRecHit2DSOAView::maxHits()) { std::cout << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits() << std::endl; } - ctx.emplace(iEvent, - tokenHit_, - gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream())); + iEvent.emplace(tokenHit_, gpuAlgo_.makeHits(digis, clusters, bs, &fcpe.getCPUProduct())); } DEFINE_FWK_MODULE(SiPixelRecHitCUDA); diff --git a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h index 433d3b012..d58984893 100644 --- a/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h +++ b/src/cudacompat/plugin-SiPixelRecHits/gpuPixelRecHits.h @@ -5,8 +5,8 @@ #include #include -#include "CUDADataFormats/BeamSpotCUDA.h" -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "DataFormats/BeamSpotPOD.h" +#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" #include "DataFormats/approx_atan2.h" #include "CUDACore/cuda_assert.h" #include "CondFormats/pixelCPEforGPU.h" diff --git a/src/cudacompat/plugin-Validation/CountValidator.cc b/src/cudacompat/plugin-Validation/CountValidator.cc index 23352f5ba..92a6c148d 100644 --- a/src/cudacompat/plugin-Validation/CountValidator.cc +++ b/src/cudacompat/plugin-Validation/CountValidator.cc @@ -1,8 +1,6 @@ -#include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" #include "CUDADataFormats/ZVertexHeterogeneous.h" #include "DataFormats/DigiClusterCount.h" #include "DataFormats/TrackCount.h" @@ -38,8 +36,8 @@ class CountValidator : public edm::EDProducer { edm::EDGetTokenT trackCountToken_; edm::EDGetTokenT vertexCountToken_; - edm::EDGetTokenT> digiToken_; - edm::EDGetTokenT> clusterToken_; + edm::EDGetTokenT digiToken_; + edm::EDGetTokenT clusterToken_; edm::EDGetTokenT trackToken_; edm::EDGetTokenT vertexToken_; }; @@ -48,8 +46,8 @@ CountValidator::CountValidator(edm::ProductRegistry& reg) : digiClusterCountToken_(reg.consumes()), trackCountToken_(reg.consumes()), vertexCountToken_(reg.consumes()), - digiToken_(reg.consumes>()), - clusterToken_(reg.consumes>()), + digiToken_(reg.consumes()), + clusterToken_(reg.consumes()), trackToken_(reg.consumes()), vertexToken_(reg.consumes()) {} @@ -63,11 +61,9 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) ss << "Event " << iEvent.eventID() << " "; { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextProduce ctx{pdigis}; auto const& count = iEvent.get(digiClusterCountToken_); - auto const& digis = ctx.get(iEvent, digiToken_); - auto const& clusters = ctx.get(iEvent, clusterToken_); + auto const& digis = iEvent.get(digiToken_); + auto const& clusters = iEvent.get(clusterToken_); if (digis.nModules() != count.nModules()) { ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules(); diff --git a/src/cudacompat/plugin-Validation/HistoValidator.cc b/src/cudacompat/plugin-Validation/HistoValidator.cc index d7b11d4b2..47f0159f4 100644 --- a/src/cudacompat/plugin-Validation/HistoValidator.cc +++ b/src/cudacompat/plugin-Validation/HistoValidator.cc @@ -1,9 +1,7 @@ -#include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/SiPixelClustersCUDA.h" -#include "CUDADataFormats/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit2DCUDA.h" +#include "CUDADataFormats/SiPixelClustersSoA.h" +#include "CUDADataFormats/SiPixelDigisSoA.h" +#include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" #include "CUDADataFormats/ZVertexHeterogeneous.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" @@ -15,34 +13,20 @@ #include #include -class HistoValidator : public edm::EDProducerExternalWork { +class HistoValidator : public edm::EDProducer { public: explicit HistoValidator(edm::ProductRegistry& reg); private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; void endJob() override; - edm::EDGetTokenT> digiToken_; - edm::EDGetTokenT> clusterToken_; - edm::EDGetTokenT> hitToken_; + edm::EDGetTokenT digiToken_; + edm::EDGetTokenT clusterToken_; + edm::EDGetTokenT hitToken_; edm::EDGetTokenT trackToken_; edm::EDGetTokenT vertexToken_; - uint32_t nDigis; - uint32_t nModules; - uint32_t nClusters; - uint32_t nHits; - cms::cuda::host::unique_ptr h_adc; - cms::cuda::host::unique_ptr h_clusInModule; - cms::cuda::host::unique_ptr h_localCoord; - cms::cuda::host::unique_ptr h_globalCoord; - cms::cuda::host::unique_ptr h_charge; - cms::cuda::host::unique_ptr h_size; - static std::map histos; }; @@ -82,69 +66,49 @@ std::map HistoValidator::histos = { {"vertex_pt2", SimpleAtomicHisto(100, 0, 4000)}}; HistoValidator::HistoValidator(edm::ProductRegistry& reg) - : digiToken_(reg.consumes>()), - clusterToken_(reg.consumes>()), - hitToken_(reg.consumes>()), + : digiToken_(reg.consumes()), + clusterToken_(reg.consumes()), + hitToken_(reg.consumes()), trackToken_(reg.consumes()), vertexToken_(reg.consumes()) {} -void HistoValidator::acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)}; - auto const& digis = ctx.get(iEvent, digiToken_); - auto const& clusters = ctx.get(iEvent, clusterToken_); - auto const& hits = ctx.get(iEvent, hitToken_); - - nDigis = digis.nDigis(); - nModules = digis.nModules(); - h_adc = digis.adcToHostAsync(ctx.stream()); - - nClusters = clusters.nClusters(); - h_clusInModule = cms::cuda::make_host_unique(nModules, ctx.stream()); - cudaCheck(cudaMemcpyAsync( - h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream())); - - nHits = hits.nHits(); - h_localCoord = hits.localCoordToHostAsync(ctx.stream()); - h_globalCoord = hits.globalCoordToHostAsync(ctx.stream()); - h_charge = hits.chargeToHostAsync(ctx.stream()); - h_size = hits.sizeToHostAsync(ctx.stream()); -} - void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + auto const& digis = iEvent.get(digiToken_); + auto const& clusters = iEvent.get(clusterToken_); + + auto const nDigis = digis.nDigis(); + auto const nModules = digis.nModules(); + + auto const nClusters = clusters.nClusters(); + + auto const* hits = iEvent.get(hitToken_).view(); + histos["digi_n"].fill(nDigis); for (uint32_t i = 0; i < nDigis; ++i) { - histos["digi_adc"].fill(h_adc[i]); + histos["digi_adc"].fill(digis.adc()[i]); } - h_adc.reset(); histos["module_n"].fill(nModules); histos["cluster_n"].fill(nClusters); for (uint32_t i = 0; i < nModules; ++i) { - histos["cluster_per_module_n"].fill(h_clusInModule[i]); + histos["cluster_per_module_n"].fill(clusters.clusInModule()[i]); } - h_clusInModule.reset(); + auto const nHits = hits->nHits(); histos["hit_n"].fill(nHits); for (uint32_t i = 0; i < nHits; ++i) { - histos["hit_lx"].fill(h_localCoord[i]); - histos["hit_ly"].fill(h_localCoord[i + nHits]); - histos["hit_lex"].fill(h_localCoord[i + 2 * nHits]); - histos["hit_ley"].fill(h_localCoord[i + 3 * nHits]); - histos["hit_gx"].fill(h_globalCoord[i]); - histos["hit_gy"].fill(h_globalCoord[i + nHits]); - histos["hit_gz"].fill(h_globalCoord[i + 2 * nHits]); - histos["hit_gr"].fill(h_globalCoord[i + 3 * nHits]); - histos["hit_charge"].fill(h_charge[i]); - histos["hit_sizex"].fill(h_size[i]); - histos["hit_sizey"].fill(h_size[i + nHits]); + histos["hit_lx"].fill(hits->xLocal(i)); + histos["hit_ly"].fill(hits->yLocal(i)); + histos["hit_lex"].fill(hits->xerrLocal(i)); + histos["hit_ley"].fill(hits->yerrLocal(i)); + histos["hit_gx"].fill(hits->xGlobal(i)); + histos["hit_gy"].fill(hits->yGlobal(i)); + histos["hit_gz"].fill(hits->zGlobal(i)); + histos["hit_gr"].fill(hits->rGlobal(i)); + histos["hit_charge"].fill(hits->charge(i)); + histos["hit_sizex"].fill(hits->clusterSizeX(i)); + histos["hit_sizey"].fill(hits->clusterSizeY(i)); } - h_localCoord.reset(); - h_globalCoord.reset(); - h_charge.reset(); - h_size.reset(); { auto const& tracks = iEvent.get(trackToken_); @@ -183,7 +147,7 @@ void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) } void HistoValidator::endJob() { - std::ofstream out("histograms_cuda.txt"); + std::ofstream out("histograms_cudacompat.txt"); for (auto const& elem : histos) { out << elem.first << " " << elem.second << "\n"; } diff --git a/src/cudacompat/plugins.txt b/src/cudacompat/plugins.txt index 52dfe3102..bab0d03f6 100644 --- a/src/cudacompat/plugins.txt +++ b/src/cudacompat/plugins.txt @@ -1,5 +1,6 @@ BeamSpotESProducer pluginBeamSpotProducer.so BeamSpotToCUDA pluginBeamSpotProducer.so +BeamSpotToPOD pluginBeamSpotProducer.so CAHitNtupletCUDA pluginPixelTriplets.so CountValidator pluginValidation.so HistoValidator pluginValidation.so