Skip to content

Commit

Permalink
Next prototype of the framework integration (#100)
Browse files Browse the repository at this point in the history
Provide a mechanism for a chain of modules to share a resource, that can be e.g. CUDA device memory or a CUDA stream.
Minimize data movements between the CPU and the device, and support multiple devices.
Allow the same job configuration to be used on all hardware combinations.

See HeterogeneousCore/CUDACore/README.md for a more detailed description and examples.
  • Loading branch information
makortel authored and fwyzard committed Mar 13, 2019
1 parent 96828db commit 2ebb643
Show file tree
Hide file tree
Showing 114 changed files with 3,928 additions and 1,107 deletions.
7 changes: 7 additions & 0 deletions CUDADataFormats/Common/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<use name="cuda-api-wrappers"/>
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>

<export>
<lib name="1"/>
</export>
51 changes: 51 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef CUDADataFormats_Common_CUDAProduct_h
#define CUDADataFormats_Common_CUDAProduct_h

#include <memory>

#include <cuda/api_wrappers.h>

#include "CUDADataFormats/Common/interface/CUDAProductBase.h"

namespace edm {
template <typename T> class Wrapper;
}

/**
* The purpose of this class is to wrap CUDA data to edm::Event in a
* way which forces correct use of various utilities.
*
* The non-default construction has to be done with CUDAScopedContext
* (in order to properly register the CUDA event).
*
* The default constructor is needed only for the ROOT dictionary generation.
*
* The CUDA event is in practice needed only for stream-stream
* synchronization, but someone with long-enough lifetime has to own
* it. Here is a somewhat natural place. If overhead is too much, we
* can e.g. make CUDAService own them (creating them on demand) and
* use them only where synchronization between streams is needed.
*/
template <typename T>
class CUDAProduct: public CUDAProductBase {
public:
CUDAProduct() = default; // Needed only for ROOT dictionary generation

CUDAProduct(const CUDAProduct&) = delete;
CUDAProduct& operator=(const CUDAProduct&) = delete;
CUDAProduct(CUDAProduct&&) = default;
CUDAProduct& operator=(CUDAProduct&&) = default;

private:
friend class CUDAScopedContext;
friend class edm::Wrapper<CUDAProduct<T>>;

explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
CUDAProductBase(device, std::move(stream)),
data_(std::move(data))
{}

T data_; //!
};

#endif
40 changes: 40 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProductBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef CUDADataFormats_Common_CUDAProductBase_h
#define CUDADataFormats_Common_CUDAProductBase_h

#include <memory>

#include <cuda/api_wrappers.h>

/**
* Base class for all instantiations of CUDA<T> to hold the
* non-T-dependent members.
*/
class CUDAProductBase {
public:
CUDAProductBase() = default; // Needed only for ROOT dictionary generation

bool isValid() const { return stream_.get() != nullptr; }

int device() const { return device_; }

const cuda::stream_t<>& stream() const { return *stream_; }
cuda::stream_t<>& stream() { return *stream_; }
const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }

const cuda::event_t& event() const { return *event_; }
cuda::event_t& event() { return *event_; }

protected:
explicit CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream);

private:
// The cuda::stream_t is really shared among edm::Event products, so
// using shared_ptr also here
std::shared_ptr<cuda::stream_t<>> stream_; //!
// shared_ptr because of caching in CUDAService
std::shared_ptr<cuda::event_t> event_; //!

int device_ = -1; //!
};

#endif
16 changes: 0 additions & 16 deletions CUDADataFormats/Common/interface/device_unique_ptr.h

This file was deleted.

16 changes: 0 additions & 16 deletions CUDADataFormats/Common/interface/host_unique_ptr.h

This file was deleted.

19 changes: 19 additions & 0 deletions CUDADataFormats/Common/src/CUDAProductBase.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "CUDADataFormats/Common/interface/CUDAProductBase.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

CUDAProductBase::CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream):
stream_(std::move(stream)),
device_(device)
{
edm::Service<CUDAService> cs;
event_ = cs->getCUDAEvent();

// Record CUDA event to the CUDA stream. The event will become
// "occurred" after all work queued to the stream before this
// point has been finished.
event_->record(stream_->id());
}


5 changes: 5 additions & 0 deletions CUDADataFormats/Common/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<bin file="test*.cc" name="testCUDADataFormatsCommon">
<use name="HeterogeneousCore/CUDACore"/>
<use name="catch2"/>
<use name="cuda"/>
</bin>
63 changes: 63 additions & 0 deletions CUDADataFormats/Common/test/test_CUDAProduct.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#include "catch.hpp"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"

#include <cuda_runtime_api.h>

namespace cudatest {
class TestCUDAScopedContext {
public:
static
CUDAScopedContext make(int dev) {
auto device = cuda::device::get(dev);
return CUDAScopedContext(dev, std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)));
}
};
}

TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
SECTION("Default constructed") {
auto foo = CUDAProduct<int>();
REQUIRE(!foo.isValid());

auto bar = std::move(foo);
}

exitSansCUDADevices();

constexpr int defaultDevice = 0;
{
auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
auto& data = *dataPtr;

SECTION("Construct from CUDAScopedContext") {
REQUIRE(data.isValid());
REQUIRE(data.device() == defaultDevice);
REQUIRE(data.stream().id() == ctx.stream().id());
REQUIRE(&data.event() != nullptr);
}

SECTION("Move constructor") {
auto data2 = CUDAProduct<int>(std::move(data));
REQUIRE(data2.isValid());
REQUIRE(!data.isValid());
}

SECTION("Move assignment") {
CUDAProduct<int> data2;
data2 = std::move(data);
REQUIRE(data2.isValid());
REQUIRE(!data.isValid());
}
}

// Destroy and clean up all resources so that the next test can
// assume to start from a clean state.
cudaCheck(cudaSetDevice(defaultDevice));
cudaCheck(cudaDeviceSynchronize());
cudaDeviceReset();
}
2 changes: 2 additions & 0 deletions CUDADataFormats/Common/test/test_main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
1 change: 1 addition & 0 deletions CUDADataFormats/SiPixelCluster/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
Expand Down
29 changes: 16 additions & 13 deletions CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
Original file line number Diff line number Diff line change
@@ -1,35 +1,39 @@
#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <cuda/api_wrappers.h>

class SiPixelClustersCUDA {
public:
SiPixelClustersCUDA() = default;
explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream);
~SiPixelClustersCUDA() = default;

SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;

void setNClusters(uint32_t nClusters) {
nClusters_h = nClusters;
}

uint32_t nClusters() const { return nClusters_h; }

uint32_t *moduleStart() { return moduleStart_d.get(); }
int32_t *clus() { return clus_d.get(); }
uint32_t *clusInModule() { return clusInModule_d.get(); }
uint32_t *moduleId() { return moduleId_d.get(); }
uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }

uint32_t const *moduleStart() const { return moduleStart_d.get(); }
int32_t const *clus() const { return clus_d.get(); }
uint32_t const *clusInModule() const { return clusInModule_d.get(); }
uint32_t const *moduleId() const { return moduleId_d.get(); }
uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }

uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
int32_t const *c_clus() const { return clus_d.get(); }
uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
uint32_t const *c_moduleId() const { return moduleId_d.get(); }
uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
Expand All @@ -40,7 +44,6 @@ class SiPixelClustersCUDA {

#ifdef __CUDACC__
__device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
__device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
__device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
__device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
__device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
Expand All @@ -50,7 +53,6 @@ class SiPixelClustersCUDA {

private:
uint32_t const *moduleStart_;
int32_t const *clus_;
uint32_t const *clusInModule_;
uint32_t const *moduleId_;
uint32_t const *clusModuleStart_;
Expand All @@ -59,15 +61,16 @@ class SiPixelClustersCUDA {
DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
edm::cuda::device::unique_ptr<int32_t[]> clus_d; // cluster id of each pixel
edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module
cudautils::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
cudautils::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
cudautils::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module

// originally from rechits
edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;

cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer

edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
uint32_t nClusters_h;
};

#endif
15 changes: 7 additions & 8 deletions CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

moduleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
clus_d = cs->make_device_unique< int32_t[]>(feds, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
moduleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->moduleStart_ = moduleStart_d.get();
view->clus_ = clus_d.get();
view->clusInModule_ = clusInModule_d.get();
view->moduleId_ = moduleId_d.get();
view->clusModuleStart_ = clusModuleStart_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
cudautils::copyAsync(view_d, view, stream);
}
8 changes: 8 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef CUDADataFormats_SiPixelCluster_classes_h
#define CUDADataFormats_SiPixelCluster_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
4 changes: 4 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<lcgdict>
<class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
</lcgdict>
2 changes: 2 additions & 0 deletions CUDADataFormats/SiPixelDigi/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
<use name="DataFormats/SiPixelRawData"/>
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
Expand Down
Loading

0 comments on commit 2ebb643

Please sign in to comment.