diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..e8f8e590 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,7 @@ +[submodule "extern/pybind11"] + path = extern/pybind11 + url = https://github.com/sbaldu/pybind11.git + branch = master +[submodule "extern/alpaka"] + path = extern/alpaka + url = https://github.com/cms-patatrack/alpaka.git diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h b/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h new file mode 100644 index 00000000..33ecf3f4 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h @@ -0,0 +1,31 @@ +#ifndef AlpakaCore_AllocatorConfig_h +#define AlpakaCore_AllocatorConfig_h + +#include + +namespace cms::alpakatools { + + namespace config { + + // bin growth factor (bin_growth in cub::CachingDeviceAllocator) + constexpr unsigned int binGrowth = 2; + + // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator + constexpr unsigned int minBin = 8; // 256 bytes + + // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. + constexpr unsigned int maxBin = 30; // 1 GB + + // total storage for the allocator; 0 means no limit. + constexpr size_t maxCachedBytes = 0; + + // fraction of total device memory taken for the allocator; 0 means no limit. + constexpr double maxCachedFraction = 0.8; + + // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used. + + } // namespace config + +} // namespace cms::alpakatools + +#endif // AlpakaCore_AllocatorConfig_h diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h new file mode 100644 index 00000000..4561d91f --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h @@ -0,0 +1,51 @@ +#ifndef AlpakaCore_AllocatorPolicy_h +#define AlpakaCore_AllocatorPolicy_h + +#include + +namespace cms::alpakatools { + + // Which memory allocator to use + // - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost + // - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2) + // - Caching: (device and host) caching allocator + enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; + + template + constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; + +#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + +#if defined ALPAKA_ACC_GPU_CUDA_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + AllocatorPolicy::Asynchronous; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#if defined ALPAKA_ACC_GPU_HIP_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +} // namespace cms::alpakatools + +#endif // AlpakaCore_AllocatorPolicy_h diff --git a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h new file mode 100644 index 00000000..a912024a --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h @@ -0,0 +1,145 @@ +#ifndef AlpakaCore_CachedBufAlloc_h +#define AlpakaCore_CachedBufAlloc_h + +#include + +#include "getDeviceCachingAllocator.h" +#include "getHostCachingAllocator.h" + +namespace cms::alpakatools { + + namespace traits { + + //! The caching memory allocator trait. + template + struct CachedBufAlloc { + static_assert(alpaka::meta::DependentFalseType::value, "This device does not support a caching allocator"); + }; + + //! The caching memory allocator implementation for the CPU device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCpu { + // non-cached host-only memory + return alpaka::allocAsyncBuf(queue, extent); + } + }; + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + + //! The caching memory allocator implementation for the pinned host memory + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueCudaRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the CUDA device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCudaRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCudaRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + + //! The caching memory allocator implementation for the pinned host memory + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueHipRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the ROCm/HIP device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufHipRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufHipRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace traits + + template + ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) { + return traits::CachedBufAlloc, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent); + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_CachedBufAlloc_h diff --git a/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h new file mode 100644 index 00000000..1cda3e9d --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h @@ -0,0 +1,421 @@ +#ifndef AlpakaCore_CachingAllocator_h +#define AlpakaCore_CachingAllocator_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "alpakaDevices.h" + +// Inspired by cub::CachingDeviceAllocator + +namespace cms::alpakatools { + + namespace detail { + + inline constexpr unsigned int power(unsigned int base, unsigned int exponent) { + unsigned int power = 1; + while (exponent > 0) { + if (exponent & 1) { + power = power * base; + } + base = base * base; + exponent = exponent >> 1; + } + return power; + } + + // format a memory size in B/kB/MB/GB + inline std::string as_bytes(size_t value) { + if (value == std::numeric_limits::max()) { + return "unlimited"; + } else if (value >= (1 << 30) and value % (1 << 30) == 0) { + return std::to_string(value >> 30) + " GB"; + } else if (value >= (1 << 20) and value % (1 << 20) == 0) { + return std::to_string(value >> 20) + " MB"; + } else if (value >= (1 << 10) and value % (1 << 10) == 0) { + return std::to_string(value >> 10) + " kB"; + } else { + return std::to_string(value) + " B"; + } + } + + } // namespace detail + + /* + * The "memory device" identifies the memory space, i.e. the device where the memory is allocated. + * A caching allocator object is associated to a single memory `Device`, set at construction time, and unchanged for + * the lifetime of the allocator. + * + * Each allocation is associated to an event on a queue, that identifies the "synchronisation device" according to + * which the synchronisation occurs. + * The `Event` type depends only on the synchronisation `Device` type. + * The `Queue` type depends on the synchronisation `Device` type and the queue properties, either `Sync` or `Async`. + * + * **Note**: how to handle different queue and event types in a single allocator ? store and access type-punned + * queues and events ? or template the internal structures on them, but with a common base class ? + * alpaka does rely on the compile-time type for dispatch. + * + * Common use case #1: accelerator's memory allocations + * - the "memory device" is the accelerator device (e.g. a GPU); + * - the "synchronisation device" is the same accelerator device; + * - the `Queue` type is usually always the same (either `Sync` or `Async`). + * + * Common use case #2: pinned host memory allocations + * - the "memory device" is the host device (e.g. system memory); + * - the "synchronisation device" is the accelerator device (e.g. a GPU) whose work queue will access the host; + * memory (direct memory access from the accelerator, or scheduling `alpaka::memcpy`/`alpaka::memset`), and can + * be different for each allocation; + * - the synchronisation `Device` _type_ could potentially be different, but memory pinning is currently tied to + * the accelerator's platform (CUDA, HIP, etc.), so the device type needs to be fixed to benefit from caching; + * - the `Queue` type can be either `Sync` _or_ `Async` on any allocation. + */ + + template + class CachingAllocator { + public: + using Device = TDevice; // the "memory device", where the memory will be allocated + using Queue = TQueue; // the queue used to submit the memory operations + using Event = alpaka::Event; // the events used to synchronise the operations + using Buffer = alpaka::Buf, size_t>; + + // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the " + "host CPU."); + + struct CachedBytes { + size_t free = 0; // total bytes freed and cached on this device + size_t live = 0; // total bytes currently in use oin this device + size_t requested = 0; // total bytes requested and currently in use on this device + }; + + explicit CachingAllocator( + Device const& device, + unsigned int binGrowth, // bin growth factor; + unsigned int minBin, // smallest bin, corresponds to binGrowth^minBin bytes; + // smaller allocations are rounded to this value; + unsigned int maxBin, // largest bin, corresponds to binGrowth^maxBin bytes; + // larger allocations will fail; + size_t maxCachedBytes, // total storage for the allocator (0 means no limit); + double maxCachedFraction, // fraction of total device memory taken for the allocator (0 means no limit); + // if both maxCachedBytes and maxCachedFraction are non-zero, + // the smallest resulting value is used. + bool reuseSameQueueAllocations, // reuse non-ready allocations if they are in the same queue as the new one; + // this is safe only if all memory operations are scheduled in the same queue + bool debug) + : device_(device), + binGrowth_(binGrowth), + minBin_(minBin), + maxBin_(maxBin), + minBinBytes_(detail::power(binGrowth, minBin)), + maxBinBytes_(detail::power(binGrowth, maxBin)), + maxCachedBytes_(cacheSize(maxCachedBytes, maxCachedFraction)), + reuseSameQueueAllocations_(reuseSameQueueAllocations), + debug_(debug) { + if (debug_) { + std::ostringstream out; + out << "CachingAllocator settings\n" + << " bin growth " << binGrowth_ << "\n" + << " min bin " << minBin_ << "\n" + << " max bin " << maxBin_ << "\n" + << " resulting bins:\n"; + for (auto bin = minBin_; bin <= maxBin_; ++bin) { + auto binSize = detail::power(binGrowth, bin); + out << " " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n'; + } + out << " maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_); + std::cout << out.str() << std::endl; + } + } + + ~CachingAllocator() { + { + // this should never be called while some memory blocks are still live + std::scoped_lock lock(mutex_); + assert(liveBlocks_.empty()); + assert(cachedBytes_.live == 0); + } + + freeAllCached(); + } + + // return a copy of the cache allocation status, for monitoring purposes + CachedBytes cacheStatus() const { + std::scoped_lock lock(mutex_); + return cachedBytes_; + } + + // Allocate given number of bytes on the current device associated to given queue + void* allocate(size_t bytes, Queue queue) { + // create a block descriptor for the requested allocation + BlockDescriptor block; + block.queue = std::move(queue); + block.requested = bytes; + std::tie(block.bin, block.bytes) = findBin(bytes); + + // try to re-use a cached block, or allocate a new buffer + if (not tryReuseCachedBlock(block)) { + allocateNewBlock(block); + } + + return block.buffer->data(); + } + + // frees an allocation + void free(void* ptr) { + std::scoped_lock lock(mutex_); + + auto iBlock = liveBlocks_.find(ptr); + if (iBlock == liveBlocks_.end()) { + std::stringstream ss; + ss << "Trying to free a non-live block at " << ptr; + throw std::runtime_error(ss.str()); + } + // remove the block from the list of live blocks + BlockDescriptor block = std::move(iBlock->second); + liveBlocks_.erase(iBlock); + cachedBytes_.live -= block.bytes; + cachedBytes_.requested -= block.requested; + + bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_); + if (recache) { + alpaka::enqueue(*(block.queue), *(block.event)); + cachedBytes_.free += block.bytes; + // after the call to insert(), cachedBlocks_ shares ownership of the buffer + // TODO use std::move ? + cachedBlocks_.insert(std::make_pair(block.bin, block)); + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at " + << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" + << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; + std::cout << out.str() << std::endl; + } + } else { + // if the buffer is not recached, it is automatically freed when block goes out of scope + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at " + << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached (" + << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; + std::cout << out.str() << std::endl; + } + } + } + + private: + struct BlockDescriptor { + std::optional buffer; + std::optional queue; + std::optional event; + size_t bytes = 0; + size_t requested = 0; // for monitoring only + unsigned int bin = 0; + + // the "synchronisation device" for this block + auto device() { return alpaka::getDev(*queue); } + }; + + private: + // return the maximum amount of memory that should be cached on this device + size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const { + // note that getMemBytes() returns 0 if the platform does not support querying the device memory + size_t totalMemory = alpaka::getMemBytes(device_); + size_t memoryFraction = static_cast(maxCachedFraction * totalMemory); + size_t size = std::numeric_limits::max(); + if (maxCachedBytes > 0 and maxCachedBytes < size) { + size = maxCachedBytes; + } + if (memoryFraction > 0 and memoryFraction < size) { + size = memoryFraction; + } + return size; + } + + // return (bin, bin size) + std::tuple findBin(size_t bytes) const { + if (bytes < minBinBytes_) { + return std::make_tuple(minBin_, minBinBytes_); + } + if (bytes > maxBinBytes_) { + throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + + " bytes is too large for the caching detail with maximum bin " + + std::to_string(maxBinBytes_) + + " bytes. You might want to increase the maximum bin size"); + } + unsigned int bin = minBin_; + size_t binBytes = minBinBytes_; + while (binBytes < bytes) { + ++bin; + binBytes *= binGrowth_; + } + return std::make_tuple(bin, binBytes); + } + + bool tryReuseCachedBlock(BlockDescriptor& block) { + std::scoped_lock lock(mutex_); + + // iterate through the range of cached blocks in the same bin + const auto [begin, end] = cachedBlocks_.equal_range(block.bin); + for (auto iBlock = begin; iBlock != end; ++iBlock) { + if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or + alpaka::isComplete(*(iBlock->second.event))) { + // associate the cached buffer to the new queue + auto queue = std::move(*(block.queue)); + // TODO cache (or remove) the debug information and use std::move() + block = iBlock->second; + block.queue = std::move(queue); + + // if the new queue is on different device than the old event, create a new event + if (block.device() != alpaka::getDev(*(block.event))) { + block.event = Event{block.device()}; + } + + // insert the cached block into the live blocks + // TODO cache (or remove) the debug information and use std::move() + liveBlocks_[block.buffer->data()] = block; + + // update the accounting information + cachedBytes_.free -= block.bytes; + cachedBytes_.live += block.bytes; + cachedBytes_.requested += block.requested; + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at " + << block.buffer->data() << " (" << block.bytes << " bytes) for queue " + << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() + << " (previously associated with stream " << iBlock->second.queue->m_spQueueImpl.get() << " , event " + << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl; + std::cout << out.str() << std::endl; + } + + // remove the reused block from the list of cached blocks + cachedBlocks_.erase(iBlock); + return true; + } + } + + return false; + } + + Buffer allocateBuffer(size_t bytes, Queue const& queue) { + if constexpr (std::is_same_v>) { + // allocate device memory + return alpaka::allocBuf(device_, bytes); + } else if constexpr (std::is_same_v) { + // allocate pinned host memory accessible by the queue's platform + return alpaka::allocMappedBuf>, std::byte, size_t>(device_, bytes); + } else { + // unsupported combination + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be " + "the host CPU."); + } + } + + void allocateNewBlock(BlockDescriptor& block) { + try { + block.buffer = allocateBuffer(block.bytes, *block.queue); + } catch (std::runtime_error const& e) { + // the allocation attempt failed: free all cached blocks on the device and retry + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes + << " bytes for queue " << block.queue->m_spQueueImpl.get() + << ", retrying after freeing cached allocations" << std::endl; + std::cout << out.str() << std::endl; + } + // TODO implement a method that frees only up to block.bytes bytes + freeAllCached(); + + // throw an exception if it fails again + block.buffer = allocateBuffer(block.bytes, *block.queue); + } + + // create a new event associated to the "synchronisation device" + block.event = Event{block.device()}; + + { + std::scoped_lock lock(mutex_); + cachedBytes_.live += block.bytes; + cachedBytes_.requested += block.requested; + // TODO use std::move() ? + liveBlocks_[block.buffer->data()] = block; + } + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at " + << block.buffer->data() << " (" << block.bytes << " bytes associated with queue " + << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl; + std::cout << out.str() << std::endl; + } + } + + void freeAllCached() { + std::scoped_lock lock(mutex_); + + while (not cachedBlocks_.empty()) { + auto iBlock = cachedBlocks_.begin(); + cachedBytes_.free -= iBlock->second.bytes; + + if (debug_) { + std::ostringstream out; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes + << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free + << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding." + << std::endl; + std::cout << out.str() << std::endl; + } + + cachedBlocks_.erase(iBlock); + } + } + + // TODO replace with a tbb::concurrent_multimap ? + using CachedBlocks = std::multimap; // ordered by the allocation bin + // TODO replace with a tbb::concurrent_map ? + using BusyBlocks = std::map; // ordered by the address of the allocated memory + + inline static const std::string deviceType_ = boost::core::demangle(typeid(Device).name()); + + mutable std::mutex mutex_; + Device device_; // the device where the memory is allocated + + CachedBytes cachedBytes_; + CachedBlocks cachedBlocks_; // Set of cached device allocations available for reuse + BusyBlocks liveBlocks_; // map of pointers to the live device allocations currently in use + + const unsigned int binGrowth_; // Geometric growth factor for bin-sizes + const unsigned int minBin_; + const unsigned int maxBin_; + + const size_t minBinBytes_; + const size_t maxBinBytes_; + const size_t maxCachedBytes_; // Maximum aggregate cached bytes per device + + const bool reuseSameQueueAllocations_; + const bool debug_; + }; + +} // namespace cms::alpakatools + +#endif // AlpakaCore_CachingAllocator_h diff --git a/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h new file mode 100644 index 00000000..d010dc3e --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h @@ -0,0 +1,67 @@ +#ifndef AlpakaCore_HostOnlyTask_h +#define AlpakaCore_HostOnlyTask_h + +#include +#include + +#include + +namespace alpaka { + + class HostOnlyTask { + public: + HostOnlyTask(std::function task) : task_(std::move(task)) {} + + void operator()() const { task_(); } + + private: + std::function task_; + }; + + namespace trait { + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + //! The CUDA async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + using TApi = ApiCudaRt; + + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + //! The HIP async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + using TApi = ApiHipRt; + + static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace trait + +} // namespace alpaka + +#endif // AlpakaCore_HostOnlyTask_h diff --git a/CLUEstering/alpaka/AlpakaCore/StreamCache.h b/CLUEstering/alpaka/AlpakaCore/StreamCache.h new file mode 100644 index 00000000..682bb7e7 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/StreamCache.h @@ -0,0 +1,56 @@ +#ifndef AlpakaCore_StreamCache_h +#define AlpakaCore_StreamCache_h + +#include +#include + +#include "alpakaConfig.h" +#include "getDeviceIndex.h" +#include "Framework/ReusableObjectHolder.h" + +namespace cms::alpakatools { + + template + class StreamCache { + using Device = alpaka::Dev; + using Platform = alpaka::Pltf; + + public: + // StreamCache should be constructed by the first call to + // getStreamCache() only if we have CUDA devices present + StreamCache() : cache_(alpaka::getDevCount()) {} + + // Gets a (cached) CUDA stream for the current device. The stream + // will be returned to the cache by the shared_ptr destructor. + // This function is thread safe + ALPAKA_FN_HOST std::shared_ptr get(Device const& dev) { + return cache_[cms::alpakatools::getDeviceIndex(dev)].makeOrGet([dev]() { return std::make_unique(dev); }); + } + + private: + // not thread safe, intended to be called only from CUDAService destructor + void clear() { + // Reset the contents of the caches, but leave an + // edm::ReusableObjectHolder alive for each device. This is needed + // mostly for the unit tests, where the function-static + // StreamCache lives through multiple tests (and go through + // multiple shutdowns of the framework). + cache_.clear(); + cache_.resize(alpaka::getDevCount()); + } + + std::vector> cache_; + }; + + // Gets the global instance of a StreamCache + // This function is thread safe + template + StreamCache& getStreamCache() { + // the public interface is thread safe + static StreamCache cache; + return cache; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_StreamCache_h diff --git a/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc b/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc new file mode 100644 index 00000000..08ccbb1a --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc @@ -0,0 +1,32 @@ +#include + +#include + +#include "../alpakaConfig.h" +#include "../alpakaDevices.h" +#include "../initialise.h" +#include "../demangle.h" + +namespace cms::alpakatools { + + template + void initialise() { + constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; + + if (devices.empty()) { + devices = enumerate(); + auto size = devices.size(); + //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; + std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; + for (auto const& device : devices) { + std::cout << " - " << alpaka::getName(device) << std::endl; + } + } else { + //std::cout << edm::demangle << " platform already initialised." << std::endl; + } + } + + // explicit template instantiation definition + template void initialise(); + +} // namespace cms::alpakatools diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h new file mode 100644 index 00000000..3f94e9cf --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h @@ -0,0 +1,160 @@ +#ifndef AlpakaCore_alpakaConfig_h +#define AlpakaCore_alpakaConfig_h + +#include "alpakaFwd.h" + +namespace alpaka_common { + + // common types and dimensions + using Idx = uint32_t; + using Extent = uint32_t; + using Offsets = Extent; + + using Dim0D = alpaka::DimInt<0u>; + using Dim1D = alpaka::DimInt<1u>; + using Dim2D = alpaka::DimInt<2u>; + using Dim3D = alpaka::DimInt<3u>; + + template + using Vec = alpaka::Vec; + using Vec1D = Vec; + using Vec2D = Vec; + using Vec3D = Vec; + using Scalar = Vec; + + template + using WorkDiv = alpaka::WorkDivMembers; + using WorkDiv1D = WorkDiv; + using WorkDiv2D = WorkDiv; + using WorkDiv3D = WorkDiv; + + // host types + using DevHost = alpaka::DevCpu; + using PltfHost = alpaka::PltfCpu; + +} // namespace alpaka_common + +// trick to force expanding ALPAKA_ACCELERATOR_NAMESPACE before stringification inside DEFINE_FWK_MODULE +#define DEFINE_FWK_ALPAKA_MODULE2(name) DEFINE_FWK_MODULE(name) +#define DEFINE_FWK_ALPAKA_MODULE(name) DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) + +#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(name) DEFINE_FWK_EVENTSETUP_MODULE(name) +#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \ + DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) + +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT +namespace alpaka_cuda_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfCudaRt; + using Device = alpaka::DevCudaRt; + using Queue = alpaka::QueueCudaRtNonBlocking; + using Event = alpaka::EventCudaRt; + + template + using Acc = alpaka::AccGpuCudaRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_cuda_async + +#endif // ALPAKA_ACC_GPU_CUDA_PRESENT + +#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async +#endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND + +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT +namespace alpaka_rocm_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::QueueHipRtNonBlocking; + using Event = alpaka::EventHipRt; + + template + using Acc = alpaka::AccGpuHipRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_rocm_async + +#endif // ALPAKA_ACC_GPU_HIP_PRESENT + +#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async +#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT +namespace alpaka_serial_sync { + using namespace alpaka_common; + + using Platform = alpaka::PltfCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::QueueCpuBlocking; + using Event = alpaka::EventCpu; + + template + using Acc = alpaka::AccCpuSerial; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_serial_sync + +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND + +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT +namespace alpaka_tbb_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::QueueCpuNonBlocking; + using Event = alpaka::EventCpu; + + template + using Acc = alpaka::AccCpuTbbBlocks; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_tbb_async + +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND + +#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT +namespace alpaka_omp2_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::QueueCpuBlocking; + using Event = alpaka::EventCpu; + + template + using Acc = alpaka::AccCpuOmp2Blocks; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_omp2_async + +#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT + +#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async +#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND + +#endif // AlpakaCore_alpakaConfig_h diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h new file mode 100644 index 00000000..d980bf38 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h @@ -0,0 +1,40 @@ +#ifndef AlpakaCore_alpakaDevices_h +#define AlpakaCore_alpakaDevices_h + +#include +#include + +#include + +#include "alpakaConfig.h" +#include "getDeviceIndex.h" + +namespace cms::alpakatools { + + // alpaka host device + inline const alpaka_common::DevHost host = alpaka::getDevByIdx(0u); + + // alpaka accelerator devices + template + inline std::vector> devices; + + template + std::vector> enumerate() { + assert(getDeviceIndex(host) == 0u); + + using Device = alpaka::Dev; + using Platform = TPlatform; + + std::vector devices; + uint32_t n = alpaka::getDevCount(); + devices.reserve(n); + for (uint32_t i = 0; i < n; ++i) { + devices.push_back(alpaka::getDevByIdx(i)); + assert(getDeviceIndex(devices.back()) == static_cast(i)); + } + return devices; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_alpakaDevices_h diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h new file mode 100644 index 00000000..2234e440 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h @@ -0,0 +1,96 @@ +#ifndef AlpakaCore_alpakaFwd_h +#define AlpakaCore_alpakaFwd_h + +#include +#include +#include + +/** + * This file forward declares specific types defined in Alpaka + * (depending on the backend-enabling macros) so that these types + * would be available throughout CMSSW without a direct dependence on + * Alpaka in order to avoid the constraints that would impose + * (primarily the device compiler) + * + * This is a little bit brittle, but let's see how it goes. + */ +namespace alpaka { + + // miscellanea + template + using DimInt = std::integral_constant; + + template + class Vec; + + template + class WorkDivMembers; + + // API + struct ApiCudaRt; + struct ApiHipRt; + + // Platforms + class PltfCpu; + template + class PltfUniformCudaHipRt; + using PltfCudaRt = PltfUniformCudaHipRt; + using PltfHipRt = PltfUniformCudaHipRt; + + // Devices + class DevCpu; + template + class DevUniformCudaHipRt; + using DevCudaRt = DevUniformCudaHipRt; + using DevHipRt = DevUniformCudaHipRt; + + // Queues + template + class QueueGenericThreadsBlocking; + using QueueCpuBlocking = QueueGenericThreadsBlocking; + + template + class QueueGenericThreadsNonBlocking; + using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking; + + namespace uniform_cuda_hip::detail { + template + class QueueUniformCudaHipRt; + } + using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + + // Events + template + class EventGenericThreads; + using EventCpu = EventGenericThreads; + + template + class EventUniformCudaHipRt; + using EventCudaRt = EventUniformCudaHipRt; + using EventHipRt = EventUniformCudaHipRt; + + // Accelerators + template + class AccGpuUniformCudaHipRt; + + template + using AccGpuCudaRt = AccGpuUniformCudaHipRt; + + template + using AccGpuHipRt = AccGpuUniformCudaHipRt; + + template + class AccCpuSerial; + + template + class AccCpuTbbBlocks; + + template + class AccCpuOmp2Blocks; + +} // namespace alpaka + +#endif // AlpakaCore_alpakaFwd_h diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h new file mode 100644 index 00000000..6bec9424 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h @@ -0,0 +1,237 @@ +#ifndef AlpakaCore_alpakaMemory_h +#define AlpakaCore_alpakaMemory_h + +#include +#include "initialise.h" + +#if __cplusplus >= 202002L +namespace cms { + using std::is_bounded_array; + using std::is_unbounded_array; +} // namespace cms +#else +#include +#include +namespace cms { + using boost::is_bounded_array; + using boost::is_unbounded_array; +} // namespace cms +#endif + +namespace cms { + template + inline constexpr bool is_bounded_array_v = is_bounded_array::value; + + template + inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; +} // namespace cms + +#include + +#include "AllocatorPolicy.h" +#include "CachedBufAlloc.h" +#include "alpakaConfig.h" +#include "alpakaDevices.h" + +namespace cms::alpakatools { + + // for Extent, Dim1D, Idx + using namespace alpaka_common; + + // type deduction helpers + namespace detail { + + template + struct buffer_type { + using type = alpaka::Buf; + }; + + template + struct buffer_type { + using type = alpaka::Buf; + }; + + template + struct buffer_type { + using type = alpaka::Buf; + }; + + template + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + template + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + template + struct view_type { + using type = alpaka::ViewPlainPtr; + }; + + } // namespace detail + + // scalar and 1-dimensional host buffers + + template + using host_buffer = typename detail::buffer_type::type; + + // non-cached, non-pinned, scalar and 1-dimensional host buffers + + template + std::enable_if_t, host_buffer> make_host_buffer() { + return alpaka::allocBuf(host, Scalar{}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer(Extent extent) { + return alpaka::allocBuf, Idx>(host, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer() { + return alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); + } + + // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue + // the memory is pinned according to the device associated to the queue + + template + std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf(host, queue, Scalar{}); + } else { + return alpaka::allocMappedBuf(host, alpaka::getDev(queue), Scalar{}); + } + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer(TQueue const& queue, Extent extent) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(host, queue, Vec1D{extent}); + } else { + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{extent}); + } + } + + template + std::enable_if_t and not std::is_array_v>, host_buffer> + make_host_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(host, queue, Vec1D{std::extent_v}); + } else { + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{std::extent_v}); + } + } + + // scalar and 1-dimensional host views + + template + using host_view = typename detail::view_type::type; + + template + std::enable_if_t, host_view> make_host_view(T& data) { + return alpaka::ViewPlainPtr(&data, host, Scalar{}); + } + + template + host_view make_host_view(T* data, Extent extent) { + return alpaka::ViewPlainPtr(data, host, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_view> + make_host_view(T& data, Extent extent) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, host_view> + make_host_view(T& data) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host, Vec1D{std::extent_v}); + } + + // scalar and 1-dimensional device buffers + + template + using device_buffer = typename detail::buffer_type::type; + + template + std::enable_if_t, device_buffer, T>> make_device_buffer( + TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf(alpaka::getDev(queue), queue, Scalar{}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf(queue, Scalar{}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf(alpaka::getDev(queue), Scalar{}); + } + } + + template + std::enable_if_t and not std::is_array_v>, + device_buffer, T>> + make_device_buffer(TQueue const& queue, Extent extent) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(alpaka::getDev(queue), queue, Vec1D{extent}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{extent}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{extent}); + } + } + + template + std::enable_if_t and not std::is_array_v>, + device_buffer, T>> + make_device_buffer(TQueue const& queue) { + if constexpr (allocator_policy> == AllocatorPolicy::Caching) { + return allocCachedBuf, Idx>(alpaka::getDev(queue), queue, Vec1D{std::extent_v}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{std::extent_v}); + } + if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{std::extent_v}); + } + } + + // scalar and 1-dimensional device views + + template + using device_view = typename detail::view_type::type; + + template + std::enable_if_t, device_view> make_device_view(TDev const& device, T& data) { + return alpaka::ViewPlainPtr(&data, device, Scalar{}); + } + + template + device_view make_device_view(TDev const& device, T* data, Extent extent) { + return alpaka::ViewPlainPtr(data, device, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, device_view> + make_device_view(TDev const& device, T& data, Extent extent) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{extent}); + } + + template + std::enable_if_t and not std::is_array_v>, device_view> + make_device_view(TDev const& device, T& data) { + return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{std::extent_v}); + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_alpakaMemory_h diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h new file mode 100644 index 00000000..b94c89a4 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h @@ -0,0 +1,385 @@ +#ifndef AlpakaCore_alpakaWorkDiv_h +#define AlpakaCore_alpakaWorkDiv_h + +#include +#include + +#include + +#include "alpakaConfig.h" + +using namespace alpaka_common; + +namespace cms::alpakatools { + + /********************************************* + * WORKDIV CREATION + ********************************************/ + + /* + * If the first argument is not a multiple of the second argument, round it up to the next multiple. + */ + inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; } + + /* + * Return the integer division of the first argument by the second argument, rounded up to the next integer. + */ + inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } + + /* + * Creates the accelerator-dependent workdiv for 1-dimensional operations. + */ + template + inline WorkDiv make_workdiv(Idx blocksPerGrid, Idx threadsPerBlockOrElementsPerThread) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Idx{1}; + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Idx{1}; + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - threadsPerBlock is always 1; + // - threadsPerBlockOrElementsPerThread is the number of elements per thread. + const auto threadsPerBlock = Idx{1}; + return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); + } + } + + /* + * Creates the accelerator-dependent workdiv for N-dimensional operations. + */ + template + inline WorkDiv> make_workdiv(const Vec>& blocksPerGrid, + const Vec>& threadsPerBlockOrElementsPerThread) { + using Dim = alpaka::Dim; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - threadsPerBlock is always 1; + // - threadsPerBlockOrElementsPerThread is the number of elements per thread. + const auto threadsPerBlock = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); + } + } + + /********************************************* + * RANGE COMPUTATION + ********************************************/ + + /* + * Computes the range of the elements indexes, local to the block. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the thread index in block. + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Compute the elements indexes in block. + // Obviously relevant for CPU only. + // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. + const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; + const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! + const Idx endElementIdxUncut = firstElementIdx + threadDimension; + + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdx, endElementIdxUncut}; + } + + /* + * Computes the range of the elements indexes, local to the block. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Check dimension + //static_assert(alpaka::Dim::value == Dim1D::value, + // "Accelerator and maxNumberOfElements need to have same dimension."); + auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocal, endElementIdxLocal}; + } + + /* + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the block index in grid. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift += blockIdxInGrid * blockDimension; + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift, dimIndex); + } + + /* + * Computes the range of the elements indexes in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Check dimension + //static_assert(dimIndex <= alpaka::Dim::value, + //"Accelerator and maxNumberOfElements need to have same dimension."); + auto [firstElementIdxGlobal, endElementIdxGlobal] = element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxGlobal = std::min(endElementIdxGlobal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxGlobal, endElementIdxGlobal}; + } + + /* + * Computes the range of the element(s) index(es) in grid. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const unsigned int dimIndex = 0u) { + Idx elementIdxShift = 0u; + return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + } + + /********************************************* + * LOOP ON ALL ELEMENTS + ********************************************/ + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + const auto& [firstElementIdx, endElementIdx] = + element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + + for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { + func(elementIdx); + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are expressed in GRID 'frame-of-reference'. + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, + const Idx maxNumberOfElements, + Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Take into account the block index in grid to compute the element indices. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + elementIdxShift += blockIdxInGrid * blockDimension; + + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_grid(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** + * LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS + **************************************************************/ + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by BLOCK size. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Stride = block size. + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* + * (CPU) Loop on all elements + (CPU/GPU) Strided access. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Stride to full problem size, by GRID size. + * Indexes are local to the GRID. + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Stride = grid size. + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** + * LOOP ON ALL ELEMENTS WITH ONE LOOP + **************************************************************/ + + /* + * Case where the input index i has reached the end of threadDimension: strides the input index. + * Otherwise: do nothing. + * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops + * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, + * because of the additional loop over elements in Alpaka model). + * This allows to keep the 'continue' and 'break' statements as-is from legacy code, + * and hence avoids a lot of legacy code reshuffling. + * NB 2: Modifies i, firstElementIdx and endElementIdx. + */ + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( + Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { + bool isNextStrideElementValid = true; + if (i == endElementIdx) { + firstElementIdx += stride; + endElementIdx += stride; + i = firstElementIdx; + if (i >= maxNumberOfElements) { + isNextStrideElementValid = false; + } + } + return isNextStrideElementValid; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_alpakaWorkDiv_h diff --git a/CLUEstering/alpaka/AlpakaCore/backend.h b/CLUEstering/alpaka/AlpakaCore/backend.h new file mode 100644 index 00000000..8d58953c --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/backend.h @@ -0,0 +1,17 @@ +#ifndef AlpakaCore_backend_h +#define AlpakaCore_backend_h + +enum class Backend { SERIAL, TBB, CUDA, HIP }; + +inline std::string const& name(Backend backend) { + static const std::string names[] = {"serial_sync", "tbb_async", "cuda_async", "rocm_async"}; + return names[static_cast(backend)]; +} + +template +inline T& operator<<(T& out, Backend backend) { + out << name(backend); + return out; +} + +#endif // AlpakaCore_backend_h diff --git a/CLUEstering/alpaka/AlpakaCore/chooseDevice.h b/CLUEstering/alpaka/AlpakaCore/chooseDevice.h new file mode 100644 index 00000000..5cdd4141 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/chooseDevice.h @@ -0,0 +1,24 @@ +#ifndef HeterogeneousCore_AlpakaCore_chooseDevice_h +#define HeterogeneousCore_AlpakaCore_chooseDevice_h + +#include "alpakaConfig.h" +#include "alpakaDevices.h" +#include "Framework/Event.h" + +namespace cms::alpakatools { + + template + alpaka::Dev const& chooseDevice(edm::StreamID id) { + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + + // TODO: improve the "assignment" logic + auto const& devices = cms::alpakatools::devices; + return devices[id % devices.size()]; + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaCore_chooseDevice_h diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h new file mode 100644 index 00000000..4bcac1ec --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h @@ -0,0 +1,69 @@ +#ifndef AlpakaCore_getDeviceCachingAllocator_h +#define AlpakaCore_getDeviceCachingAllocator_h + +#include +#include +#include + +#include "AllocatorConfig.h" +#include "CachingAllocator.h" +#include "alpakaDevices.h" +#include "alpakaFwd.h" +#include "getDeviceIndex.h" + +namespace cms::alpakatools { + + namespace detail { + + template + auto allocate_device_allocators() { + using Allocator = CachingAllocator; + auto const& devices = cms::alpakatools::enumerate>(); + auto const size = devices.size(); + + // allocate the storage for the objects + auto ptr = std::allocator().allocate(size); + + // construct the objects in the storage + for (size_t index = 0; index < size; ++index) { + new (ptr + index) Allocator(devices[index], + config::binGrowth, + config::minBin, + config::maxBin, + config::maxCachedBytes, + config::maxCachedFraction, + true, // reuseSameQueueAllocations + false); // debug + } + + // use a custom deleter to destroy all objects and deallocate the memory + auto deleter = [size](Allocator* ptr) { + for (size_t i = size; i > 0; --i) { + (ptr + i - 1)->~Allocator(); + } + std::allocator().deallocate(ptr, size); + }; + + return std::unique_ptr(ptr, deleter); + } + + } // namespace detail + + template + inline CachingAllocator& getDeviceCachingAllocator(TDevice const& device) { + // initialise all allocators, one per device + static auto allocators = detail::allocate_device_allocators(); + + size_t const index = getDeviceIndex(device); + + std::vector devs = alpaka::getDevs>(); + + assert(index < cms::alpakatools::enumerate>().size()); + + // the public interface is thread safe + return allocators[index]; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_getDeviceCachingAllocator_h diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h new file mode 100644 index 00000000..5abbeaa0 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h @@ -0,0 +1,29 @@ +#ifndef AlpakaCore_getDeviceIndex_h +#define AlpakaCore_getDeviceIndex_h + +#include + +namespace cms::alpakatools { + + // generic interface, for DevOacc and DevOmp5 + template + inline int getDeviceIndex(Device const& device) { + return device.iDevice(); + } + + // overload for DevCpu + inline int getDeviceIndex(alpaka::DevCpu const& device) { return 0; } + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // overload for DevCudaRt + inline int getDeviceIndex(alpaka::DevCudaRt const& device) { return alpaka::getNativeHandle(device); } +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + // overload for DevHipRt + inline int getDeviceIndex(alpaka::DevHipRt const& device) { return alpaka::getNativeHandle(device); } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +} // namespace cms::alpakatools + +#endif // AlpakaCore_getDeviceIndex_h diff --git a/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h new file mode 100644 index 00000000..3103a7bf --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h @@ -0,0 +1,28 @@ +#ifndef AlpakaCore_getHostCachingAllocator_h +#define AlpakaCore_getHostCachingAllocator_h + +#include "AllocatorConfig.h" +#include "CachingAllocator.h" +#include "alpakaDevices.h" + +namespace cms::alpakatools { + + template + inline CachingAllocator& getHostCachingAllocator() { + // thread safe initialisation of the host allocator + static CachingAllocator allocator(host, + config::binGrowth, + config::minBin, + config::maxBin, + config::maxCachedBytes, + config::maxCachedFraction, + false, // reuseSameQueueAllocations + false); // debug + + // the public interface is thread safe + return allocator; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_getHostCachingAllocator_h diff --git a/CLUEstering/alpaka/AlpakaCore/initialise.h b/CLUEstering/alpaka/AlpakaCore/initialise.h new file mode 100644 index 00000000..cd4125a0 --- /dev/null +++ b/CLUEstering/alpaka/AlpakaCore/initialise.h @@ -0,0 +1,27 @@ +#ifndef AlpakaCore_initialise_h +#define AlpakaCore_initialise_h + +#include "alpakaConfig.h" + +namespace cms::alpakatools { + + template + void initialise(); + + // explicit template instantiation declaration +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT + extern template void initialise(); +#endif + +} // namespace cms::alpakatools + +#endif // AlpakaCore_initialise_h diff --git a/CLUEstering/alpaka/BindingModules/CLUEstering.py b/CLUEstering/alpaka/BindingModules/CLUEstering.py new file mode 100644 index 00000000..19cdfbbe --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/CLUEstering.py @@ -0,0 +1,925 @@ +""" +Density based clustering algorithm developed at CERN. +""" + +from dataclasses import dataclass +from glob import glob +import random as rnd +from math import sqrt +import time +import types +from typing import Union +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.datasets import make_blobs +from sklearn.preprocessing import StandardScaler +import CLUE_Convolutional_Kernels as clue_kernels +import CLUE_CPU_Serial as cpu_serial +tbb_found = False +if len(glob('./CLUE_CPU_TBB*.so')) != 0: + import CLUE_CPU_TBB as cpu_tbb + tbb_found = True +cuda_found = False +if len(glob('./CLUE_GPU_CUDA*.so')) != 0: + import CLUE_GPU_CUDA as gpu_cuda + cuda_found = True +hip_found = False +if len(glob('./CLUE_GPU_HIP*.so')) != 0: + import CLUE_GPU_HIP as gpu_hip + hip_found = True + +def test_blobs(n_samples: int, n_dim: int , n_blobs: int = 4, mean: float = 0, + sigma: float = 0.5, x_max: float = 30, y_max: float = 30) -> pd.DataFrame: + """ + Returns a dataframe containing randomly generated 2-dimensional or 3-dimensional blobs. + + This functions serves as a tool for generating a random dataset to test the library. + + Parameters + ---------- + n_samples : int + The number of points in the dataset. + n_dim : int + The number of dimensions. + n_blobs : int, optional + The number of blobs that should be produced. By default it is set to 4. + mean : float, optional + The mean of the gaussian distribution of the z values. + sigma : float, optional + The standard deviation of the gaussian distribution of the z values. + x_max : float, optional + Limit of the space where the blobs are created in the x direction. + y_max : float, optional + Limit of the space where the blobs are created in the y direction. + + Returns + ------- + Pandas DataFrame + DataFrame containing n_blobs gaussian blobs. + """ + + if n_blobs < 0: + raise ValueError('Wrong parameter value. The number of blobs must be positive.') + if sigma < 0.: + raise ValueError("Wrong parameter value. The mean and sigma of the blobs" + + " cannot be negative.") + if n_dim > 3: + raise ValueError("Wrong number of dimensions. Blobs can only be generated" + + " in 2 or 3 dimensions.") + centers = [] + if n_dim == 2: + data = {'x0': np.array([]), 'x1': np.array([]), 'weight': np.array([])} + centers = [[x_max * rnd.random(), y_max * rnd.random()] for _ in range(n_blobs)] + blob_data = make_blobs(n_samples=n_samples, centers=np.array(centers))[0] + + data['x0'] = blob_data.T[0] + data['x1'] = blob_data.T[1] + data['weight'] = np.full(shape=len(blob_data.T[0]), fill_value=1) + + + return pd.DataFrame(data) + if n_dim == 3: + data = {'x0': [], 'x1': [], 'x2': [], 'weight': []} + sqrt_samples = int(sqrt(n_samples)) + z_values = np.random.normal(mean, sigma,sqrt_samples) + centers = [[x_max * rnd.random(), y_max * rnd.random()] for _ in range(n_blobs)] + + for value in z_values: # for every z value, a layer is generated. + blob_data = make_blobs(n_samples=sqrt_samples, centers=np.array(centers))[0] + data['x0'] = np.concatenate([data['x0'], blob_data.T[0]]) + data['x1'] = np.concatenate([data['x1'], blob_data.T[1]]) + data['x2'] = np.concatenate([data['x2'], np.full(shape=sqrt_samples, + fill_value=value)]) + data['weight'] = np.concatenate([data['weight'], np.full(shape=sqrt_samples, + fill_value=1)]) + + return pd.DataFrame(data) + +@dataclass() +class clustering_data: + """ + Container characterizing the data used for clustering. + + Attributes + ---------- + coords : np.ndarray + Spatially normalized data coordinates in the coordinate system used for clustering. + original_coords : np.ndarray + Data coordinates in the original coordinate system provided by the user. + weight : np.ndarray + Weight values of the data points. + domain_ranges : list + List containing the ranges of the domains for every coordinate. + n_dim : int + Number of dimensions. + n_points : int + Number of points in the clustering data. + """ + + coords : np.ndarray + original_coords : np.ndarray + weight : np.ndarray + n_dim : int + n_points : int + +@dataclass(eq=False) +class cluster_properties: + """ + Container of the data resulting from the clusterization of the input data. + + Attributes + ---------- + n_clusters : int + Number of clusters constructed. + cluster_ids : np.ndarray + Array containing the cluster_id of each point. + is_seed : np.ndarray + Array of integers containing '1' if a point is a seed and '0' if it isn't + cluster_points : np.ndarray + Array containing, for each cluster, the list of point_ids corresponding to the + clusters bolonging to that cluster. + points_per_cluster : np.ndarray + Array containing the number of points belonging to each cluster. + output_df : pd.DataFrame + Dataframe containing is_seed and cluster_ids as columns. + """ + + n_clusters : int + cluster_ids : np.ndarray + is_seed : np.ndarray + cluster_points : np.ndarray + points_per_cluster : np.ndarray + output_df : pd.DataFrame + + def __eq__(self, other): + if self.n_clusters != other.n_clusters: + return False + if not (self.cluster_ids == other.cluster_ids).all(): + return False + if not (self.is_seed == other.is_seed).all(): + return False + + return True + + +class clusterer: + """ + Class representing a wrapper for the methods using in the process of clustering using + the CLUE algorithm. + + Attributes + ---------- + dc_ : float + Spatial parameter indicating how large is the region over which the local density of + each point is calculated. + rhoc : float + Parameter representing the density threshold value which divides seeds and + outliers. + + Points with a density lower than rhoc can't be seeds, can only be followers + or outliers. + outlier : float + Multiplicative increment of dc_ for getting the region over which the followers of a + point are searched. + + While dc_ determines the size of the search box in which the neighbors of a point are + searched when calculating its local density, when looking for followers while trying + to find potential seeds the size of the search box is given by dm = dc_ * outlier. + ppbin : int + Average number of points to be found in each tile. + kernel : Algo.kernel + Convolution kernel used to calculate the local density of the points. + clust_data : clustering_data + Container of the data used by the clustering algorithm. + clust_prop : cluster_properties + Container of the data produced as output of the algorithm. + elapsed_time : int + Execution time of the algorithm, expressed in nanoseconds. + """ + + def __init__(self, dc_: float, rhoc_: float, outlier_: float, ppbin: int = 10): + self.dc_ = dc_ + self.rhoc = rhoc_ + self.outlier = outlier_ + self.ppbin = ppbin + + # Initialize attributes + ## Data containers + self.clust_data = None + self.scaler = StandardScaler() + + ## Kernel for calculation of local density + self.kernel = clue_kernels.FlatKernel(0.5) + + ## Output attributes + self.clust_prop = None + self.elapsed_time = 0. + + def _read_array(self, input_data: Union[list,np.ndarray]) -> None: + """ + Reads data provided with lists or np.ndarrays + + Attributes + ---------- + input_data : list, np.ndarray + The coordinates and weight values of the data points + + Modified attributes + ------------------- + clust_data : clustering_data + Properties of the input data + + Returns + ------- + None + """ + + if len(input_data) < 2 or len(input_data) > 10: + raise ValueError("Inadequate data. The data must contain at least one coordinate" + + " and the weight.") + self.clust_data = clustering_data(np.asarray(input_data[:-1]), + np.copy(np.asarray(input_data[:-1])), + np.asarray(input_data[-1]), + len(input_data[:-1]), + len(input_data[-1])) + + def _read_string(self, input_data: str) -> Union[pd.DataFrame,None]: + """ + Reads data provided by passing a string containing the path to a csv file + + Attributes + ---------- + input_data : str + The path to the csv file containing the input data + + Modified attributes + ------------------- + None + + Returns + ------------------- + pd.DataFrame + Dataframe containing the input data + """ + + if not input_data.endswith('.csv'): + raise ValueError('Wrong type of file. The file is not a csv file.') + df_ = pd.read_csv(input_data) + return df_ + + def _read_dict_df(self, input_data: Union[dict,pd.DataFrame]) -> pd.DataFrame: + """ + Reads data provided using dictionaries or pandas dataframes + + Attributes + ---------- + input_data : dict, pd.DataFrame + The coordinates and weight values of the data points + + Modified attributes + ------------------- + None + + Returns + ------------------- + pd.DataFrame + Dataframe containing the input data + """ + + df_ = pd.DataFrame(input_data, copy=False) + return df_ + + def _handle_dataframe(self, df_: pd.DataFrame) -> None: + """ + Constructs the clust_data attribute from the dataframe produced by the + _read_string or _read_dict_df methods + + Modified attributes + ------------------- + clust_data : clustering_data + Properties of the input data + + Returns + ------- + None + """ + + # Check that the user provided the weights + if 'weight' not in df_.columns: + raise ValueError("Inadequate data. The input dataframe must" + + " contain a weight column.") + + coordinate_columns = [col for col in df_.columns if col[0] == 'x'] + + # Check that the dimensionality of the dataset is adequate + if len(df_.columns) < 2: + raise ValueError("Inadequate data. The data must contain" + + " at least one coordinate and the weight.") + if len(coordinate_columns) > 10: + raise ValueError("Inadequate data. The maximum number of" + + " dimensions supported is 10.") + n_dim = len(coordinate_columns) + n_points = len(df_.index) + coords = df_.iloc[:, 0:-1].to_numpy() + + self.clust_data = clustering_data(coords, + np.copy(coords), + np.asarray(df_['weight']), + n_dim, + n_points) + + def _rescale(self) -> None: + """ + Normalizes the input data using a standard scaler + + Modified attributes + ------------------- + clust_data.coords : np.ndarray + Array containing the coordinates and weight values of the data points + + Returns + ------- + None + """ + + for dim in range(self.clust_data.n_dim): + self.clust_data.coords.T[dim] = \ + self.scaler.fit_transform(self.clust_data.coords.T[dim].reshape(-1, 1)).reshape(1, -1)[0] + + def read_data(self, + input_data: Union[pd.DataFrame,str,dict,list,np.ndarray]) -> None: + """ + Reads the data in input and fills the class members containing the coordinates + of the points, the weight, the number of dimensions and the number of points. + + Parameters + ---------- + input_data : pandas dataframe + The dataframe should contain one column for every + coordinate, each one called 'x*', and one column for the weight. + input_data : string + The string should contain the full path to a csv file containing + the data. + input_data : dict + input_data : array_like + The list or numpy array should contain a list of lists for the + coordinates and a list for the weight. + kwargs : tuples + Tuples corresponding to the domain of any periodic variables. The + keyword should be the keyword of the corrispoding variable. + + Modified attributes + ------------------- + coords : ndarray + Point coordinates used for clustering, spatially normalized. + original_coords : ndarray + Point coordinates in the original coordinate system used by the user. + weight : ndarray + Weights of all the points. + domain_ranges : list of Algo.domain_t + List of the domains for each coordinate. + n_dim : int + The number of dimensions in which we are calculating the clusters. + n_points : int + The number of points in the dataset. + + Returns + ------- + None + """ + + # lists and np ndarrays + if isinstance(input_data, (list, np.ndarray)): + self._read_array(input_data) + + # path to .csv file or pandas dataframe + if isinstance(input_data, (str)): + df = self._read_string(input_data) + self._handle_dataframe(df) + + if isinstance(input_data, (dict, pd.DataFrame)): + df = self._read_dict_df(input_data) + self._handle_dataframe(df) + + # Rescale the coordinates with a standard scaler + self._rescale() + + def change_coordinates(self, **kwargs: types.FunctionType) -> None: + """ + Change the coordinate system + + Parameters + ---------- + kwargs : function objects + The functions for the change of coordinates. + The keywords of the arguments are the coordinates names (x0, x1, ...). + + Modifies attributes + ------------------- + coords : ndarray + Coordinates used for clustering converted in the chosen coordinate system. + + Returns + ------- + None + """ + + # Change the coordinate system + for coord, func in kwargs.items(): + self.clust_data.coords[int(coord[1])] = func(self.clust_data.original_coords) + + # Normalize the coordinate with a standard scaler + self.clust_data.coords[int(coord[1])] = \ + self.scaler.fit_transform( + self.clust_data.coords[int(coord[1])].reshape(-1, 1) + ).reshape(1, -1)[0] + + def choose_kernel(self, + choice: str, + parameters: Union[list,None] = None, + function: types.FunctionType = lambda: 0) -> None: + """ + Changes the kernel used in the calculation of local density. The default kernel + is a flat kernel with parameter 0.5 + + Parameters + ---------- + choice : string + The type of kernel that you want to choose (flat, exp, gaus or custom). + parameters : array_like, optional + List of the parameters needed by the kernels. + The flat kernel requires one, the exponential requires two + (amplitude and mean), the gaussian requires three (amplitude, + mean and standard deviation) and the custom doesn't require any. + function : function object, optional + Function that should be used as kernel when the custom kernel is chosen. + + Modified attributes + ------------------- + kernel : Algo.kernel + + Return + ------ + None + """ + + if choice == "flat": + if len(parameters) != 1: + raise ValueError("Wrong number of parameters. The flat kernel" + + " requires 1 parameter.") + self.kernel = CLUE_Convolutional_Kernels.FlatKernel(parameters[0]) + elif choice == "exp": + if len(parameters) != 2: + raise ValueError("Wrong number of parameters. The exponential" + + " kernel requires 2 parameters.") + self.kernel = CLUE_Convolutional_Kernels.ExponentialKernel(parameters[0], parameters[1]) + elif choice == "gaus": + if len(parameters) != 3: + raise ValueError("Wrong number of parameters. The gaussian" + + " kernel requires 3 parameters.") + self.kernel = CLUE_Convolutional_Kernels.GaussinKernel(parameters[0], parameters[1], parameters[2]) + elif choice == "custom": + if len(parameters) != 0: + raise ValueError("Wrong number of parameters. Custom kernels" + + " requires 0 parameters.") + else: + raise ValueError("Invalid kernel. The allowed choices for the" + + " kernels are: flat, exp, gaus and custom.") + + def list_devices(self, backend: str = "all") -> None: + """ + Lists the devices available for the chosen backend. + + Parameters + ---------- + backend : string, optional + The backend for which the devices are listed. The allowed values are + 'all', 'cpu serial', 'cpu tbb' and 'gpu cuda'. + The default value is 'all'. + + Raises + ------ + ValueError : If the backend is not valid. + """ + + if backend == "all": + cpu_serial.listDevices('cpu serial') + if tbb_found: + cpu_tbb.listDevices('cpu tbb') + if cuda_found: + gpu_cuda.listDevices('gpu cuda') + if hip_found: + gpu_hip.listDevices('gpu hip') + elif backend == "cpu serial": + cpu_serial.listDevices(backend) + elif backend == "cpu tbb": + if tbb_found: + cpu_tbb.listDevices(backend) + else: + print("TBB module not found. Please re-compile the library and try again.") + elif backend == "gpu cuda": + if cuda_found: + gpu_cuda.listDevices(backend) + else: + print("CUDA module not found. Please re-compile the library and try again.") + elif backend == "gpu hip": + if hip_found: + gpu_hip.listDevices(backend) + else: + print("HIP module not found. Please re-compile the library and try again.") + else: + raise ValueError("Invalid backend. The allowed choices for the" + + " backend are: all, cpu serial, cpu tbb and gpu cuda.") + + def run_clue(self, + backend: str = "cpu serial", + block_size: int = 1024, + device_id: int = 0, + verbose: bool = False) -> None: + """ + Executes the CLUE clustering algorithm. + + Parameters + ---------- + verbose : bool, optional + The verbose option prints the execution time of runCLUE and the number + of clusters found. + + Modified attributes + ------------------- + cluster_ids : ndarray + Contains the cluster_id corresponding to every point. + is_seed : ndarray + For every point the value is 1 if the point is a seed or an + outlier and 0 if it isn't. + n_clusters : int + Number of clusters reconstructed. + cluster_points : ndarray of lists + Contains, for every cluster, the list of points associated to id. + points_per_cluster : ndarray + Contains the number of points associated to every cluster. + + Return + ------ + None + """ + + start = time.time_ns() + if backend == "cpu serial": + cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin, + self.clust_data.coords, self.clust_data.weight, + self.kernel, self.clust_data.n_dim, block_size, + device_id) + elif backend == "cpu tbb": + if tbb_found: + cluster_id_is_seed = cpu_tbb.mainRun(self.dc_, self.rhoc, self.outlier, + self.ppbin, self.clust_data.coords, + self.clust_data.weight, self.kernel, + self.clust_data.n_dim, block_size, + device_id) + else: + print("TBB module not found. Please re-compile the library and try again.") + + elif backend == "gpu cuda": + if cuda_found: + cluster_id_is_seed = gpu_cuda.mainRun(self.dc_, self.rhoc, self.outlier, + self.ppbin, self.clust_data.coords, + self.clust_data.weight, self.kernel, + self.clust_data.n_dim, block_size, + device_id) + else: + print("CUDA module not found. Please re-compile the library and try again.") + + elif backend == "gpu hip": + if hip_found: + cluster_id_is_seed = gpu_hip.mainRun(self.dc_, self.rhoc, self.outlier, + self.ppbin, self.clust_data.coords, + self.clust_data.weight, self.kernel, + self.clust_data.n_dim, block_size, + device_id) + else: + print("HIP module not found. Please re-compile the library and try again.") + + finish = time.time_ns() + cluster_ids = np.array(cluster_id_is_seed[0]) + is_seed = np.array(cluster_id_is_seed[1]) + n_clusters = len(np.unique(cluster_ids)) + + cluster_points = [[] for _ in range(n_clusters)] + for i in range(self.clust_data.n_points): + cluster_points[cluster_ids[i]].append(i) + + points_per_cluster = np.array([len(clust) for clust in cluster_points]) + + data = {'cluster_ids': cluster_ids, 'is_seed': is_seed} + output_df = pd.DataFrame(data) + + self.clust_prop = cluster_properties(n_clusters, + cluster_ids, + is_seed, + np.asarray(cluster_points, dtype=object), + points_per_cluster, + output_df) + + self.elapsed_time = (finish - start)/(10**6) + if verbose: + print(f'CLUE executed in {self.elapsed_time} ms') + print(f'Number of clusters found: {self.clust_prop.n_clusters}') + + def input_plotter(self, plot_title: str='', title_size: float = 16, + x_label: str = 'x', y_label: str = 'y', z_label: str = 'z', + label_size: float = 16, pt_size: float = 1, pt_colour: str = 'b', + grid: bool = True, grid_style: str = '--', grid_size: float = 0.2, + x_ticks=None, y_ticks=None, z_ticks=None, + **kwargs) -> None: + """ + Plots the points in input. + + Parameters + ---------- + plot_title : string, optional + Title of the plot. + title_size : float, optional + Size of the plot title. + x_label : string, optional + Label on x-axis. + y_label : string, optional + Label on y-axis. + z_label : string, optional + Label on z-axis. + label_size : int, optional + Fontsize of the axis labels. + pt_size : int, optional + Size of the points in the plot. + pt_colour : string, optional + Colour of the points in the plot. + grid : bool, optional + If true displays grids in the plot. + grid_style : string, optional + Style of the grid. + grid_size : float, optional + Linewidth of the plot grid. + x_ticks : list, optional + List of ticks for the x axis. + y_ticks : list, optional + List of ticks for the y axis. + z_ticks : list, optional + List of ticks for the z axis. + kwargs : function objects, optional + Functions for converting the used coordinates to cartesian coordinates. + The keywords of the arguments are the coordinates names (x0, x1, ...). + + Modified attributes + ------------------- + None + + Returns + ------- + None + """ + + # Convert the used coordinates to cartesian coordiantes + cartesian_coords = np.copy(self.clust_data.original_coords) + for coord, func in kwargs.items(): + cartesian_coords[int(coord[1])] = func(self.clust_data.original_coords) + + if self.clust_data.n_dim == 2: + plt.scatter(cartesian_coords[0], + cartesian_coords[1], + s=pt_size, + color=pt_colour) + + # Customization of the plot title + plt.title(plot_title, fontsize=title_size) + + # Customization of axis labels + plt.xlabel(x_label, fontsize=label_size) + plt.ylabel(y_label, fontsize=label_size) + + # Customization of the grid + if grid: + plt.grid(linestyle=grid_style, linewidth=grid_size) + + # Customization of axis ticks + if x_ticks is not None: + plt.xticks(x_ticks) + if y_ticks is not None: + plt.yticks(y_ticks) + + plt.show() + if self.clust_data.n_dim >= 3: + fig = plt.figure() + ax_ = fig.add_subplot(projection='3d') + ax_.scatter(cartesian_coords[0], + cartesian_coords[1], + cartesian_coords[2], + s=pt_size, + color=pt_colour) + + # Customization of the plot title + ax_.set_title(plot_title, fontsize=title_size) + + # Customization of axis labels + ax_.set_xlabel(x_label, fontsize=label_size) + ax_.set_ylabel(y_label, fontsize=label_size) + ax_.set_zlabel(z_label, fontsize=label_size) + + # Customization of the grid + if grid: + plt.grid(linestyle=grid_style, linewidth=grid_size) + + # Customization of axis ticks + if x_ticks is not None: + ax_.set_xticks(x_ticks) + if y_ticks is not None: + ax_.set_yticks(y_ticks) + if z_ticks is not None: + ax_.set_zticks(z_ticks) + + plt.show() + + def cluster_plotter(self, plot_title: str = '', title_size: float = 16, + x_label: str = 'x', y_label: str = 'y', z_label: str = 'z', + label_size: float = 16, outl_size: float = 10, pt_size: float = 10, + seed_size: float = 25, grid: bool = True, grid_style: str = '--', + grid_size: float = 0.2, x_ticks=None, y_ticks=None, z_ticks=None, + **kwargs) -> None: + """ + Plots the clusters with a different colour for every cluster. + + The points assigned to a cluster are printed as points, the seeds + as stars and the outliers as little grey crosses. + + Parameters + ---------- + plot_title : string, optional + Title of the plot + title_size : float, optional + Size of the plot title + x_label : string, optional + Label on x-axis + y_label : string, optional + Label on y-axis + z_label : string, optional + Label on z-axis + label_size : int, optional + Fontsize of the axis labels + outl_size : int, optional + Size of the outliers in the plot + pt_size : int, optional + Size of the points in the plot + seed_size : int, optional + Size of the seeds in the plot + grid : bool, optional + f true displays grids in the plot + grid_style : string, optional + Style of the grid + grid_size : float, optional + Linewidth of the plot grid + x_ticks : list, optional + List of ticks for the x axis. + y_ticks : list, optional + List of ticks for the y axis. + z_ticks : list, optional + List of ticks for the z axis. + kwargs : function objects, optional + Functions for converting the used coordinates to cartesian coordinates. + The keywords of the arguments are the coordinates names (x0, x1, ...). + + Modified attributes + ------------------- + None + + Returns + ------- + None + """ + + # Convert the used coordinates to cartesian coordiantes + cartesian_coords = np.copy(self.clust_data.original_coords.T) + for coord, func in kwargs.items(): + cartesian_coords[int(coord[1])] = func(self.clust_data.original_coords) + + if self.clust_data.n_dim == 2: + data = {'x0': cartesian_coords[0], + 'x1': cartesian_coords[1], + 'cluster_ids': self.clust_prop.cluster_ids, + 'isSeed': self.clust_prop.is_seed} + df_ = pd.DataFrame(data) + + max_clusterid = max(df_["cluster_ids"]) + + df_out = df_[df_.cluster_ids == -1] # Outliers + plt.scatter(df_out.x0, df_out.x1, s=outl_size, marker='x', color='0.4') + for i in range(0, max_clusterid+1): + dfi = df_[df_.cluster_ids == i] # ith cluster + plt.scatter(dfi.x0, dfi.x1, s=pt_size, marker='.') + df_seed = df_[df_.isSeed == 1] # Only Seeds + plt.scatter(df_seed.x0, df_seed.x1, s=seed_size, color='r', marker='*') + + # Customization of the plot title + plt.title(plot_title, fontsize=title_size) + + # Customization of axis labels + plt.xlabel(x_label, fontsize=label_size) + plt.ylabel(y_label, fontsize=label_size) + + # Customization of the grid + if grid: + plt.grid(linestyle=grid_style, linewidth=grid_size) + + # Customization of axis ticks + if x_ticks is not None: + plt.xticks(x_ticks) + if y_ticks is not None: + plt.yticks(y_ticks) + + plt.show() + if self.clust_data.n_dim == 3: + data = {'x0': cartesian_coords[0], + 'x1': cartesian_coords[1], + 'x2': cartesian_coords[2], + 'cluster_ids': self.clust_prop.cluster_ids, + 'isSeed': self.clust_prop.is_seed} + df_ = pd.DataFrame(data) + + max_clusterid = max(df_["cluster_ids"]) + fig = plt.figure() + ax_ = fig.add_subplot(projection='3d') + + df_out = df_[df_.cluster_ids == -1] + ax_.scatter(df_out.x0, df_out.x1, df_out.x2, s=outl_size, color = 'grey', marker = 'x') + for i in range(0, max_clusterid+1): + dfi = df_[df_.cluster_ids == i] + ax_.scatter(dfi.x0, dfi.x1, dfi.x2, s=pt_size, marker = '.') + + df_seed = df_[df_.isSeed == 1] # Only Seeds + ax_.scatter(df_seed.x0, df_seed.x1, df_seed.x2, s=seed_size, color = 'r', marker = '*') + + # Customization of the plot title + ax_.set_title(plot_title, fontsize=title_size) + + # Customization of axis labels + ax_.set_xlabel(x_label, fontsize=label_size) + ax_.set_ylabel(y_label, fontsize=label_size) + ax_.set_zlabel(z_label, fontsize=label_size) + + # Customization of the grid + if grid: + plt.grid(linestyle=grid_style, linewidth=grid_size) + + # Customization of axis ticks + if x_ticks is not None: + ax_.set_xticks(x_ticks) + if y_ticks is not None: + ax_.set_yticks(y_ticks) + if z_ticks is not None: + ax_.set_zticks(z_ticks) + + plt.show() + + def to_csv(self, output_folder: str, file_name: str) -> None: + """ + Creates a file containing the coordinates of all the points, their cluster_ids and is_seed. + + Parameters + ---------- + output_folder : string + Full path to the desired ouput folder. + file_name : string + Name of the file, with the '.csv' suffix. + + Modified attributes + ------------------- + None + + Returns + ------- + None + """ + + out_path = output_folder + file_name + data = {} + for i in range(self.clust_data.n_dim): + data['x' + str(i)] = self.clust_data.coords.T[i] + data['cluster_ids'] = self.clust_prop.cluster_ids + data['is_seed'] = self.clust_prop.is_seed + + df_ = pd.DataFrame(data) + df_.to_csv(out_path,index=False) + +if __name__ == "__main__": + c = clusterer(0.4,5,1.) + c.read_data('./sissa.csv') + c.run_clue(backend="cpu serial", verbose=True) + # c.run_clue(backend="cpu tbb", verbose=True) + # c.run_clue(backend="gpu cuda", verbose=True) + # c.run_clue(backend="gpu hip", verbose=True) + c.cluster_plotter() + # c.to_csv('./','sissa_output_tbb.csv') + c.list_devices('cpu serial') + c.list_devices('cpu tbb') + c.list_devices('gpu cuda') + c.list_devices('gpu hip') + c.list_devices() diff --git a/CLUEstering/alpaka/BindingModules/Makefile b/CLUEstering/alpaka/BindingModules/Makefile new file mode 100644 index 00000000..ef93d8e7 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/Makefile @@ -0,0 +1,103 @@ + +# Compilers +export CXX := g++ +export CUDA := nvcc +export HIP := hipcc + +# define supported cuda architectures +export CUDA_ARCH := 50 60 61 62 70 + +# Compiler flags +CXX_FLAGS = -std=c++17 -g -O2 +# Cuda flags +export CUDA_FLAGS = -x cu --expt-relaxed-constexpr -gencode arch=compute_61,code=[sm_61,compute_61] -G +export CUDA_CXXFLAGS := -I$(CUDA_BASE)/include +# TBB flags +TBB_FLAGS = -ltbb +# Amd flags +export HIP_BASE = /opt/rocm/ +export HIP_FLAGS := -I$(HIP_BASE)/include \ + -I$(HIP_BASE)/hiprand/include \ + -I$(HIP_BASE)/rocrand/include + +# Dependencies flags +ALPAKA_PATH = ../../../extern/alpaka/include +BOOST_PATH = /usr/include/boost + +# Alpaka backend compilation flags +ALPAKA_SERIAL_FLAGS = -DALPAKA_HOST_ONLY \ + -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT \ + -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \ + -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND +ALPAKA_TBB_FLAGS = -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT \ + -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED \ + -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND +ALPAKA_CUDA_FLAGS = -DALPAKA_ACC_GPU_CUDA_PRESENT \ + -DALPAKA_ACC_GPU_CUDA_ENABLED \ + -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +ALPAKA_HIP_FLAGS = -DALPAKA_ACC_GPU_HIP_PRESENT \ + -DALPAKA_ACC_GPU_HIP_ENABLED \ + -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND + +# Binding flags +PYTHON_VERS := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' | sed 's/\.//g' ) +PYTHON_VERS_DOT := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' ) +PYTHON_PATH := /usr/include/python$(PYTHON_VERS_DOT) +PYBIND_PATH := ../../../extern/pybind11/include +BINDING_FLAGS := -Wall -shared -fPIC -I$(PYTHON_PATH) -I$(PYBIND_PATH) +CUDA_BINDING_FLAGS := -shared --compiler-options '-fPIC' -I$(PYTHON_PATH) -I$(PYBIND_PATH) + +# Backend module names +KERNELS_MODULE_NAME := CLUE_Convolutional_Kernels.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so +SERIAL_MODULE_NAME := CLUE_CPU_Serial.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so +TBB_MODULE_NAME := CLUE_CPU_TBB.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so +CUDA_MODULE_NAME := CLUE_GPU_CUDA.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so +HIP_MODULE_NAME := CLUE_GPU_HIP.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so + +all: + $(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \ + $(BINDING_FLAGS) binding_cpu.cc -o $(SERIAL_MODULE_NAME) + $(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \ + $(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME) + $(CXX) $(CXX_FLAGS) $(TBB_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_TBB_FLAGS) \ + $(BINDING_FLAGS) binding_cpu_tbb.cc -o $(TBB_MODULE_NAME) +ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0) + $(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) \ + $(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME) +else + echo "No CUDA compiler found, skipping CUDA backend" +endif +ifeq ($(call,$(shell which hipcc))$(.SHELLSTATUS),0) + $(HIP) $(CXX_FLAGS) $(HIP_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) \ + $(BINDING_FLAGS) binding_gpu_hip.cc -o $(HIP_MODULE_NAME) +else + echo "No HIP compiler found, skipping HIP backend" +endif + +serial: + $(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \ + $(BINDING_FLAGS) binding_cpu.cc -o $(SERIAL_MODULE_NAME) + +tbb: + $(CXX) $(CXX_FLAGS) $(TBB_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_TBB_FLAGS) \ + $(BINDING_FLAGS) binding_cpu_tbb.cc -o $(TBB_MODULE_NAME) + +cuda: +ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0) + $(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) \ + $(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME) +else + echo "No CUDA compiler found, skipping CUDA backend" +endif + +hip: +ifeq ($(call,$(shell which hipcc))$(.SHELLSTATUS),0) + $(HIP) $(CXX_FLAGS) $(HIP_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) \ + $(BINDING_FLAGS) binding_gpu_hip.cc -o $(HIP_MODULE_NAME) +else + echo "No HIP compiler found, skipping HIP backend" +endif + +kernel: + $(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \ + $(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME) diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu.cc b/CLUEstering/alpaka/BindingModules/binding_cpu.cc new file mode 100644 index 00000000..a870d6f4 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_cpu.cc @@ -0,0 +1,200 @@ + +#include +#include + +#include "../CLUE/CLUEAlgoAlpaka.h" +#include "../CLUE/Run.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" + +#include +#include +#include +#include + +namespace alpaka_serial_sync { + void listDevices(const std::string& backend) { + const char tab = '\t'; + const std::vector devices = alpaka::getDevs(); + if (devices.empty()) { + std::cout << "No devices found for the " << backend << " backend." << std::endl; + return; + } else { + std::cout << backend << " devices found: \n"; + for (size_t i{}; i < devices.size(); ++i) { + std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n'; + } + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const FlatKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const ExponentialKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const GaussianKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + PYBIND11_MODULE(CLUE_CPU_Serial, m) { + m.doc() = "Binding of the CLUE algorithm running serially on CPU"; + + m.def("listDevices", &listDevices, "List the available devices for the CPU serial backend"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const FlatKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const ExponentialKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const GaussianKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + } +}; // namespace alpaka_serial_sync diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc new file mode 100644 index 00000000..42214d98 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc @@ -0,0 +1,200 @@ + +#include +#include + +#include "../CLUE/CLUEAlgoAlpaka.h" +#include "../CLUE/Run.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" + +#include +#include +#include +#include + +namespace alpaka_tbb_async { + void listDevices(const std::string& backend) { + const char tab = '\t'; + const std::vector devices = alpaka::getDevs(); + if (devices.empty()) { + std::cout << "No devices found for the " << backend << " backend." << std::endl; + return; + } else { + std::cout << backend << " devices found: \n"; + for (size_t i{}; i < devices.size(); ++i) { + std::cout << tab << "Device " << i << ": " << alpaka::getName(devices[i]) << '\n'; + } + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const FlatKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const ExponentialKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const GaussianKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + PYBIND11_MODULE(CLUE_CPU_TBB, m) { + m.doc() = "Binding of the CLUE algorithm running on CPU with TBB"; + + m.def("listDevices", &listDevices, "List the available devices for the TBB backend"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const FlatKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const ExponentialKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const GaussianKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + } +}; // namespace alpaka_tbb_async diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc new file mode 100644 index 00000000..da464650 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc @@ -0,0 +1,204 @@ +#include +#include + +#include "../CLUE/CLUEAlgoAlpaka.h" +#include "../CLUE/Run.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" +#include "../AlpakaCore/initialise.h" + +#include +#include +#include +#include + +using cms::alpakatools::initialise; + +namespace alpaka_cuda_async { + void listDevices(const std::string& backend) { + const char tab = '\t'; + const std::vector devices = alpaka::getDevs(); + if (devices.empty()) { + std::cout << "No devices found for the " << backend << " backend." << std::endl; + return; + } else { + std::cout << backend << " devices found: \n"; + for (size_t i{}; i < devices.size(); ++i) { + std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n'; + } + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const FlatKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + /* initialise(); */ + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const ExponentialKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const GaussianKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + PYBIND11_MODULE(CLUE_GPU_CUDA, m) { + m.doc() = "Binding of the CLUE algorithm running on CUDA GPUs"; + + m.def("listDevices", &listDevices, "List the available devices for the CUDA backend"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const FlatKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const ExponentialKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const GaussianKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + } +}; // namespace alpaka_cuda_async diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc new file mode 100644 index 00000000..479c3c20 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc @@ -0,0 +1,200 @@ +#include +#include + +#include "../CLUE/CLUEAlgoAlpaka.h" +#include "../CLUE/Run.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" +#include "../AlpakaCore/initialise.h" + +#include +#include +#include +#include + +namespace alpaka_rocm_async { + void listDevices(const std::string& backend) { + const char tab = '\t'; + const std::vector devices = alpaka::getDevs(); + if (devices.empty()) { + std::cout << "No devices found for the " << backend << " backend." << std::endl; + return; + } else { + std::cout << backend << " devices found: \n"; + for (size_t i{}; i < devices.size(); ++i) { + std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n'; + } + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const FlatKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const ExponentialKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + std::vector> mainRun(float dc, + float rhoc, + float outlier, + int pPBin, + const std::vector>& coords, + const std::vector& weights, + const GaussianKernel& kernel, + int Ndim, + size_t block_size, + size_t device_id) { + auto const dev_acc = alpaka::getDevByIdx(device_id); + + // Create the queue + Queue queue_(dev_acc); + + // Running the clustering algorithm // + switch (Ndim) { + [[unlikely]] case (1) : + return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2) : + return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3) : + return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4) : + return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5) : + return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6) : + return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7) : + return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8) : + return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9) : + return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10) : + return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] default: + std::cout << "This library only works up to 10 dimensions\n"; + return {}; + } + } + + PYBIND11_MODULE(CLUE_GPU_HIP, m) { + m.doc() = "Binding of the CLUE algorithm running on AMD GPUs"; + + m.def("listDevices", &listDevices, "List the available devices for the HIP/ROCm backend"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const FlatKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const ExponentialKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + m.def("mainRun", + pybind11::overload_cast>&, + const std::vector&, + const GaussianKernel&, + int, + size_t, + size_t>(&mainRun), + "mainRun"); + } +}; // namespace alpaka_rocm_async diff --git a/CLUEstering/alpaka/BindingModules/binding_kernels.cc b/CLUEstering/alpaka/BindingModules/binding_kernels.cc new file mode 100644 index 00000000..c61170d9 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/binding_kernels.cc @@ -0,0 +1,17 @@ + +#include + +#include "../CLUE/ConvolutionalKernel.h" + +#include +#include +#include +#include + +PYBIND11_MODULE(CLUE_Convolutional_Kernels, m) { + m.doc() = "Binding of the convolutional kernels used in the CLUE algorithm."; + + pybind11::class_(m, "FlatKernel").def(pybind11::init()); + pybind11::class_(m, "ExponentialKernel").def(pybind11::init()); + pybind11::class_(m, "GaussianKernel").def(pybind11::init()); +} diff --git a/CLUEstering/alpaka/BindingModules/sissa.csv b/CLUEstering/alpaka/BindingModules/sissa.csv new file mode 100644 index 00000000..6cb12d84 --- /dev/null +++ b/CLUEstering/alpaka/BindingModules/sissa.csv @@ -0,0 +1,1000 @@ +x0,x1,weight +34.84,-40.85,1.0 +37.44,-39.57,1.0 +30.43,-40.06,1.0 +34.57,-37.18,1.0 +41.46,-52.92,1.0 +44.23,-52.01,1.0 +32.13,-42.11,1.0 +37.01,-42.12,1.0 +36.0,-35.2,1.0 +41.23,-54.89,1.0 +33.8,-33.8,1.0 +36.81,-51.74,1.0 +43.59,-47.38,1.0 +27.63,-41.56,1.0 +40.48,-41.47,1.0 +36.97,-44.03,1.0 +26.27,-42.45,1.0 +27.72,-36.5,1.0 +48.27,-50.86,1.0 +40.79,-46.45,1.0 +38.22,-46.54,1.0 +42.17,-41.67,1.0 +31.54,-33.86,1.0 +29.38,-35.03,1.0 +26.13,-37.42,1.0 +35.69,-52.4,1.0 +26.04,-37.42,1.0 +31.93,-46.03,1.0 +27.2,-43.51,1.0 +49.11,-52.63,1.0 +47.99,-45.54,1.0 +26.73,-37.94,1.0 +47.62,-56.66,1.0 +30.11,-32.93,1.0 +44.45,-44.34,1.0 +25.1,-37.11,1.0 +45.26,-45.39,1.0 +31.11,-32.51,1.0 +31.65,-48.42,1.0 +24.33,-37.42,1.0 +50.76,-53.68,1.0 +51.41,-46.09,1.0 +43.3,-43.55,1.0 +50.1,-45.02,1.0 +50.65,-50.86,1.0 +23.94,-43.42,1.0 +50.39,-51.61,1.0 +51.32,-53.0,1.0 +26.2,-28.39,1.0 +36.67,-31.21,1.0 +45.14,-39.85,1.0 +52.09,-43.1,1.0 +53.14,-45.03,1.0 +46.36,-57.8,1.0 +45.27,-36.84,1.0 +21.08,-36.36,1.0 +42.44,-34.17,1.0 +52.28,-51.59,1.0 +20.25,-33.77,1.0 +30.15,-29.41,1.0 +55.51,-55.56,1.0 +21.65,-39.47,1.0 +44.72,-34.92,1.0 +31.52,-50.53,1.0 +54.24,-52.46,1.0 +36.86,-56.06,1.0 +21.58,-28.72,1.0 +35.17,-56.21,1.0 +54.12,-52.25,1.0 +47.77,-58.61,1.0 +21.26,-28.09,1.0 +20.97,-28.23,1.0 +44.6,-34.03,1.0 +42.94,-33.49,1.0 +31.97,-52.64,1.0 +55.14,-47.78,1.0 +20.79,-40.55,1.0 +43.67,-33.38,1.0 +47.31,-38.22,1.0 +48.8,-60.25,1.0 +21.4,-25.72,1.0 +18.47,-36.56,1.0 +54.75,-60.86,1.0 +16.48,-27.36,1.0 +28.12,-24.78,1.0 +20.99,-23.29,1.0 +48.13,-38.6,1.0 +19.45,-39.61,1.0 +41.97,-30.3,1.0 +46.73,-33.85,1.0 +11.78,-23.7,1.0 +48.19,-37.13,1.0 +14.77,-34.82,1.0 +38.9,-58.31,1.0 +57.95,-46.06,1.0 +13.25,-33.33,1.0 +56.73,-45.05,1.0 +17.05,-23.3,1.0 +14.47,-29.77,1.0 +39.48,-59.1,1.0 +20.16,-21.32,1.0 +11.79,-28.31,1.0 +28.19,-52.92,1.0 +57.57,-45.08,1.0 +20.64,-44.94,1.0 +30.23,-54.55,1.0 +12.81,-31.53,1.0 +24.98,-23.58,1.0 +48.33,-64.15,1.0 +49.78,-37.26,1.0 +38.03,-27.21,1.0 +62.2,-51.31,1.0 +22.13,-21.74,1.0 +61.08,-52.17,1.0 +37.97,-26.07,1.0 +57.98,-62.1,1.0 +9.92,-20.63,1.0 +49.41,-35.17,1.0 +39.01,-27.29,1.0 +18.34,-21.45,1.0 +46.58,-65.54,1.0 +39.29,-60.71,1.0 +9.54,-25.86,1.0 +21.99,-20.43,1.0 +13.32,-17.53,1.0 +60.35,-47.52,1.0 +12.49,-21.36,1.0 +53.67,-65.87,1.0 +52.83,-66.94,1.0 +59.09,-64.18,1.0 +19.14,-17.72,1.0 +64.87,-51.24,1.0 +58.37,-64.98,1.0 +15.04,-16.66,1.0 +16.5,-16.66,1.0 +57.36,-66.87,1.0 +61.77,-48.53,1.0 +6.6,-24.89,1.0 +64.76,-58.7,1.0 +62.47,-65.41,1.0 +60.02,-67.3,1.0 +2.95,-14.56,1.0 +64.41,-63.72,1.0 +36.38,-60.41,1.0 +56.15,-40.21,1.0 +4.56,-31.44,1.0 +67.48,-60.2,1.0 +4.73,-15.74,1.0 +52.08,-33.73,1.0 +10.5,-17.06,1.0 +65.64,-62.47,1.0 +5.56,-32.34,1.0 +28.3,-20.01,1.0 +10.19,-14.41,1.0 +9.17,-35.68,1.0 +68.77,-58.81,1.0 +8.72,-16.82,1.0 +20.95,-49.88,1.0 +16.73,-44.92,1.0 +7.85,-13.65,1.0 +36.34,-20.91,1.0 +7.87,-11.35,1.0 +15.97,-43.6,1.0 +45.8,-26.44,1.0 +-3.38,-18.31,1.0 +7.95,-38.36,1.0 +57.95,-40.17,1.0 +61.29,-69.89,1.0 +35.81,-19.5,1.0 +62.99,-67.77,1.0 +11.71,-14.79,1.0 +67.88,-61.87,1.0 +30.01,-18.69,1.0 +46.9,-68.4,1.0 +6.78,-35.97,1.0 +69.04,-53.99,1.0 +-1.72,-10.21,1.0 +9.87,-12.14,1.0 +68.46,-61.86,1.0 +17.48,-49.8,1.0 +62.2,-43.37,1.0 +53.89,-71.6,1.0 +-0.86,-21.58,1.0 +65.59,-67.51,1.0 +-5.73,-19.79,1.0 +5.58,-36.85,1.0 +59.7,-38.44,1.0 +73.62,-55.84,1.0 +22.28,-15.17,1.0 +13.16,-10.43,1.0 +-3.44,-20.71,1.0 +14.92,-46.27,1.0 +0.76,-32.96,1.0 +16.73,-12.3,1.0 +-6.21,-18.2,1.0 +-6.53,-19.2,1.0 +66.85,-67.79,1.0 +70.36,-66.84,1.0 +-6.88,-12.95,1.0 +75.99,-56.77,1.0 +60.69,-73.7,1.0 +57.32,-34.69,1.0 +-4.16,-22.67,1.0 +61.26,-73.77,1.0 +-9.82,-5.65,1.0 +-7.88,-17.73,1.0 +-8.38,-8.66,1.0 +27.79,-61.16,1.0 +-1.49,-5.81,1.0 +29.78,-15.19,1.0 +-5.07,-8.04,1.0 +22.06,-57.63,1.0 +10.42,-44.85,1.0 +21.43,-57.34,1.0 +-3.72,-5.43,1.0 +66.7,-73.64,1.0 +14.11,-7.18,1.0 +-9.03,-14.4,1.0 +44.99,-73.67,1.0 +-2.64,-2.57,1.0 +14.08,-5.97,1.0 +58.45,-32.55,1.0 +-5.53,-27.3,1.0 +18.34,-8.32,1.0 +-12.49,-1.24,1.0 +41.06,-70.4,1.0 +-7.29,-22.33,1.0 +40.15,-71.65,1.0 +63.33,-75.26,1.0 +80.96,-67.43,1.0 +0.01,0.48,1.0 +63.64,-75.5,1.0 +74.17,-69.97,1.0 +61.41,-76.19,1.0 +2.02,0.53,1.0 +61.83,-75.26,1.0 +75.77,-66.01,1.0 +81.37,-71.45,1.0 +74.91,-70.36,1.0 +81.42,-72.02,1.0 +-3.77,1.86,1.0 +82.23,-67.03,1.0 +77.0,-52.91,1.0 +34.48,-68.97,1.0 +-14.43,-12.09,1.0 +-0.32,-37.78,1.0 +-16.35,-2.27,1.0 +-14.32,-18.22,1.0 +87.19,-72.99,1.0 +71.71,-77.69,1.0 +42.55,-74.04,1.0 +88.19,-75.01,1.0 +-14.77,-14.38,1.0 +87.76,-71.9,1.0 +68.91,-78.82,1.0 +68.88,-78.7,1.0 +60.78,-78.16,1.0 +3.97,1.02,1.0 +36.08,-14.52,1.0 +85.42,-65.92,1.0 +-17.0,-12.63,1.0 +39.33,-73.62,1.0 +58.82,-78.89,1.0 +61.29,-30.77,1.0 +81.09,-79.38,1.0 +-19.26,-1.66,1.0 +-12.6,2.9,1.0 +74.21,-77.21,1.0 +68.92,-39.55,1.0 +14.89,-55.01,1.0 +-16.41,0.25,1.0 +-15.61,1.01,1.0 +-16.24,-16.22,1.0 +44.93,-19.27,1.0 +-10.97,4.07,1.0 +44.82,-76.01,1.0 +-19.16,-0.99,1.0 +15.8,-1.46,1.0 +-16.92,-16.31,1.0 +27.59,-65.87,1.0 +-19.47,-12.93,1.0 +-21.61,1.94,1.0 +-18.76,3.42,1.0 +86.92,-63.79,1.0 +17.28,-1.22,1.0 +-12.34,6.01,1.0 +4.14,3.31,1.0 +-25.46,-10.12,1.0 +-5.31,5.52,1.0 +-1.88,5.51,1.0 +-25.49,-11.25,1.0 +32.62,-72.15,1.0 +-127.66,73.23,1.0 +-126.05,78.02,1.0 +-25.72,4.63,1.0 +-16.28,-22.98,1.0 +14.21,1.33,1.0 +-124.14,76.28,1.0 +1.44,7.08,1.0 +-25.93,7.11,1.0 +82.46,-88.2,1.0 +-128.02,71.97,1.0 +-12.72,9.33,1.0 +-131.42,74.44,1.0 +-122.3,75.95,1.0 +-133.01,68.27,1.0 +-128.47,66.6,1.0 +-127.53,66.01,1.0 +-4.84,-38.56,1.0 +-125.62,69.73,1.0 +-124.87,70.77,1.0 +65.43,-87.56,1.0 +93.13,-76.9,1.0 +-139.86,69.24,1.0 +117.78,62.85,1.0 +119.76,47.72,1.0 +125.81,44.54,1.0 +-132.58,78.32,1.0 +76.55,-89.56,1.0 +122.78,47.09,1.0 +126.04,44.26,1.0 +135.26,55.03,1.0 +130.04,55.62,1.0 +113.68,59.59,1.0 +89.92,-87.5,1.0 +121.09,46.18,1.0 +94.72,-80.31,1.0 +113.02,64.32,1.0 +8.46,6.96,1.0 +-125.23,84.88,1.0 +112.23,42.9,1.0 +-114.84,73.8,1.0 +127.31,57.42,1.0 +117.26,56.92,1.0 +-116.52,81.74,1.0 +122.45,65.91,1.0 +40.61,-81.13,1.0 +117.3,46.31,1.0 +-53.71,-159.34,1.0 +133.9,57.51,1.0 +-123.36,86.67,1.0 +-33.56,-2.93,1.0 +118.07,55.4,1.0 +125.69,53.63,1.0 +137.47,49.89,1.0 +-121.56,64.3,1.0 +-20.26,13.45,1.0 +-78.06,-138.69,1.0 +-35.32,-3.29,1.0 +108.26,59.07,1.0 +-53.15,-168.72,1.0 +-20.03,15.51,1.0 +-124.34,86.19,1.0 +132.4,46.03,1.0 +109.23,60.09,1.0 +132.87,45.7,1.0 +117.58,37.79,1.0 +134.32,60.99,1.0 +115.32,51.86,1.0 +-124.18,62.52,1.0 +-140.55,66.12,1.0 +-141.77,61.26,1.0 +61.35,-88.32,1.0 +96.13,-67.72,1.0 +98.5,-72.28,1.0 +-111.4,73.23,1.0 +110.27,54.14,1.0 +-32.82,-13.12,1.0 +-56.35,-167.13,1.0 +109.45,44.84,1.0 +-108.41,78.4,1.0 +112.19,53.63,1.0 +-143.96,73.08,1.0 +-82.4,-123.74,1.0 +-49.33,-166.96,1.0 +-52.88,-167.48,1.0 +101.09,48.39,1.0 +-111.0,74.6,1.0 +-134.64,55.03,1.0 +-122.34,93.54,1.0 +-59.26,-151.25,1.0 +-35.81,1.63,1.0 +-112.83,84.22,1.0 +-35.72,2.22,1.0 +99.39,-84.06,1.0 +-148.48,64.2,1.0 +134.05,68.49,1.0 +-74.87,-145.16,1.0 +-125.94,55.76,1.0 +-61.37,-162.44,1.0 +-83.07,-133.74,1.0 +-86.56,-132.69,1.0 +-80.1,-138.03,1.0 +-97.64,-117.16,1.0 +122.26,69.93,1.0 +-146.58,69.42,1.0 +-37.47,9.44,1.0 +-73.82,-128.81,1.0 +-82.67,-130.46,1.0 +-17.62,18.33,1.0 +-131.94,90.68,1.0 +89.41,-53.83,1.0 +-104.25,-120.92,1.0 +-47.6,-171.77,1.0 +-46.83,-175.13,1.0 +-89.53,-132.43,1.0 +-102.44,-117.91,1.0 +-114.85,90.87,1.0 +-103.03,-116.12,1.0 +-95.81,-119.96,1.0 +-67.02,-146.62,1.0 +139.31,61.48,1.0 +-139.51,82.21,1.0 +-118.61,89.61,1.0 +97.91,46.78,1.0 +-121.71,94.09,1.0 +-54.18,-150.66,1.0 +116.77,34.01,1.0 +-71.11,-147.46,1.0 +-19.07,-25.16,1.0 +125.39,38.23,1.0 +-69.92,-146.52,1.0 +-109.32,-120.5,1.0 +-62.44,-167.18,1.0 +-95.25,-114.03,1.0 +-43.68,-176.18,1.0 +84.58,-92.17,1.0 +-34.63,7.38,1.0 +-145.84,53.83,1.0 +-88.06,-117.41,1.0 +101.35,41.64,1.0 +91.87,-54.13,1.0 +-52.29,-147.29,1.0 +-144.07,57.02,1.0 +-154.07,34.26,1.0 +144.41,54.22,1.0 +-110.87,-109.86,1.0 +-147.27,51.34,1.0 +-110.36,-113.16,1.0 +-33.36,11.08,1.0 +-144.96,51.96,1.0 +94.26,-57.94,1.0 +104.12,-79.63,1.0 +-71.19,-139.16,1.0 +-68.4,-142.21,1.0 +-138.21,49.23,1.0 +-109.97,88.44,1.0 +84.99,-94.15,1.0 +104.45,-89.7,1.0 +-137.75,45.6,1.0 +1.22,189.35,1.0 +-60.7,-138.24,1.0 +-59.6,-141.99,1.0 +-95.92,-133.09,1.0 +-152.5,26.33,1.0 +-39.7,8.55,1.0 +-72.26,-153.39,1.0 +-99.42,-136.81,1.0 +98.52,59.01,1.0 +104.75,55.72,1.0 +-119.7,56.49,1.0 +-104.05,-130.54,1.0 +-31.92,18.86,1.0 +0.6,14.59,1.0 +-1.29,183.33,1.0 +-103.51,79.19,1.0 +-80.86,-141.27,1.0 +-94.34,-132.68,1.0 +-142.42,83.41,1.0 +-120.41,94.57,1.0 +-75.23,-149.11,1.0 +-147.66,30.94,1.0 +-120.21,94.97,1.0 +-45.91,-156.82,1.0 +-113.18,64.32,1.0 +-37.76,-168.93,1.0 +101.43,-91.6,1.0 +-53.25,-179.13,1.0 +-154.69,23.96,1.0 +-77.25,-156.72,1.0 +-91.23,-112.83,1.0 +-0.67,182.68,1.0 +-100.58,-134.24,1.0 +-104.57,-98.96,1.0 +107.18,-73.45,1.0 +104.41,-84.71,1.0 +-110.45,-122.26,1.0 +-150.62,54.48,1.0 +-150.33,56.19,1.0 +103.03,-87.23,1.0 +-86.3,-118.51,1.0 +-150.21,41.42,1.0 +2.45,193.22,1.0 +-143.03,80.64,1.0 +2.29,193.0,1.0 +6.71,184.08,1.0 +-7.11,194.24,1.0 +-62.9,-136.68,1.0 +-5.93,184.97,1.0 +105.43,-74.64,1.0 +-100.06,-138.12,1.0 +114.8,30.82,1.0 +138.28,69.49,1.0 +105.64,-74.73,1.0 +-109.93,-125.02,1.0 +7.94,187.81,1.0 +-9.15,187.02,1.0 +137.07,35.82,1.0 +-58.04,-176.93,1.0 +-41.88,-185.41,1.0 +-32.3,-175.38,1.0 +-146.26,33.83,1.0 +-118.28,-108.51,1.0 +-149.11,43.89,1.0 +100.31,-94.86,1.0 +-150.58,39.95,1.0 +109.77,73.36,1.0 +-114.4,94.2,1.0 +-44.24,-156.41,1.0 +-148.74,9.84,1.0 +47.3,-88.45,1.0 +-158.16,51.82,1.0 +12.71,188.05,1.0 +-2.42,-49.64,1.0 +-31.77,-175.57,1.0 +-32.83,-181.81,1.0 +-142.52,90.8,1.0 +-119.29,-105.08,1.0 +-137.68,36.15,1.0 +32.41,-1.36,1.0 +-148.11,39.73,1.0 +-58.49,-175.98,1.0 +-52.11,-184.33,1.0 +8.44,-60.34,1.0 +-119.55,-107.49,1.0 +12.39,196.7,1.0 +100.01,-98.36,1.0 +-113.87,-125.13,1.0 +-159.96,17.69,1.0 +-115.87,-118.48,1.0 +14.0,191.59,1.0 +-58.55,-130.35,1.0 +-140.28,26.7,1.0 +-49.84,-137.28,1.0 +9.56,199.91,1.0 +78.44,-38.58,1.0 +115.29,-95.22,1.0 +-9.76,190.38,1.0 +-114.34,-88.71,1.0 +-156.79,14.79,1.0 +-102.99,70.32,1.0 +-80.57,-160.86,1.0 +-142.8,7.19,1.0 +-74.66,-170.95,1.0 +-57.55,-185.25,1.0 +90.84,-96.07,1.0 +-68.01,-175.46,1.0 +17.67,183.47,1.0 +-30.33,-181.82,1.0 +110.0,-101.48,1.0 +16.83,192.52,1.0 +-148.43,-2.33,1.0 +-113.86,-91.64,1.0 +17.15,185.01,1.0 +30.54,-78.75,1.0 +-135.92,27.96,1.0 +11.43,190.47,1.0 +-108.92,60.47,1.0 +148.88,53.84,1.0 +144.85,64.85,1.0 +-138.48,17.67,1.0 +-91.68,-103.39,1.0 +53.89,-92.19,1.0 +-48.56,-138.11,1.0 +-168.45,33.35,1.0 +113.29,79.23,1.0 +-45.76,-189.75,1.0 +1.53,174.53,1.0 +-51.2,-189.64,1.0 +-117.01,98.07,1.0 +-80.26,-160.96,1.0 +129.83,29.6,1.0 +-30.71,-168.85,1.0 +-164.59,17.52,1.0 +-137.66,22.2,1.0 +100.85,-55.84,1.0 +-43.21,16.37,1.0 +-33.45,-161.0,1.0 +-161.88,13.12,1.0 +-106.37,103.34,1.0 +-10.91,196.1,1.0 +-137.24,10.84,1.0 +-11.73,194.86,1.0 +19.13,174.96,1.0 +-152.06,-1.13,1.0 +137.21,32.38,1.0 +-63.54,-180.26,1.0 +105.47,-61.5,1.0 +-129.58,32.77,1.0 +99.0,78.27,1.0 +-105.95,-85.41,1.0 +148.82,68.16,1.0 +102.22,78.9,1.0 +-121.61,-94.99,1.0 +-127.72,-106.97,1.0 +-65.23,-183.45,1.0 +96.4,-105.5,1.0 +-161.02,9.57,1.0 +95.56,-103.22,1.0 +-121.48,-125.59,1.0 +-44.8,13.98,1.0 +-114.97,-128.25,1.0 +-167.2,20.85,1.0 +148.88,67.12,1.0 +-122.4,-93.31,1.0 +-121.76,-95.92,1.0 +94.5,73.08,1.0 +-36.37,-153.89,1.0 +106.34,78.96,1.0 +-96.02,88.34,1.0 +-4.88,171.54,1.0 +-29.56,-177.13,1.0 +-147.33,86.94,1.0 +132.5,31.05,1.0 +-127.52,-96.71,1.0 +-29.31,-166.07,1.0 +-89.84,-155.65,1.0 +-116.29,-83.58,1.0 +110.65,-65.83,1.0 +20.28,195.1,1.0 +100.07,-108.84,1.0 +-124.27,-116.28,1.0 +-97.93,-150.41,1.0 +7.37,18.22,1.0 +-44.89,21.41,1.0 +-169.12,42.79,1.0 +-121.35,-76.85,1.0 +158.27,64.29,1.0 +-128.6,-84.6,1.0 +161.17,72.32,1.0 +-166.06,41.13,1.0 +-112.46,-79.61,1.0 +-127.94,-75.52,1.0 +21.86,196.14,1.0 +155.44,58.71,1.0 +-108.67,-69.61,1.0 +-168.77,11.39,1.0 +-88.07,-159.94,1.0 +155.55,65.14,1.0 +13.09,205.04,1.0 +-166.96,1.81,1.0 +-108.33,-69.23,1.0 +-110.8,-81.12,1.0 +154.84,64.86,1.0 +-53.48,-129.85,1.0 +-172.4,18.05,1.0 +-170.36,17.81,1.0 +-111.54,109.03,1.0 +108.27,82.55,1.0 +118.47,-102.91,1.0 +-90.1,-107.58,1.0 +-78.55,-177.12,1.0 +-103.55,-83.6,1.0 +-175.4,34.5,1.0 +103.18,29.78,1.0 +98.2,83.25,1.0 +-111.47,50.58,1.0 +90.84,73.5,1.0 +-173.23,17.25,1.0 +-124.32,-77.94,1.0 +89.71,37.28,1.0 +158.25,72.76,1.0 +-36.05,26.67,1.0 +-58.63,-124.63,1.0 +-127.29,7.34,1.0 +-54.28,-191.08,1.0 +158.53,72.31,1.0 +115.97,82.77,1.0 +-152.47,82.11,1.0 +-172.15,4.26,1.0 +-50.63,-129.85,1.0 +-144.74,-10.69,1.0 +-113.03,108.14,1.0 +-18.17,191.7,1.0 +-141.08,-4.83,1.0 +19.87,199.35,1.0 +-89.66,-105.97,1.0 +-132.88,-106.13,1.0 +-102.63,-69.5,1.0 +90.59,75.79,1.0 +77.85,-101.28,1.0 +92.66,33.84,1.0 +118.8,-76.61,1.0 +92.97,32.94,1.0 +142.41,27.19,1.0 +-101.45,-73.78,1.0 +1.37,168.87,1.0 +-136.53,-1.69,1.0 +137.48,24.59,1.0 +-171.98,54.64,1.0 +-34.35,-149.96,1.0 +-13.73,203.23,1.0 +-51.7,12.3,1.0 +-40.06,-194.36,1.0 +-126.64,22.01,1.0 +-53.25,12.01,1.0 +-40.58,-195.37,1.0 +81.91,46.8,1.0 +-182.31,5.6,1.0 +89.63,31.25,1.0 +-131.19,-0.89,1.0 +170.14,81.86,1.0 +-20.31,174.65,1.0 +89.2,90.67,1.0 +-96.22,109.16,1.0 +-135.38,-5.84,1.0 +88.9,81.25,1.0 +-123.97,-7.98,1.0 +-127.73,-71.18,1.0 +76.68,-104.13,1.0 +91.05,81.25,1.0 +-6.77,170.08,1.0 +-167.14,67.67,1.0 +-174.17,46.84,1.0 +-25.48,191.01,1.0 +-35.0,31.27,1.0 +120.01,-112.91,1.0 +-56.96,19.79,1.0 +-31.17,-151.32,1.0 +85.13,79.42,1.0 +-133.9,-12.5,1.0 +-88.45,97.67,1.0 +-94.7,-82.11,1.0 +76.29,47.32,1.0 +105.21,-114.65,1.0 +90.17,27.69,1.0 +-88.97,-174.7,1.0 +76.57,42.37,1.0 +76.75,39.09,1.0 +-120.21,-0.41,1.0 +38.95,191.37,1.0 +-16.61,169.63,1.0 +-83.75,105.92,1.0 +152.59,83.59,1.0 +-44.22,-129.13,1.0 +128.1,-99.92,1.0 +80.93,38.75,1.0 +128.39,-98.37,1.0 +73.97,-107.35,1.0 +124.94,20.74,1.0 +-93.43,118.77,1.0 +-14.66,210.52,1.0 +-103.78,125.49,1.0 +33.79,4.74,1.0 +143.81,16.29,1.0 +122.19,-117.22,1.0 +-141.88,-21.99,1.0 +178.17,89.47,1.0 +108.35,-53.2,1.0 +-69.43,-105.54,1.0 +-134.74,109.53,1.0 +-35.52,34.17,1.0 +-2.49,166.13,1.0 +147.86,14.69,1.0 +124.79,-114.03,1.0 +73.07,85.62,1.0 +-55.53,-117.48,1.0 +-17.27,-178.42,1.0 +142.13,14.04,1.0 +-85.3,109.81,1.0 +-19.5,-189.8,1.0 +178.36,89.71,1.0 +-118.15,28.28,1.0 +181.36,74.08,1.0 +-88.78,64.83,1.0 +-56.56,32.21,1.0 +-97.5,121.06,1.0 +-132.51,-20.91,1.0 +116.09,-119.39,1.0 +134.4,-103.69,1.0 +-81.92,-95.36,1.0 +-115.16,34.44,1.0 +185.54,91.77,1.0 +-146.13,-92.81,1.0 +-121.92,-53.86,1.0 +-88.75,120.73,1.0 +-113.72,124.99,1.0 +183.88,72.62,1.0 +-100.83,-55.66,1.0 +-165.27,-17.99,1.0 +-83.69,114.16,1.0 +-54.54,-114.85,1.0 +-101.56,-167.51,1.0 +134.13,-93.06,1.0 +-173.34,-8.23,1.0 +30.63,207.53,1.0 +-192.41,23.22,1.0 +119.19,91.51,1.0 +-170.19,-15.39,1.0 +-99.19,-49.44,1.0 +-42.67,38.51,1.0 +126.82,-118.69,1.0 +-90.25,-53.48,1.0 +187.47,73.6,1.0 +185.77,71.49,1.0 +-33.08,-203.35,1.0 +-144.34,-24.69,1.0 +-85.63,126.18,1.0 +84.95,102.16,1.0 +-184.08,51.08,1.0 +-110.99,35.95,1.0 +-84.95,-57.46,1.0 +-113.06,-27.43,1.0 +-112.15,-47.72,1.0 +-11.14,214.12,1.0 +113.85,99.19,1.0 +-138.8,-71.62,1.0 +-78.9,86.08,1.0 +-106.19,-157.61,1.0 +196.55,94.58,1.0 +145.94,5.0,1.0 +199.02,92.95,1.0 +11.2,-77.94,1.0 +176.28,102.81,1.0 +-108.52,-1.67,1.0 +-89.59,-39.01,1.0 +-71.02,101.17,1.0 +201.85,96.03,1.0 +-195.77,18.03,1.0 +88.32,104.46,1.0 +-59.51,42.02,1.0 +-175.57,-10.95,1.0 +146.9,-3.72,1.0 +149.88,-2.27,1.0 +-107.99,31.03,1.0 +3.72,159.94,1.0 +69.86,104.73,1.0 +-45.46,45.89,1.0 +-151.74,-115.27,1.0 +-16.58,83.32,1.0 +165.44,4.82,1.0 +-68.45,-96.8,1.0 +-9.45,-193.64,1.0 +-90.37,54.49,1.0 +150.25,-95.28,1.0 +-98.38,-8.48,1.0 +58.98,33.97,1.0 +-17.37,80.34,1.0 +-131.35,129.83,1.0 +-3.55,63.23,1.0 +-152.69,-107.4,1.0 +189.2,114.16,1.0 +-126.22,-46.18,1.0 +-99.82,-20.51,1.0 +-199.58,15.43,1.0 +64.94,58.76,1.0 +-124.2,130.86,1.0 +-22.9,70.98,1.0 +101.46,-130.87,1.0 +-89.56,-22.93,1.0 +-59.79,96.14,1.0 +-25.44,69.28,1.0 +-7.63,101.82,1.0 +-18.33,218.82,1.0 +-10.05,57.69,1.0 +-90.2,-193.2,1.0 +48.24,197.6,1.0 +161.49,-23.1,1.0 +-211.32,199.59,1.0 +56.32,95.85,1.0 +73.55,117.9,1.0 +-157.22,-33.54,1.0 +152.9,-193.79,1.0 +-55.82,97.45,1.0 +82.37,-154.18,1.0 +-144.03,110.43,1.0 +165.13,-181.03,1.0 +-30.61,95.87,1.0 +169.61,-178.41,1.0 +139.89,-76.15,1.0 +-133.64,143.67,1.0 +50.57,-217.21,1.0 +83.75,-165.61,1.0 +167.3,159.8,1.0 +-71.62,-68.71,1.0 +-63.06,132.28,1.0 +-206.06,207.94,1.0 +21.83,34.6,1.0 +-90.55,174.5,1.0 +176.2,-20.95,1.0 +154.01,-200.93,1.0 +89.94,-175.25,1.0 +156.96,147.98,1.0 +11.74,53.49,1.0 +-70.23,75.82,1.0 +-72.92,52.3,1.0 +-153.83,199.29,1.0 +-107.26,-183.61,1.0 +-218.5,217.82,1.0 +42.42,173.1,1.0 +-26.56,230.0,1.0 +-225.45,213.66,1.0 +-208.42,44.23,1.0 +-50.46,139.03,1.0 +110.59,173.8,1.0 +206.69,-54.09,1.0 +134.68,168.44,1.0 +-197.99,180.01,1.0 +165.08,182.24,1.0 +-173.08,188.31,1.0 +-205.48,-60.18,1.0 +38.82,40.86,1.0 +84.47,-179.68,1.0 +-135.44,160.1,1.0 +-46.29,139.92,1.0 +185.52,-13.35,1.0 +62.53,-224.14,1.0 +-10.58,-137.83,1.0 +-90.77,197.17,1.0 +180.35,-177.28,1.0 +107.82,223.36,1.0 +173.79,159.79,1.0 +-114.92,167.81,1.0 +47.25,74.7,1.0 +104.78,-140.51,1.0 +138.77,151.97,1.0 +68.95,-218.09,1.0 +183.49,-44.48,1.0 +46.87,-178.43,1.0 +50.55,102.89,1.0 +179.91,-43.13,1.0 +28.72,-218.23,1.0 +-85.01,176.34,1.0 +151.61,-207.9,1.0 +-151.0,-125.41,1.0 +-165.85,212.54,1.0 +-62.73,-26.17,1.0 +-30.01,-82.16,1.0 +-198.84,120.93,1.0 +-1.38,-107.19,1.0 +124.88,223.88,1.0 +11.02,-109.6,1.0 +219.38,44.14,1.0 +85.29,222.39,1.0 +-222.51,-3.68,1.0 +176.43,193.78,1.0 +185.85,217.29,1.0 +221.69,45.76,1.0 +-195.01,-80.14,1.0 +-222.27,41.0,1.0 +227.34,102.66,1.0 +-136.73,188.82,1.0 +5.76,119.62,1.0 +15.38,90.69,1.0 +-8.29,-215.48,1.0 +-215.86,135.65,1.0 +217.93,-94.74,1.0 +230.0,-117.35,1.0 +-61.5,-72.0,1.0 +215.97,-122.79,1.0 +-28.78,-67.4,1.0 +25.69,-203.08,1.0 +202.44,225.5,1.0 +-79.99,-226.8,1.0 +54.55,-152.15,1.0 +164.86,-97.89,1.0 +27.91,-162.19,1.0 +-89.83,-230.0,1.0 +-217.3,-49.78,1.0 +223.38,-92.1,1.0 +-207.69,-14.73,1.0 +221.29,-34.45,1.0 +46.87,-180.27,1.0 +84.93,166.13,1.0 +63.32,129.43,1.0 +-230.0,-217.98,1.0 +-142.46,-195.96,1.0 +-192.39,85.64,1.0 +-170.09,-66.53,1.0 +-140.8,-162.65,1.0 +-174.47,-209.6,1.0 +-227.04,-98.18,1.0 +-214.12,-127.14,1.0 +-53.37,214.91,1.0 +229.54,-147.83,1.0 +98.48,-210.49,1.0 +47.68,-126.85,1.0 +-122.78,-213.57,1.0 +97.43,196.05,1.0 +208.11,-195.95,1.0 +-228.87,69.29,1.0 +137.95,195.99,1.0 +169.87,-122.36,1.0 +-191.6,-37.74,1.0 +172.93,-66.66,1.0 +191.85,33.12,1.0 +-103.49,219.02,1.0 +188.1,-221.79,1.0 +-199.87,-184.58,1.0 diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h new file mode 100644 index 00000000..356233dd --- /dev/null +++ b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h @@ -0,0 +1,213 @@ +#ifndef CLUE_Algo_Alpaka_h +#define CLUE_Algo_Alpaka_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "CLUEAlpakaKernels.h" +#include "../DataFormats/Points.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" +#include "../DataFormats/alpaka/TilesAlpaka.h" +#include "ConvolutionalKernel.h" + +using cms::alpakatools::VecArray; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + class CLUEAlgoAlpaka { + public: + CLUEAlgoAlpaka() = delete; + explicit CLUEAlgoAlpaka(float dc, float rhoc, float outlierDeltaFactor, int pPBin, Queue queue_) + : dc_{dc}, rhoc_{rhoc}, outlierDeltaFactor_{outlierDeltaFactor}, pointsPerTile_{pPBin} { + init_device(queue_); + } + + TilesAlpaka* m_tiles; + VecArray* m_seeds; + VecArray* m_followers; + + template + std::vector> make_clusters(Points& h_points, + PointsAlpaka& d_points, + const KernelType& kernel, + Queue queue_, + size_t block_size); + + private: + float dc_; + float rhoc_; + float outlierDeltaFactor_; + // average number of points found in a tile + int pointsPerTile_; + + /* domain_t m_domains; */ + + // Buffers + std::optional>> d_tiles; + std::optional>> + d_seeds; + std::optional< + cms::alpakatools::device_buffer[]>> + d_followers; + + // Private methods + void init_device(Queue queue_); + void setup(const Points& h_points, PointsAlpaka& d_points, Queue queue_, size_t block_size); + + // Construction of the tiles + void calculate_tile_size(TilesAlpaka& h_tiles, const Points& h_points); + }; + + // Private methods + template + void CLUEAlgoAlpaka::calculate_tile_size(TilesAlpaka& h_tiles, + const Points& h_points) { + for (int i{}; i != Ndim; ++i) { + float tileSize; + float dimMax{*std::max_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; + float dimMin{*std::min_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; + + VecArray temp; + temp.push_back_unsafe(dimMin); + temp.push_back_unsafe(dimMax); + h_tiles.min_max[i] = temp; + tileSize = (dimMax - dimMin) / h_tiles.nPerDim(); + + h_tiles.tile_size[i] = tileSize; + } + } + + template + void CLUEAlgoAlpaka::init_device(Queue queue_) { + d_tiles = cms::alpakatools::make_device_buffer>(queue_); + d_seeds = cms::alpakatools::make_device_buffer>(queue_); + d_followers = + cms::alpakatools::make_device_buffer[]>(queue_, + reserve); + + // Copy to the public pointers + m_seeds = (*d_seeds).data(); + m_followers = (*d_followers).data(); + } + + template + void CLUEAlgoAlpaka::setup(const Points& h_points, + PointsAlpaka& d_points, + Queue queue_, + size_t block_size) { + // Create temporary tiles object + TilesAlpaka temp; + calculate_tile_size(temp, h_points); + temp.resizeTiles(); + + alpaka::memcpy(queue_, *d_tiles, cms::alpakatools::make_host_view(temp)); + m_tiles = (*d_tiles).data(); + alpaka::memcpy( + queue_, d_points.coords, cms::alpakatools::make_host_view(h_points.m_coords.data(), h_points.n)); + alpaka::memcpy( + queue_, d_points.weight, cms::alpakatools::make_host_view(h_points.m_weight.data(), h_points.n)); + alpaka::memset(queue_, (*d_seeds), 0x00); + + // Define the working division + Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); + auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::enqueue( + queue_, + alpaka::createTaskKernel(working_div, KernelResetFollowers{}, m_followers, h_points.n)); + } + + // Public methods + template + template + std::vector> CLUEAlgoAlpaka::make_clusters(Points& h_points, + PointsAlpaka& d_points, + const KernelType& kernel, + Queue queue_, + size_t block_size) { + setup(h_points, d_points, queue_, block_size); + + const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); + auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); + alpaka::enqueue(queue_, + alpaka::createTaskKernel( + working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n)); + + alpaka::enqueue(queue_, + alpaka::createTaskKernel(working_div, + KernelCalculateLocalDensity(), + m_tiles, + d_points.view(), + kernel, + /* m_domains.data(), */ + dc_, + h_points.n)); + alpaka::enqueue(queue_, + alpaka::createTaskKernel(working_div, + KernelCalculateNearestHigher(), + m_tiles, + d_points.view(), + /* m_domains.data(), */ + outlierDeltaFactor_, + dc_, + h_points.n)); + alpaka::enqueue(queue_, + alpaka::createTaskKernel(working_div, + KernelFindClusters(), + m_seeds, + m_followers, + d_points.view(), + outlierDeltaFactor_, + dc_, + rhoc_, + h_points.n)); + + // We change the working division when assigning the clusters + const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, block_size); + auto working_div_seeds = cms::alpakatools::make_workdiv(grid_size_seeds, block_size); + alpaka::enqueue( + queue_, + alpaka::createTaskKernel( + working_div_seeds, KernelAssignClusters(), m_seeds, m_followers, d_points.view())); + + // Wait for all the operations in the queue to finish + alpaka::wait(queue_); + + alpaka::memcpy(queue_, + cms::alpakatools::make_host_view(h_points.m_rho.data(), h_points.n), + d_points.rho, + static_cast(h_points.n)); + alpaka::memcpy(queue_, + cms::alpakatools::make_host_view(h_points.m_delta.data(), h_points.n), + d_points.delta, + static_cast(h_points.n)); + alpaka::memcpy(queue_, + cms::alpakatools::make_host_view(h_points.m_nearestHigher.data(), h_points.n), + d_points.nearest_higher, + static_cast(h_points.n)); + alpaka::memcpy(queue_, + cms::alpakatools::make_host_view(h_points.m_clusterIndex.data(), h_points.n), + d_points.cluster_index, + static_cast(h_points.n)); + alpaka::memcpy(queue_, + cms::alpakatools::make_host_view(h_points.m_isSeed.data(), h_points.n), + d_points.is_seed, + static_cast(h_points.n)); + + // Wait for all the operations in the queue to finish + alpaka::wait(queue_); + + return {h_points.m_clusterIndex, h_points.m_isSeed}; + } +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif diff --git a/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h new file mode 100644 index 00000000..40c82271 --- /dev/null +++ b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h @@ -0,0 +1,310 @@ +#ifndef CLUE_Alpaka_Kernels_h +#define CLUE_Alpaka_Kernels_h + +#include +#include + +#include "../AlpakaCore/alpakaWorkDiv.h" +#include "../DataFormats/alpaka/PointsAlpaka.h" +#include "../DataFormats/alpaka/TilesAlpaka.h" +#include "../DataFormats/alpaka/AlpakaVecArray.h" +#include "ConvolutionalKernel.h" + +using cms::alpakatools::VecArray; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + constexpr int32_t max_followers{100}; + constexpr int32_t max_seeds{100}; + constexpr int32_t reserve{1000000}; + + template + using PointsView = typename PointsAlpaka::PointsAlpakaView; + + struct KernelResetFollowers { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VecArray* d_followers, + uint32_t n_points) const { + cms::alpakatools::for_each_element_in_grid( + acc, n_points, [&](uint32_t i) { d_followers[i].reset(); }); + } + }; + + struct KernelFillTiles { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + PointsView* points, + TilesAlpaka* tiles, + uint32_t n_points) const { + cms::alpakatools::for_each_element_in_grid( + acc, n_points, [&](uint32_t i) { tiles->fill(acc, points->coords[i], i); }); + } + }; + + template + ALPAKA_FN_HOST_ACC void for_recursion(const TAcc& acc, + VecArray& base_vec, + const VecArray, Ndim>& search_box, + TilesAlpaka* tiles, + PointsView* dev_points, + const KernelType& kernel, + /* const VecArray, Ndim>& domains, */ + const VecArray& coords_i, + float* rho_i, + float dc, + uint32_t point_id) { + if constexpr (N_ == 0) { + int binId{tiles->getGlobalBinByBin(acc, base_vec)}; + // get the size of this bin + int binSize{static_cast((*tiles)[binId].size())}; + + // iterate inside this bin + for (int binIter{}; binIter < binSize; ++binIter) { + uint32_t j{(*tiles)[binId][binIter]}; + // query N_{dc_}(i) + + VecArray coords_j{dev_points->coords[j]}; + float dist_ij_sq{0.f}; + for (int dim{}; dim != Ndim; ++dim) { + dist_ij_sq += (coords_j[dim] - coords_i[dim]) * (coords_j[dim] - coords_i[dim]); + } + + if (dist_ij_sq <= dc * dc) { + *rho_i += kernel(acc, alpaka::math::sqrt(acc, dist_ij_sq), point_id, j); + } + + } // end of interate inside this bin + + return; + } else { + for (unsigned int i{search_box[search_box.capacity() - N_][0]}; + i <= search_box[search_box.capacity() - N_][1]; + ++i) { + base_vec[base_vec.capacity() - N_] = i; + for_recursion( + acc, base_vec, search_box, tiles, dev_points, kernel, coords_i, rho_i, dc, point_id); + } + } + } + + struct KernelCalculateLocalDensity { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TilesAlpaka* dev_tiles, + PointsView* dev_points, + const KernelType& kernel, + /* const VecArray, Ndim>& domains, */ + float dc, + uint32_t n_points) const { + cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) { + float rho_i{0.f}; + VecArray coords_i{dev_points->coords[i]}; + + // Get the extremes of the search box + VecArray, Ndim> searchbox_extremes; + for (int dim{}; dim != Ndim; ++dim) { + VecArray dim_extremes; + dim_extremes.push_back_unsafe(coords_i[dim] - dc); + dim_extremes.push_back_unsafe(coords_i[dim] + dc); + + searchbox_extremes.push_back_unsafe(dim_extremes); + } + + // Calculate the search box + VecArray, Ndim> search_box; + dev_tiles->searchBox(acc, searchbox_extremes, &search_box); + + VecArray base_vec; + for_recursion( + acc, base_vec, search_box, dev_tiles, dev_points, kernel, coords_i, &rho_i, dc, i); + + dev_points->rho[i] = rho_i; + }); + } + }; + + template + ALPAKA_FN_HOST_ACC void for_recursion_nearest_higher( + const TAcc& acc, + VecArray& base_vec, + const VecArray, Ndim>& s_box, + TilesAlpaka* tiles, + PointsView* dev_points, + /* const VecArray, Ndim>& domains, */ + const VecArray& coords_i, + float rho_i, + float* delta_i, + int* nh_i, + float dm_sq, + uint32_t point_id) { + if constexpr (N_ == 0) { + int binId{tiles->getGlobalBinByBin(acc, base_vec)}; + // get the size of this bin + int binSize{(*tiles)[binId].size()}; + + // iterate inside this bin + for (int binIter{}; binIter < binSize; ++binIter) { + unsigned int j{(*tiles)[binId][binIter]}; + // query N'_{dm}(i) + float rho_j{dev_points->rho[j]}; + bool found_higher{(rho_j > rho_i)}; + // in the rare case where rho is the same, use detid + found_higher = found_higher || ((rho_j == rho_i) && (j > point_id)); + + // Calculate the distance between the two points + VecArray coords_j{dev_points->coords[j]}; + float dist_ij_sq{0.f}; + for (int dim{}; dim != Ndim; ++dim) { + dist_ij_sq += (coords_j[dim] - coords_i[dim]) * (coords_j[dim] - coords_i[dim]); + } + + if (found_higher && dist_ij_sq <= dm_sq) { + // find the nearest point within N'_{dm}(i) + if (dist_ij_sq < *delta_i) { + // update delta_i and nearestHigher_i + *delta_i = dist_ij_sq; + *nh_i = j; + } + } + } // end of interate inside this bin + + return; + } else { + for (unsigned int i{s_box[s_box.capacity() - N_][0]}; i <= s_box[s_box.capacity() - N_][1]; ++i) { + base_vec[base_vec.capacity() - N_] = i; + for_recursion_nearest_higher( + acc, base_vec, s_box, tiles, dev_points, coords_i, rho_i, delta_i, nh_i, dm_sq, point_id); + } + } + } + + struct KernelCalculateNearestHigher { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TilesAlpaka* dev_tiles, + PointsView* dev_points, + /* const VecArray, Ndim>& domains, */ + float outlier_delta_factor, + float dc, + uint32_t n_points) const { + float dm{outlier_delta_factor * dc}; + float dm_squared{dm * dm}; + cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) { + float delta_i{std::numeric_limits::max()}; + int nh_i{-1}; + VecArray coords_i{dev_points->coords[i]}; + float rho_i{dev_points->rho[i]}; + + // Get the extremes of the search box + VecArray, Ndim> searchbox_extremes; + for (int dim{}; dim != Ndim; ++dim) { + VecArray dim_extremes; + dim_extremes.push_back_unsafe(coords_i[dim] - dc); + dim_extremes.push_back_unsafe(coords_i[dim] + dc); + + searchbox_extremes.push_back_unsafe(dim_extremes); + } + + // Calculate the search box + VecArray, Ndim> search_box; + dev_tiles->searchBox(acc, searchbox_extremes, &search_box); + + VecArray base_vec{}; + for_recursion_nearest_higher(acc, + base_vec, + search_box, + dev_tiles, + dev_points, + coords_i, + rho_i, + &delta_i, + &nh_i, + dm_squared, + i); + + dev_points->delta[i] = alpaka::math::sqrt(acc, delta_i); + dev_points->nearest_higher[i] = nh_i; + }); + } + }; + + template + struct KernelFindClusters { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VecArray* seeds, + VecArray* followers, + PointsView* dev_points, + float outlier_delta_factor, + float d_c, + float rho_c, + uint32_t n_points) const { + cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) { + // initialize cluster_index + dev_points->cluster_index[i] = -1; + + float delta_i{dev_points->delta[i]}; + float rho_i{dev_points->rho[i]}; + + // Determine whether the point is a seed or an outlier + bool is_seed{(delta_i > d_c) && (rho_i >= rho_c)}; + bool is_outlier{(delta_i > outlier_delta_factor * d_c) && (rho_i < rho_c)}; + + if (is_seed) { + dev_points->is_seed[i] = 1; + seeds[0].push_back(acc, i); + } else { + if (!is_outlier) { + followers[dev_points->nearest_higher[i]].push_back(acc, i); + } + dev_points->is_seed[i] = 0; + } + }); + } + }; + + template + struct KernelAssignClusters { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VecArray* seeds, + VecArray* followers, + PointsView* dev_points) const { + const auto& seeds_0{seeds[0]}; + const auto n_seeds{seeds_0.size()}; + cms::alpakatools::for_each_element_in_grid(acc, n_seeds, [&](uint32_t idx_cls) { + int local_stack[256] = {-1}; + int local_stack_size{}; + + int idx_this_seed{seeds_0[idx_cls]}; + dev_points->cluster_index[idx_this_seed] = idx_cls; + // push_back idThisSeed to localStack + local_stack[local_stack_size] = idx_this_seed; + ++local_stack_size; + // process all elements in localStack + while (local_stack_size > 0) { + // get last element of localStack + int idx_end_of_local_stack{local_stack[local_stack_size - 1]}; + int temp_cluster_index{dev_points->cluster_index[idx_end_of_local_stack]}; + // pop_back last element of localStack + local_stack[local_stack_size - 1] = -1; + --local_stack_size; + const auto& followers_ies{followers[idx_end_of_local_stack]}; + const auto followers_size{followers[idx_end_of_local_stack].size()}; + // loop over followers of last element of localStack + for (int j{}; j != followers_size; ++j) { + // pass id to follower + int follower{followers_ies[j]}; + dev_points->cluster_index[follower] = temp_cluster_index; + // push_back follower to localStack + local_stack[local_stack_size] = follower; + ++local_stack_size; + } + } + }); + } + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h new file mode 100644 index 00000000..2e6eb3c6 --- /dev/null +++ b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h @@ -0,0 +1,80 @@ +#ifndef convolutional_kernels_h +#define convolutional_kernels_h + +#include +#include +#include +#include +#include +#include + +#include + +class FlatKernel { +private: + float m_flat; + +public: + // Constructors + FlatKernel() = delete; + FlatKernel(float flat) : m_flat{flat} {} + + // Overload call operator + template + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + if (point_id == j) { + return 1.f; + } else { + return m_flat; + } + } +}; + +class GaussianKernel { +private: + float m_gaus_avg; + float m_gaus_std; + float m_gaus_amplitude; + +public: + // Constructors + GaussianKernel() = delete; + GaussianKernel(float gaus_avg, float gaus_std, float gaus_amplitude) + : m_gaus_avg{gaus_avg}, m_gaus_std{gaus_std}, m_gaus_amplitude{gaus_amplitude} {} + + // Overload call operator + template + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + if (point_id == j) { + return 1.f; + } else { + return (m_gaus_amplitude * + alpaka::math::exp( + acc, -(dist_ij - m_gaus_avg) * (dist_ij - m_gaus_avg) / (2 * m_gaus_std * m_gaus_std))); + } + } +}; + +class ExponentialKernel { +private: + float m_exp_avg; + float m_exp_amplitude; + +public: + // Constructors + ExponentialKernel() = delete; + ExponentialKernel(float exp_avg, float exp_amplitude) + : m_exp_avg{exp_avg}, m_exp_amplitude{exp_amplitude} {} + + // Overload call operator + template + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + if (point_id == j) { + return 1.f; + } else { + return (m_exp_amplitude * alpaka::math::exp(acc, -m_exp_avg * dist_ij)); + } + } +}; + +#endif diff --git a/CLUEstering/alpaka/CLUE/Run.h b/CLUEstering/alpaka/CLUE/Run.h new file mode 100644 index 00000000..cf1502fc --- /dev/null +++ b/CLUEstering/alpaka/CLUE/Run.h @@ -0,0 +1,552 @@ +#ifndef run_h +#define run_h + +#include +#include "CLUEAlgoAlpaka.h" +#include "ConvolutionalKernel.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + std::vector> run1(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run1(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run1(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run2(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run2(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run2(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run3(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run3(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run3(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run4(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run4(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run4(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run5(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run5(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run5(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run6(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run6(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run6(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run7(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run7(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run7(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run8(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run8(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run8(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run9(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run9(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run9(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run10(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const FlatKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run10(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const ExponentialKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + + std::vector> run10(float dc, + float rhoc, + float outlier, + int pPBin, + std::vector> const &coordinates, + std::vector const &weight, + const GaussianKernel &kernel, + Queue queue_, + size_t block_size) { + CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); + + // Create the host and device points + Points<2> h_points(coordinates, weight); + PointsAlpaka<2> d_points(queue_, weight.size()); + + return algo.make_clusters(h_points, d_points, kernel, queue_, block_size); + } + +}; // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/CLUEstering/alpaka/DataFormats/Points.h b/CLUEstering/alpaka/DataFormats/Points.h new file mode 100644 index 00000000..e522976c --- /dev/null +++ b/CLUEstering/alpaka/DataFormats/Points.h @@ -0,0 +1,48 @@ +#ifndef points_h +#define points_h + +#include +#include +#include +#include +#include +#include +#include "alpaka/PointsAlpaka.h" +#include "alpaka/AlpakaVecArray.h" + +using cms::alpakatools::VecArray; + +template +struct Points { + Points() = default; + Points(const std::vector>& coords, const std::vector& weight) + : m_coords{coords}, m_weight{weight}, n{weight.size()} {} + Points(const std::vector>& coords, const std::vector& weight) + : m_weight{weight}, n{weight.size()} { + for (const auto& x : coords) { + VecArray temp_vecarray; + for (auto value : x) { + temp_vecarray.push_back_unsafe(value); + } + m_coords.push_back(temp_vecarray); + } + + m_rho.resize(n); + m_delta.resize(n); + m_nearestHigher.resize(n); + m_clusterIndex.resize(n); + m_isSeed.resize(n); + } + + std::vector> m_coords; + std::vector m_weight; + std::vector m_rho; + std::vector m_delta; + std::vector m_nearestHigher; + std::vector m_clusterIndex; + std::vector m_isSeed; + + size_t n; +}; + +#endif diff --git a/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h new file mode 100644 index 00000000..f2e3ed95 --- /dev/null +++ b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h @@ -0,0 +1,100 @@ +#ifndef AlpakaVecArray_h +#define AlpakaVecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +namespace cms::alpakatools { + + template + struct VecArray { + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { + auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + assert(("Too few elemets reserved")); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { + auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC inline T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int capacity() const { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + int m_size = 0; + + T m_data[maxSize]; + }; + +} // end namespace cms::alpakatools + +#endif // AlpakaVecArray_h diff --git a/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h new file mode 100644 index 00000000..9fe71ef4 --- /dev/null +++ b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h @@ -0,0 +1,76 @@ +#ifndef Points_Alpaka_h +#define Points_Alpaka_h + +#include +#include + +#include "../../AlpakaCore/alpakaConfig.h" +#include "../../AlpakaCore/alpakaMemory.h" +#include "AlpakaVecArray.h" +#include "../Points.h" + +using cms::alpakatools::VecArray; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + class PointsAlpaka { + public: + PointsAlpaka() = delete; + explicit PointsAlpaka(Queue stream, int n_points) + : coords{cms::alpakatools::make_device_buffer[]>(stream, n_points)}, + weight{cms::alpakatools::make_device_buffer(stream, n_points)}, + rho{cms::alpakatools::make_device_buffer(stream, n_points)}, + delta{cms::alpakatools::make_device_buffer(stream, n_points)}, + nearest_higher{cms::alpakatools::make_device_buffer(stream, n_points)}, + cluster_index{cms::alpakatools::make_device_buffer(stream, n_points)}, + is_seed{cms::alpakatools::make_device_buffer(stream, n_points)}, + view_dev{cms::alpakatools::make_device_buffer(stream)} { + auto view_host = cms::alpakatools::make_host_buffer(stream); + view_host->coords = coords.data(); + view_host->weight = weight.data(); + view_host->rho = rho.data(); + view_host->delta = delta.data(); + view_host->nearest_higher = nearest_higher.data(); + view_host->cluster_index = cluster_index.data(); + view_host->is_seed = is_seed.data(); + + // Copy memory inside the host view to device + alpaka::memcpy(stream, view_dev, view_host); + } + // Copy constructor/assignment operator + PointsAlpaka(const PointsAlpaka&) = delete; + PointsAlpaka& operator=(const PointsAlpaka&) = delete; + // Move constructor/assignment operator + PointsAlpaka(PointsAlpaka&&) = default; + PointsAlpaka& operator=(PointsAlpaka&&) = default; + // Destructor + ~PointsAlpaka() = default; + + cms::alpakatools::device_buffer[]> coords; + cms::alpakatools::device_buffer weight; + cms::alpakatools::device_buffer rho; + cms::alpakatools::device_buffer delta; + cms::alpakatools::device_buffer nearest_higher; + cms::alpakatools::device_buffer cluster_index; + cms::alpakatools::device_buffer is_seed; + + class PointsAlpakaView { + public: + VecArray* coords; + float* weight; + float* rho; + float* delta; + int* nearest_higher; + int* cluster_index; + int* is_seed; + }; + + PointsAlpakaView* view() { return view_dev.data(); } + + private: + cms::alpakatools::device_buffer view_dev; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h new file mode 100644 index 00000000..9f5812a1 --- /dev/null +++ b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h @@ -0,0 +1,111 @@ +#ifndef Tiles_Alpaka_h +#define Tiles_Alpaka_h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../AlpakaCore/alpakaConfig.h" +#include "../../AlpakaCore/alpakaMemory.h" +#include "AlpakaVecArray.h" + +using cms::alpakatools::VecArray; + +constexpr uint32_t max_tile_depth{1 << 10}; +constexpr uint32_t max_n_tiles{1 << 10}; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + class TilesAlpaka { + public: + TilesAlpaka() : n_tiles{1000}, n_tiles_per_dim{static_cast(std::pow(1000, 1. / Ndim))} {}; + + // Public member + VecArray, Ndim> min_max; + VecArray tile_size; + + // Public methods + void resizeTiles() { m_tiles.resize(n_tiles); } + + // getter + int nPerDim() const { return n_tiles_per_dim; } + + template + ALPAKA_FN_HOST_ACC inline constexpr int getBin(const TAcc& acc, float coord_, int dim_) const { + int coord_Bin{(int)((coord_ - min_max[dim_][0]) / tile_size[dim_])}; + + // Address the cases of underflow and overflow and underflow + coord_Bin = alpaka::math::min(acc, coord_Bin, n_tiles_per_dim - 1); + coord_Bin = alpaka::math::max(acc, coord_Bin, 0); + + return coord_Bin; + } + + template + ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBin(const TAcc& acc, + const VecArray& coords) const { + int globalBin{getBin(acc, coords[0], 0)}; + for (int i{1}; i != Ndim; ++i) { + globalBin += n_tiles_per_dim * getBin(acc, coords[i], i); + } + return globalBin; + } + + template + ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBinByBin(const TAcc& acc, + const VecArray& Bins) const { + uint32_t globalBin{Bins[0]}; + for (int i{1}; i != Ndim; ++i) { + globalBin += n_tiles_per_dim * Bins[i]; + } + return globalBin; + } + + template + ALPAKA_FN_ACC inline constexpr void fill(const TAcc& acc, const VecArray& coords, int i) { + m_tiles[getGlobalBin(acc, coords)].push_back(acc, i); + } + + template + ALPAKA_FN_ACC inline void searchBox(const TAcc& acc, + const VecArray, Ndim>& sb_extremes, + VecArray, Ndim>* search_box) { + for (int dim{}; dim != Ndim; ++dim) { + VecArray dim_sb; + dim_sb.push_back_unsafe(getBin(acc, sb_extremes[dim][0], dim)); + dim_sb.push_back_unsafe(getBin(acc, sb_extremes[dim][1], dim)); + + search_box->push_back_unsafe(dim_sb); + } + } + + ALPAKA_FN_HOST_ACC inline constexpr auto size() { return n_tiles; } + + ALPAKA_FN_HOST_ACC inline constexpr void clear() { + for (int i{}; i < n_tiles; ++i) { + m_tiles[i].reset(); + } + } + + ALPAKA_FN_HOST_ACC inline constexpr VecArray& operator[](int globalBinId) { + return m_tiles[globalBinId]; + } + + private: + size_t n_tiles; + int n_tiles_per_dim; + VecArray, max_n_tiles> m_tiles; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/CLUEstering/include/Clustering.h b/CLUEstering/serial/include/Clustering.h similarity index 100% rename from CLUEstering/include/Clustering.h rename to CLUEstering/serial/include/Clustering.h diff --git a/CLUEstering/include/Kernels.h b/CLUEstering/serial/include/Kernels.h similarity index 100% rename from CLUEstering/include/Kernels.h rename to CLUEstering/serial/include/Kernels.h diff --git a/CLUEstering/include/Point.h b/CLUEstering/serial/include/Point.h similarity index 100% rename from CLUEstering/include/Point.h rename to CLUEstering/serial/include/Point.h diff --git a/CLUEstering/include/Tiles.h b/CLUEstering/serial/include/Tiles.h similarity index 100% rename from CLUEstering/include/Tiles.h rename to CLUEstering/serial/include/Tiles.h diff --git a/CLUEstering/include/deltaPhi.h b/CLUEstering/serial/include/deltaPhi.h similarity index 100% rename from CLUEstering/include/deltaPhi.h rename to CLUEstering/serial/include/deltaPhi.h diff --git a/CLUEstering/include/test/Makefile b/CLUEstering/serial/include/test/Makefile similarity index 100% rename from CLUEstering/include/test/Makefile rename to CLUEstering/serial/include/test/Makefile diff --git a/CLUEstering/include/test/doctest.h b/CLUEstering/serial/include/test/doctest.h similarity index 100% rename from CLUEstering/include/test/doctest.h rename to CLUEstering/serial/include/test/doctest.h diff --git a/CLUEstering/include/test/test_deltaphi.cc b/CLUEstering/serial/include/test/test_deltaphi.cc similarity index 100% rename from CLUEstering/include/test/test_deltaphi.cc rename to CLUEstering/serial/include/test/test_deltaphi.cc diff --git a/CLUEstering/include/test/test_distance.cc b/CLUEstering/serial/include/test/test_distance.cc similarity index 100% rename from CLUEstering/include/test/test_distance.cc rename to CLUEstering/serial/include/test/test_distance.cc diff --git a/CLUEstering/include/test/test_kernels.cc b/CLUEstering/serial/include/test/test_kernels.cc similarity index 100% rename from CLUEstering/include/test/test_kernels.cc rename to CLUEstering/serial/include/test/test_kernels.cc diff --git a/extern/alpaka b/extern/alpaka new file mode 160000 index 00000000..5573fc35 --- /dev/null +++ b/extern/alpaka @@ -0,0 +1 @@ +Subproject commit 5573fc351cc1de8d4ffe0db092ad1b3eb9eae177 diff --git a/extern/pybind11 b/extern/pybind11 new file mode 160000 index 00000000..104f921b --- /dev/null +++ b/extern/pybind11 @@ -0,0 +1 @@ +Subproject commit 104f921b0cd0fd0fc90718acd0b40975c5f7f703