diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..e8f8e590
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "extern/pybind11"]
+	path = extern/pybind11
+	url = https://github.com/sbaldu/pybind11.git
+	branch = master
+[submodule "extern/alpaka"]
+	path = extern/alpaka
+	url = https://github.com/cms-patatrack/alpaka.git
diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h b/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h
new file mode 100644
index 00000000..33ecf3f4
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h
@@ -0,0 +1,31 @@
+#ifndef AlpakaCore_AllocatorConfig_h
+#define AlpakaCore_AllocatorConfig_h
+
+#include <limits>
+
+namespace cms::alpakatools {
+
+  namespace config {
+
+    // bin growth factor (bin_growth in cub::CachingDeviceAllocator)
+    constexpr unsigned int binGrowth = 2;
+
+    // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+    constexpr unsigned int minBin = 8;  // 256 bytes
+
+    // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+    constexpr unsigned int maxBin = 30;  // 1 GB
+
+    // total storage for the allocator; 0 means no limit.
+    constexpr size_t maxCachedBytes = 0;
+
+    // fraction of total device memory taken for the allocator; 0 means no limit.
+    constexpr double maxCachedFraction = 0.8;
+
+    // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.
+
+  }  // namespace config
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_AllocatorConfig_h
diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h
new file mode 100644
index 00000000..4561d91f
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h
@@ -0,0 +1,51 @@
+#ifndef AlpakaCore_AllocatorPolicy_h
+#define AlpakaCore_AllocatorPolicy_h
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  // Which memory allocator to use
+  //   - Synchronous:   (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost
+  //   - Asynchronous:  (device only)     cudaMallocAsync (requires CUDA >= 11.2)
+  //   - Caching:       (device and host) caching allocator
+  enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
+
+  template <typename TDev>
+  constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
+
+#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#if defined ALPAKA_ACC_GPU_CUDA_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
+      AllocatorPolicy::Asynchronous;
+#else
+          AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#if defined ALPAKA_ACC_GPU_HIP_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_AllocatorPolicy_h
diff --git a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h
new file mode 100644
index 00000000..a912024a
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h
@@ -0,0 +1,145 @@
+#ifndef AlpakaCore_CachedBufAlloc_h
+#define AlpakaCore_CachedBufAlloc_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "getDeviceCachingAllocator.h"
+#include "getHostCachingAllocator.h"
+
+namespace cms::alpakatools {
+
+  namespace traits {
+
+    //! The caching memory allocator trait.
+    template <typename TElem, typename TDim, typename TIdx, typename TDev, typename TQueue, typename TSfinae = void>
+    struct CachedBufAlloc {
+      static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
+    };
+
+    //! The caching memory allocator implementation for the CPU device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        // non-cached host-only memory
+        return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
+      }
+    };
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueCudaRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the CUDA device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCudaRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueHipRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the ROCm/HIP device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufHipRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufHipRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  }  // namespace traits
+
+  template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
+  ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
+    return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_CachedBufAlloc_h
diff --git a/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h
new file mode 100644
index 00000000..1cda3e9d
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h
@@ -0,0 +1,421 @@
+#ifndef AlpakaCore_CachingAllocator_h
+#define AlpakaCore_CachingAllocator_h
+
+#include <cassert>
+#include <exception>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include <boost/core/demangle.hpp>
+
+#include <alpaka/alpaka.hpp>
+
+#include "alpakaDevices.h"
+
+// Inspired by cub::CachingDeviceAllocator
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    inline constexpr unsigned int power(unsigned int base, unsigned int exponent) {
+      unsigned int power = 1;
+      while (exponent > 0) {
+        if (exponent & 1) {
+          power = power * base;
+        }
+        base = base * base;
+        exponent = exponent >> 1;
+      }
+      return power;
+    }
+
+    // format a memory size in B/kB/MB/GB
+    inline std::string as_bytes(size_t value) {
+      if (value == std::numeric_limits<size_t>::max()) {
+        return "unlimited";
+      } else if (value >= (1 << 30) and value % (1 << 30) == 0) {
+        return std::to_string(value >> 30) + " GB";
+      } else if (value >= (1 << 20) and value % (1 << 20) == 0) {
+        return std::to_string(value >> 20) + " MB";
+      } else if (value >= (1 << 10) and value % (1 << 10) == 0) {
+        return std::to_string(value >> 10) + " kB";
+      } else {
+        return std::to_string(value) + "  B";
+      }
+    }
+
+  }  // namespace detail
+
+  /*
+   * The "memory device" identifies the memory space, i.e. the device where the memory is allocated.
+   * A caching allocator object is associated to a single memory `Device`, set at construction time, and unchanged for
+   * the lifetime of the allocator.
+   *
+   * Each allocation is associated to an event on a queue, that identifies the "synchronisation device" according to
+   * which the synchronisation occurs.
+   * The `Event` type depends only on the synchronisation `Device` type.
+   * The `Queue` type depends on the synchronisation `Device` type and the queue properties, either `Sync` or `Async`.
+   *
+   * **Note**: how to handle different queue and event types in a single allocator ?  store and access type-punned
+   * queues and events ?  or template the internal structures on them, but with a common base class ?
+   * alpaka does rely on the compile-time type for dispatch.
+   *
+   * Common use case #1: accelerator's memory allocations
+   *   - the "memory device" is the accelerator device (e.g. a GPU);
+   *   - the "synchronisation device" is the same accelerator device;
+   *   - the `Queue` type is usually always the same (either `Sync` or `Async`).
+   *
+   * Common use case #2: pinned host memory allocations
+   *    - the "memory device" is the host device (e.g. system memory);
+   *    - the "synchronisation device" is the accelerator device (e.g. a GPU) whose work queue will access the host;
+   *      memory (direct memory access from the accelerator, or scheduling `alpaka::memcpy`/`alpaka::memset`), and can
+   *      be different for each allocation;
+   *    - the synchronisation `Device` _type_ could potentially be different, but memory pinning is currently tied to
+   *      the accelerator's platform (CUDA, HIP, etc.), so the device type needs to be fixed to benefit from caching;
+   *    - the `Queue` type can be either `Sync` _or_ `Async` on any allocation.
+   */
+
+  template <typename TDevice, typename TQueue>
+  class CachingAllocator {
+  public:
+    using Device = TDevice;              // the "memory device", where the memory will be allocated
+    using Queue = TQueue;                // the queue used to submit the memory operations
+    using Event = alpaka::Event<Queue>;  // the events used to synchronise the operations
+    using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
+
+    // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
+    static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                  "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
+                  "host CPU.");
+
+    struct CachedBytes {
+      size_t free = 0;       // total bytes freed and cached on this device
+      size_t live = 0;       // total bytes currently in use oin this device
+      size_t requested = 0;  // total bytes requested and currently in use on this device
+    };
+
+    explicit CachingAllocator(
+        Device const& device,
+        unsigned int binGrowth,          // bin growth factor;
+        unsigned int minBin,             // smallest bin, corresponds to binGrowth^minBin bytes;
+                                         // smaller allocations are rounded to this value;
+        unsigned int maxBin,             // largest bin, corresponds to binGrowth^maxBin bytes;
+                                         // larger allocations will fail;
+        size_t maxCachedBytes,           // total storage for the allocator (0 means no limit);
+        double maxCachedFraction,        // fraction of total device memory taken for the allocator (0 means no limit);
+                                         // if both maxCachedBytes and maxCachedFraction are non-zero,
+                                         // the smallest resulting value is used.
+        bool reuseSameQueueAllocations,  // reuse non-ready allocations if they are in the same queue as the new one;
+                                         // this is safe only if all memory operations are scheduled in the same queue
+        bool debug)
+        : device_(device),
+          binGrowth_(binGrowth),
+          minBin_(minBin),
+          maxBin_(maxBin),
+          minBinBytes_(detail::power(binGrowth, minBin)),
+          maxBinBytes_(detail::power(binGrowth, maxBin)),
+          maxCachedBytes_(cacheSize(maxCachedBytes, maxCachedFraction)),
+          reuseSameQueueAllocations_(reuseSameQueueAllocations),
+          debug_(debug) {
+      if (debug_) {
+        std::ostringstream out;
+        out << "CachingAllocator settings\n"
+            << "  bin growth " << binGrowth_ << "\n"
+            << "  min bin    " << minBin_ << "\n"
+            << "  max bin    " << maxBin_ << "\n"
+            << "  resulting bins:\n";
+        for (auto bin = minBin_; bin <= maxBin_; ++bin) {
+          auto binSize = detail::power(binGrowth, bin);
+          out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
+        }
+        out << "  maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
+        std::cout << out.str() << std::endl;
+      }
+    }
+
+    ~CachingAllocator() {
+      {
+        // this should never be called while some memory blocks are still live
+        std::scoped_lock lock(mutex_);
+        assert(liveBlocks_.empty());
+        assert(cachedBytes_.live == 0);
+      }
+
+      freeAllCached();
+    }
+
+    // return a copy of the cache allocation status, for monitoring purposes
+    CachedBytes cacheStatus() const {
+      std::scoped_lock lock(mutex_);
+      return cachedBytes_;
+    }
+
+    // Allocate given number of bytes on the current device associated to given queue
+    void* allocate(size_t bytes, Queue queue) {
+      // create a block descriptor for the requested allocation
+      BlockDescriptor block;
+      block.queue = std::move(queue);
+      block.requested = bytes;
+      std::tie(block.bin, block.bytes) = findBin(bytes);
+
+      // try to re-use a cached block, or allocate a new buffer
+      if (not tryReuseCachedBlock(block)) {
+        allocateNewBlock(block);
+      }
+
+      return block.buffer->data();
+    }
+
+    // frees an allocation
+    void free(void* ptr) {
+      std::scoped_lock lock(mutex_);
+
+      auto iBlock = liveBlocks_.find(ptr);
+      if (iBlock == liveBlocks_.end()) {
+        std::stringstream ss;
+        ss << "Trying to free a non-live block at " << ptr;
+        throw std::runtime_error(ss.str());
+      }
+      // remove the block from the list of live blocks
+      BlockDescriptor block = std::move(iBlock->second);
+      liveBlocks_.erase(iBlock);
+      cachedBytes_.live -= block.bytes;
+      cachedBytes_.requested -= block.requested;
+
+      bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
+      if (recache) {
+        alpaka::enqueue(*(block.queue), *(block.event));
+        cachedBytes_.free += block.bytes;
+        // after the call to insert(), cachedBlocks_ shares ownership of the buffer
+        // TODO use std::move ?
+        cachedBlocks_.insert(std::make_pair(block.bin, block));
+
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at "
+              << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
+              << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+      } else {
+        // if the buffer is not recached, it is automatically freed when block goes out of scope
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
+              << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
+              << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+      }
+    }
+
+  private:
+    struct BlockDescriptor {
+      std::optional<Buffer> buffer;
+      std::optional<Queue> queue;
+      std::optional<Event> event;
+      size_t bytes = 0;
+      size_t requested = 0;  // for monitoring only
+      unsigned int bin = 0;
+
+      // the "synchronisation device" for this block
+      auto device() { return alpaka::getDev(*queue); }
+    };
+
+  private:
+    // return the maximum amount of memory that should be cached on this device
+    size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const {
+      // note that getMemBytes() returns 0 if the platform does not support querying the device memory
+      size_t totalMemory = alpaka::getMemBytes(device_);
+      size_t memoryFraction = static_cast<size_t>(maxCachedFraction * totalMemory);
+      size_t size = std::numeric_limits<size_t>::max();
+      if (maxCachedBytes > 0 and maxCachedBytes < size) {
+        size = maxCachedBytes;
+      }
+      if (memoryFraction > 0 and memoryFraction < size) {
+        size = memoryFraction;
+      }
+      return size;
+    }
+
+    // return (bin, bin size)
+    std::tuple<unsigned int, size_t> findBin(size_t bytes) const {
+      if (bytes < minBinBytes_) {
+        return std::make_tuple(minBin_, minBinBytes_);
+      }
+      if (bytes > maxBinBytes_) {
+        throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
+                                 " bytes is too large for the caching detail with maximum bin " +
+                                 std::to_string(maxBinBytes_) +
+                                 " bytes. You might want to increase the maximum bin size");
+      }
+      unsigned int bin = minBin_;
+      size_t binBytes = minBinBytes_;
+      while (binBytes < bytes) {
+        ++bin;
+        binBytes *= binGrowth_;
+      }
+      return std::make_tuple(bin, binBytes);
+    }
+
+    bool tryReuseCachedBlock(BlockDescriptor& block) {
+      std::scoped_lock lock(mutex_);
+
+      // iterate through the range of cached blocks in the same bin
+      const auto [begin, end] = cachedBlocks_.equal_range(block.bin);
+      for (auto iBlock = begin; iBlock != end; ++iBlock) {
+        if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or
+            alpaka::isComplete(*(iBlock->second.event))) {
+          // associate the cached buffer to the new queue
+          auto queue = std::move(*(block.queue));
+          // TODO cache (or remove) the debug information and use std::move()
+          block = iBlock->second;
+          block.queue = std::move(queue);
+
+          // if the new queue is on different device than the old event, create a new event
+          if (block.device() != alpaka::getDev(*(block.event))) {
+            block.event = Event{block.device()};
+          }
+
+          // insert the cached block into the live blocks
+          // TODO cache (or remove) the debug information and use std::move()
+          liveBlocks_[block.buffer->data()] = block;
+
+          // update the accounting information
+          cachedBytes_.free -= block.bytes;
+          cachedBytes_.live += block.bytes;
+          cachedBytes_.requested += block.requested;
+
+          if (debug_) {
+            std::ostringstream out;
+            out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
+                << block.buffer->data() << " (" << block.bytes << " bytes) for queue "
+                << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
+                << " (previously associated with stream " << iBlock->second.queue->m_spQueueImpl.get() << " , event "
+                << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
+            std::cout << out.str() << std::endl;
+          }
+
+          // remove the reused block from the list of cached blocks
+          cachedBlocks_.erase(iBlock);
+          return true;
+        }
+      }
+
+      return false;
+    }
+
+    Buffer allocateBuffer(size_t bytes, Queue const& queue) {
+      if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
+        // allocate device memory
+        return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
+      } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
+        // allocate pinned host memory accessible by the queue's platform
+        return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<Queue>>, std::byte, size_t>(device_, bytes);
+      } else {
+        // unsupported combination
+        static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                      "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
+                      "the host CPU.");
+      }
+    }
+
+    void allocateNewBlock(BlockDescriptor& block) {
+      try {
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
+      } catch (std::runtime_error const& e) {
+        // the allocation attempt failed: free all cached blocks on the device and retry
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes
+              << " bytes for queue " << block.queue->m_spQueueImpl.get()
+              << ", retrying after freeing cached allocations" << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+        // TODO implement a method that frees only up to block.bytes bytes
+        freeAllCached();
+
+        // throw an exception if it fails again
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
+      }
+
+      // create a new event associated to the "synchronisation device"
+      block.event = Event{block.device()};
+
+      {
+        std::scoped_lock lock(mutex_);
+        cachedBytes_.live += block.bytes;
+        cachedBytes_.requested += block.requested;
+        // TODO use std::move() ?
+        liveBlocks_[block.buffer->data()] = block;
+      }
+
+      if (debug_) {
+        std::ostringstream out;
+        out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
+            << block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
+            << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
+        std::cout << out.str() << std::endl;
+      }
+    }
+
+    void freeAllCached() {
+      std::scoped_lock lock(mutex_);
+
+      while (not cachedBlocks_.empty()) {
+        auto iBlock = cachedBlocks_.begin();
+        cachedBytes_.free -= iBlock->second.bytes;
+
+        if (debug_) {
+          std::ostringstream out;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes
+              << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free
+              << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding."
+              << std::endl;
+          std::cout << out.str() << std::endl;
+        }
+
+        cachedBlocks_.erase(iBlock);
+      }
+    }
+
+    // TODO replace with a tbb::concurrent_multimap ?
+    using CachedBlocks = std::multimap<unsigned int, BlockDescriptor>;  // ordered by the allocation bin
+    // TODO replace with a tbb::concurrent_map ?
+    using BusyBlocks = std::map<void*, BlockDescriptor>;  // ordered by the address of the allocated memory
+
+    inline static const std::string deviceType_ = boost::core::demangle(typeid(Device).name());
+
+    mutable std::mutex mutex_;
+    Device device_;  // the device where the memory is allocated
+
+    CachedBytes cachedBytes_;
+    CachedBlocks cachedBlocks_;  // Set of cached device allocations available for reuse
+    BusyBlocks liveBlocks_;      // map of pointers to the live device allocations currently in use
+
+    const unsigned int binGrowth_;  // Geometric growth factor for bin-sizes
+    const unsigned int minBin_;
+    const unsigned int maxBin_;
+
+    const size_t minBinBytes_;
+    const size_t maxBinBytes_;
+    const size_t maxCachedBytes_;  // Maximum aggregate cached bytes per device
+
+    const bool reuseSameQueueAllocations_;
+    const bool debug_;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_CachingAllocator_h
diff --git a/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h
new file mode 100644
index 00000000..d010dc3e
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h
@@ -0,0 +1,67 @@
+#ifndef AlpakaCore_HostOnlyTask_h
+#define AlpakaCore_HostOnlyTask_h
+
+#include <functional>
+#include <memory>
+
+#include <alpaka/alpaka.hpp>
+
+namespace alpaka {
+
+  class HostOnlyTask {
+  public:
+    HostOnlyTask(std::function<void()> task) : task_(std::move(task)) {}
+
+    void operator()() const { task_(); }
+
+  private:
+    std::function<void()> task_;
+  };
+
+  namespace trait {
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    //! The CUDA async queue enqueue trait specialization for "safe tasks"
+    template <>
+    struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
+      using TApi = ApiCudaRt;
+
+      static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
+        //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
+        std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
+        (*pTask)();
+      }
+
+      ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
+        auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+      }
+    };
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    //! The HIP async queue enqueue trait specialization for "safe tasks"
+    template <>
+    struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
+      using TApi = ApiHipRt;
+
+      static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
+        //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
+        std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
+        (*pTask)();
+      }
+
+      ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
+        auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+      }
+    };
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  }  // namespace trait
+
+}  // namespace alpaka
+
+#endif  // AlpakaCore_HostOnlyTask_h
diff --git a/CLUEstering/alpaka/AlpakaCore/StreamCache.h b/CLUEstering/alpaka/AlpakaCore/StreamCache.h
new file mode 100644
index 00000000..682bb7e7
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/StreamCache.h
@@ -0,0 +1,56 @@
+#ifndef AlpakaCore_StreamCache_h
+#define AlpakaCore_StreamCache_h
+
+#include <memory>
+#include <vector>
+
+#include "alpakaConfig.h"
+#include "getDeviceIndex.h"
+#include "Framework/ReusableObjectHolder.h"
+
+namespace cms::alpakatools {
+
+  template <typename Queue>
+  class StreamCache {
+    using Device = alpaka::Dev<Queue>;
+    using Platform = alpaka::Pltf<Device>;
+
+  public:
+    // StreamCache should be constructed by the first call to
+    // getStreamCache() only if we have CUDA devices present
+    StreamCache() : cache_(alpaka::getDevCount<Platform>()) {}
+
+    // Gets a (cached) CUDA stream for the current device. The stream
+    // will be returned to the cache by the shared_ptr destructor.
+    // This function is thread safe
+    ALPAKA_FN_HOST std::shared_ptr<Queue> get(Device const& dev) {
+      return cache_[cms::alpakatools::getDeviceIndex(dev)].makeOrGet([dev]() { return std::make_unique<Queue>(dev); });
+    }
+
+  private:
+    // not thread safe, intended to be called only from CUDAService destructor
+    void clear() {
+      // Reset the contents of the caches, but leave an
+      // edm::ReusableObjectHolder alive for each device. This is needed
+      // mostly for the unit tests, where the function-static
+      // StreamCache lives through multiple tests (and go through
+      // multiple shutdowns of the framework).
+      cache_.clear();
+      cache_.resize(alpaka::getDevCount<Platform>());
+    }
+
+    std::vector<edm::ReusableObjectHolder<Queue>> cache_;
+  };
+
+  // Gets the global instance of a StreamCache
+  // This function is thread safe
+  template <typename Queue>
+  StreamCache<Queue>& getStreamCache() {
+    // the public interface is thread safe
+    static StreamCache<Queue> cache;
+    return cache;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_StreamCache_h
diff --git a/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc b/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc
new file mode 100644
index 00000000..08ccbb1a
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpaka/initialise.cc
@@ -0,0 +1,32 @@
+#include <iostream>
+
+#include <alpaka/alpaka.hpp>
+
+#include "../alpakaConfig.h"
+#include "../alpakaDevices.h"
+#include "../initialise.h"
+#include "../demangle.h"
+
+namespace cms::alpakatools {
+
+  template <typename TPlatform>
+  void initialise() {
+    constexpr const char* suffix[] = {"devices.", "device:", "devices:"};
+
+    if (devices<TPlatform>.empty()) {
+      devices<TPlatform> = enumerate<TPlatform>();
+      auto size = devices<TPlatform>.size();
+      //std::cout << edm::demangle<TPlatform> << " platform succesfully initialised." << std::endl;
+      std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl;
+      for (auto const& device : devices<TPlatform>) {
+        std::cout << "  - " << alpaka::getName(device) << std::endl;
+      }
+    } else {
+      //std::cout << edm::demangle<TPlatform> << " platform already initialised." << std::endl;
+    }
+  }
+
+  // explicit template instantiation definition
+  template void initialise<ALPAKA_ACCELERATOR_NAMESPACE::Platform>();
+
+}  // namespace cms::alpakatools
diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h
new file mode 100644
index 00000000..3f94e9cf
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h
@@ -0,0 +1,160 @@
+#ifndef AlpakaCore_alpakaConfig_h
+#define AlpakaCore_alpakaConfig_h
+
+#include "alpakaFwd.h"
+
+namespace alpaka_common {
+
+  // common types and dimensions
+  using Idx = uint32_t;
+  using Extent = uint32_t;
+  using Offsets = Extent;
+
+  using Dim0D = alpaka::DimInt<0u>;
+  using Dim1D = alpaka::DimInt<1u>;
+  using Dim2D = alpaka::DimInt<2u>;
+  using Dim3D = alpaka::DimInt<3u>;
+
+  template <typename TDim>
+  using Vec = alpaka::Vec<TDim, Idx>;
+  using Vec1D = Vec<Dim1D>;
+  using Vec2D = Vec<Dim2D>;
+  using Vec3D = Vec<Dim3D>;
+  using Scalar = Vec<Dim0D>;
+
+  template <typename TDim>
+  using WorkDiv = alpaka::WorkDivMembers<TDim, Idx>;
+  using WorkDiv1D = WorkDiv<Dim1D>;
+  using WorkDiv2D = WorkDiv<Dim2D>;
+  using WorkDiv3D = WorkDiv<Dim3D>;
+
+  // host types
+  using DevHost = alpaka::DevCpu;
+  using PltfHost = alpaka::PltfCpu;
+
+}  // namespace alpaka_common
+
+// trick to force expanding ALPAKA_ACCELERATOR_NAMESPACE before stringification inside DEFINE_FWK_MODULE
+#define DEFINE_FWK_ALPAKA_MODULE2(name) DEFINE_FWK_MODULE(name)
+#define DEFINE_FWK_ALPAKA_MODULE(name) DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
+
+#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(name) DEFINE_FWK_EVENTSETUP_MODULE(name)
+#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \
+  DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
+
+#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT
+namespace alpaka_cuda_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCudaRt;
+  using Device = alpaka::DevCudaRt;
+  using Queue = alpaka::QueueCudaRtNonBlocking;
+  using Event = alpaka::EventCudaRt;
+
+  template <typename TDim>
+  using Acc = alpaka::AccGpuCudaRt<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_cuda_async
+
+#endif  // ALPAKA_ACC_GPU_CUDA_PRESENT
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
+#endif  // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
+
+#ifdef ALPAKA_ACC_GPU_HIP_PRESENT
+namespace alpaka_rocm_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfHipRt;
+  using Device = alpaka::DevHipRt;
+  using Queue = alpaka::QueueHipRtNonBlocking;
+  using Event = alpaka::EventHipRt;
+
+  template <typename TDim>
+  using Acc = alpaka::AccGpuHipRt<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_rocm_async
+
+#endif  // ALPAKA_ACC_GPU_HIP_PRESENT
+
+#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
+#endif  // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
+namespace alpaka_serial_sync {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCpu;
+  using Device = alpaka::DevCpu;
+  using Queue = alpaka::QueueCpuBlocking;
+  using Event = alpaka::EventCpu;
+
+  template <typename TDim>
+  using Acc = alpaka::AccCpuSerial<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_serial_sync
+
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
+namespace alpaka_tbb_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCpu;
+  using Device = alpaka::DevCpu;
+  using Queue = alpaka::QueueCpuNonBlocking;
+  using Event = alpaka::EventCpu;
+
+  template <typename TDim>
+  using Acc = alpaka::AccCpuTbbBlocks<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_tbb_async
+
+#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async
+#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
+namespace alpaka_omp2_async {
+  using namespace alpaka_common;
+
+  using Platform = alpaka::PltfCpu;
+  using Device = alpaka::DevCpu;
+  using Queue = alpaka::QueueCpuBlocking;
+  using Event = alpaka::EventCpu;
+
+  template <typename TDim>
+  using Acc = alpaka::AccCpuOmp2Blocks<TDim, Idx>;
+  using Acc1D = Acc<Dim1D>;
+  using Acc2D = Acc<Dim2D>;
+  using Acc3D = Acc<Dim3D>;
+
+}  // namespace alpaka_omp2_async
+
+#endif  // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async
+#endif  // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND
+
+#endif  // AlpakaCore_alpakaConfig_h
diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h
new file mode 100644
index 00000000..d980bf38
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h
@@ -0,0 +1,40 @@
+#ifndef AlpakaCore_alpakaDevices_h
+#define AlpakaCore_alpakaDevices_h
+
+#include <cassert>
+#include <vector>
+
+#include <alpaka/alpaka.hpp>
+
+#include "alpakaConfig.h"
+#include "getDeviceIndex.h"
+
+namespace cms::alpakatools {
+
+  // alpaka host device
+  inline const alpaka_common::DevHost host = alpaka::getDevByIdx<alpaka_common::PltfHost>(0u);
+
+  // alpaka accelerator devices
+  template <typename TPlatform>
+  inline std::vector<alpaka::Dev<TPlatform>> devices;
+
+  template <typename TPlatform>
+  std::vector<alpaka::Dev<TPlatform>> enumerate() {
+    assert(getDeviceIndex(host) == 0u);
+
+    using Device = alpaka::Dev<TPlatform>;
+    using Platform = TPlatform;
+
+    std::vector<Device> devices;
+    uint32_t n = alpaka::getDevCount<Platform>();
+    devices.reserve(n);
+    for (uint32_t i = 0; i < n; ++i) {
+      devices.push_back(alpaka::getDevByIdx<Platform>(i));
+      assert(getDeviceIndex(devices.back()) == static_cast<int>(i));
+    }
+    return devices;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_alpakaDevices_h
diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h
new file mode 100644
index 00000000..2234e440
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h
@@ -0,0 +1,96 @@
+#ifndef AlpakaCore_alpakaFwd_h
+#define AlpakaCore_alpakaFwd_h
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+/**
+ * This file forward declares specific types defined in Alpaka
+ * (depending on the backend-enabling macros) so that these types
+ * would be available throughout CMSSW without a direct dependence on
+ * Alpaka in order to avoid the constraints that would impose
+ * (primarily the device compiler)
+ *
+ * This is a little bit brittle, but let's see how it goes.
+ */
+namespace alpaka {
+
+  // miscellanea
+  template <std::size_t N>
+  using DimInt = std::integral_constant<std::size_t, N>;
+
+  template <typename TDim, typename TVal>
+  class Vec;
+
+  template <typename TDim, typename TIdx>
+  class WorkDivMembers;
+
+  // API
+  struct ApiCudaRt;
+  struct ApiHipRt;
+
+  // Platforms
+  class PltfCpu;
+  template <typename TApi>
+  class PltfUniformCudaHipRt;
+  using PltfCudaRt = PltfUniformCudaHipRt<ApiCudaRt>;
+  using PltfHipRt = PltfUniformCudaHipRt<ApiHipRt>;
+
+  // Devices
+  class DevCpu;
+  template <typename TApi>
+  class DevUniformCudaHipRt;
+  using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
+  using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;
+
+  // Queues
+  template <typename TDev>
+  class QueueGenericThreadsBlocking;
+  using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
+
+  template <typename TDev>
+  class QueueGenericThreadsNonBlocking;
+  using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
+
+  namespace uniform_cuda_hip::detail {
+    template <typename TApi, bool TBlocking>
+    class QueueUniformCudaHipRt;
+  }
+  using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
+  using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
+  using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
+  using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;
+
+  // Events
+  template <typename TDev>
+  class EventGenericThreads;
+  using EventCpu = EventGenericThreads<DevCpu>;
+
+  template <typename TApi>
+  class EventUniformCudaHipRt;
+  using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
+  using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;
+
+  // Accelerators
+  template <typename TApi, typename TDim, typename TIdx>
+  class AccGpuUniformCudaHipRt;
+
+  template <typename TDim, typename TIdx>
+  using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;
+
+  template <typename TDim, typename TIdx>
+  using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuSerial;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuTbbBlocks;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuOmp2Blocks;
+
+}  // namespace alpaka
+
+#endif  // AlpakaCore_alpakaFwd_h
diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h
new file mode 100644
index 00000000..6bec9424
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h
@@ -0,0 +1,237 @@
+#ifndef AlpakaCore_alpakaMemory_h
+#define AlpakaCore_alpakaMemory_h
+
+#include <type_traits>
+#include "initialise.h"
+
+#if __cplusplus >= 202002L
+namespace cms {
+  using std::is_bounded_array;
+  using std::is_unbounded_array;
+}  // namespace cms
+#else
+#include <boost/type_traits/is_bounded_array.hpp>
+#include <boost/type_traits/is_unbounded_array.hpp>
+namespace cms {
+  using boost::is_bounded_array;
+  using boost::is_unbounded_array;
+}  // namespace cms
+#endif
+
+namespace cms {
+  template <class T>
+  inline constexpr bool is_bounded_array_v = is_bounded_array<T>::value;
+
+  template <class T>
+  inline constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
+}  // namespace cms
+
+#include <alpaka/alpaka.hpp>
+
+#include "AllocatorPolicy.h"
+#include "CachedBufAlloc.h"
+#include "alpakaConfig.h"
+#include "alpakaDevices.h"
+
+namespace cms::alpakatools {
+
+  // for Extent, Dim1D, Idx
+  using namespace alpaka_common;
+
+  // type deduction helpers
+  namespace detail {
+
+    template <typename TDev, typename T>
+    struct buffer_type {
+      using type = alpaka::Buf<TDev, T, Dim0D, Idx>;
+    };
+
+    template <typename TDev, typename T>
+    struct buffer_type<TDev, T[]> {
+      using type = alpaka::Buf<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T, int N>
+    struct buffer_type<TDev, T[N]> {
+      using type = alpaka::Buf<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T>
+    struct view_type {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim0D, Idx>;
+    };
+
+    template <typename TDev, typename T>
+    struct view_type<TDev, T[]> {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>;
+    };
+
+    template <typename TDev, typename T, int N>
+    struct view_type<TDev, T[N]> {
+      using type = alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>;
+    };
+
+  }  // namespace detail
+
+  // scalar and 1-dimensional host buffers
+
+  template <typename T>
+  using host_buffer = typename detail::buffer_type<DevHost, T>::type;
+
+  // non-cached, non-pinned, scalar and 1-dimensional host buffers
+
+  template <typename T>
+  std::enable_if_t<not std::is_array_v<T>, host_buffer<T>> make_host_buffer() {
+    return alpaka::allocBuf<T, Idx>(host, Scalar{});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer(Extent extent) {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host, Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer() {
+    return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host, Vec1D{std::extent_v<T>});
+  }
+
+  // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue
+  // the memory is pinned according to the device associated to the queue
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<not std::is_array_v<T>, host_buffer<T>> make_host_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<T, Idx>(host, queue, Scalar{});
+    } else {
+      return alpaka::allocMappedBuf<T, Idx>(host, alpaka::getDev(queue), Scalar{});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer(TQueue const& queue, Extent extent) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(host, queue, Vec1D{extent});
+    } else {
+      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(host, alpaka::getDev(queue), Vec1D{extent});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  make_host_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(host, queue, Vec1D{std::extent_v<T>});
+    } else {
+      return alpaka::allocMappedBuf<std::remove_extent_t<T>, Idx>(host, alpaka::getDev(queue), Vec1D{std::extent_v<T>});
+    }
+  }
+
+  // scalar and 1-dimensional host views
+
+  template <typename T>
+  using host_view = typename detail::view_type<DevHost, T>::type;
+
+  template <typename T>
+  std::enable_if_t<not std::is_array_v<T>, host_view<T>> make_host_view(T& data) {
+    return alpaka::ViewPlainPtr<DevHost, T, Dim0D, Idx>(&data, host, Scalar{});
+  }
+
+  template <typename T>
+  host_view<T[]> make_host_view(T* data, Extent extent) {
+    return alpaka::ViewPlainPtr<DevHost, T, Dim1D, Idx>(data, host, Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  make_host_view(T& data, Extent extent) {
+    return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host, Vec1D{extent});
+  }
+
+  template <typename T>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  make_host_view(T& data) {
+    return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host, Vec1D{std::extent_v<T>});
+  }
+
+  // scalar and 1-dimensional device buffers
+
+  template <typename TDev, typename T>
+  using device_buffer = typename detail::buffer_type<TDev, T>::type;
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<not std::is_array_v<T>, device_buffer<alpaka::Dev<TQueue>, T>> make_device_buffer(
+      TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<T, Idx>(alpaka::getDev(queue), queue, Scalar{});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<T, Idx>(queue, Scalar{});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<T, Idx>(alpaka::getDev(queue), Scalar{});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<alpaka::Dev<TQueue>, T>>
+  make_device_buffer(TQueue const& queue, Extent extent) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), queue, Vec1D{extent});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<std::remove_extent_t<T>, Idx>(queue, Vec1D{extent});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), Vec1D{extent});
+    }
+  }
+
+  template <typename T, typename TQueue>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+                   device_buffer<alpaka::Dev<TQueue>, T>>
+  make_device_buffer(TQueue const& queue) {
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
+      return allocCachedBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), queue, Vec1D{std::extent_v<T>});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Asynchronous) {
+      return alpaka::allocAsyncBuf<std::remove_extent_t<T>, Idx>(queue, Vec1D{std::extent_v<T>});
+    }
+    if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Synchronous) {
+      return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(alpaka::getDev(queue), Vec1D{std::extent_v<T>});
+    }
+  }
+
+  // scalar and 1-dimensional device views
+
+  template <typename TDev, typename T>
+  using device_view = typename detail::view_type<TDev, T>::type;
+
+  template <typename T, typename TDev>
+  std::enable_if_t<not std::is_array_v<T>, device_view<TDev, T>> make_device_view(TDev const& device, T& data) {
+    return alpaka::ViewPlainPtr<TDev, T, Dim0D, Idx>(&data, device, Scalar{});
+  }
+
+  template <typename T, typename TDev>
+  device_view<TDev, T[]> make_device_view(TDev const& device, T* data, Extent extent) {
+    return alpaka::ViewPlainPtr<TDev, T, Dim1D, Idx>(data, device, Vec1D{extent});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  make_device_view(TDev const& device, T& data, Extent extent) {
+    return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{extent});
+  }
+
+  template <typename T, typename TDev>
+  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  make_device_view(TDev const& device, T& data) {
+    return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{std::extent_v<T>});
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_alpakaMemory_h
diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h
new file mode 100644
index 00000000..b94c89a4
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h
@@ -0,0 +1,385 @@
+#ifndef AlpakaCore_alpakaWorkDiv_h
+#define AlpakaCore_alpakaWorkDiv_h
+
+#include <algorithm>
+#include <utility>
+
+#include <alpaka/alpaka.hpp>
+
+#include "alpakaConfig.h"
+
+using namespace alpaka_common;
+
+namespace cms::alpakatools {
+
+  /*********************************************
+   *              WORKDIV CREATION
+   ********************************************/
+
+  /*
+   * If the first argument is not a multiple of the second argument, round it up to the next multiple.
+   */
+  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
+
+  /*
+   * Return the integer division of the first argument by the second argument, rounded up to the next integer.
+   */
+  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
+
+  /*
+   * Creates the accelerator-dependent workdiv for 1-dimensional operations.
+   */
+  template <typename TAcc>
+  inline WorkDiv<Dim1D> make_workdiv(Idx blocksPerGrid, Idx threadsPerBlockOrElementsPerThread) {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - threadsPerBlockOrElementsPerThread is the number of threads per block;
+      //   - elementsPerThread is always 1.
+      const auto elementsPerThread = Idx{1};
+      return WorkDiv<Dim1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread);
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#if ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - threadsPerBlockOrElementsPerThread is the number of threads per block;
+      //   - elementsPerThread is always 1.
+      const auto elementsPerThread = Idx{1};
+      return WorkDiv<Dim1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread);
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - threadsPerBlock is always 1;
+      //   - threadsPerBlockOrElementsPerThread is the number of elements per thread.
+      const auto threadsPerBlock = Idx{1};
+      return WorkDiv<Dim1D>(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread);
+    }
+  }
+
+  /*
+   * Creates the accelerator-dependent workdiv for N-dimensional operations.
+   */
+  template <typename TAcc>
+  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocksPerGrid,
+                                                 const Vec<alpaka::Dim<TAcc>>& threadsPerBlockOrElementsPerThread) {
+    using Dim = alpaka::Dim<TAcc>;
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - threadsPerBlockOrElementsPerThread is the number of threads per block;
+      //   - elementsPerThread is always 1.
+      const auto elementsPerThread = Vec<Dim>::ones();
+      return WorkDiv<Dim>(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread);
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - threadsPerBlockOrElementsPerThread is the number of threads per block;
+      //   - elementsPerThread is always 1.
+      const auto elementsPerThread = Vec<Dim>::ones();
+      return WorkDiv<Dim>(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread);
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - threadsPerBlock is always 1;
+      //   - threadsPerBlockOrElementsPerThread is the number of elements per thread.
+      const auto threadsPerBlock = Vec<Dim>::ones();
+      return WorkDiv<Dim>(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread);
+    }
+  }
+
+  /*********************************************
+   *           RANGE COMPUTATION
+   ********************************************/
+
+  /*
+   * Computes the range of the elements indexes, local to the block.
+   * Warning: the max index is not truncated by the max number of elements of interest.
+   */
+  template <typename TAcc>
+  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block(const TAcc& acc,
+                                                                 const Idx elementIdxShift,
+                                                                 const unsigned int dimIndex = 0u) {
+    // Take into account the thread index in block.
+    const Idx threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndex]);
+    const Idx threadDimension(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[dimIndex]);
+
+    // Compute the elements indexes in block.
+    // Obviously relevant for CPU only.
+    // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift.
+    const Idx firstElementIdxLocal = threadIdxLocal * threadDimension;
+    const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift;  // Add the shift!
+    const Idx endElementIdxUncut = firstElementIdx + threadDimension;
+
+    // Return element indexes, shifted by elementIdxShift.
+    return {firstElementIdx, endElementIdxUncut};
+  }
+
+  /*
+   * Computes the range of the elements indexes, local to the block.
+   * Truncated by the max number of elements of interest.
+   */
+  template <typename TAcc>
+  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_block_truncated(const TAcc& acc,
+                                                                           const Idx maxNumberOfElements,
+                                                                           const Idx elementIdxShift,
+                                                                           const unsigned int dimIndex = 0u) {
+    // Check dimension
+    //static_assert(alpaka::Dim<TAcc>::value == Dim1D::value,
+    //              "Accelerator and maxNumberOfElements need to have same dimension.");
+    auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex);
+
+    // Truncate
+    endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements);
+
+    // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
+    return {firstElementIdxLocal, endElementIdxLocal};
+  }
+
+  /*
+   * Computes the range of the elements indexes in grid.
+   * Warning: the max index is not truncated by the max number of elements of interest.
+   */
+  template <typename TAcc>
+  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid(const TAcc& acc,
+                                                                Idx elementIdxShift,
+                                                                const unsigned int dimIndex = 0u) {
+    // Take into account the block index in grid.
+    const Idx blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
+    const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
+
+    // Shift to get global indices in grid (instead of local to the block)
+    elementIdxShift += blockIdxInGrid * blockDimension;
+
+    // Return element indexes, shifted by elementIdxShift.
+    return element_index_range_in_block(acc, elementIdxShift, dimIndex);
+  }
+
+  /*
+   * Computes the range of the elements indexes in grid.
+   * Truncated by the max number of elements of interest.
+   */
+  template <typename TAcc>
+  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid_truncated(const TAcc& acc,
+                                                                          const Idx maxNumberOfElements,
+                                                                          Idx elementIdxShift,
+                                                                          const unsigned int dimIndex = 0u) {
+    // Check dimension
+    //static_assert(dimIndex <= alpaka::Dim<TAcc>::value,
+    //"Accelerator and maxNumberOfElements need to have same dimension.");
+    auto [firstElementIdxGlobal, endElementIdxGlobal] = element_index_range_in_grid(acc, elementIdxShift, dimIndex);
+
+    // Truncate
+    endElementIdxGlobal = std::min(endElementIdxGlobal, maxNumberOfElements);
+
+    // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements.
+    return {firstElementIdxGlobal, endElementIdxGlobal};
+  }
+
+  /*
+   * Computes the range of the element(s) index(es) in grid.
+   * Truncated by the max number of elements of interest.
+   */
+  template <typename TAcc>
+  ALPAKA_FN_ACC std::pair<Idx, Idx> element_index_range_in_grid_truncated(const TAcc& acc,
+                                                                          const Idx maxNumberOfElements,
+                                                                          const unsigned int dimIndex = 0u) {
+    Idx elementIdxShift = 0u;
+    return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex);
+  }
+
+  /*********************************************
+   *           LOOP ON ALL ELEMENTS
+   ********************************************/
+
+  /*
+   * Loop on all (CPU) elements.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Indexes are local to the BLOCK.
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
+                                               const Idx maxNumberOfElements,
+                                               const Idx elementIdxShift,
+                                               const Func func,
+                                               const unsigned int dimIndex = 0) {
+    const auto& [firstElementIdx, endElementIdx] =
+        element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex);
+
+    for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) {
+      func(elementIdx);
+    }
+  }
+
+  /*
+   * Overload for elementIdxShift = 0
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc,
+                                               const Idx maxNumberOfElements,
+                                               const Func func,
+                                               const unsigned int dimIndex = 0) {
+    const Idx elementIdxShift = 0;
+    for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+  }
+
+  /*
+   * Loop on all (CPU) elements.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Indexes are expressed in GRID 'frame-of-reference'.
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc,
+                                              const Idx maxNumberOfElements,
+                                              Idx elementIdxShift,
+                                              const Func func,
+                                              const unsigned int dimIndex = 0) {
+    // Take into account the block index in grid to compute the element indices.
+    const Idx blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[dimIndex]);
+    const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
+    elementIdxShift += blockIdxInGrid * blockDimension;
+
+    for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+  }
+
+  /*
+   * Overload for elementIdxShift = 0
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc,
+                                              const Idx maxNumberOfElements,
+                                              const Func func,
+                                              const unsigned int dimIndex = 0) {
+    const Idx elementIdxShift = 0;
+    for_each_element_in_grid(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+  }
+
+  /**************************************************************
+   *          LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS
+   **************************************************************/
+
+  /*
+   * (CPU) Loop on all elements + (CPU/GPU) Strided access.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Stride to full problem size, by BLOCK size.
+   * Indexes are local to the BLOCK.
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
+                                                       const Idx maxNumberOfElements,
+                                                       const Idx elementIdxShift,
+                                                       const Func func,
+                                                       const unsigned int dimIndex = 0) {
+    // Get thread / element indices in block.
+    const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
+        element_index_range_in_block(acc, elementIdxShift, dimIndex);
+
+    // Stride = block size.
+    const Idx blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndex]);
+
+    // Strided access.
+    for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride;
+         threadIdx < maxNumberOfElements;
+         threadIdx += blockDimension, endElementIdx += blockDimension) {
+      // (CPU) Loop on all elements.
+      if (endElementIdx > maxNumberOfElements) {
+        endElementIdx = maxNumberOfElements;
+      }
+      for (Idx i = threadIdx; i < endElementIdx; ++i) {
+        func(i);
+      }
+    }
+  }
+
+  /*
+   * Overload for elementIdxShift = 0
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc,
+                                                       const Idx maxNumberOfElements,
+                                                       const Func func,
+                                                       const unsigned int dimIndex = 0) {
+    const Idx elementIdxShift = 0;
+    for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+  }
+
+  /*
+   * (CPU) Loop on all elements + (CPU/GPU) Strided access.
+   * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift.
+   * Stride to full problem size, by GRID size.
+   * Indexes are local to the GRID.
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
+                                                      const Idx maxNumberOfElements,
+                                                      const Idx elementIdxShift,
+                                                      const Func func,
+                                                      const unsigned int dimIndex = 0) {
+    // Get thread / element indices in block.
+    const auto& [firstElementIdxNoStride, endElementIdxNoStride] =
+        element_index_range_in_grid(acc, elementIdxShift, dimIndex);
+
+    // Stride = grid size.
+    const Idx gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndex]);
+
+    // Strided access.
+    for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride;
+         threadIdx < maxNumberOfElements;
+         threadIdx += gridDimension, endElementIdx += gridDimension) {
+      // (CPU) Loop on all elements.
+      if (endElementIdx > maxNumberOfElements) {
+        endElementIdx = maxNumberOfElements;
+      }
+      for (Idx i = threadIdx; i < endElementIdx; ++i) {
+        func(i);
+      }
+    }
+  }
+
+  /*
+   * Overload for elementIdxShift = 0
+   */
+  template <typename TAcc, typename Func>
+  ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc,
+                                                      const Idx maxNumberOfElements,
+                                                      const Func func,
+                                                      const unsigned int dimIndex = 0) {
+    const Idx elementIdxShift = 0;
+    for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex);
+  }
+
+  /**************************************************************
+   *          LOOP ON ALL ELEMENTS WITH ONE LOOP
+   **************************************************************/
+
+  /*
+   * Case where the input index i has reached the end of threadDimension: strides the input index.
+   * Otherwise: do nothing.
+   * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops
+   * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, 
+   * because of the additional loop over elements in Alpaka model). 
+   * This allows to keep the 'continue' and 'break' statements as-is from legacy code, 
+   * and hence avoids a lot of legacy code reshuffling.
+   * NB 2: Modifies i, firstElementIdx and endElementIdx.
+   */
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided(
+      Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) {
+    bool isNextStrideElementValid = true;
+    if (i == endElementIdx) {
+      firstElementIdx += stride;
+      endElementIdx += stride;
+      i = firstElementIdx;
+      if (i >= maxNumberOfElements) {
+        isNextStrideElementValid = false;
+      }
+    }
+    return isNextStrideElementValid;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_alpakaWorkDiv_h
diff --git a/CLUEstering/alpaka/AlpakaCore/backend.h b/CLUEstering/alpaka/AlpakaCore/backend.h
new file mode 100644
index 00000000..8d58953c
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/backend.h
@@ -0,0 +1,17 @@
+#ifndef AlpakaCore_backend_h
+#define AlpakaCore_backend_h
+
+enum class Backend { SERIAL, TBB, CUDA, HIP };
+
+inline std::string const& name(Backend backend) {
+  static const std::string names[] = {"serial_sync", "tbb_async", "cuda_async", "rocm_async"};
+  return names[static_cast<int>(backend)];
+}
+
+template <typename T>
+inline T& operator<<(T& out, Backend backend) {
+  out << name(backend);
+  return out;
+}
+
+#endif  // AlpakaCore_backend_h
diff --git a/CLUEstering/alpaka/AlpakaCore/chooseDevice.h b/CLUEstering/alpaka/AlpakaCore/chooseDevice.h
new file mode 100644
index 00000000..5cdd4141
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/chooseDevice.h
@@ -0,0 +1,24 @@
+#ifndef HeterogeneousCore_AlpakaCore_chooseDevice_h
+#define HeterogeneousCore_AlpakaCore_chooseDevice_h
+
+#include "alpakaConfig.h"
+#include "alpakaDevices.h"
+#include "Framework/Event.h"
+
+namespace cms::alpakatools {
+
+  template <typename TPlatform>
+  alpaka::Dev<TPlatform> const& chooseDevice(edm::StreamID id) {
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+
+    // TODO: improve the "assignment" logic
+    auto const& devices = cms::alpakatools::devices<TPlatform>;
+    return devices[id % devices.size()];
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaCore_chooseDevice_h
diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h
new file mode 100644
index 00000000..4bcac1ec
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h
@@ -0,0 +1,69 @@
+#ifndef AlpakaCore_getDeviceCachingAllocator_h
+#define AlpakaCore_getDeviceCachingAllocator_h
+
+#include <optional>
+#include <mutex>
+#include <vector>
+
+#include "AllocatorConfig.h"
+#include "CachingAllocator.h"
+#include "alpakaDevices.h"
+#include "alpakaFwd.h"
+#include "getDeviceIndex.h"
+
+namespace cms::alpakatools {
+
+  namespace detail {
+
+    template <typename TDevice, typename TQueue>
+    auto allocate_device_allocators() {
+      using Allocator = CachingAllocator<TDevice, TQueue>;
+      auto const& devices = cms::alpakatools::enumerate<alpaka::Pltf<TDevice>>();
+      auto const size = devices.size();
+
+      // allocate the storage for the objects
+      auto ptr = std::allocator<Allocator>().allocate(size);
+
+      // construct the objects in the storage
+      for (size_t index = 0; index < size; ++index) {
+        new (ptr + index) Allocator(devices[index],
+                                    config::binGrowth,
+                                    config::minBin,
+                                    config::maxBin,
+                                    config::maxCachedBytes,
+                                    config::maxCachedFraction,
+                                    true,    // reuseSameQueueAllocations
+                                    false);  // debug
+      }
+
+      // use a custom deleter to destroy all objects and deallocate the memory
+      auto deleter = [size](Allocator* ptr) {
+        for (size_t i = size; i > 0; --i) {
+          (ptr + i - 1)->~Allocator();
+        }
+        std::allocator<Allocator>().deallocate(ptr, size);
+      };
+
+      return std::unique_ptr<Allocator[], decltype(deleter)>(ptr, deleter);
+    }
+
+  }  // namespace detail
+
+  template <typename TDevice, typename TQueue>
+  inline CachingAllocator<TDevice, TQueue>& getDeviceCachingAllocator(TDevice const& device) {
+    // initialise all allocators, one per device
+    static auto allocators = detail::allocate_device_allocators<TDevice, TQueue>();
+
+    size_t const index = getDeviceIndex(device);
+
+	std::vector<TDevice> devs = alpaka::getDevs<alpaka::Pltf<TDevice>>();
+
+    assert(index < cms::alpakatools::enumerate<alpaka::Pltf<TDevice>>().size());
+
+    // the public interface is thread safe
+    return allocators[index];
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_getDeviceCachingAllocator_h
diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h
new file mode 100644
index 00000000..5abbeaa0
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h
@@ -0,0 +1,29 @@
+#ifndef AlpakaCore_getDeviceIndex_h
+#define AlpakaCore_getDeviceIndex_h
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  // generic interface, for DevOacc and DevOmp5
+  template <typename Device>
+  inline int getDeviceIndex(Device const& device) {
+    return device.iDevice();
+  }
+
+  // overload for DevCpu
+  inline int getDeviceIndex(alpaka::DevCpu const& device) { return 0; }
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+  // overload for DevCudaRt
+  inline int getDeviceIndex(alpaka::DevCudaRt const& device) { return alpaka::getNativeHandle(device); }
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+  // overload for DevHipRt
+  inline int getDeviceIndex(alpaka::DevHipRt const& device) { return alpaka::getNativeHandle(device); }
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_getDeviceIndex_h
diff --git a/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h
new file mode 100644
index 00000000..3103a7bf
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h
@@ -0,0 +1,28 @@
+#ifndef AlpakaCore_getHostCachingAllocator_h
+#define AlpakaCore_getHostCachingAllocator_h
+
+#include "AllocatorConfig.h"
+#include "CachingAllocator.h"
+#include "alpakaDevices.h"
+
+namespace cms::alpakatools {
+
+  template <typename TQueue>
+  inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator() {
+    // thread safe initialisation of the host allocator
+    static CachingAllocator<alpaka_common::DevHost, TQueue> allocator(host,
+                                                                      config::binGrowth,
+                                                                      config::minBin,
+                                                                      config::maxBin,
+                                                                      config::maxCachedBytes,
+                                                                      config::maxCachedFraction,
+                                                                      false,   // reuseSameQueueAllocations
+                                                                      false);  // debug
+
+    // the public interface is thread safe
+    return allocator;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_getHostCachingAllocator_h
diff --git a/CLUEstering/alpaka/AlpakaCore/initialise.h b/CLUEstering/alpaka/AlpakaCore/initialise.h
new file mode 100644
index 00000000..cd4125a0
--- /dev/null
+++ b/CLUEstering/alpaka/AlpakaCore/initialise.h
@@ -0,0 +1,27 @@
+#ifndef AlpakaCore_initialise_h
+#define AlpakaCore_initialise_h
+
+#include "alpakaConfig.h"
+
+namespace cms::alpakatools {
+
+  template <typename TPlatform>
+  void initialise();
+
+  // explicit template instantiation declaration
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
+  extern template void initialise<alpaka_serial_sync::Platform>();
+#endif
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
+  extern template void initialise<alpaka_tbb_async::Platform>();
+#endif
+#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT
+  extern template void initialise<alpaka_cuda_async::Platform>();
+#endif
+#ifdef ALPAKA_ACC_GPU_HIP_PRESENT
+  extern template void initialise<alpaka_rocm_async::Platform>();
+#endif
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_initialise_h
diff --git a/CLUEstering/alpaka/BindingModules/CLUEstering.py b/CLUEstering/alpaka/BindingModules/CLUEstering.py
new file mode 100644
index 00000000..19cdfbbe
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/CLUEstering.py
@@ -0,0 +1,925 @@
+"""
+Density based clustering algorithm developed at CERN.
+"""
+
+from dataclasses import dataclass
+from glob import glob
+import random as rnd
+from math import sqrt
+import time
+import types
+from typing import Union
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+import CLUE_Convolutional_Kernels as clue_kernels
+import CLUE_CPU_Serial as cpu_serial
+tbb_found = False
+if len(glob('./CLUE_CPU_TBB*.so')) != 0:
+    import CLUE_CPU_TBB as cpu_tbb
+    tbb_found = True
+cuda_found = False
+if len(glob('./CLUE_GPU_CUDA*.so')) != 0:
+    import CLUE_GPU_CUDA as gpu_cuda
+    cuda_found = True
+hip_found = False
+if len(glob('./CLUE_GPU_HIP*.so')) != 0:
+    import CLUE_GPU_HIP as gpu_hip
+    hip_found = True
+
+def test_blobs(n_samples: int, n_dim: int , n_blobs: int = 4, mean: float = 0,
+               sigma: float = 0.5, x_max: float = 30, y_max: float = 30) -> pd.DataFrame:
+    """
+    Returns a dataframe containing randomly generated 2-dimensional or 3-dimensional blobs.
+
+    This functions serves as a tool for generating a random dataset to test the library.
+
+    Parameters
+    ----------
+    n_samples : int
+        The number of points in the dataset.
+    n_dim : int
+        The number of dimensions.
+    n_blobs : int, optional
+        The number of blobs that should be produced. By default it is set to 4.
+    mean : float, optional
+        The mean of the gaussian distribution of the z values.
+    sigma : float, optional
+        The standard deviation of the gaussian distribution of the z values.
+    x_max : float, optional
+        Limit of the space where the blobs are created in the x direction.
+    y_max : float, optional
+        Limit of the space where the blobs are created in the y direction.
+
+    Returns
+    -------
+    Pandas DataFrame
+        DataFrame containing n_blobs gaussian blobs.
+    """
+
+    if n_blobs < 0:
+        raise ValueError('Wrong parameter value. The number of blobs must be positive.')
+    if sigma < 0.:
+        raise ValueError("Wrong parameter value. The mean and sigma of the blobs"
+                         + " cannot be negative.")
+    if n_dim > 3:
+        raise ValueError("Wrong number of dimensions. Blobs can only be generated"
+                         + " in 2 or 3 dimensions.")
+    centers = []
+    if n_dim == 2:
+        data = {'x0': np.array([]), 'x1': np.array([]), 'weight': np.array([])}
+        centers = [[x_max * rnd.random(), y_max * rnd.random()] for _ in range(n_blobs)]
+        blob_data = make_blobs(n_samples=n_samples, centers=np.array(centers))[0]
+
+        data['x0'] = blob_data.T[0]
+        data['x1'] = blob_data.T[1]
+        data['weight'] = np.full(shape=len(blob_data.T[0]), fill_value=1)
+
+
+        return pd.DataFrame(data)
+    if n_dim == 3:
+        data = {'x0': [], 'x1': [], 'x2': [], 'weight': []}
+        sqrt_samples = int(sqrt(n_samples))
+        z_values = np.random.normal(mean, sigma,sqrt_samples)
+        centers = [[x_max * rnd.random(), y_max * rnd.random()] for _ in range(n_blobs)]
+
+        for value in z_values: # for every z value, a layer is generated.
+            blob_data = make_blobs(n_samples=sqrt_samples, centers=np.array(centers))[0]
+            data['x0'] = np.concatenate([data['x0'], blob_data.T[0]])
+            data['x1'] = np.concatenate([data['x1'], blob_data.T[1]])
+            data['x2'] = np.concatenate([data['x2'], np.full(shape=sqrt_samples,
+                                                             fill_value=value)])
+            data['weight'] = np.concatenate([data['weight'], np.full(shape=sqrt_samples,
+                                                                     fill_value=1)])
+
+        return pd.DataFrame(data)
+
+@dataclass()
+class clustering_data:
+    """
+    Container characterizing the data used for clustering.
+
+    Attributes
+    ----------
+    coords : np.ndarray
+        Spatially normalized data coordinates in the coordinate system used for clustering.
+    original_coords : np.ndarray
+        Data coordinates in the original coordinate system provided by the user.
+    weight : np.ndarray
+        Weight values of the data points.
+    domain_ranges : list
+        List containing the ranges of the domains for every coordinate.
+    n_dim : int
+        Number of dimensions.
+    n_points : int
+        Number of points in the clustering data.
+    """
+
+    coords : np.ndarray
+    original_coords : np.ndarray
+    weight : np.ndarray
+    n_dim : int
+    n_points : int
+
+@dataclass(eq=False)
+class cluster_properties:
+    """
+    Container of the data resulting from the clusterization of the input data.
+
+    Attributes
+    ----------
+    n_clusters : int
+        Number of clusters constructed.
+    cluster_ids : np.ndarray
+        Array containing the cluster_id of each point.
+    is_seed : np.ndarray
+        Array of integers containing '1' if a point is a seed and '0' if it isn't
+    cluster_points : np.ndarray
+        Array containing, for each cluster, the list of point_ids corresponding to the
+        clusters bolonging to that cluster.
+    points_per_cluster : np.ndarray
+        Array containing the number of points belonging to each cluster.
+    output_df : pd.DataFrame
+        Dataframe containing is_seed and cluster_ids as columns.
+    """
+
+    n_clusters : int
+    cluster_ids : np.ndarray
+    is_seed : np.ndarray
+    cluster_points : np.ndarray
+    points_per_cluster : np.ndarray
+    output_df : pd.DataFrame
+
+    def __eq__(self, other):
+        if self.n_clusters != other.n_clusters:
+            return False
+        if not (self.cluster_ids == other.cluster_ids).all():
+            return False
+        if not (self.is_seed == other.is_seed).all():
+            return False
+
+        return True
+
+
+class clusterer:
+    """
+    Class representing a wrapper for the methods using in the process of clustering using
+    the CLUE algorithm.
+
+    Attributes
+    ----------
+    dc_ : float
+        Spatial parameter indicating how large is the region over which the local density of
+        each point is calculated.
+    rhoc : float
+        Parameter representing the density threshold value which divides seeds and
+        outliers.
+
+        Points with a density lower than rhoc can't be seeds, can only be followers
+        or outliers.
+    outlier : float
+        Multiplicative increment of dc_ for getting the region over which the followers of a
+        point are searched.
+
+        While dc_ determines the size of the search box in which the neighbors of a point are
+        searched when calculating its local density, when looking for followers while trying
+        to find potential seeds the size of the search box is given by dm = dc_ * outlier.
+    ppbin : int
+        Average number of points to be found in each tile.
+    kernel : Algo.kernel
+        Convolution kernel used to calculate the local density of the points.
+    clust_data : clustering_data
+        Container of the data used by the clustering algorithm.
+    clust_prop : cluster_properties
+        Container of the data produced as output of the algorithm.
+    elapsed_time : int
+        Execution time of the algorithm, expressed in nanoseconds.
+    """
+
+    def __init__(self, dc_: float, rhoc_: float, outlier_: float, ppbin: int = 10):
+        self.dc_ = dc_
+        self.rhoc = rhoc_
+        self.outlier = outlier_
+        self.ppbin = ppbin
+
+        # Initialize attributes
+        ## Data containers
+        self.clust_data = None
+        self.scaler = StandardScaler()
+
+        ## Kernel for calculation of local density
+        self.kernel = clue_kernels.FlatKernel(0.5)
+
+        ## Output attributes
+        self.clust_prop = None
+        self.elapsed_time = 0.
+
+    def _read_array(self, input_data: Union[list,np.ndarray]) -> None:
+        """
+        Reads data provided with lists or np.ndarrays
+
+        Attributes
+        ----------
+        input_data : list, np.ndarray
+            The coordinates and weight values of the data points
+
+        Modified attributes
+        -------------------
+        clust_data : clustering_data
+            Properties of the input data
+
+        Returns
+        -------
+        None
+        """
+
+        if len(input_data) < 2 or len(input_data) > 10:
+            raise ValueError("Inadequate data. The data must contain at least one coordinate" +
+                             " and the weight.")
+        self.clust_data = clustering_data(np.asarray(input_data[:-1]),
+                                          np.copy(np.asarray(input_data[:-1])),
+                                          np.asarray(input_data[-1]),
+                                          len(input_data[:-1]),
+                                          len(input_data[-1]))
+
+    def _read_string(self, input_data: str) -> Union[pd.DataFrame,None]:
+        """
+        Reads data provided by passing a string containing the path to a csv file
+
+        Attributes
+        ----------
+        input_data : str
+            The path to the csv file containing the input data
+
+        Modified attributes
+        -------------------
+        None
+
+        Returns
+        -------------------
+        pd.DataFrame
+            Dataframe containing the input data
+        """
+
+        if not input_data.endswith('.csv'):
+            raise ValueError('Wrong type of file. The file is not a csv file.')
+        df_ = pd.read_csv(input_data)
+        return df_
+
+    def _read_dict_df(self, input_data: Union[dict,pd.DataFrame]) -> pd.DataFrame:
+        """
+        Reads data provided using dictionaries or pandas dataframes
+
+        Attributes
+        ----------
+        input_data : dict, pd.DataFrame
+            The coordinates and weight values of the data points
+
+        Modified attributes
+        -------------------
+        None
+
+        Returns
+        -------------------
+        pd.DataFrame
+            Dataframe containing the input data
+        """
+
+        df_ = pd.DataFrame(input_data, copy=False)
+        return df_
+
+    def _handle_dataframe(self, df_: pd.DataFrame) -> None:
+        """
+        Constructs the clust_data attribute from the dataframe produced by the
+        _read_string or _read_dict_df methods
+
+        Modified attributes
+        -------------------
+        clust_data : clustering_data
+            Properties of the input data
+
+        Returns
+        -------
+        None
+        """
+
+        # Check that the user provided the weights
+        if 'weight' not in df_.columns:
+            raise ValueError("Inadequate data. The input dataframe must"
+                             + " contain a weight column.")
+
+        coordinate_columns = [col for col in df_.columns if col[0] == 'x']
+
+        # Check that the dimensionality of the dataset is adequate
+        if len(df_.columns) < 2:
+            raise ValueError("Inadequate data. The data must contain"
+                             + " at least one coordinate and the weight.")
+        if len(coordinate_columns) > 10:
+            raise ValueError("Inadequate data. The maximum number of"
+                             + " dimensions supported is 10.")
+        n_dim = len(coordinate_columns)
+        n_points = len(df_.index)
+        coords = df_.iloc[:, 0:-1].to_numpy()
+
+        self.clust_data = clustering_data(coords,
+                                          np.copy(coords),
+                                          np.asarray(df_['weight']),
+                                          n_dim,
+                                          n_points)
+
+    def _rescale(self) -> None:
+        """
+        Normalizes the input data using a standard scaler
+
+        Modified attributes
+        -------------------
+        clust_data.coords : np.ndarray
+            Array containing the coordinates and weight values of the data points
+
+        Returns
+        -------
+        None
+        """
+
+        for dim in range(self.clust_data.n_dim):
+            self.clust_data.coords.T[dim] = \
+            self.scaler.fit_transform(self.clust_data.coords.T[dim].reshape(-1, 1)).reshape(1, -1)[0]
+
+    def read_data(self,
+                  input_data: Union[pd.DataFrame,str,dict,list,np.ndarray]) -> None:
+        """
+        Reads the data in input and fills the class members containing the coordinates
+        of the points, the weight, the number of dimensions and the number of points.
+
+        Parameters
+        ----------
+        input_data : pandas dataframe
+            The dataframe should contain one column for every
+            coordinate, each one called 'x*', and one column for the weight.
+        input_data : string
+            The string should contain the full path to a csv file containing
+            the data.
+        input_data : dict
+        input_data : array_like
+            The list or numpy array should contain a list of lists for the
+            coordinates and a list for the weight.
+        kwargs : tuples
+            Tuples corresponding to the domain of any periodic variables. The
+            keyword should be the keyword of the corrispoding variable.
+
+        Modified attributes
+        -------------------
+        coords : ndarray
+            Point coordinates used for clustering, spatially normalized.
+        original_coords : ndarray
+            Point coordinates in the original coordinate system used by the user.
+        weight : ndarray
+            Weights of all the points.
+        domain_ranges : list of Algo.domain_t
+            List of the domains for each coordinate.
+        n_dim : int
+            The number of dimensions in which we are calculating the clusters.
+        n_points : int
+            The number of points in the dataset.
+
+        Returns
+        -------
+        None
+        """
+
+        # lists and np ndarrays
+        if isinstance(input_data, (list, np.ndarray)):
+            self._read_array(input_data)
+
+        # path to .csv file or pandas dataframe
+        if isinstance(input_data, (str)):
+            df = self._read_string(input_data)
+            self._handle_dataframe(df)
+
+        if isinstance(input_data, (dict, pd.DataFrame)):
+            df = self._read_dict_df(input_data)
+            self._handle_dataframe(df)
+
+        # Rescale the coordinates with a standard scaler
+        self._rescale()
+
+    def change_coordinates(self, **kwargs: types.FunctionType) -> None:
+        """
+        Change the coordinate system
+
+        Parameters
+        ----------
+        kwargs : function objects
+            The functions for the change of coordinates.
+            The keywords of the arguments are the coordinates names (x0, x1, ...).
+
+        Modifies attributes
+        -------------------
+        coords : ndarray
+            Coordinates used for clustering converted in the chosen coordinate system.
+
+        Returns
+        -------
+        None
+        """
+
+        # Change the coordinate system
+        for coord, func in kwargs.items():
+            self.clust_data.coords[int(coord[1])] = func(self.clust_data.original_coords)
+
+            # Normalize the coordinate with a standard scaler
+            self.clust_data.coords[int(coord[1])] = \
+                self.scaler.fit_transform(
+                    self.clust_data.coords[int(coord[1])].reshape(-1, 1)
+                ).reshape(1, -1)[0]
+
+    def choose_kernel(self,
+                      choice: str,
+                      parameters: Union[list,None] = None,
+                      function: types.FunctionType = lambda: 0) -> None:
+        """
+        Changes the kernel used in the calculation of local density. The default kernel
+        is a flat kernel with parameter 0.5
+
+        Parameters
+        ----------
+        choice : string
+            The type of kernel that you want to choose (flat, exp, gaus or custom).
+        parameters : array_like, optional
+            List of the parameters needed by the kernels.
+            The flat kernel requires one, the exponential requires two
+            (amplitude and mean), the gaussian requires three (amplitude,
+            mean and standard deviation) and the custom doesn't require any.
+        function : function object, optional
+            Function that should be used as kernel when the custom kernel is chosen.
+
+        Modified attributes
+        -------------------
+        kernel : Algo.kernel
+
+        Return
+        ------
+        None
+        """
+
+        if choice == "flat":
+            if len(parameters) != 1:
+                raise ValueError("Wrong number of parameters. The flat kernel"
+                                 + " requires 1 parameter.")
+            self.kernel = CLUE_Convolutional_Kernels.FlatKernel(parameters[0])
+        elif choice == "exp":
+            if len(parameters) != 2:
+                raise ValueError("Wrong number of parameters. The exponential"
+                                 + " kernel requires 2 parameters.")
+            self.kernel = CLUE_Convolutional_Kernels.ExponentialKernel(parameters[0], parameters[1])
+        elif choice == "gaus":
+            if len(parameters) != 3:
+                raise ValueError("Wrong number of parameters. The gaussian" +
+                                 " kernel requires 3 parameters.")
+            self.kernel = CLUE_Convolutional_Kernels.GaussinKernel(parameters[0], parameters[1], parameters[2])
+        elif choice == "custom":
+            if len(parameters) != 0:
+                raise ValueError("Wrong number of parameters. Custom kernels"
+                                 + " requires 0 parameters.")
+        else:
+            raise ValueError("Invalid kernel. The allowed choices for the"
+                             + " kernels are: flat, exp, gaus and custom.")
+
+    def list_devices(self, backend: str = "all") -> None:
+        """
+        Lists the devices available for the chosen backend.
+
+        Parameters
+        ----------
+        backend : string, optional
+            The backend for which the devices are listed. The allowed values are
+            'all', 'cpu serial', 'cpu tbb' and 'gpu cuda'.
+            The default value is 'all'.
+
+        Raises
+        ------
+        ValueError : If the backend is not valid.
+        """
+
+        if backend == "all":
+            cpu_serial.listDevices('cpu serial')
+            if tbb_found:
+                cpu_tbb.listDevices('cpu tbb')
+            if cuda_found:
+                gpu_cuda.listDevices('gpu cuda')
+            if hip_found:
+                gpu_hip.listDevices('gpu hip')
+        elif backend == "cpu serial":
+            cpu_serial.listDevices(backend)
+        elif backend == "cpu tbb":
+            if tbb_found:
+                cpu_tbb.listDevices(backend)
+            else:
+                print("TBB module not found. Please re-compile the library and try again.")
+        elif backend == "gpu cuda":
+            if cuda_found:
+                gpu_cuda.listDevices(backend)
+            else:
+                print("CUDA module not found. Please re-compile the library and try again.")
+        elif backend == "gpu hip":
+            if hip_found:
+                gpu_hip.listDevices(backend)
+            else:
+                print("HIP module not found. Please re-compile the library and try again.")
+        else:
+            raise ValueError("Invalid backend. The allowed choices for the"
+                             + " backend are: all, cpu serial, cpu tbb and gpu cuda.")
+
+    def run_clue(self,
+                 backend: str = "cpu serial",
+                 block_size: int = 1024,
+                 device_id: int = 0,
+                 verbose: bool = False) -> None:
+        """
+        Executes the CLUE clustering algorithm.
+
+        Parameters
+        ----------
+        verbose : bool, optional
+            The verbose option prints the execution time of runCLUE and the number
+            of clusters found.
+
+        Modified attributes
+        -------------------
+        cluster_ids : ndarray
+            Contains the cluster_id corresponding to every point.
+        is_seed : ndarray
+            For every point the value is 1 if the point is a seed or an
+            outlier and 0 if it isn't.
+        n_clusters : int
+            Number of clusters reconstructed.
+        cluster_points : ndarray of lists
+            Contains, for every cluster, the list of points associated to id.
+        points_per_cluster : ndarray
+            Contains the number of points associated to every cluster.
+
+        Return
+        ------
+        None
+        """
+
+        start = time.time_ns()
+        if backend == "cpu serial":
+            cluster_id_is_seed = cpu_serial.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin,
+                                                    self.clust_data.coords, self.clust_data.weight,
+                                                    self.kernel, self.clust_data.n_dim, block_size,
+                                                    device_id)
+        elif backend == "cpu tbb":
+            if tbb_found:
+                cluster_id_is_seed = cpu_tbb.mainRun(self.dc_, self.rhoc, self.outlier,
+                                                     self.ppbin, self.clust_data.coords,
+                                                     self.clust_data.weight, self.kernel,
+                                                     self.clust_data.n_dim, block_size,
+                                                     device_id)
+            else:
+                print("TBB module not found. Please re-compile the library and try again.")
+
+        elif backend == "gpu cuda":
+            if cuda_found:
+                cluster_id_is_seed = gpu_cuda.mainRun(self.dc_, self.rhoc, self.outlier,
+                                                      self.ppbin, self.clust_data.coords,
+                                                      self.clust_data.weight, self.kernel,
+                                                      self.clust_data.n_dim, block_size,
+                                                      device_id)
+            else:
+                print("CUDA module not found. Please re-compile the library and try again.")
+
+        elif backend == "gpu hip":
+            if hip_found:
+                cluster_id_is_seed = gpu_hip.mainRun(self.dc_, self.rhoc, self.outlier,
+                                                     self.ppbin, self.clust_data.coords,
+                                                     self.clust_data.weight, self.kernel,
+                                                     self.clust_data.n_dim, block_size,
+                                                     device_id)
+            else:
+                print("HIP module not found. Please re-compile the library and try again.")
+
+        finish = time.time_ns()
+        cluster_ids = np.array(cluster_id_is_seed[0])
+        is_seed = np.array(cluster_id_is_seed[1])
+        n_clusters = len(np.unique(cluster_ids))
+
+        cluster_points = [[] for _ in range(n_clusters)]
+        for i in range(self.clust_data.n_points):
+            cluster_points[cluster_ids[i]].append(i)
+
+        points_per_cluster = np.array([len(clust) for clust in cluster_points])
+
+        data = {'cluster_ids': cluster_ids, 'is_seed': is_seed}
+        output_df = pd.DataFrame(data)
+
+        self.clust_prop = cluster_properties(n_clusters,
+                                             cluster_ids,
+                                             is_seed,
+                                             np.asarray(cluster_points, dtype=object),
+                                             points_per_cluster,
+                                             output_df)
+
+        self.elapsed_time = (finish - start)/(10**6)
+        if verbose:
+            print(f'CLUE executed in {self.elapsed_time} ms')
+            print(f'Number of clusters found: {self.clust_prop.n_clusters}')
+
+    def input_plotter(self, plot_title: str='', title_size: float = 16,
+                      x_label: str = 'x', y_label: str = 'y', z_label: str = 'z',
+                      label_size: float = 16, pt_size: float = 1, pt_colour: str = 'b',
+                      grid: bool = True, grid_style: str = '--', grid_size: float = 0.2,
+                      x_ticks=None, y_ticks=None, z_ticks=None,
+                      **kwargs) -> None:
+        """
+        Plots the points in input.
+
+        Parameters
+        ----------
+        plot_title : string, optional
+            Title of the plot.
+        title_size : float, optional
+            Size of the plot title.
+        x_label : string, optional
+            Label on x-axis.
+        y_label : string, optional
+            Label on y-axis.
+        z_label : string, optional
+            Label on z-axis.
+        label_size : int, optional
+            Fontsize of the axis labels.
+        pt_size : int, optional
+            Size of the points in the plot.
+        pt_colour : string, optional
+            Colour of the points in the plot.
+        grid : bool, optional
+            If true displays grids in the plot.
+        grid_style : string, optional
+            Style of the grid.
+        grid_size : float, optional
+            Linewidth of the plot grid.
+        x_ticks : list, optional
+            List of ticks for the x axis.
+        y_ticks : list, optional
+            List of ticks for the y axis.
+        z_ticks : list, optional
+            List of ticks for the z axis.
+        kwargs : function objects, optional
+            Functions for converting the used coordinates to cartesian coordinates.
+            The keywords of the arguments are the coordinates names (x0, x1, ...).
+
+        Modified attributes
+        -------------------
+        None
+
+        Returns
+        -------
+        None
+        """
+
+        # Convert the used coordinates to cartesian coordiantes
+        cartesian_coords = np.copy(self.clust_data.original_coords)
+        for coord, func in kwargs.items():
+            cartesian_coords[int(coord[1])] = func(self.clust_data.original_coords)
+
+        if self.clust_data.n_dim == 2:
+            plt.scatter(cartesian_coords[0],
+                        cartesian_coords[1],
+                        s=pt_size,
+                        color=pt_colour)
+
+            # Customization of the plot title
+            plt.title(plot_title, fontsize=title_size)
+
+            # Customization of axis labels
+            plt.xlabel(x_label, fontsize=label_size)
+            plt.ylabel(y_label, fontsize=label_size)
+
+            # Customization of the grid
+            if grid:
+                plt.grid(linestyle=grid_style, linewidth=grid_size)
+
+            # Customization of axis ticks
+            if x_ticks is not None:
+                plt.xticks(x_ticks)
+            if y_ticks is not None:
+                plt.yticks(y_ticks)
+
+            plt.show()
+        if self.clust_data.n_dim >= 3:
+            fig = plt.figure()
+            ax_ = fig.add_subplot(projection='3d')
+            ax_.scatter(cartesian_coords[0],
+                       cartesian_coords[1],
+                       cartesian_coords[2],
+                       s=pt_size,
+                       color=pt_colour)
+
+            # Customization of the plot title
+            ax_.set_title(plot_title, fontsize=title_size)
+
+            # Customization of axis labels
+            ax_.set_xlabel(x_label, fontsize=label_size)
+            ax_.set_ylabel(y_label, fontsize=label_size)
+            ax_.set_zlabel(z_label, fontsize=label_size)
+
+            # Customization of the grid
+            if grid:
+                plt.grid(linestyle=grid_style, linewidth=grid_size)
+
+            # Customization of axis ticks
+            if x_ticks is not None:
+                ax_.set_xticks(x_ticks)
+            if y_ticks is not None:
+                ax_.set_yticks(y_ticks)
+            if z_ticks is not None:
+                ax_.set_zticks(z_ticks)
+
+            plt.show()
+
+    def cluster_plotter(self, plot_title: str = '', title_size: float = 16,
+                        x_label: str = 'x', y_label: str = 'y', z_label: str = 'z',
+                        label_size: float = 16, outl_size: float = 10, pt_size: float = 10,
+                        seed_size: float = 25, grid: bool = True, grid_style: str = '--',
+                        grid_size: float = 0.2, x_ticks=None, y_ticks=None, z_ticks=None,
+                        **kwargs) -> None:
+        """
+        Plots the clusters with a different colour for every cluster.
+
+        The points assigned to a cluster are printed as points, the seeds
+        as stars and the outliers as little grey crosses.
+
+        Parameters
+        ----------
+        plot_title : string, optional
+            Title of the plot
+        title_size : float, optional
+            Size of the plot title
+        x_label : string, optional
+            Label on x-axis
+        y_label : string, optional
+            Label on y-axis
+        z_label : string, optional
+            Label on z-axis
+        label_size : int, optional
+            Fontsize of the axis labels
+        outl_size : int, optional
+            Size of the outliers in the plot
+        pt_size : int, optional
+            Size of the points in the plot
+        seed_size : int, optional
+            Size of the seeds in the plot
+        grid : bool, optional
+            f true displays grids in the plot
+        grid_style : string, optional
+            Style of the grid
+        grid_size : float, optional
+            Linewidth of the plot grid
+        x_ticks : list, optional
+            List of ticks for the x axis.
+        y_ticks : list, optional
+            List of ticks for the y axis.
+        z_ticks : list, optional
+            List of ticks for the z axis.
+        kwargs : function objects, optional
+            Functions for converting the used coordinates to cartesian coordinates.
+            The keywords of the arguments are the coordinates names (x0, x1, ...).
+
+        Modified attributes
+        -------------------
+        None
+
+        Returns
+        -------
+        None
+        """
+
+        # Convert the used coordinates to cartesian coordiantes
+        cartesian_coords = np.copy(self.clust_data.original_coords.T)
+        for coord, func in kwargs.items():
+            cartesian_coords[int(coord[1])] = func(self.clust_data.original_coords)
+
+        if self.clust_data.n_dim == 2:
+            data = {'x0': cartesian_coords[0],
+                    'x1': cartesian_coords[1],
+                    'cluster_ids': self.clust_prop.cluster_ids,
+                    'isSeed': self.clust_prop.is_seed}
+            df_ = pd.DataFrame(data)
+
+            max_clusterid = max(df_["cluster_ids"])
+
+            df_out = df_[df_.cluster_ids == -1] # Outliers
+            plt.scatter(df_out.x0, df_out.x1, s=outl_size, marker='x', color='0.4')
+            for i in range(0, max_clusterid+1):
+                dfi = df_[df_.cluster_ids == i] # ith cluster
+                plt.scatter(dfi.x0, dfi.x1, s=pt_size, marker='.')
+            df_seed = df_[df_.isSeed == 1] # Only Seeds
+            plt.scatter(df_seed.x0, df_seed.x1, s=seed_size, color='r', marker='*')
+
+            # Customization of the plot title
+            plt.title(plot_title, fontsize=title_size)
+
+            # Customization of axis labels
+            plt.xlabel(x_label, fontsize=label_size)
+            plt.ylabel(y_label, fontsize=label_size)
+
+            # Customization of the grid
+            if grid:
+                plt.grid(linestyle=grid_style, linewidth=grid_size)
+
+            # Customization of axis ticks
+            if x_ticks is not None:
+                plt.xticks(x_ticks)
+            if y_ticks is not None:
+                plt.yticks(y_ticks)
+
+            plt.show()
+        if self.clust_data.n_dim == 3:
+            data = {'x0': cartesian_coords[0],
+                    'x1': cartesian_coords[1],
+                    'x2': cartesian_coords[2],
+                    'cluster_ids': self.clust_prop.cluster_ids,
+                    'isSeed': self.clust_prop.is_seed}
+            df_ = pd.DataFrame(data)
+
+            max_clusterid = max(df_["cluster_ids"])
+            fig = plt.figure()
+            ax_ = fig.add_subplot(projection='3d')
+
+            df_out = df_[df_.cluster_ids == -1]
+            ax_.scatter(df_out.x0, df_out.x1, df_out.x2, s=outl_size, color = 'grey', marker = 'x')
+            for i in range(0, max_clusterid+1):
+                dfi = df_[df_.cluster_ids == i]
+                ax_.scatter(dfi.x0, dfi.x1, dfi.x2, s=pt_size, marker = '.')
+
+            df_seed = df_[df_.isSeed == 1] # Only Seeds
+            ax_.scatter(df_seed.x0, df_seed.x1, df_seed.x2, s=seed_size, color = 'r', marker = '*')
+
+            # Customization of the plot title
+            ax_.set_title(plot_title, fontsize=title_size)
+
+            # Customization of axis labels
+            ax_.set_xlabel(x_label, fontsize=label_size)
+            ax_.set_ylabel(y_label, fontsize=label_size)
+            ax_.set_zlabel(z_label, fontsize=label_size)
+
+            # Customization of the grid
+            if grid:
+                plt.grid(linestyle=grid_style, linewidth=grid_size)
+
+            # Customization of axis ticks
+            if x_ticks is not None:
+                ax_.set_xticks(x_ticks)
+            if y_ticks is not None:
+                ax_.set_yticks(y_ticks)
+            if z_ticks is not None:
+                ax_.set_zticks(z_ticks)
+
+            plt.show()
+
+    def to_csv(self, output_folder: str, file_name: str) -> None:
+        """
+        Creates a file containing the coordinates of all the points, their cluster_ids and is_seed.
+
+        Parameters
+        ----------
+        output_folder : string
+            Full path to the desired ouput folder.
+        file_name : string
+            Name of the file, with the '.csv' suffix.
+
+        Modified attributes
+        -------------------
+        None
+
+        Returns
+        -------
+        None
+        """
+
+        out_path = output_folder + file_name
+        data = {}
+        for i in range(self.clust_data.n_dim):
+            data['x' + str(i)] = self.clust_data.coords.T[i]
+        data['cluster_ids'] = self.clust_prop.cluster_ids
+        data['is_seed'] = self.clust_prop.is_seed
+
+        df_ = pd.DataFrame(data)
+        df_.to_csv(out_path,index=False)
+
+if __name__ == "__main__":
+    c = clusterer(0.4,5,1.)
+    c.read_data('./sissa.csv')
+    c.run_clue(backend="cpu serial", verbose=True)
+    # c.run_clue(backend="cpu tbb", verbose=True)
+    # c.run_clue(backend="gpu cuda", verbose=True)
+    # c.run_clue(backend="gpu hip", verbose=True)
+    c.cluster_plotter()
+    # c.to_csv('./','sissa_output_tbb.csv')
+    c.list_devices('cpu serial')
+    c.list_devices('cpu tbb')
+    c.list_devices('gpu cuda')
+    c.list_devices('gpu hip')
+    c.list_devices()
diff --git a/CLUEstering/alpaka/BindingModules/Makefile b/CLUEstering/alpaka/BindingModules/Makefile
new file mode 100644
index 00000000..ef93d8e7
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/Makefile
@@ -0,0 +1,103 @@
+
+# Compilers
+export CXX := g++
+export CUDA := nvcc
+export HIP := hipcc
+
+# define supported cuda architectures
+export CUDA_ARCH := 50 60 61 62 70
+
+# Compiler flags
+CXX_FLAGS = -std=c++17 -g -O2 
+# Cuda flags
+export CUDA_FLAGS = -x cu --expt-relaxed-constexpr -gencode arch=compute_61,code=[sm_61,compute_61] -G 
+export CUDA_CXXFLAGS := -I$(CUDA_BASE)/include
+# TBB flags
+TBB_FLAGS = -ltbb
+# Amd flags
+export HIP_BASE = /opt/rocm/
+export HIP_FLAGS := -I$(HIP_BASE)/include \
+					-I$(HIP_BASE)/hiprand/include \
+					-I$(HIP_BASE)/rocrand/include
+
+# Dependencies flags
+ALPAKA_PATH = ../../../extern/alpaka/include
+BOOST_PATH = /usr/include/boost
+
+# Alpaka backend compilation flags
+ALPAKA_SERIAL_FLAGS = -DALPAKA_HOST_ONLY \
+					  -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT \
+					  -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \
+					  -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
+ALPAKA_TBB_FLAGS = -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT \
+				   -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED \
+				   -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
+ALPAKA_CUDA_FLAGS = -DALPAKA_ACC_GPU_CUDA_PRESENT \
+					-DALPAKA_ACC_GPU_CUDA_ENABLED \
+					-DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND 
+ALPAKA_HIP_FLAGS = -DALPAKA_ACC_GPU_HIP_PRESENT \
+				   -DALPAKA_ACC_GPU_HIP_ENABLED \
+				   -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
+
+# Binding flags
+PYTHON_VERS := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' | sed 's/\.//g'  )
+PYTHON_VERS_DOT := $(shell python3 -V | awk -F '' '{print $$8$$9$$10$$11}' )
+PYTHON_PATH := /usr/include/python$(PYTHON_VERS_DOT)
+PYBIND_PATH := ../../../extern/pybind11/include
+BINDING_FLAGS := -Wall -shared -fPIC -I$(PYTHON_PATH) -I$(PYBIND_PATH)
+CUDA_BINDING_FLAGS := -shared --compiler-options '-fPIC' -I$(PYTHON_PATH) -I$(PYBIND_PATH)
+
+# Backend module names
+KERNELS_MODULE_NAME := CLUE_Convolutional_Kernels.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+SERIAL_MODULE_NAME := CLUE_CPU_Serial.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+TBB_MODULE_NAME := CLUE_CPU_TBB.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+CUDA_MODULE_NAME := CLUE_GPU_CUDA.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+HIP_MODULE_NAME := CLUE_GPU_HIP.cpython-$(PYTHON_VERS)-x86_64-linux-gnu.so
+
+all:
+	$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \
+		$(BINDING_FLAGS) binding_cpu.cc -o $(SERIAL_MODULE_NAME)
+	$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \
+		$(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME)
+	$(CXX) $(CXX_FLAGS) $(TBB_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_TBB_FLAGS) \
+		$(BINDING_FLAGS) binding_cpu_tbb.cc -o $(TBB_MODULE_NAME)
+ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
+	$(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) \
+		$(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME)
+else
+	echo "No CUDA compiler found, skipping CUDA backend"
+endif
+ifeq ($(call,$(shell which hipcc))$(.SHELLSTATUS),0)
+	$(HIP) $(CXX_FLAGS) $(HIP_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) \
+		$(BINDING_FLAGS) binding_gpu_hip.cc -o $(HIP_MODULE_NAME)
+else
+	echo "No HIP compiler found, skipping HIP backend"
+endif
+
+serial:
+	$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \
+		$(BINDING_FLAGS) binding_cpu.cc -o $(SERIAL_MODULE_NAME)
+
+tbb:
+	$(CXX) $(CXX_FLAGS) $(TBB_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_TBB_FLAGS) \
+		$(BINDING_FLAGS) binding_cpu_tbb.cc -o $(TBB_MODULE_NAME)
+
+cuda:
+ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
+	$(CUDA) $(CUDA_FLAGS) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_CUDA_FLAGS) \
+		$(CUDA_BINDING_FLAGS) binding_gpu_cuda.cc -o $(CUDA_MODULE_NAME)
+else
+	echo "No CUDA compiler found, skipping CUDA backend"
+endif
+
+hip:
+ifeq ($(call,$(shell which hipcc))$(.SHELLSTATUS),0)
+	$(HIP) $(CXX_FLAGS) $(HIP_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_HIP_FLAGS) \
+		$(BINDING_FLAGS) binding_gpu_hip.cc -o $(HIP_MODULE_NAME)
+else
+	echo "No HIP compiler found, skipping HIP backend"
+endif
+
+kernel:
+	$(CXX) $(CXX_FLAGS) -I$(BOOST_PATH) -I$(ALPAKA_PATH) $(ALPAKA_SERIAL_FLAGS) \
+		$(BINDING_FLAGS) binding_kernels.cc -o $(KERNELS_MODULE_NAME)
diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu.cc b/CLUEstering/alpaka/BindingModules/binding_cpu.cc
new file mode 100644
index 00000000..a870d6f4
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_cpu.cc
@@ -0,0 +1,200 @@
+
+#include <alpaka/alpaka.hpp>
+#include <vector>
+
+#include "../CLUE/CLUEAlgoAlpaka.h"
+#include "../CLUE/Run.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+namespace alpaka_serial_sync {
+  void listDevices(const std::string& backend) {
+    const char tab = '\t';
+    const std::vector<Device> devices = alpaka::getDevs<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices found for the " << backend << " backend." << std::endl;
+      return;
+    } else {
+      std::cout << backend << " devices found: \n";
+      for (size_t i{}; i < devices.size(); ++i) {
+        std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n';
+      }
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const FlatKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const ExponentialKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const GaussianKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  PYBIND11_MODULE(CLUE_CPU_Serial, m) {
+    m.doc() = "Binding of the CLUE algorithm running serially on CPU";
+
+    m.def("listDevices", &listDevices, "List the available devices for the CPU serial backend");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const FlatKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const ExponentialKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const GaussianKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+  }
+};  // namespace alpaka_serial_sync
diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc
new file mode 100644
index 00000000..42214d98
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_cpu_tbb.cc
@@ -0,0 +1,200 @@
+
+#include <alpaka/alpaka.hpp>
+#include <vector>
+
+#include "../CLUE/CLUEAlgoAlpaka.h"
+#include "../CLUE/Run.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+namespace alpaka_tbb_async {
+  void listDevices(const std::string& backend) {
+    const char tab = '\t';
+    const std::vector<Device> devices = alpaka::getDevs<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices found for the " << backend << " backend." << std::endl;
+      return;
+    } else {
+      std::cout << backend << " devices found: \n";
+      for (size_t i{}; i < devices.size(); ++i) {
+        std::cout << tab << "Device " << i << ": " << alpaka::getName(devices[i]) << '\n';
+      }
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const FlatKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const ExponentialKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const GaussianKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  PYBIND11_MODULE(CLUE_CPU_TBB, m) {
+    m.doc() = "Binding of the CLUE algorithm running on CPU with TBB";
+
+    m.def("listDevices", &listDevices, "List the available devices for the TBB backend");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const FlatKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const ExponentialKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const GaussianKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+  }
+};  // namespace alpaka_tbb_async
diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc
new file mode 100644
index 00000000..da464650
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_gpu_cuda.cc
@@ -0,0 +1,204 @@
+#include <alpaka/alpaka.hpp>
+#include <vector>
+
+#include "../CLUE/CLUEAlgoAlpaka.h"
+#include "../CLUE/Run.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+#include "../AlpakaCore/initialise.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+using cms::alpakatools::initialise;
+
+namespace alpaka_cuda_async {
+  void listDevices(const std::string& backend) {
+    const char tab = '\t';
+    const std::vector<Device> devices = alpaka::getDevs<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices found for the " << backend << " backend." << std::endl;
+      return;
+    } else {
+      std::cout << backend << " devices found: \n";
+      for (size_t i{}; i < devices.size(); ++i) {
+        std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n';
+      }
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const FlatKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    /* initialise<Platform>(); */
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const ExponentialKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const GaussianKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  PYBIND11_MODULE(CLUE_GPU_CUDA, m) {
+    m.doc() = "Binding of the CLUE algorithm running on CUDA GPUs";
+
+    m.def("listDevices", &listDevices, "List the available devices for the CUDA backend");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const FlatKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const ExponentialKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const GaussianKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+  }
+};  // namespace alpaka_cuda_async
diff --git a/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
new file mode 100644
index 00000000..479c3c20
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_gpu_hip.cc
@@ -0,0 +1,200 @@
+#include <alpaka/alpaka.hpp>
+#include <vector>
+
+#include "../CLUE/CLUEAlgoAlpaka.h"
+#include "../CLUE/Run.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+#include "../AlpakaCore/initialise.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+namespace alpaka_rocm_async {
+  void listDevices(const std::string& backend) {
+    const char tab = '\t';
+    const std::vector<Device> devices = alpaka::getDevs<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices found for the " << backend << " backend." << std::endl;
+      return;
+    } else {
+      std::cout << backend << " devices found: \n";
+      for (size_t i{}; i < devices.size(); ++i) {
+        std::cout << tab << "device " << i << ": " << alpaka::getName(devices[i]) << '\n';
+      }
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const FlatKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const ExponentialKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  std::vector<std::vector<int>> mainRun(float dc,
+                                        float rhoc,
+                                        float outlier,
+                                        int pPBin,
+                                        const std::vector<std::vector<float>>& coords,
+                                        const std::vector<float>& weights,
+                                        const GaussianKernel& kernel,
+                                        int Ndim,
+                                        size_t block_size,
+                                        size_t device_id) {
+    auto const dev_acc = alpaka::getDevByIdx<Acc1D>(device_id);
+
+    // Create the queue
+    Queue queue_(dev_acc);
+
+    // Running the clustering algorithm //
+    switch (Ndim) {
+      [[unlikely]] case (1) :
+        return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (2) :
+        return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[likely]] case (3) :
+        return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (4) :
+        return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (5) :
+        return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (6) :
+        return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (7) :
+        return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (8) :
+        return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (9) :
+        return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] case (10) :
+        return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size);
+      [[unlikely]] default:
+        std::cout << "This library only works up to 10 dimensions\n";
+        return {};
+    }
+  }
+
+  PYBIND11_MODULE(CLUE_GPU_HIP, m) {
+    m.doc() = "Binding of the CLUE algorithm running on AMD GPUs";
+
+    m.def("listDevices", &listDevices, "List the available devices for the HIP/ROCm backend");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const FlatKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const ExponentialKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+    m.def("mainRun",
+          pybind11::overload_cast<float,
+                                  float,
+                                  float,
+                                  int,
+                                  const std::vector<std::vector<float>>&,
+                                  const std::vector<float>&,
+                                  const GaussianKernel&,
+                                  int,
+                                  size_t,
+                                  size_t>(&mainRun),
+          "mainRun");
+  }
+};  // namespace alpaka_rocm_async
diff --git a/CLUEstering/alpaka/BindingModules/binding_kernels.cc b/CLUEstering/alpaka/BindingModules/binding_kernels.cc
new file mode 100644
index 00000000..c61170d9
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/binding_kernels.cc
@@ -0,0 +1,17 @@
+
+#include <vector>
+
+#include "../CLUE/ConvolutionalKernel.h"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <stdint.h>
+
+PYBIND11_MODULE(CLUE_Convolutional_Kernels, m) {
+  m.doc() = "Binding of the convolutional kernels used in the CLUE algorithm.";
+
+  pybind11::class_<FlatKernel>(m, "FlatKernel").def(pybind11::init<float>());
+  pybind11::class_<ExponentialKernel>(m, "ExponentialKernel").def(pybind11::init<float, float>());
+  pybind11::class_<GaussianKernel>(m, "GaussianKernel").def(pybind11::init<float, float, float>());
+}
diff --git a/CLUEstering/alpaka/BindingModules/sissa.csv b/CLUEstering/alpaka/BindingModules/sissa.csv
new file mode 100644
index 00000000..6cb12d84
--- /dev/null
+++ b/CLUEstering/alpaka/BindingModules/sissa.csv
@@ -0,0 +1,1000 @@
+x0,x1,weight
+34.84,-40.85,1.0
+37.44,-39.57,1.0
+30.43,-40.06,1.0
+34.57,-37.18,1.0
+41.46,-52.92,1.0
+44.23,-52.01,1.0
+32.13,-42.11,1.0
+37.01,-42.12,1.0
+36.0,-35.2,1.0
+41.23,-54.89,1.0
+33.8,-33.8,1.0
+36.81,-51.74,1.0
+43.59,-47.38,1.0
+27.63,-41.56,1.0
+40.48,-41.47,1.0
+36.97,-44.03,1.0
+26.27,-42.45,1.0
+27.72,-36.5,1.0
+48.27,-50.86,1.0
+40.79,-46.45,1.0
+38.22,-46.54,1.0
+42.17,-41.67,1.0
+31.54,-33.86,1.0
+29.38,-35.03,1.0
+26.13,-37.42,1.0
+35.69,-52.4,1.0
+26.04,-37.42,1.0
+31.93,-46.03,1.0
+27.2,-43.51,1.0
+49.11,-52.63,1.0
+47.99,-45.54,1.0
+26.73,-37.94,1.0
+47.62,-56.66,1.0
+30.11,-32.93,1.0
+44.45,-44.34,1.0
+25.1,-37.11,1.0
+45.26,-45.39,1.0
+31.11,-32.51,1.0
+31.65,-48.42,1.0
+24.33,-37.42,1.0
+50.76,-53.68,1.0
+51.41,-46.09,1.0
+43.3,-43.55,1.0
+50.1,-45.02,1.0
+50.65,-50.86,1.0
+23.94,-43.42,1.0
+50.39,-51.61,1.0
+51.32,-53.0,1.0
+26.2,-28.39,1.0
+36.67,-31.21,1.0
+45.14,-39.85,1.0
+52.09,-43.1,1.0
+53.14,-45.03,1.0
+46.36,-57.8,1.0
+45.27,-36.84,1.0
+21.08,-36.36,1.0
+42.44,-34.17,1.0
+52.28,-51.59,1.0
+20.25,-33.77,1.0
+30.15,-29.41,1.0
+55.51,-55.56,1.0
+21.65,-39.47,1.0
+44.72,-34.92,1.0
+31.52,-50.53,1.0
+54.24,-52.46,1.0
+36.86,-56.06,1.0
+21.58,-28.72,1.0
+35.17,-56.21,1.0
+54.12,-52.25,1.0
+47.77,-58.61,1.0
+21.26,-28.09,1.0
+20.97,-28.23,1.0
+44.6,-34.03,1.0
+42.94,-33.49,1.0
+31.97,-52.64,1.0
+55.14,-47.78,1.0
+20.79,-40.55,1.0
+43.67,-33.38,1.0
+47.31,-38.22,1.0
+48.8,-60.25,1.0
+21.4,-25.72,1.0
+18.47,-36.56,1.0
+54.75,-60.86,1.0
+16.48,-27.36,1.0
+28.12,-24.78,1.0
+20.99,-23.29,1.0
+48.13,-38.6,1.0
+19.45,-39.61,1.0
+41.97,-30.3,1.0
+46.73,-33.85,1.0
+11.78,-23.7,1.0
+48.19,-37.13,1.0
+14.77,-34.82,1.0
+38.9,-58.31,1.0
+57.95,-46.06,1.0
+13.25,-33.33,1.0
+56.73,-45.05,1.0
+17.05,-23.3,1.0
+14.47,-29.77,1.0
+39.48,-59.1,1.0
+20.16,-21.32,1.0
+11.79,-28.31,1.0
+28.19,-52.92,1.0
+57.57,-45.08,1.0
+20.64,-44.94,1.0
+30.23,-54.55,1.0
+12.81,-31.53,1.0
+24.98,-23.58,1.0
+48.33,-64.15,1.0
+49.78,-37.26,1.0
+38.03,-27.21,1.0
+62.2,-51.31,1.0
+22.13,-21.74,1.0
+61.08,-52.17,1.0
+37.97,-26.07,1.0
+57.98,-62.1,1.0
+9.92,-20.63,1.0
+49.41,-35.17,1.0
+39.01,-27.29,1.0
+18.34,-21.45,1.0
+46.58,-65.54,1.0
+39.29,-60.71,1.0
+9.54,-25.86,1.0
+21.99,-20.43,1.0
+13.32,-17.53,1.0
+60.35,-47.52,1.0
+12.49,-21.36,1.0
+53.67,-65.87,1.0
+52.83,-66.94,1.0
+59.09,-64.18,1.0
+19.14,-17.72,1.0
+64.87,-51.24,1.0
+58.37,-64.98,1.0
+15.04,-16.66,1.0
+16.5,-16.66,1.0
+57.36,-66.87,1.0
+61.77,-48.53,1.0
+6.6,-24.89,1.0
+64.76,-58.7,1.0
+62.47,-65.41,1.0
+60.02,-67.3,1.0
+2.95,-14.56,1.0
+64.41,-63.72,1.0
+36.38,-60.41,1.0
+56.15,-40.21,1.0
+4.56,-31.44,1.0
+67.48,-60.2,1.0
+4.73,-15.74,1.0
+52.08,-33.73,1.0
+10.5,-17.06,1.0
+65.64,-62.47,1.0
+5.56,-32.34,1.0
+28.3,-20.01,1.0
+10.19,-14.41,1.0
+9.17,-35.68,1.0
+68.77,-58.81,1.0
+8.72,-16.82,1.0
+20.95,-49.88,1.0
+16.73,-44.92,1.0
+7.85,-13.65,1.0
+36.34,-20.91,1.0
+7.87,-11.35,1.0
+15.97,-43.6,1.0
+45.8,-26.44,1.0
+-3.38,-18.31,1.0
+7.95,-38.36,1.0
+57.95,-40.17,1.0
+61.29,-69.89,1.0
+35.81,-19.5,1.0
+62.99,-67.77,1.0
+11.71,-14.79,1.0
+67.88,-61.87,1.0
+30.01,-18.69,1.0
+46.9,-68.4,1.0
+6.78,-35.97,1.0
+69.04,-53.99,1.0
+-1.72,-10.21,1.0
+9.87,-12.14,1.0
+68.46,-61.86,1.0
+17.48,-49.8,1.0
+62.2,-43.37,1.0
+53.89,-71.6,1.0
+-0.86,-21.58,1.0
+65.59,-67.51,1.0
+-5.73,-19.79,1.0
+5.58,-36.85,1.0
+59.7,-38.44,1.0
+73.62,-55.84,1.0
+22.28,-15.17,1.0
+13.16,-10.43,1.0
+-3.44,-20.71,1.0
+14.92,-46.27,1.0
+0.76,-32.96,1.0
+16.73,-12.3,1.0
+-6.21,-18.2,1.0
+-6.53,-19.2,1.0
+66.85,-67.79,1.0
+70.36,-66.84,1.0
+-6.88,-12.95,1.0
+75.99,-56.77,1.0
+60.69,-73.7,1.0
+57.32,-34.69,1.0
+-4.16,-22.67,1.0
+61.26,-73.77,1.0
+-9.82,-5.65,1.0
+-7.88,-17.73,1.0
+-8.38,-8.66,1.0
+27.79,-61.16,1.0
+-1.49,-5.81,1.0
+29.78,-15.19,1.0
+-5.07,-8.04,1.0
+22.06,-57.63,1.0
+10.42,-44.85,1.0
+21.43,-57.34,1.0
+-3.72,-5.43,1.0
+66.7,-73.64,1.0
+14.11,-7.18,1.0
+-9.03,-14.4,1.0
+44.99,-73.67,1.0
+-2.64,-2.57,1.0
+14.08,-5.97,1.0
+58.45,-32.55,1.0
+-5.53,-27.3,1.0
+18.34,-8.32,1.0
+-12.49,-1.24,1.0
+41.06,-70.4,1.0
+-7.29,-22.33,1.0
+40.15,-71.65,1.0
+63.33,-75.26,1.0
+80.96,-67.43,1.0
+0.01,0.48,1.0
+63.64,-75.5,1.0
+74.17,-69.97,1.0
+61.41,-76.19,1.0
+2.02,0.53,1.0
+61.83,-75.26,1.0
+75.77,-66.01,1.0
+81.37,-71.45,1.0
+74.91,-70.36,1.0
+81.42,-72.02,1.0
+-3.77,1.86,1.0
+82.23,-67.03,1.0
+77.0,-52.91,1.0
+34.48,-68.97,1.0
+-14.43,-12.09,1.0
+-0.32,-37.78,1.0
+-16.35,-2.27,1.0
+-14.32,-18.22,1.0
+87.19,-72.99,1.0
+71.71,-77.69,1.0
+42.55,-74.04,1.0
+88.19,-75.01,1.0
+-14.77,-14.38,1.0
+87.76,-71.9,1.0
+68.91,-78.82,1.0
+68.88,-78.7,1.0
+60.78,-78.16,1.0
+3.97,1.02,1.0
+36.08,-14.52,1.0
+85.42,-65.92,1.0
+-17.0,-12.63,1.0
+39.33,-73.62,1.0
+58.82,-78.89,1.0
+61.29,-30.77,1.0
+81.09,-79.38,1.0
+-19.26,-1.66,1.0
+-12.6,2.9,1.0
+74.21,-77.21,1.0
+68.92,-39.55,1.0
+14.89,-55.01,1.0
+-16.41,0.25,1.0
+-15.61,1.01,1.0
+-16.24,-16.22,1.0
+44.93,-19.27,1.0
+-10.97,4.07,1.0
+44.82,-76.01,1.0
+-19.16,-0.99,1.0
+15.8,-1.46,1.0
+-16.92,-16.31,1.0
+27.59,-65.87,1.0
+-19.47,-12.93,1.0
+-21.61,1.94,1.0
+-18.76,3.42,1.0
+86.92,-63.79,1.0
+17.28,-1.22,1.0
+-12.34,6.01,1.0
+4.14,3.31,1.0
+-25.46,-10.12,1.0
+-5.31,5.52,1.0
+-1.88,5.51,1.0
+-25.49,-11.25,1.0
+32.62,-72.15,1.0
+-127.66,73.23,1.0
+-126.05,78.02,1.0
+-25.72,4.63,1.0
+-16.28,-22.98,1.0
+14.21,1.33,1.0
+-124.14,76.28,1.0
+1.44,7.08,1.0
+-25.93,7.11,1.0
+82.46,-88.2,1.0
+-128.02,71.97,1.0
+-12.72,9.33,1.0
+-131.42,74.44,1.0
+-122.3,75.95,1.0
+-133.01,68.27,1.0
+-128.47,66.6,1.0
+-127.53,66.01,1.0
+-4.84,-38.56,1.0
+-125.62,69.73,1.0
+-124.87,70.77,1.0
+65.43,-87.56,1.0
+93.13,-76.9,1.0
+-139.86,69.24,1.0
+117.78,62.85,1.0
+119.76,47.72,1.0
+125.81,44.54,1.0
+-132.58,78.32,1.0
+76.55,-89.56,1.0
+122.78,47.09,1.0
+126.04,44.26,1.0
+135.26,55.03,1.0
+130.04,55.62,1.0
+113.68,59.59,1.0
+89.92,-87.5,1.0
+121.09,46.18,1.0
+94.72,-80.31,1.0
+113.02,64.32,1.0
+8.46,6.96,1.0
+-125.23,84.88,1.0
+112.23,42.9,1.0
+-114.84,73.8,1.0
+127.31,57.42,1.0
+117.26,56.92,1.0
+-116.52,81.74,1.0
+122.45,65.91,1.0
+40.61,-81.13,1.0
+117.3,46.31,1.0
+-53.71,-159.34,1.0
+133.9,57.51,1.0
+-123.36,86.67,1.0
+-33.56,-2.93,1.0
+118.07,55.4,1.0
+125.69,53.63,1.0
+137.47,49.89,1.0
+-121.56,64.3,1.0
+-20.26,13.45,1.0
+-78.06,-138.69,1.0
+-35.32,-3.29,1.0
+108.26,59.07,1.0
+-53.15,-168.72,1.0
+-20.03,15.51,1.0
+-124.34,86.19,1.0
+132.4,46.03,1.0
+109.23,60.09,1.0
+132.87,45.7,1.0
+117.58,37.79,1.0
+134.32,60.99,1.0
+115.32,51.86,1.0
+-124.18,62.52,1.0
+-140.55,66.12,1.0
+-141.77,61.26,1.0
+61.35,-88.32,1.0
+96.13,-67.72,1.0
+98.5,-72.28,1.0
+-111.4,73.23,1.0
+110.27,54.14,1.0
+-32.82,-13.12,1.0
+-56.35,-167.13,1.0
+109.45,44.84,1.0
+-108.41,78.4,1.0
+112.19,53.63,1.0
+-143.96,73.08,1.0
+-82.4,-123.74,1.0
+-49.33,-166.96,1.0
+-52.88,-167.48,1.0
+101.09,48.39,1.0
+-111.0,74.6,1.0
+-134.64,55.03,1.0
+-122.34,93.54,1.0
+-59.26,-151.25,1.0
+-35.81,1.63,1.0
+-112.83,84.22,1.0
+-35.72,2.22,1.0
+99.39,-84.06,1.0
+-148.48,64.2,1.0
+134.05,68.49,1.0
+-74.87,-145.16,1.0
+-125.94,55.76,1.0
+-61.37,-162.44,1.0
+-83.07,-133.74,1.0
+-86.56,-132.69,1.0
+-80.1,-138.03,1.0
+-97.64,-117.16,1.0
+122.26,69.93,1.0
+-146.58,69.42,1.0
+-37.47,9.44,1.0
+-73.82,-128.81,1.0
+-82.67,-130.46,1.0
+-17.62,18.33,1.0
+-131.94,90.68,1.0
+89.41,-53.83,1.0
+-104.25,-120.92,1.0
+-47.6,-171.77,1.0
+-46.83,-175.13,1.0
+-89.53,-132.43,1.0
+-102.44,-117.91,1.0
+-114.85,90.87,1.0
+-103.03,-116.12,1.0
+-95.81,-119.96,1.0
+-67.02,-146.62,1.0
+139.31,61.48,1.0
+-139.51,82.21,1.0
+-118.61,89.61,1.0
+97.91,46.78,1.0
+-121.71,94.09,1.0
+-54.18,-150.66,1.0
+116.77,34.01,1.0
+-71.11,-147.46,1.0
+-19.07,-25.16,1.0
+125.39,38.23,1.0
+-69.92,-146.52,1.0
+-109.32,-120.5,1.0
+-62.44,-167.18,1.0
+-95.25,-114.03,1.0
+-43.68,-176.18,1.0
+84.58,-92.17,1.0
+-34.63,7.38,1.0
+-145.84,53.83,1.0
+-88.06,-117.41,1.0
+101.35,41.64,1.0
+91.87,-54.13,1.0
+-52.29,-147.29,1.0
+-144.07,57.02,1.0
+-154.07,34.26,1.0
+144.41,54.22,1.0
+-110.87,-109.86,1.0
+-147.27,51.34,1.0
+-110.36,-113.16,1.0
+-33.36,11.08,1.0
+-144.96,51.96,1.0
+94.26,-57.94,1.0
+104.12,-79.63,1.0
+-71.19,-139.16,1.0
+-68.4,-142.21,1.0
+-138.21,49.23,1.0
+-109.97,88.44,1.0
+84.99,-94.15,1.0
+104.45,-89.7,1.0
+-137.75,45.6,1.0
+1.22,189.35,1.0
+-60.7,-138.24,1.0
+-59.6,-141.99,1.0
+-95.92,-133.09,1.0
+-152.5,26.33,1.0
+-39.7,8.55,1.0
+-72.26,-153.39,1.0
+-99.42,-136.81,1.0
+98.52,59.01,1.0
+104.75,55.72,1.0
+-119.7,56.49,1.0
+-104.05,-130.54,1.0
+-31.92,18.86,1.0
+0.6,14.59,1.0
+-1.29,183.33,1.0
+-103.51,79.19,1.0
+-80.86,-141.27,1.0
+-94.34,-132.68,1.0
+-142.42,83.41,1.0
+-120.41,94.57,1.0
+-75.23,-149.11,1.0
+-147.66,30.94,1.0
+-120.21,94.97,1.0
+-45.91,-156.82,1.0
+-113.18,64.32,1.0
+-37.76,-168.93,1.0
+101.43,-91.6,1.0
+-53.25,-179.13,1.0
+-154.69,23.96,1.0
+-77.25,-156.72,1.0
+-91.23,-112.83,1.0
+-0.67,182.68,1.0
+-100.58,-134.24,1.0
+-104.57,-98.96,1.0
+107.18,-73.45,1.0
+104.41,-84.71,1.0
+-110.45,-122.26,1.0
+-150.62,54.48,1.0
+-150.33,56.19,1.0
+103.03,-87.23,1.0
+-86.3,-118.51,1.0
+-150.21,41.42,1.0
+2.45,193.22,1.0
+-143.03,80.64,1.0
+2.29,193.0,1.0
+6.71,184.08,1.0
+-7.11,194.24,1.0
+-62.9,-136.68,1.0
+-5.93,184.97,1.0
+105.43,-74.64,1.0
+-100.06,-138.12,1.0
+114.8,30.82,1.0
+138.28,69.49,1.0
+105.64,-74.73,1.0
+-109.93,-125.02,1.0
+7.94,187.81,1.0
+-9.15,187.02,1.0
+137.07,35.82,1.0
+-58.04,-176.93,1.0
+-41.88,-185.41,1.0
+-32.3,-175.38,1.0
+-146.26,33.83,1.0
+-118.28,-108.51,1.0
+-149.11,43.89,1.0
+100.31,-94.86,1.0
+-150.58,39.95,1.0
+109.77,73.36,1.0
+-114.4,94.2,1.0
+-44.24,-156.41,1.0
+-148.74,9.84,1.0
+47.3,-88.45,1.0
+-158.16,51.82,1.0
+12.71,188.05,1.0
+-2.42,-49.64,1.0
+-31.77,-175.57,1.0
+-32.83,-181.81,1.0
+-142.52,90.8,1.0
+-119.29,-105.08,1.0
+-137.68,36.15,1.0
+32.41,-1.36,1.0
+-148.11,39.73,1.0
+-58.49,-175.98,1.0
+-52.11,-184.33,1.0
+8.44,-60.34,1.0
+-119.55,-107.49,1.0
+12.39,196.7,1.0
+100.01,-98.36,1.0
+-113.87,-125.13,1.0
+-159.96,17.69,1.0
+-115.87,-118.48,1.0
+14.0,191.59,1.0
+-58.55,-130.35,1.0
+-140.28,26.7,1.0
+-49.84,-137.28,1.0
+9.56,199.91,1.0
+78.44,-38.58,1.0
+115.29,-95.22,1.0
+-9.76,190.38,1.0
+-114.34,-88.71,1.0
+-156.79,14.79,1.0
+-102.99,70.32,1.0
+-80.57,-160.86,1.0
+-142.8,7.19,1.0
+-74.66,-170.95,1.0
+-57.55,-185.25,1.0
+90.84,-96.07,1.0
+-68.01,-175.46,1.0
+17.67,183.47,1.0
+-30.33,-181.82,1.0
+110.0,-101.48,1.0
+16.83,192.52,1.0
+-148.43,-2.33,1.0
+-113.86,-91.64,1.0
+17.15,185.01,1.0
+30.54,-78.75,1.0
+-135.92,27.96,1.0
+11.43,190.47,1.0
+-108.92,60.47,1.0
+148.88,53.84,1.0
+144.85,64.85,1.0
+-138.48,17.67,1.0
+-91.68,-103.39,1.0
+53.89,-92.19,1.0
+-48.56,-138.11,1.0
+-168.45,33.35,1.0
+113.29,79.23,1.0
+-45.76,-189.75,1.0
+1.53,174.53,1.0
+-51.2,-189.64,1.0
+-117.01,98.07,1.0
+-80.26,-160.96,1.0
+129.83,29.6,1.0
+-30.71,-168.85,1.0
+-164.59,17.52,1.0
+-137.66,22.2,1.0
+100.85,-55.84,1.0
+-43.21,16.37,1.0
+-33.45,-161.0,1.0
+-161.88,13.12,1.0
+-106.37,103.34,1.0
+-10.91,196.1,1.0
+-137.24,10.84,1.0
+-11.73,194.86,1.0
+19.13,174.96,1.0
+-152.06,-1.13,1.0
+137.21,32.38,1.0
+-63.54,-180.26,1.0
+105.47,-61.5,1.0
+-129.58,32.77,1.0
+99.0,78.27,1.0
+-105.95,-85.41,1.0
+148.82,68.16,1.0
+102.22,78.9,1.0
+-121.61,-94.99,1.0
+-127.72,-106.97,1.0
+-65.23,-183.45,1.0
+96.4,-105.5,1.0
+-161.02,9.57,1.0
+95.56,-103.22,1.0
+-121.48,-125.59,1.0
+-44.8,13.98,1.0
+-114.97,-128.25,1.0
+-167.2,20.85,1.0
+148.88,67.12,1.0
+-122.4,-93.31,1.0
+-121.76,-95.92,1.0
+94.5,73.08,1.0
+-36.37,-153.89,1.0
+106.34,78.96,1.0
+-96.02,88.34,1.0
+-4.88,171.54,1.0
+-29.56,-177.13,1.0
+-147.33,86.94,1.0
+132.5,31.05,1.0
+-127.52,-96.71,1.0
+-29.31,-166.07,1.0
+-89.84,-155.65,1.0
+-116.29,-83.58,1.0
+110.65,-65.83,1.0
+20.28,195.1,1.0
+100.07,-108.84,1.0
+-124.27,-116.28,1.0
+-97.93,-150.41,1.0
+7.37,18.22,1.0
+-44.89,21.41,1.0
+-169.12,42.79,1.0
+-121.35,-76.85,1.0
+158.27,64.29,1.0
+-128.6,-84.6,1.0
+161.17,72.32,1.0
+-166.06,41.13,1.0
+-112.46,-79.61,1.0
+-127.94,-75.52,1.0
+21.86,196.14,1.0
+155.44,58.71,1.0
+-108.67,-69.61,1.0
+-168.77,11.39,1.0
+-88.07,-159.94,1.0
+155.55,65.14,1.0
+13.09,205.04,1.0
+-166.96,1.81,1.0
+-108.33,-69.23,1.0
+-110.8,-81.12,1.0
+154.84,64.86,1.0
+-53.48,-129.85,1.0
+-172.4,18.05,1.0
+-170.36,17.81,1.0
+-111.54,109.03,1.0
+108.27,82.55,1.0
+118.47,-102.91,1.0
+-90.1,-107.58,1.0
+-78.55,-177.12,1.0
+-103.55,-83.6,1.0
+-175.4,34.5,1.0
+103.18,29.78,1.0
+98.2,83.25,1.0
+-111.47,50.58,1.0
+90.84,73.5,1.0
+-173.23,17.25,1.0
+-124.32,-77.94,1.0
+89.71,37.28,1.0
+158.25,72.76,1.0
+-36.05,26.67,1.0
+-58.63,-124.63,1.0
+-127.29,7.34,1.0
+-54.28,-191.08,1.0
+158.53,72.31,1.0
+115.97,82.77,1.0
+-152.47,82.11,1.0
+-172.15,4.26,1.0
+-50.63,-129.85,1.0
+-144.74,-10.69,1.0
+-113.03,108.14,1.0
+-18.17,191.7,1.0
+-141.08,-4.83,1.0
+19.87,199.35,1.0
+-89.66,-105.97,1.0
+-132.88,-106.13,1.0
+-102.63,-69.5,1.0
+90.59,75.79,1.0
+77.85,-101.28,1.0
+92.66,33.84,1.0
+118.8,-76.61,1.0
+92.97,32.94,1.0
+142.41,27.19,1.0
+-101.45,-73.78,1.0
+1.37,168.87,1.0
+-136.53,-1.69,1.0
+137.48,24.59,1.0
+-171.98,54.64,1.0
+-34.35,-149.96,1.0
+-13.73,203.23,1.0
+-51.7,12.3,1.0
+-40.06,-194.36,1.0
+-126.64,22.01,1.0
+-53.25,12.01,1.0
+-40.58,-195.37,1.0
+81.91,46.8,1.0
+-182.31,5.6,1.0
+89.63,31.25,1.0
+-131.19,-0.89,1.0
+170.14,81.86,1.0
+-20.31,174.65,1.0
+89.2,90.67,1.0
+-96.22,109.16,1.0
+-135.38,-5.84,1.0
+88.9,81.25,1.0
+-123.97,-7.98,1.0
+-127.73,-71.18,1.0
+76.68,-104.13,1.0
+91.05,81.25,1.0
+-6.77,170.08,1.0
+-167.14,67.67,1.0
+-174.17,46.84,1.0
+-25.48,191.01,1.0
+-35.0,31.27,1.0
+120.01,-112.91,1.0
+-56.96,19.79,1.0
+-31.17,-151.32,1.0
+85.13,79.42,1.0
+-133.9,-12.5,1.0
+-88.45,97.67,1.0
+-94.7,-82.11,1.0
+76.29,47.32,1.0
+105.21,-114.65,1.0
+90.17,27.69,1.0
+-88.97,-174.7,1.0
+76.57,42.37,1.0
+76.75,39.09,1.0
+-120.21,-0.41,1.0
+38.95,191.37,1.0
+-16.61,169.63,1.0
+-83.75,105.92,1.0
+152.59,83.59,1.0
+-44.22,-129.13,1.0
+128.1,-99.92,1.0
+80.93,38.75,1.0
+128.39,-98.37,1.0
+73.97,-107.35,1.0
+124.94,20.74,1.0
+-93.43,118.77,1.0
+-14.66,210.52,1.0
+-103.78,125.49,1.0
+33.79,4.74,1.0
+143.81,16.29,1.0
+122.19,-117.22,1.0
+-141.88,-21.99,1.0
+178.17,89.47,1.0
+108.35,-53.2,1.0
+-69.43,-105.54,1.0
+-134.74,109.53,1.0
+-35.52,34.17,1.0
+-2.49,166.13,1.0
+147.86,14.69,1.0
+124.79,-114.03,1.0
+73.07,85.62,1.0
+-55.53,-117.48,1.0
+-17.27,-178.42,1.0
+142.13,14.04,1.0
+-85.3,109.81,1.0
+-19.5,-189.8,1.0
+178.36,89.71,1.0
+-118.15,28.28,1.0
+181.36,74.08,1.0
+-88.78,64.83,1.0
+-56.56,32.21,1.0
+-97.5,121.06,1.0
+-132.51,-20.91,1.0
+116.09,-119.39,1.0
+134.4,-103.69,1.0
+-81.92,-95.36,1.0
+-115.16,34.44,1.0
+185.54,91.77,1.0
+-146.13,-92.81,1.0
+-121.92,-53.86,1.0
+-88.75,120.73,1.0
+-113.72,124.99,1.0
+183.88,72.62,1.0
+-100.83,-55.66,1.0
+-165.27,-17.99,1.0
+-83.69,114.16,1.0
+-54.54,-114.85,1.0
+-101.56,-167.51,1.0
+134.13,-93.06,1.0
+-173.34,-8.23,1.0
+30.63,207.53,1.0
+-192.41,23.22,1.0
+119.19,91.51,1.0
+-170.19,-15.39,1.0
+-99.19,-49.44,1.0
+-42.67,38.51,1.0
+126.82,-118.69,1.0
+-90.25,-53.48,1.0
+187.47,73.6,1.0
+185.77,71.49,1.0
+-33.08,-203.35,1.0
+-144.34,-24.69,1.0
+-85.63,126.18,1.0
+84.95,102.16,1.0
+-184.08,51.08,1.0
+-110.99,35.95,1.0
+-84.95,-57.46,1.0
+-113.06,-27.43,1.0
+-112.15,-47.72,1.0
+-11.14,214.12,1.0
+113.85,99.19,1.0
+-138.8,-71.62,1.0
+-78.9,86.08,1.0
+-106.19,-157.61,1.0
+196.55,94.58,1.0
+145.94,5.0,1.0
+199.02,92.95,1.0
+11.2,-77.94,1.0
+176.28,102.81,1.0
+-108.52,-1.67,1.0
+-89.59,-39.01,1.0
+-71.02,101.17,1.0
+201.85,96.03,1.0
+-195.77,18.03,1.0
+88.32,104.46,1.0
+-59.51,42.02,1.0
+-175.57,-10.95,1.0
+146.9,-3.72,1.0
+149.88,-2.27,1.0
+-107.99,31.03,1.0
+3.72,159.94,1.0
+69.86,104.73,1.0
+-45.46,45.89,1.0
+-151.74,-115.27,1.0
+-16.58,83.32,1.0
+165.44,4.82,1.0
+-68.45,-96.8,1.0
+-9.45,-193.64,1.0
+-90.37,54.49,1.0
+150.25,-95.28,1.0
+-98.38,-8.48,1.0
+58.98,33.97,1.0
+-17.37,80.34,1.0
+-131.35,129.83,1.0
+-3.55,63.23,1.0
+-152.69,-107.4,1.0
+189.2,114.16,1.0
+-126.22,-46.18,1.0
+-99.82,-20.51,1.0
+-199.58,15.43,1.0
+64.94,58.76,1.0
+-124.2,130.86,1.0
+-22.9,70.98,1.0
+101.46,-130.87,1.0
+-89.56,-22.93,1.0
+-59.79,96.14,1.0
+-25.44,69.28,1.0
+-7.63,101.82,1.0
+-18.33,218.82,1.0
+-10.05,57.69,1.0
+-90.2,-193.2,1.0
+48.24,197.6,1.0
+161.49,-23.1,1.0
+-211.32,199.59,1.0
+56.32,95.85,1.0
+73.55,117.9,1.0
+-157.22,-33.54,1.0
+152.9,-193.79,1.0
+-55.82,97.45,1.0
+82.37,-154.18,1.0
+-144.03,110.43,1.0
+165.13,-181.03,1.0
+-30.61,95.87,1.0
+169.61,-178.41,1.0
+139.89,-76.15,1.0
+-133.64,143.67,1.0
+50.57,-217.21,1.0
+83.75,-165.61,1.0
+167.3,159.8,1.0
+-71.62,-68.71,1.0
+-63.06,132.28,1.0
+-206.06,207.94,1.0
+21.83,34.6,1.0
+-90.55,174.5,1.0
+176.2,-20.95,1.0
+154.01,-200.93,1.0
+89.94,-175.25,1.0
+156.96,147.98,1.0
+11.74,53.49,1.0
+-70.23,75.82,1.0
+-72.92,52.3,1.0
+-153.83,199.29,1.0
+-107.26,-183.61,1.0
+-218.5,217.82,1.0
+42.42,173.1,1.0
+-26.56,230.0,1.0
+-225.45,213.66,1.0
+-208.42,44.23,1.0
+-50.46,139.03,1.0
+110.59,173.8,1.0
+206.69,-54.09,1.0
+134.68,168.44,1.0
+-197.99,180.01,1.0
+165.08,182.24,1.0
+-173.08,188.31,1.0
+-205.48,-60.18,1.0
+38.82,40.86,1.0
+84.47,-179.68,1.0
+-135.44,160.1,1.0
+-46.29,139.92,1.0
+185.52,-13.35,1.0
+62.53,-224.14,1.0
+-10.58,-137.83,1.0
+-90.77,197.17,1.0
+180.35,-177.28,1.0
+107.82,223.36,1.0
+173.79,159.79,1.0
+-114.92,167.81,1.0
+47.25,74.7,1.0
+104.78,-140.51,1.0
+138.77,151.97,1.0
+68.95,-218.09,1.0
+183.49,-44.48,1.0
+46.87,-178.43,1.0
+50.55,102.89,1.0
+179.91,-43.13,1.0
+28.72,-218.23,1.0
+-85.01,176.34,1.0
+151.61,-207.9,1.0
+-151.0,-125.41,1.0
+-165.85,212.54,1.0
+-62.73,-26.17,1.0
+-30.01,-82.16,1.0
+-198.84,120.93,1.0
+-1.38,-107.19,1.0
+124.88,223.88,1.0
+11.02,-109.6,1.0
+219.38,44.14,1.0
+85.29,222.39,1.0
+-222.51,-3.68,1.0
+176.43,193.78,1.0
+185.85,217.29,1.0
+221.69,45.76,1.0
+-195.01,-80.14,1.0
+-222.27,41.0,1.0
+227.34,102.66,1.0
+-136.73,188.82,1.0
+5.76,119.62,1.0
+15.38,90.69,1.0
+-8.29,-215.48,1.0
+-215.86,135.65,1.0
+217.93,-94.74,1.0
+230.0,-117.35,1.0
+-61.5,-72.0,1.0
+215.97,-122.79,1.0
+-28.78,-67.4,1.0
+25.69,-203.08,1.0
+202.44,225.5,1.0
+-79.99,-226.8,1.0
+54.55,-152.15,1.0
+164.86,-97.89,1.0
+27.91,-162.19,1.0
+-89.83,-230.0,1.0
+-217.3,-49.78,1.0
+223.38,-92.1,1.0
+-207.69,-14.73,1.0
+221.29,-34.45,1.0
+46.87,-180.27,1.0
+84.93,166.13,1.0
+63.32,129.43,1.0
+-230.0,-217.98,1.0
+-142.46,-195.96,1.0
+-192.39,85.64,1.0
+-170.09,-66.53,1.0
+-140.8,-162.65,1.0
+-174.47,-209.6,1.0
+-227.04,-98.18,1.0
+-214.12,-127.14,1.0
+-53.37,214.91,1.0
+229.54,-147.83,1.0
+98.48,-210.49,1.0
+47.68,-126.85,1.0
+-122.78,-213.57,1.0
+97.43,196.05,1.0
+208.11,-195.95,1.0
+-228.87,69.29,1.0
+137.95,195.99,1.0
+169.87,-122.36,1.0
+-191.6,-37.74,1.0
+172.93,-66.66,1.0
+191.85,33.12,1.0
+-103.49,219.02,1.0
+188.1,-221.79,1.0
+-199.87,-184.58,1.0
diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
new file mode 100644
index 00000000..356233dd
--- /dev/null
+++ b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
@@ -0,0 +1,213 @@
+#ifndef CLUE_Algo_Alpaka_h
+#define CLUE_Algo_Alpaka_h
+
+#include <algorithm>
+#include <alpaka/mem/view/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include "CLUEAlpakaKernels.h"
+#include "../DataFormats/Points.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+#include "../DataFormats/alpaka/TilesAlpaka.h"
+#include "ConvolutionalKernel.h"
+
+using cms::alpakatools::VecArray;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <typename TAcc, uint8_t Ndim>
+  class CLUEAlgoAlpaka {
+  public:
+    CLUEAlgoAlpaka() = delete;
+    explicit CLUEAlgoAlpaka(float dc, float rhoc, float outlierDeltaFactor, int pPBin, Queue queue_)
+        : dc_{dc}, rhoc_{rhoc}, outlierDeltaFactor_{outlierDeltaFactor}, pointsPerTile_{pPBin} {
+      init_device(queue_);
+    }
+
+    TilesAlpaka<Ndim>* m_tiles;
+    VecArray<int32_t, max_seeds>* m_seeds;
+    VecArray<int32_t, max_followers>* m_followers;
+
+    template <typename KernelType>
+    std::vector<std::vector<int>> make_clusters(Points<Ndim>& h_points,
+                                                PointsAlpaka<Ndim>& d_points,
+                                                const KernelType& kernel,
+                                                Queue queue_,
+                                                size_t block_size);
+
+  private:
+    float dc_;
+    float rhoc_;
+    float outlierDeltaFactor_;
+    // average number of points found in a tile
+    int pointsPerTile_;
+
+    /* domain_t<Ndim> m_domains; */
+
+    // Buffers
+    std::optional<cms::alpakatools::device_buffer<Device, TilesAlpaka<Ndim>>> d_tiles;
+    std::optional<cms::alpakatools::device_buffer<Device, cms::alpakatools::VecArray<int32_t, max_seeds>>>
+        d_seeds;
+    std::optional<
+        cms::alpakatools::device_buffer<Device, cms::alpakatools::VecArray<int32_t, max_followers>[]>>
+        d_followers;
+
+    // Private methods
+    void init_device(Queue queue_);
+    void setup(const Points<Ndim>& h_points, PointsAlpaka<Ndim>& d_points, Queue queue_, size_t block_size);
+
+    // Construction of the tiles
+    void calculate_tile_size(TilesAlpaka<Ndim>& h_tiles, const Points<Ndim>& h_points);
+  };
+
+  // Private methods
+  template <typename TAcc, uint8_t Ndim>
+  void CLUEAlgoAlpaka<TAcc, Ndim>::calculate_tile_size(TilesAlpaka<Ndim>& h_tiles,
+                                                       const Points<Ndim>& h_points) {
+    for (int i{}; i != Ndim; ++i) {
+      float tileSize;
+      float dimMax{*std::max_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())};
+      float dimMin{*std::min_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())};
+
+      VecArray<float, 2> temp;
+      temp.push_back_unsafe(dimMin);
+      temp.push_back_unsafe(dimMax);
+      h_tiles.min_max[i] = temp;
+      tileSize = (dimMax - dimMin) / h_tiles.nPerDim();
+
+      h_tiles.tile_size[i] = tileSize;
+    }
+  }
+
+  template <typename TAcc, uint8_t Ndim>
+  void CLUEAlgoAlpaka<TAcc, Ndim>::init_device(Queue queue_) {
+    d_tiles = cms::alpakatools::make_device_buffer<TilesAlpaka<Ndim>>(queue_);
+    d_seeds = cms::alpakatools::make_device_buffer<cms::alpakatools::VecArray<int32_t, max_seeds>>(queue_);
+    d_followers =
+        cms::alpakatools::make_device_buffer<cms::alpakatools::VecArray<int32_t, max_followers>[]>(queue_,
+                                                                                                   reserve);
+
+    // Copy to the public pointers
+    m_seeds = (*d_seeds).data();
+    m_followers = (*d_followers).data();
+  }
+
+  template <typename TAcc, uint8_t Ndim>
+  void CLUEAlgoAlpaka<TAcc, Ndim>::setup(const Points<Ndim>& h_points,
+                                         PointsAlpaka<Ndim>& d_points,
+                                         Queue queue_,
+                                         size_t block_size) {
+    // Create temporary tiles object
+    TilesAlpaka<Ndim> temp;
+    calculate_tile_size(temp, h_points);
+    temp.resizeTiles();
+
+    alpaka::memcpy(queue_, *d_tiles, cms::alpakatools::make_host_view(temp));
+    m_tiles = (*d_tiles).data();
+    alpaka::memcpy(
+        queue_, d_points.coords, cms::alpakatools::make_host_view(h_points.m_coords.data(), h_points.n));
+    alpaka::memcpy(
+        queue_, d_points.weight, cms::alpakatools::make_host_view(h_points.m_weight.data(), h_points.n));
+    alpaka::memset(queue_, (*d_seeds), 0x00);
+
+    // Define the working division
+    Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size);
+    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::enqueue(
+        queue_,
+        alpaka::createTaskKernel<Acc1D>(working_div, KernelResetFollowers{}, m_followers, h_points.n));
+  }
+
+  // Public methods
+  template <typename TAcc, uint8_t Ndim>
+  template <typename KernelType>
+  std::vector<std::vector<int>> CLUEAlgoAlpaka<TAcc, Ndim>::make_clusters(Points<Ndim>& h_points,
+                                                                          PointsAlpaka<Ndim>& d_points,
+                                                                          const KernelType& kernel,
+                                                                          Queue queue_,
+                                                                          size_t block_size) {
+    setup(h_points, d_points, queue_, block_size);
+
+    const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size);
+    auto working_div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::enqueue(queue_,
+                    alpaka::createTaskKernel<Acc1D>(
+                        working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n));
+
+    alpaka::enqueue(queue_,
+                    alpaka::createTaskKernel<Acc1D>(working_div,
+                                                    KernelCalculateLocalDensity(),
+                                                    m_tiles,
+                                                    d_points.view(),
+                                                    kernel,
+                                                    /* m_domains.data(), */
+                                                    dc_,
+                                                    h_points.n));
+    alpaka::enqueue(queue_,
+                    alpaka::createTaskKernel<Acc1D>(working_div,
+                                                    KernelCalculateNearestHigher(),
+                                                    m_tiles,
+                                                    d_points.view(),
+                                                    /* m_domains.data(), */
+                                                    outlierDeltaFactor_,
+                                                    dc_,
+                                                    h_points.n));
+    alpaka::enqueue(queue_,
+                    alpaka::createTaskKernel<Acc1D>(working_div,
+                                                    KernelFindClusters<Ndim>(),
+                                                    m_seeds,
+                                                    m_followers,
+                                                    d_points.view(),
+                                                    outlierDeltaFactor_,
+                                                    dc_,
+                                                    rhoc_,
+                                                    h_points.n));
+
+    // We change the working division when assigning the clusters
+    const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, block_size);
+    auto working_div_seeds = cms::alpakatools::make_workdiv<Acc1D>(grid_size_seeds, block_size);
+    alpaka::enqueue(
+        queue_,
+        alpaka::createTaskKernel<Acc1D>(
+            working_div_seeds, KernelAssignClusters<Ndim>(), m_seeds, m_followers, d_points.view()));
+
+    // Wait for all the operations in the queue to finish
+    alpaka::wait(queue_);
+
+    alpaka::memcpy(queue_,
+                   cms::alpakatools::make_host_view(h_points.m_rho.data(), h_points.n),
+                   d_points.rho,
+                   static_cast<uint32_t>(h_points.n));
+    alpaka::memcpy(queue_,
+                   cms::alpakatools::make_host_view(h_points.m_delta.data(), h_points.n),
+                   d_points.delta,
+                   static_cast<uint32_t>(h_points.n));
+    alpaka::memcpy(queue_,
+                   cms::alpakatools::make_host_view(h_points.m_nearestHigher.data(), h_points.n),
+                   d_points.nearest_higher,
+                   static_cast<uint32_t>(h_points.n));
+    alpaka::memcpy(queue_,
+                   cms::alpakatools::make_host_view(h_points.m_clusterIndex.data(), h_points.n),
+                   d_points.cluster_index,
+                   static_cast<uint32_t>(h_points.n));
+    alpaka::memcpy(queue_,
+                   cms::alpakatools::make_host_view(h_points.m_isSeed.data(), h_points.n),
+                   d_points.is_seed,
+                   static_cast<uint32_t>(h_points.n));
+
+    // Wait for all the operations in the queue to finish
+    alpaka::wait(queue_);
+
+    return {h_points.m_clusterIndex, h_points.m_isSeed};
+  }
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif
diff --git a/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h
new file mode 100644
index 00000000..40c82271
--- /dev/null
+++ b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h
@@ -0,0 +1,310 @@
+#ifndef CLUE_Alpaka_Kernels_h
+#define CLUE_Alpaka_Kernels_h
+
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+
+#include "../AlpakaCore/alpakaWorkDiv.h"
+#include "../DataFormats/alpaka/PointsAlpaka.h"
+#include "../DataFormats/alpaka/TilesAlpaka.h"
+#include "../DataFormats/alpaka/AlpakaVecArray.h"
+#include "ConvolutionalKernel.h"
+
+using cms::alpakatools::VecArray;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  constexpr int32_t max_followers{100};
+  constexpr int32_t max_seeds{100};
+  constexpr int32_t reserve{1000000};
+
+  template <uint8_t Ndim>
+  using PointsView = typename PointsAlpaka<Ndim>::PointsAlpakaView;
+
+  struct KernelResetFollowers {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VecArray<int, max_followers>* d_followers,
+                                  uint32_t n_points) const {
+      cms::alpakatools::for_each_element_in_grid(
+          acc, n_points, [&](uint32_t i) { d_followers[i].reset(); });
+    }
+  };
+
+  struct KernelFillTiles {
+    template <typename TAcc, uint8_t Ndim>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  PointsView<Ndim>* points,
+                                  TilesAlpaka<Ndim>* tiles,
+                                  uint32_t n_points) const {
+      cms::alpakatools::for_each_element_in_grid(
+          acc, n_points, [&](uint32_t i) { tiles->fill(acc, points->coords[i], i); });
+    }
+  };
+
+  template <typename TAcc, uint8_t Ndim, uint8_t N_, typename KernelType>
+  ALPAKA_FN_HOST_ACC void for_recursion(const TAcc& acc,
+                                        VecArray<uint32_t, Ndim>& base_vec,
+                                        const VecArray<VecArray<uint32_t, 2>, Ndim>& search_box,
+                                        TilesAlpaka<Ndim>* tiles,
+                                        PointsView<Ndim>* dev_points,
+                                        const KernelType& kernel,
+                                        /* const VecArray<VecArray<float, 2>, Ndim>& domains, */
+                                        const VecArray<float, Ndim>& coords_i,
+                                        float* rho_i,
+                                        float dc,
+                                        uint32_t point_id) {
+    if constexpr (N_ == 0) {
+      int binId{tiles->getGlobalBinByBin(acc, base_vec)};
+      // get the size of this bin
+      int binSize{static_cast<int>((*tiles)[binId].size())};
+
+      // iterate inside this bin
+      for (int binIter{}; binIter < binSize; ++binIter) {
+        uint32_t j{(*tiles)[binId][binIter]};
+        // query N_{dc_}(i)
+
+        VecArray<float, Ndim> coords_j{dev_points->coords[j]};
+        float dist_ij_sq{0.f};
+        for (int dim{}; dim != Ndim; ++dim) {
+          dist_ij_sq += (coords_j[dim] - coords_i[dim]) * (coords_j[dim] - coords_i[dim]);
+        }
+
+        if (dist_ij_sq <= dc * dc) {
+          *rho_i += kernel(acc, alpaka::math::sqrt(acc, dist_ij_sq), point_id, j);
+        }
+
+      }  // end of interate inside this bin
+
+      return;
+    } else {
+      for (unsigned int i{search_box[search_box.capacity() - N_][0]};
+           i <= search_box[search_box.capacity() - N_][1];
+           ++i) {
+        base_vec[base_vec.capacity() - N_] = i;
+        for_recursion<TAcc, Ndim, N_ - 1>(
+            acc, base_vec, search_box, tiles, dev_points, kernel, coords_i, rho_i, dc, point_id);
+      }
+    }
+  }
+
+  struct KernelCalculateLocalDensity {
+    template <typename TAcc, uint8_t Ndim, typename KernelType>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  TilesAlpaka<Ndim>* dev_tiles,
+                                  PointsView<Ndim>* dev_points,
+                                  const KernelType& kernel,
+                                  /* const VecArray<VecArray<float, 2>, Ndim>& domains, */
+                                  float dc,
+                                  uint32_t n_points) const {
+      cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) {
+        float rho_i{0.f};
+        VecArray<float, Ndim> coords_i{dev_points->coords[i]};
+
+        // Get the extremes of the search box
+        VecArray<VecArray<float, 2>, Ndim> searchbox_extremes;
+        for (int dim{}; dim != Ndim; ++dim) {
+          VecArray<float, 2> dim_extremes;
+          dim_extremes.push_back_unsafe(coords_i[dim] - dc);
+          dim_extremes.push_back_unsafe(coords_i[dim] + dc);
+
+          searchbox_extremes.push_back_unsafe(dim_extremes);
+        }
+
+        // Calculate the search box
+        VecArray<VecArray<uint32_t, 2>, Ndim> search_box;
+        dev_tiles->searchBox(acc, searchbox_extremes, &search_box);
+
+        VecArray<uint32_t, Ndim> base_vec;
+        for_recursion<TAcc, Ndim, Ndim>(
+            acc, base_vec, search_box, dev_tiles, dev_points, kernel, coords_i, &rho_i, dc, i);
+
+        dev_points->rho[i] = rho_i;
+      });
+    }
+  };
+
+  template <typename TAcc, uint8_t Ndim, uint8_t N_>
+  ALPAKA_FN_HOST_ACC void for_recursion_nearest_higher(
+      const TAcc& acc,
+      VecArray<uint32_t, Ndim>& base_vec,
+      const VecArray<VecArray<uint32_t, 2>, Ndim>& s_box,
+      TilesAlpaka<Ndim>* tiles,
+      PointsView<Ndim>* dev_points,
+      /* const VecArray<VecArray<float, 2>, Ndim>& domains, */
+      const VecArray<float, Ndim>& coords_i,
+      float rho_i,
+      float* delta_i,
+      int* nh_i,
+      float dm_sq,
+      uint32_t point_id) {
+    if constexpr (N_ == 0) {
+      int binId{tiles->getGlobalBinByBin(acc, base_vec)};
+      // get the size of this bin
+      int binSize{(*tiles)[binId].size()};
+
+      // iterate inside this bin
+      for (int binIter{}; binIter < binSize; ++binIter) {
+        unsigned int j{(*tiles)[binId][binIter]};
+        // query N'_{dm}(i)
+        float rho_j{dev_points->rho[j]};
+        bool found_higher{(rho_j > rho_i)};
+        // in the rare case where rho is the same, use detid
+        found_higher = found_higher || ((rho_j == rho_i) && (j > point_id));
+
+        // Calculate the distance between the two points
+        VecArray<float, Ndim> coords_j{dev_points->coords[j]};
+        float dist_ij_sq{0.f};
+        for (int dim{}; dim != Ndim; ++dim) {
+          dist_ij_sq += (coords_j[dim] - coords_i[dim]) * (coords_j[dim] - coords_i[dim]);
+        }
+
+        if (found_higher && dist_ij_sq <= dm_sq) {
+          // find the nearest point within N'_{dm}(i)
+          if (dist_ij_sq < *delta_i) {
+            // update delta_i and nearestHigher_i
+            *delta_i = dist_ij_sq;
+            *nh_i = j;
+          }
+        }
+      }  // end of interate inside this bin
+
+      return;
+    } else {
+      for (unsigned int i{s_box[s_box.capacity() - N_][0]}; i <= s_box[s_box.capacity() - N_][1]; ++i) {
+        base_vec[base_vec.capacity() - N_] = i;
+        for_recursion_nearest_higher<TAcc, Ndim, N_ - 1>(
+            acc, base_vec, s_box, tiles, dev_points, coords_i, rho_i, delta_i, nh_i, dm_sq, point_id);
+      }
+    }
+  }
+
+  struct KernelCalculateNearestHigher {
+    template <typename TAcc, uint8_t Ndim>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  TilesAlpaka<Ndim>* dev_tiles,
+                                  PointsView<Ndim>* dev_points,
+                                  /* const VecArray<VecArray<float, 2>, Ndim>& domains, */
+                                  float outlier_delta_factor,
+                                  float dc,
+                                  uint32_t n_points) const {
+      float dm{outlier_delta_factor * dc};
+      float dm_squared{dm * dm};
+      cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) {
+        float delta_i{std::numeric_limits<float>::max()};
+        int nh_i{-1};
+        VecArray<float, Ndim> coords_i{dev_points->coords[i]};
+        float rho_i{dev_points->rho[i]};
+
+        // Get the extremes of the search box
+        VecArray<VecArray<float, 2>, Ndim> searchbox_extremes;
+        for (int dim{}; dim != Ndim; ++dim) {
+          VecArray<float, 2> dim_extremes;
+          dim_extremes.push_back_unsafe(coords_i[dim] - dc);
+          dim_extremes.push_back_unsafe(coords_i[dim] + dc);
+
+          searchbox_extremes.push_back_unsafe(dim_extremes);
+        }
+
+        // Calculate the search box
+        VecArray<VecArray<uint32_t, 2>, Ndim> search_box;
+        dev_tiles->searchBox(acc, searchbox_extremes, &search_box);
+
+        VecArray<uint32_t, Ndim> base_vec{};
+        for_recursion_nearest_higher<TAcc, Ndim, Ndim>(acc,
+                                                       base_vec,
+                                                       search_box,
+                                                       dev_tiles,
+                                                       dev_points,
+                                                       coords_i,
+                                                       rho_i,
+                                                       &delta_i,
+                                                       &nh_i,
+                                                       dm_squared,
+                                                       i);
+
+        dev_points->delta[i] = alpaka::math::sqrt(acc, delta_i);
+        dev_points->nearest_higher[i] = nh_i;
+      });
+    }
+  };
+
+  template <uint8_t Ndim>
+  struct KernelFindClusters {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VecArray<int32_t, max_seeds>* seeds,
+                                  VecArray<int32_t, max_followers>* followers,
+                                  PointsView<Ndim>* dev_points,
+                                  float outlier_delta_factor,
+                                  float d_c,
+                                  float rho_c,
+                                  uint32_t n_points) const {
+      cms::alpakatools::for_each_element_in_grid(acc, n_points, [&](uint32_t i) {
+        // initialize cluster_index
+        dev_points->cluster_index[i] = -1;
+
+        float delta_i{dev_points->delta[i]};
+        float rho_i{dev_points->rho[i]};
+
+        // Determine whether the point is a seed or an outlier
+        bool is_seed{(delta_i > d_c) && (rho_i >= rho_c)};
+        bool is_outlier{(delta_i > outlier_delta_factor * d_c) && (rho_i < rho_c)};
+
+        if (is_seed) {
+          dev_points->is_seed[i] = 1;
+          seeds[0].push_back(acc, i);
+        } else {
+          if (!is_outlier) {
+            followers[dev_points->nearest_higher[i]].push_back(acc, i);
+          }
+          dev_points->is_seed[i] = 0;
+        }
+      });
+    }
+  };
+
+  template <uint8_t Ndim>
+  struct KernelAssignClusters {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  VecArray<int, max_seeds>* seeds,
+                                  VecArray<int, max_followers>* followers,
+                                  PointsView<Ndim>* dev_points) const {
+      const auto& seeds_0{seeds[0]};
+      const auto n_seeds{seeds_0.size()};
+      cms::alpakatools::for_each_element_in_grid(acc, n_seeds, [&](uint32_t idx_cls) {
+        int local_stack[256] = {-1};
+        int local_stack_size{};
+
+        int idx_this_seed{seeds_0[idx_cls]};
+        dev_points->cluster_index[idx_this_seed] = idx_cls;
+        // push_back idThisSeed to localStack
+        local_stack[local_stack_size] = idx_this_seed;
+        ++local_stack_size;
+        // process all elements in localStack
+        while (local_stack_size > 0) {
+          // get last element of localStack
+          int idx_end_of_local_stack{local_stack[local_stack_size - 1]};
+          int temp_cluster_index{dev_points->cluster_index[idx_end_of_local_stack]};
+          // pop_back last element of localStack
+          local_stack[local_stack_size - 1] = -1;
+          --local_stack_size;
+          const auto& followers_ies{followers[idx_end_of_local_stack]};
+          const auto followers_size{followers[idx_end_of_local_stack].size()};
+          // loop over followers of last element of localStack
+          for (int j{}; j != followers_size; ++j) {
+            // pass id to follower
+            int follower{followers_ies[j]};
+            dev_points->cluster_index[follower] = temp_cluster_index;
+            // push_back follower to localStack
+            local_stack[local_stack_size] = follower;
+            ++local_stack_size;
+          }
+        }
+      });
+    }
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif
diff --git a/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h
new file mode 100644
index 00000000..2e6eb3c6
--- /dev/null
+++ b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h
@@ -0,0 +1,80 @@
+#ifndef convolutional_kernels_h
+#define convolutional_kernels_h
+
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Debug.hpp>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include <alpaka/alpaka.hpp>
+
+class FlatKernel {
+private:
+  float m_flat;
+
+public:
+  // Constructors
+  FlatKernel() = delete;
+  FlatKernel(float flat) : m_flat{flat} {}
+
+  // Overload call operator
+  template <typename TAcc>
+  ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const {
+    if (point_id == j) {
+      return 1.f;
+    } else {
+      return m_flat;
+    }
+  }
+};
+
+class GaussianKernel {
+private:
+  float m_gaus_avg;
+  float m_gaus_std;
+  float m_gaus_amplitude;
+
+public:
+  // Constructors
+  GaussianKernel() = delete;
+  GaussianKernel(float gaus_avg, float gaus_std, float gaus_amplitude)
+      : m_gaus_avg{gaus_avg}, m_gaus_std{gaus_std}, m_gaus_amplitude{gaus_amplitude} {}
+
+  // Overload call operator
+  template <typename TAcc>
+  ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const {
+    if (point_id == j) {
+      return 1.f;
+    } else {
+      return (m_gaus_amplitude *
+              alpaka::math::exp(
+                  acc, -(dist_ij - m_gaus_avg) * (dist_ij - m_gaus_avg) / (2 * m_gaus_std * m_gaus_std)));
+    }
+  }
+};
+
+class ExponentialKernel {
+private:
+  float m_exp_avg;
+  float m_exp_amplitude;
+
+public:
+  // Constructors
+  ExponentialKernel() = delete;
+  ExponentialKernel(float exp_avg, float exp_amplitude)
+      : m_exp_avg{exp_avg}, m_exp_amplitude{exp_amplitude} {}
+
+  // Overload call operator
+  template <typename TAcc>
+  ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const {
+    if (point_id == j) {
+      return 1.f;
+    } else {
+      return (m_exp_amplitude * alpaka::math::exp(acc, -m_exp_avg * dist_ij));
+    }
+  }
+};
+
+#endif
diff --git a/CLUEstering/alpaka/CLUE/Run.h b/CLUEstering/alpaka/CLUE/Run.h
new file mode 100644
index 00000000..cf1502fc
--- /dev/null
+++ b/CLUEstering/alpaka/CLUE/Run.h
@@ -0,0 +1,552 @@
+#ifndef run_h
+#define run_h
+
+#include <vector>
+#include "CLUEAlgoAlpaka.h"
+#include "ConvolutionalKernel.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  std::vector<std::vector<int>> run1(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run1(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run1(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run2(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run2(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run2(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run3(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run3(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run3(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run4(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run4(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run4(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run5(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run5(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run5(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run6(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run6(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run6(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run7(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run7(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run7(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run8(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run8(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run8(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run9(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const FlatKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run9(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const ExponentialKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run9(float dc,
+                                     float rhoc,
+                                     float outlier,
+                                     int pPBin,
+                                     std::vector<std::vector<float>> const &coordinates,
+                                     std::vector<float> const &weight,
+                                     const GaussianKernel &kernel,
+                                     Queue queue_,
+                                     size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run10(float dc,
+                                      float rhoc,
+                                      float outlier,
+                                      int pPBin,
+                                      std::vector<std::vector<float>> const &coordinates,
+                                      std::vector<float> const &weight,
+                                      const FlatKernel &kernel,
+                                      Queue queue_,
+                                      size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run10(float dc,
+                                      float rhoc,
+                                      float outlier,
+                                      int pPBin,
+                                      std::vector<std::vector<float>> const &coordinates,
+                                      std::vector<float> const &weight,
+                                      const ExponentialKernel &kernel,
+                                      Queue queue_,
+                                      size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+  std::vector<std::vector<int>> run10(float dc,
+                                      float rhoc,
+                                      float outlier,
+                                      int pPBin,
+                                      std::vector<std::vector<float>> const &coordinates,
+                                      std::vector<float> const &weight,
+                                      const GaussianKernel &kernel,
+                                      Queue queue_,
+                                      size_t block_size) {
+    CLUEAlgoAlpaka<Acc1D, 2> algo(dc, rhoc, outlier, pPBin, queue_);
+
+    // Create the host and device points
+    Points<2> h_points(coordinates, weight);
+    PointsAlpaka<2> d_points(queue_, weight.size());
+
+    return algo.make_clusters(h_points, d_points, kernel, queue_, block_size);
+  }
+
+};  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif
diff --git a/CLUEstering/alpaka/DataFormats/Points.h b/CLUEstering/alpaka/DataFormats/Points.h
new file mode 100644
index 00000000..e522976c
--- /dev/null
+++ b/CLUEstering/alpaka/DataFormats/Points.h
@@ -0,0 +1,48 @@
+#ifndef points_h
+#define points_h
+
+#include <array>
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <vector>
+#include "alpaka/PointsAlpaka.h"
+#include "alpaka/AlpakaVecArray.h"
+
+using cms::alpakatools::VecArray;
+
+template <uint8_t Ndim>
+struct Points {
+  Points() = default;
+  Points(const std::vector<VecArray<float, Ndim>>& coords, const std::vector<float>& weight)
+      : m_coords{coords}, m_weight{weight}, n{weight.size()} {}
+  Points(const std::vector<std::vector<float>>& coords, const std::vector<float>& weight)
+      : m_weight{weight}, n{weight.size()} {
+    for (const auto& x : coords) {
+      VecArray<float, Ndim> temp_vecarray;
+      for (auto value : x) {
+        temp_vecarray.push_back_unsafe(value);
+      }
+      m_coords.push_back(temp_vecarray);
+    }
+
+    m_rho.resize(n);
+    m_delta.resize(n);
+    m_nearestHigher.resize(n);
+    m_clusterIndex.resize(n);
+    m_isSeed.resize(n);
+  }
+
+  std::vector<VecArray<float, Ndim>> m_coords;
+  std::vector<float> m_weight;
+  std::vector<float> m_rho;
+  std::vector<float> m_delta;
+  std::vector<int> m_nearestHigher;
+  std::vector<int> m_clusterIndex;
+  std::vector<int> m_isSeed;
+
+  size_t n;
+};
+
+#endif
diff --git a/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h
new file mode 100644
index 00000000..f2e3ed95
--- /dev/null
+++ b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h
@@ -0,0 +1,100 @@
+#ifndef AlpakaVecArray_h
+#define AlpakaVecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+namespace cms::alpakatools {
+
+  template <class T, int maxSize>
+  struct VecArray {
+    inline constexpr int push_back_unsafe(const T &element) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    constexpr int emplace_back_unsafe(Ts &&...args) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    inline constexpr T &back() const {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    // thread-safe version of the vector, when used in a CUDA kernel
+    template <typename T_Acc>
+    ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) {
+      auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        assert(("Too few elemets reserved"));
+        return -1;
+      }
+    }
+
+    template <typename T_Acc, class... Ts>
+    ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) {
+      auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    template <typename T_Acc, class... Ts>
+    ALPAKA_FN_ACC inline T pop_back() {
+      if (m_size > 0) {
+        auto previousSize = m_size--;
+        return m_data[previousSize - 1];
+      } else
+        return T();
+    }
+
+    inline constexpr T const *begin() const { return m_data; }
+    inline constexpr T const *end() const { return m_data + m_size; }
+    inline constexpr T *begin() { return m_data; }
+    inline constexpr T *end() { return m_data + m_size; }
+    inline constexpr int size() const { return m_size; }
+    inline constexpr T &operator[](int i) { return m_data[i]; }
+    inline constexpr const T &operator[](int i) const { return m_data[i]; }
+    inline constexpr void reset() { m_size = 0; }
+    inline constexpr int capacity() const { return maxSize; }
+    inline constexpr T const *data() const { return m_data; }
+    inline constexpr void resize(int size) { m_size = size; }
+    inline constexpr bool empty() const { return 0 == m_size; }
+    inline constexpr bool full() const { return maxSize == m_size; }
+
+    int m_size = 0;
+
+    T m_data[maxSize];
+  };
+
+}  // end namespace cms::alpakatools
+
+#endif  // AlpakaVecArray_h
diff --git a/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h
new file mode 100644
index 00000000..9fe71ef4
--- /dev/null
+++ b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h
@@ -0,0 +1,76 @@
+#ifndef Points_Alpaka_h
+#define Points_Alpaka_h
+
+#include <cstdint>
+#include <memory>
+
+#include "../../AlpakaCore/alpakaConfig.h"
+#include "../../AlpakaCore/alpakaMemory.h"
+#include "AlpakaVecArray.h"
+#include "../Points.h"
+
+using cms::alpakatools::VecArray;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <uint8_t Ndim>
+  class PointsAlpaka {
+  public:
+    PointsAlpaka() = delete;
+    explicit PointsAlpaka(Queue stream, int n_points)
+        : coords{cms::alpakatools::make_device_buffer<VecArray<float, Ndim>[]>(stream, n_points)},
+          weight{cms::alpakatools::make_device_buffer<float[]>(stream, n_points)},
+          rho{cms::alpakatools::make_device_buffer<float[]>(stream, n_points)},
+          delta{cms::alpakatools::make_device_buffer<float[]>(stream, n_points)},
+          nearest_higher{cms::alpakatools::make_device_buffer<int[]>(stream, n_points)},
+          cluster_index{cms::alpakatools::make_device_buffer<int[]>(stream, n_points)},
+          is_seed{cms::alpakatools::make_device_buffer<int[]>(stream, n_points)},
+          view_dev{cms::alpakatools::make_device_buffer<PointsAlpakaView>(stream)} {
+      auto view_host = cms::alpakatools::make_host_buffer<PointsAlpakaView>(stream);
+      view_host->coords = coords.data();
+      view_host->weight = weight.data();
+      view_host->rho = rho.data();
+      view_host->delta = delta.data();
+      view_host->nearest_higher = nearest_higher.data();
+      view_host->cluster_index = cluster_index.data();
+      view_host->is_seed = is_seed.data();
+
+      // Copy memory inside the host view to device
+      alpaka::memcpy(stream, view_dev, view_host);
+    }
+    // Copy constructor/assignment operator
+    PointsAlpaka(const PointsAlpaka&) = delete;
+    PointsAlpaka& operator=(const PointsAlpaka&) = delete;
+    // Move constructor/assignment operator
+    PointsAlpaka(PointsAlpaka&&) = default;
+    PointsAlpaka& operator=(PointsAlpaka&&) = default;
+    // Destructor
+    ~PointsAlpaka() = default;
+
+    cms::alpakatools::device_buffer<Device, VecArray<float, Ndim>[]> coords;
+    cms::alpakatools::device_buffer<Device, float[]> weight;
+    cms::alpakatools::device_buffer<Device, float[]> rho;
+    cms::alpakatools::device_buffer<Device, float[]> delta;
+    cms::alpakatools::device_buffer<Device, int[]> nearest_higher;
+    cms::alpakatools::device_buffer<Device, int[]> cluster_index;
+    cms::alpakatools::device_buffer<Device, int[]> is_seed;
+
+    class PointsAlpakaView {
+    public:
+      VecArray<float, Ndim>* coords;
+      float* weight;
+      float* rho;
+      float* delta;
+      int* nearest_higher;
+      int* cluster_index;
+      int* is_seed;
+    };
+
+    PointsAlpakaView* view() { return view_dev.data(); }
+
+  private:
+    cms::alpakatools::device_buffer<Device, PointsAlpakaView> view_dev;
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif
diff --git a/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h
new file mode 100644
index 00000000..9f5812a1
--- /dev/null
+++ b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h
@@ -0,0 +1,111 @@
+#ifndef Tiles_Alpaka_h
+#define Tiles_Alpaka_h
+
+#include <alpaka/core/Common.hpp>
+#include <alpaka/alpaka.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+#include <functional>
+#include <cmath>
+#include <algorithm>
+#include <numeric>
+#include <stdint.h>
+#include <type_traits>
+
+#include "../../AlpakaCore/alpakaConfig.h"
+#include "../../AlpakaCore/alpakaMemory.h"
+#include "AlpakaVecArray.h"
+
+using cms::alpakatools::VecArray;
+
+constexpr uint32_t max_tile_depth{1 << 10};
+constexpr uint32_t max_n_tiles{1 << 10};
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <uint8_t Ndim>
+  class TilesAlpaka {
+  public:
+    TilesAlpaka() : n_tiles{1000}, n_tiles_per_dim{static_cast<int>(std::pow(1000, 1. / Ndim))} {};
+
+    // Public member
+    VecArray<VecArray<float, 2>, Ndim> min_max;
+    VecArray<float, Ndim> tile_size;
+
+    // Public methods
+    void resizeTiles() { m_tiles.resize(n_tiles); }
+
+    // getter
+    int nPerDim() const { return n_tiles_per_dim; }
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC inline constexpr int getBin(const TAcc& acc, float coord_, int dim_) const {
+      int coord_Bin{(int)((coord_ - min_max[dim_][0]) / tile_size[dim_])};
+
+      // Address the cases of underflow and overflow and underflow
+      coord_Bin = alpaka::math::min(acc, coord_Bin, n_tiles_per_dim - 1);
+      coord_Bin = alpaka::math::max(acc, coord_Bin, 0);
+
+      return coord_Bin;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBin(const TAcc& acc,
+                                                         const VecArray<float, Ndim>& coords) const {
+      int globalBin{getBin(acc, coords[0], 0)};
+      for (int i{1}; i != Ndim; ++i) {
+        globalBin += n_tiles_per_dim * getBin(acc, coords[i], i);
+      }
+      return globalBin;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBinByBin(const TAcc& acc,
+                                                              const VecArray<uint32_t, Ndim>& Bins) const {
+      uint32_t globalBin{Bins[0]};
+      for (int i{1}; i != Ndim; ++i) {
+        globalBin += n_tiles_per_dim * Bins[i];
+      }
+      return globalBin;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC inline constexpr void fill(const TAcc& acc, const VecArray<float, Ndim>& coords, int i) {
+      m_tiles[getGlobalBin(acc, coords)].push_back(acc, i);
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC inline void searchBox(const TAcc& acc,
+                                        const VecArray<VecArray<float, 2>, Ndim>& sb_extremes,
+                                        VecArray<VecArray<uint32_t, 2>, Ndim>* search_box) {
+      for (int dim{}; dim != Ndim; ++dim) {
+        VecArray<uint32_t, 2> dim_sb;
+        dim_sb.push_back_unsafe(getBin(acc, sb_extremes[dim][0], dim));
+        dim_sb.push_back_unsafe(getBin(acc, sb_extremes[dim][1], dim));
+
+        search_box->push_back_unsafe(dim_sb);
+      }
+    }
+
+    ALPAKA_FN_HOST_ACC inline constexpr auto size() { return n_tiles; }
+
+    ALPAKA_FN_HOST_ACC inline constexpr void clear() {
+      for (int i{}; i < n_tiles; ++i) {
+        m_tiles[i].reset();
+      }
+    }
+
+    ALPAKA_FN_HOST_ACC inline constexpr VecArray<uint32_t, max_tile_depth>& operator[](int globalBinId) {
+      return m_tiles[globalBinId];
+    }
+
+  private:
+    size_t n_tiles;
+    int n_tiles_per_dim;
+    VecArray<VecArray<uint32_t, max_tile_depth>, max_n_tiles> m_tiles;
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif
diff --git a/CLUEstering/include/Clustering.h b/CLUEstering/serial/include/Clustering.h
similarity index 100%
rename from CLUEstering/include/Clustering.h
rename to CLUEstering/serial/include/Clustering.h
diff --git a/CLUEstering/include/Kernels.h b/CLUEstering/serial/include/Kernels.h
similarity index 100%
rename from CLUEstering/include/Kernels.h
rename to CLUEstering/serial/include/Kernels.h
diff --git a/CLUEstering/include/Point.h b/CLUEstering/serial/include/Point.h
similarity index 100%
rename from CLUEstering/include/Point.h
rename to CLUEstering/serial/include/Point.h
diff --git a/CLUEstering/include/Tiles.h b/CLUEstering/serial/include/Tiles.h
similarity index 100%
rename from CLUEstering/include/Tiles.h
rename to CLUEstering/serial/include/Tiles.h
diff --git a/CLUEstering/include/deltaPhi.h b/CLUEstering/serial/include/deltaPhi.h
similarity index 100%
rename from CLUEstering/include/deltaPhi.h
rename to CLUEstering/serial/include/deltaPhi.h
diff --git a/CLUEstering/include/test/Makefile b/CLUEstering/serial/include/test/Makefile
similarity index 100%
rename from CLUEstering/include/test/Makefile
rename to CLUEstering/serial/include/test/Makefile
diff --git a/CLUEstering/include/test/doctest.h b/CLUEstering/serial/include/test/doctest.h
similarity index 100%
rename from CLUEstering/include/test/doctest.h
rename to CLUEstering/serial/include/test/doctest.h
diff --git a/CLUEstering/include/test/test_deltaphi.cc b/CLUEstering/serial/include/test/test_deltaphi.cc
similarity index 100%
rename from CLUEstering/include/test/test_deltaphi.cc
rename to CLUEstering/serial/include/test/test_deltaphi.cc
diff --git a/CLUEstering/include/test/test_distance.cc b/CLUEstering/serial/include/test/test_distance.cc
similarity index 100%
rename from CLUEstering/include/test/test_distance.cc
rename to CLUEstering/serial/include/test/test_distance.cc
diff --git a/CLUEstering/include/test/test_kernels.cc b/CLUEstering/serial/include/test/test_kernels.cc
similarity index 100%
rename from CLUEstering/include/test/test_kernels.cc
rename to CLUEstering/serial/include/test/test_kernels.cc
diff --git a/extern/alpaka b/extern/alpaka
new file mode 160000
index 00000000..5573fc35
--- /dev/null
+++ b/extern/alpaka
@@ -0,0 +1 @@
+Subproject commit 5573fc351cc1de8d4ffe0db092ad1b3eb9eae177
diff --git a/extern/pybind11 b/extern/pybind11
new file mode 160000
index 00000000..104f921b
--- /dev/null
+++ b/extern/pybind11
@@ -0,0 +1 @@
+Subproject commit 104f921b0cd0fd0fc90718acd0b40975c5f7f703