[CUDADEV] Update to the code base from the Patatrack CMSSW release

Highlights: - remove the dependency on the CUB external - reduce caching allocator memory usage - update CUDA to version 11.0 and C++17 - update the CUDA code base from CMSSW 11.2.x - import more tests from the cuda directory
cms-patatrack · Nov 5, 2020 · 56003a9 · 56003a9
1 parent 5a19e8e
commit 56003a9
Show file tree

Hide file tree

Showing 99 changed files with 2,648 additions and 1,703 deletions.
diff --git a/Makefile b/Makefile
@@ -26,8 +26,12 @@ export LIB_DIR := $(BASE_DIR)/lib
 export TEST_DIR := $(BASE_DIR)/test
 
 # System external definitions
-CUDA_BASE := /usr/local/cuda-10.2
-ifneq ($(wildcard $(CUDA_BASE)),)
+CUDA_BASE := /usr/local/cuda-11.0
+ifeq ($(wildcard $(CUDA_BASE)),)
+# CUDA platform not found
+CUDA_BASE :=
+else
+# CUDA platform at $(CUDA_BASE)
 CUDA_LIBDIR := $(CUDA_BASE)/lib64
 USER_CUDAFLAGS :=
 export CUDA_DEPS := $(CUDA_BASE)/lib64/libcudart.so
@@ -38,15 +42,13 @@ export CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64 -lcudart -lcudadevrt
 export CUDA_NVCC := $(CUDA_BASE)/bin/nvcc
 define CUFLAGS_template
 $(2)NVCC_FLAGS := $$(foreach ARCH,$(1),-gencode arch=compute_$$(ARCH),code=sm_$$(ARCH)) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --cudart=shared
-$(2)NVCC_COMMON := -std=c++14 -O3 $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS) $(USER_CXXFLAGS)'
+$(2)NVCC_COMMON := -std=c++17 -O3 $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS) $(USER_CXXFLAGS)'
 $(2)CUDA_CUFLAGS := -dc $$($(2)NVCC_COMMON) $(USER_CUDAFLAGS)
 $(2)CUDA_DLINKFLAGS := -dlink $$($(2)NVCC_COMMON)
 endef
 $(eval $(call CUFLAGS_template,$(CUDA_ARCH),))
 export CUDA_CUFLAGS
 export CUDA_DLINKFLAGS
-else # no CUDA
-CUDA_BASE=
 endif
 
 # Input data definitions

diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.cc b/src/cuda/CUDACore/deviceAllocatorStatus.cc
@@ -0,0 +1,7 @@
+#include "CUDACore/deviceAllocatorStatus.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace cms::cuda {
+  allocator::GpuCachedBytes deviceAllocatorStatus() { return allocator::getCachingDeviceAllocator().CacheStatus(); }
+}  // namespace cms::cuda
diff --git a/src/cuda/CUDACore/deviceAllocatorStatus.h b/src/cuda/CUDACore/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/src/cudadev/CUDACore/AtomicPairCounter.h b/src/cudadev/CUDACore/AtomicPairCounter.h
@@ -5,48 +5,54 @@
 
 #include "CUDACore/cudaCompat.h"
 
-class AtomicPairCounter {
-public:
-  using c_type = unsigned long long int;
+namespace cms {
+  namespace cuda {
 
-  AtomicPairCounter() {}
-  AtomicPairCounter(c_type i) { counter.ac = i; }
+    class AtomicPairCounter {
+    public:
+      using c_type = unsigned long long int;
 
-  __device__ __host__ AtomicPairCounter& operator=(c_type i) {
-    counter.ac = i;
-    return *this;
-  }
+      AtomicPairCounter() {}
+      AtomicPairCounter(c_type i) { counter.ac = i; }
 
-  struct Counters {
-    uint32_t n;  // in a "One to Many" association is the number of "One"
-    uint32_t m;  // in a "One to Many" association is the total number of associations
-  };
+      __device__ __host__ AtomicPairCounter& operator=(c_type i) {
+        counter.ac = i;
+        return *this;
+      }
 
-  union Atomic2 {
-    Counters counters;
-    c_type ac;
-  };
+      struct Counters {
+        uint32_t n;  // in a "One to Many" association is the number of "One"
+        uint32_t m;  // in a "One to Many" association is the total number of associations
+      };
 
-  static constexpr c_type incr = 1UL << 32;
+      union Atomic2 {
+        Counters counters;
+        c_type ac;
+      };
 
-  __device__ __host__ Counters get() const { return counter.counters; }
+      static constexpr c_type incr = 1UL << 32;
 
-  // increment n by 1 and m by i.  return previous value
-  __host__ __device__ __forceinline__ Counters add(uint32_t i) {
-    c_type c = i;
-    c += incr;
-    Atomic2 ret;
+      __device__ __host__ Counters get() const { return counter.counters; }
+
+      // increment n by 1 and m by i.  return previous value
+      __host__ __device__ __forceinline__ Counters add(uint32_t i) {
+        c_type c = i;
+        c += incr;
+        Atomic2 ret;
 #ifdef __CUDA_ARCH__
-    ret.ac = atomicAdd(&counter.ac, c);
+        ret.ac = atomicAdd(&counter.ac, c);
 #else
-    ret.ac = counter.ac;
-    counter.ac += c;
+        ret.ac = counter.ac;
+        counter.ac += c;
 #endif
-    return ret.counters;
-  }
+        return ret.counters;
+      }
+
+    private:
+      Atomic2 counter;
+    };
 
-private:
-  Atomic2 counter;
-};
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
diff --git a/src/cudadev/CUDACore/CUDAHostAllocator.h b/src/cudadev/CUDACore/CUDAHostAllocator.h