From 55dd2c99346baa3a14949a0f7e9c41865e434eda Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 6 Oct 2022 15:49:49 -0700 Subject: [PATCH] Merge internal changes from CTK 11.8 - Fix constness with annotated_ptr - Adds SM_90 to - Adds SM_90 to CMake and testing scripts --- .upstream-tests/test/CMakeLists.txt | 2 +- .../annotated_ptr/access_property.pass.cpp | 121 +++++ .../access_property_explicit.fail.cpp | 30 ++ .../test/cuda/annotated_ptr/annotated_ptr.h | 440 ------------------ .../cuda/annotated_ptr/annotated_ptr.pass.cpp | 174 ++++++- .../annotated_ptr_bench.pass.cpp | 164 +++++++ .../annotated_ptr_constructors.pass.cpp | 86 ++++ .../annotated_ptr/annotated_ptr_ctor.fail.cpp | 52 +++ .../annotated_ptr_host_constructor.pass.cpp | 28 ++ .../annotated_ptr_host_shared.pass.cpp | 6 +- .../annotated_ptr_neg_tests.pass.cpp | 20 +- .../annotated_ptr_shmem.pass.cpp | 56 +++ .../apply_access_property.pass.cpp | 49 ++ .../associate_access_property.pass.cpp | 54 +++ .../annotated_ptr/discard_memory.pass.cpp | 44 ++ .../cuda/annotated_ptr/memcpy_async.pass.cpp | 120 +++++ .../test/cuda/annotated_ptr/utils.h | 66 +++ .../utils/libcudacxx/test/config.py | 7 +- .../utils/nvidia/linux/perform_tests.bash | 2 +- include/cuda/annotated_ptr | 40 +- include/cuda/std/detail/__annotated_ptr | 29 +- include/nv/detail/__target_macros | 22 + include/nv/target | 12 +- 23 files changed, 1126 insertions(+), 498 deletions(-) create mode 100644 .upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp delete mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp create mode 100644 .upstream-tests/test/cuda/annotated_ptr/utils.h diff --git a/.upstream-tests/test/CMakeLists.txt b/.upstream-tests/test/CMakeLists.txt index 838b222748..cae71093b1 100644 --- a/.upstream-tests/test/CMakeLists.txt +++ b/.upstream-tests/test/CMakeLists.txt @@ -6,7 +6,7 @@ macro(pythonize_bool var) endif() endmacro() -set(LIBCUDACXX_HIGHEST_COMPUTE_ARCH 80) +set(LIBCUDACXX_HIGHEST_COMPUTE_ARCH 90) foreach (COMPUTE_ARCH ${LIBCUDACXX_COMPUTE_ARCHS}) set(_compute_message "${_compute_message} sm_${COMPUTE_ARCH}") diff --git a/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp new file mode 100644 index 0000000000..809598ef96 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" + +__device__ __host__ __noinline__ +void test_access_property_interleave() { + const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000}; + const uint64_t INTERLEAVE_NORMAL_DEMOTE = uint64_t{0x16F0000000000000}; + const uint64_t INTERLEAVE_PERSISTING = uint64_t{0x14F0000000000000}; + const uint64_t INTERLEAVE_STREAMING = uint64_t{0x12F0000000000000}; + cuda::access_property ap(cuda::access_property::persisting{}); + cuda::access_property ap2; + + assert(INTERLEAVE_PERSISTING == static_cast(ap)); + assert(static_cast(ap2) == INTERLEAVE_NORMAL); + + ap = cuda::access_property(cuda::access_property::normal()); + assert(static_cast(ap) == INTERLEAVE_NORMAL_DEMOTE); + + ap = cuda::access_property(cuda::access_property::streaming()); + assert(static_cast(ap) == INTERLEAVE_STREAMING); + + ap = cuda::access_property(cuda::access_property::normal(), 2.0f); + assert(static_cast(ap) == INTERLEAVE_NORMAL_DEMOTE); +} + +__device__ __host__ __noinline__ +void test_access_property_block() { + //assuming ptr address is 0; + const size_t TOTAL_BYTES = 0xFFFFFFFF; + const size_t HIT_BYTES = 0xFFFFFFFF; + const size_t BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES = size_t{0x1DD00FE000000000}; + const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000}; + + cuda::access_property ap(0x0, HIT_BYTES, TOTAL_BYTES, cuda::access_property::persisting{}, cuda::access_property::streaming{}); + assert(static_cast(ap) == BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES); + + ap = cuda::access_property(0x0, 0xFFFFFFFF, 0xFFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{}); + assert(static_cast(ap) == INTERLEAVE_NORMAL); + + ap = cuda::access_property(0x0, 0xFFFFFFFFF, 0xFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{}); + assert(static_cast(ap) == INTERLEAVE_NORMAL); + + ap = cuda::access_property(0x0, 0, 0, cuda::access_property::persisting{}, cuda::access_property::streaming{}); + assert(static_cast(ap) == INTERLEAVE_NORMAL); + + for (size_t ptr = 1; ptr < size_t{0xFFFFFFFF}; ptr <<= 1) { + for (size_t hit = 1; hit < size_t{0xFFFFFFFF}; hit <<= 1) { + ap = cuda::access_property((void*)ptr, hit, hit, cuda::access_property::persisting{}, cuda::access_property::streaming{}); + DPRINTF("Block encoding PTR:%p, hit:%p, block encoding:%p\n", ptr, hit, static_cast(ap)); + } + } +} + +template +__host__ __device__ __noinline__ void test_global_implicit_property(T ap, cudaAccessProperty cp) { + // Test implicit conversions + cudaAccessProperty v = ap; + assert(cp == v); + + // Test default, copy constructor, and copy-assignent + cuda::access_property o(ap); + cuda::access_property d; + d = ap; + + // Test explicit conversion to i64 + std::uint64_t x = (std::uint64_t)o; + std::uint64_t y = (std::uint64_t)d; + assert(x == y); +} + +__host__ __device__ __noinline__ void test_global() { + cuda::access_property o(cuda::access_property::global{}); + std::uint64_t x = (std::uint64_t)o; + unused(x); +} + +__host__ __device__ __noinline__ void test_shared() { + (void)cuda::access_property::shared{}; +} + +static_assert(sizeof(cuda::access_property::shared) == 1, ""); +static_assert(sizeof(cuda::access_property::global) == 1, ""); +static_assert(sizeof(cuda::access_property::persisting) == 1, ""); +static_assert(sizeof(cuda::access_property::normal) == 1, ""); +static_assert(sizeof(cuda::access_property::streaming) == 1, ""); +static_assert(sizeof(cuda::access_property) == 8, ""); + +static_assert(alignof(cuda::access_property::shared) == 1, ""); +static_assert(alignof(cuda::access_property::global) == 1, ""); +static_assert(alignof(cuda::access_property::persisting) == 1, ""); +static_assert(alignof(cuda::access_property::normal) == 1, ""); +static_assert(alignof(cuda::access_property::streaming) == 1, ""); +static_assert(alignof(cuda::access_property) == 8, ""); + +int main(int argc, char ** argv) +{ + test_access_property_interleave(); + test_access_property_block(); + + test_global_implicit_property(cuda::access_property::normal{}, cudaAccessProperty::cudaAccessPropertyNormal); + test_global_implicit_property(cuda::access_property::streaming{}, cudaAccessProperty::cudaAccessPropertyStreaming); + test_global_implicit_property(cuda::access_property::persisting{}, cudaAccessProperty::cudaAccessPropertyPersisting); + + test_global(); + test_shared(); + + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp b/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp new file mode 100644 index 0000000000..0f18d671eb --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" + +__host__ __device__ __noinline__ void test_access_property_fail() { + cuda::access_property o = cuda::access_property::normal{}; + // Test implicit conversion fails + std::uint64_t x; + x = o; + unused(o); +} + +int main(int argc, char ** argv) +{ + test_access_property_fail(); + return 0; +} + diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h deleted file mode 100644 index 386a1a323a..0000000000 --- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h +++ /dev/null @@ -1,440 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: pre-sm-70 -// UNSUPPORTED: c++98, c++03 - -#include -#include - -#ifndef __CUDA_ARCH__ -#include -#endif - -#if defined(DEBUG) - #define DPRINTF(...) { printf(__VA_ARGS__); } -#else - #define DPRINTF(...) do {} while (false) -#endif - -#if defined(_LIBCUDACXX_COMPILER_MSVC) -#pragma warning(disable: 4505) -#endif - -template -__host__ __device__ constexpr bool unused(T...) {return true;} - -// ************************ Device code *************************************** -static __device__ -void shared_mem_test_dev() { - __shared__ int smem[42]; - smem[10] = 42; - assert(smem[10] == 42); - - cuda::annotated_ptr p{smem + 10}; - - assert(*p == 42); -} - -static __global__ -void shared_mem_test() { - shared_mem_test_dev(); -}; - -static __device__ -void annotated_ptr_timing_dev(int * in, int * out) { - cuda::access_property ap(cuda::access_property::persisting{}); - // Retrieve global id - int i = blockIdx.x * blockDim.x + threadIdx.x; - - cuda::annotated_ptr in_ann{in, ap}; - cuda::annotated_ptr out_ann{out, ap}; - - DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i); - DPRINTF("&out[i]:%p = &in_ann[i]:%p for i = %d\n", &out_ann[i], &in_ann[i], i); - - out_ann[i] = in_ann[i]; -}; - -static __global__ -void annotated_ptr_timing(int * in, int * out) { - annotated_ptr_timing_dev(in, out); -} - -static __device__ -void ptr_timing_dev(int * in, int * out) { - // Retrieve global id - int i = blockIdx.x * blockDim.x + threadIdx.x; - DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i); - out[i] = in[i]; -}; - -static __global__ -void ptr_timing(int * in, int * out) { - ptr_timing_dev(in, out); -}; - -// ************************ Host/device code *************************************** -__device__ __host__ -void assert_rt_wrap(cudaError_t code, const char *file, int line) { - if (code != cudaSuccess) { -#ifndef __CUDACC_RTC__ - printf("assert: %s %s %d\n", cudaGetErrorString(code), file, line); -#endif - assert(code == cudaSuccess); - } -} -#define assert_rt(ret) { assert_rt_wrap((ret), __FILE__, __LINE__); } - -__device__ __host__ __noinline__ -void test_access_property_interleave() { - (void)cuda::access_property::shared{}; - (void)cuda::access_property::global{}; - assert(cuda::access_property::persisting{} == cudaAccessPropertyPersisting); - assert(cuda::access_property::streaming{} == cudaAccessPropertyStreaming); - assert(cuda::access_property::normal{} == cudaAccessPropertyNormal); - - const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000}; - const uint64_t INTERLEAVE_NORMAL_DEMOTE = uint64_t{0x16F0000000000000}; - const uint64_t INTERLEAVE_PERSISTING = uint64_t{0x14F0000000000000}; - const uint64_t INTERLEAVE_STREAMING = uint64_t{0x12F0000000000000}; - cuda::access_property ap(cuda::access_property::persisting{}); - cuda::access_property ap2; - - assert(INTERLEAVE_PERSISTING == static_cast(ap)); - assert(static_cast(ap2) == INTERLEAVE_NORMAL); - - ap = cuda::access_property(cuda::access_property::normal()); - assert(static_cast(ap) == INTERLEAVE_NORMAL_DEMOTE); - - ap = cuda::access_property(cuda::access_property::streaming()); - assert(static_cast(ap) == INTERLEAVE_STREAMING); - - ap = cuda::access_property(cuda::access_property::normal(), 2.0f); - assert(static_cast(ap) == INTERLEAVE_NORMAL_DEMOTE); -} - -__device__ __host__ __noinline__ -void test_access_property_block() { - //assuming ptr address is 0; - const size_t TOTAL_BYTES = 0xFFFFFFFF; - const size_t HIT_BYTES = 0xFFFFFFFF; - const size_t BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES = size_t{0x1DD00FE000000000}; - const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000}; - - cuda::access_property ap(0x0, HIT_BYTES, TOTAL_BYTES, cuda::access_property::persisting{}, cuda::access_property::streaming{}); - assert(static_cast(ap) == BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES); - - ap = cuda::access_property(0x0, 0xFFFFFFFF, 0xFFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{}); - assert(static_cast(ap) == INTERLEAVE_NORMAL); - - ap = cuda::access_property(0x0, 0xFFFFFFFFF, 0xFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{}); - assert(static_cast(ap) == INTERLEAVE_NORMAL); - - ap = cuda::access_property(0x0, 0, 0, cuda::access_property::persisting{}, cuda::access_property::streaming{}); - assert(static_cast(ap) == INTERLEAVE_NORMAL); - - for (size_t ptr = 1; ptr < size_t{0xFFFFFFFF}; ptr <<= 1) { - for (size_t hit = 1; hit < size_t{0xFFFFFFFF}; hit <<= 1) { - ap = cuda::access_property((void*)ptr, hit, hit, cuda::access_property::persisting{}, cuda::access_property::streaming{}); - DPRINTF("Block encoding PTR:%p, hit:%p, block encoding:%p\n", ptr, hit, static_cast(ap)); - } - } -} - -__device__ __host__ __noinline__ -void test_access_property_functions() { - size_t ARR_SZ = 1 << 10; - int* arr0 = nullptr; - int* arr1 = nullptr; - cuda::access_property ap(cuda::access_property::persisting{}); - cuda::access_property a; - unused(a); - cuda::access_property as(cuda::access_property::streaming{}); - -#ifdef __CUDA_ARCH__ - arr0 = (int*)malloc(ARR_SZ * sizeof(int)); - arr1 = (int*)malloc(ARR_SZ * sizeof(int)); -#else - assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int))); - assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int))); - assert_rt(cudaDeviceSynchronize()); -#endif - - cuda::discard_memory(arr0, ARR_SZ); - arr0 = cuda::associate_access_property(arr0, ap); - arr1 = cuda::associate_access_property(arr1, as); - cuda::apply_access_property(arr0, ARR_SZ, cuda::access_property::persisting{}); - cuda::apply_access_property(arr1, ARR_SZ, cuda::access_property::normal{}); - -#ifdef __CUDA_ARCH__ - free(arr0); - free(arr1); -#else - assert_rt(cudaFree(arr0)); - assert_rt(cudaFree(arr1)); -#endif - -} - -__device__ __host__ __noinline__ -void test_annotated_ptr_basic() { - cuda::access_property ap(cuda::access_property::persisting{}); - static const size_t ARR_SZ = 1 << 10; - int* array0 = new int[ARR_SZ]; - int* array1 = new int[ARR_SZ]; - cuda::annotated_ptr array_anno_ptr{array0, ap}; - cuda::annotated_ptr array0_anno_ptr0{array0, ap}; - cuda::annotated_ptr array0_anno_ptr1 = array0_anno_ptr0; - cuda::annotated_ptr array0_anno_ptr2{array0_anno_ptr0}; - cuda::annotated_ptr array1_anno_ptr{array1, ap}; -#ifndef __CUDA_ARCH__ - cuda::annotated_ptr shared_ptr1; - cuda::annotated_ptr shared_ptr2; - - shared_ptr1 = shared_ptr2; - unused(shared_ptr1); - - //Check on host the arrays through annotated_ptr ops - std::array a1{3, 2, 1}; - cuda::annotated_ptr, cuda::access_property> anno_ptr{&a1, ap}; - assert(anno_ptr->at(0) == 3); -#endif - - //Fill the arrays - for (size_t i = 0; i < ARR_SZ; ++i) { - array0[i] = static_cast(i); - array1[i] = static_cast(ARR_SZ - i); - } - - assert((bool)array0_anno_ptr0 == true); - assert(array0_anno_ptr0.get() == array0); - - for (size_t i = 0; i < ARR_SZ; ++i) { - assert(array0_anno_ptr0[i] == static_cast(i)); - assert(array0_anno_ptr2[i] == static_cast(i)); - assert(&array0[i] == &array0_anno_ptr0[i]); - assert(&array0[i] == &array0_anno_ptr1[i]); - } - - for (size_t i = 0; i < ARR_SZ; ++i) { - assert(array1_anno_ptr[i] == array1[i]); - } - - delete[] array0; - delete[] array1; -} - -__device__ __host__ __noinline__ -void test_annotated_ptr_launch_kernel() { -#ifndef __CUDA_ARCH__ - static const size_t ARR_SZ = 1 << 22; - static const size_t THREAD_CNT = 128; - static const size_t BLOCK_CNT = ARR_SZ / THREAD_CNT; - const dim3 threads(THREAD_CNT, 1, 1), blocks(BLOCK_CNT, 1, 1); - cudaEvent_t start, stop; -#else - static const size_t ARR_SZ = 1 << 10; -#endif - int* arr0 = nullptr; - int* arr1 = nullptr; - float annotated_time = 0.f, pointer_time = 0.f; - -#ifdef __CUDA_ARCH__ - arr0 = (int*)malloc(ARR_SZ * sizeof(int)); - arr1 = (int*)malloc(ARR_SZ * sizeof(int)); -#else - assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int))); - assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int))); - assert_rt(cudaDeviceSynchronize()); -#endif - -#ifdef __CUDA_ARCH__ - shared_mem_test_dev(); -#else - shared_mem_test<<<1, 1, 0, 0>>>(); - assert_rt(cudaStreamSynchronize(0)); -#endif - - -#ifdef __CUDA_ARCH__ - ptr_timing_dev(arr0, arr1); -#else - ptr_timing<<>>(arr0, arr1); - assert_rt(cudaDeviceSynchronize()); -#endif - - for (size_t i = 0; i < ARR_SZ; ++i) { - arr0[i] = static_cast(i); - arr1[i] = 0; - } - -#ifdef __CUDA_ARCH__ - ptr_timing_dev(arr0, arr1); -#else - assert_rt(cudaDeviceSynchronize()); - assert_rt(cudaEventCreate(&start)); - assert_rt(cudaEventCreate(&stop)); - assert_rt(cudaEventRecord(start)); - ptr_timing<<>>(arr0, arr1); - assert_rt(cudaEventRecord(stop)); - assert_rt(cudaEventSynchronize(stop)); - assert_rt(cudaEventElapsedTime(&pointer_time, start, stop)); - assert_rt(cudaEventDestroy(start)); - assert_rt(cudaEventDestroy(stop)); - assert_rt(cudaDeviceSynchronize()); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i); - assert(arr1[i] == static_cast(i)); - } - - arr1[i] = 0; - } -#endif - -#ifdef __CUDA_ARCH__ - annotated_ptr_timing_dev(arr0, arr1); -#else - assert_rt(cudaDeviceSynchronize()); - annotated_ptr_timing<<>>(arr0, arr1); - assert_rt(cudaDeviceSynchronize()); -#endif - - for (size_t i = 0; i < ARR_SZ; ++i) { - arr0[i] = static_cast(i); - arr1[i] = 0; - } - -#ifdef __CUDA_ARCH__ - annotated_ptr_timing_dev(arr0, arr1); -#else - assert_rt(cudaDeviceSynchronize()); - assert_rt(cudaEventCreate(&start)); - assert_rt(cudaEventCreate(&stop)); - assert_rt(cudaEventRecord(start)); - annotated_ptr_timing<<>>(arr0, arr1); - assert_rt(cudaEventRecord(stop)); - assert_rt(cudaEventSynchronize(stop)); - assert_rt(cudaEventElapsedTime(&annotated_time, start, stop)); - assert_rt(cudaEventDestroy(start)); - assert_rt(cudaEventDestroy(stop)); - assert_rt(cudaDeviceSynchronize()); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i); - assert(arr1[i] == static_cast(i)); - } - - arr1[i] = 0; - } -#endif - -#ifdef __CUDA_ARCH__ - free(arr0); - free(arr1); -#else - assert_rt(cudaFree(arr0)); - assert_rt(cudaFree(arr1)); -#endif - printf("array(ms):%f, arrotated_ptr(ms):%f\n", - pointer_time, annotated_time); -} - -__device__ __host__ __noinline__ -void test_annotated_ptr_functions() { - size_t ARR_SZ = 1 << 10; - int* arr0 = nullptr; - int* arr1 = nullptr; - cuda::access_property ap(cuda::access_property::persisting{}); - cuda::barrier bar0, bar1, bar2, bar3; - init(&bar0, 1); - init(&bar1, 1); - init(&bar2, 1); - init(&bar3, 1); - -#ifdef __CUDA_ARCH__ - arr0 = (int*)malloc(ARR_SZ * sizeof(int)); - arr1 = (int*)malloc(ARR_SZ * sizeof(int)); - - auto group = cooperative_groups::this_thread_block(); -#else - assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int))); - assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int))); - assert_rt(cudaDeviceSynchronize()); -#endif - - cuda::annotated_ptr ann0{arr0, ap}; - cuda::annotated_ptr ann1{arr1, ap}; - - for (size_t i = 0; i < ARR_SZ; ++i) { - arr0[i] = static_cast(i); - arr1[i] = 0; - } - - cuda::memcpy_async(ann1, ann0, ARR_SZ * sizeof(int), bar0); - bar0.arrive_and_wait(); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); - assert(arr1[i] == static_cast(i)); - } - - arr1[i] = 0; - } - - cuda::memcpy_async(arr1, ann0, ARR_SZ * sizeof(int), bar1); - bar1.arrive_and_wait(); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); - assert(arr1[i] == static_cast(i)); - } - - arr1[i] = 0; - } - -#ifdef __CUDA_ARCH__ - cuda::memcpy_async(group, ann1, ann0, ARR_SZ * sizeof(int), bar2); - bar2.arrive_and_wait(); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); - assert(arr1[i] == i); - } - - arr1[i] = 0; - } - - cuda::memcpy_async(group, arr1, ann0, ARR_SZ * sizeof(int), bar3); - bar3.arrive_and_wait(); - - for (size_t i = 0; i < ARR_SZ; ++i) { - if (arr1[i] != (int)i) { - DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); - assert(arr1[i] == i); - } - - arr1[i] = 0; - } -#endif - -#ifdef __CUDA_ARCH__ - free(arr0); - free(arr1); -#else - assert_rt(cudaFree(arr0)); - assert_rt(cudaFree(arr1)); -#endif -} diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp index 11a9185ee5..2b871f77c6 100644 --- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp @@ -8,30 +8,166 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc // UNSUPPORTED: c++98, c++03 -#include "annotated_ptr.h" +#include "utils.h" -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), "annotated_ptr must be pointer size"); -static_assert(sizeof(cuda::annotated_ptr) == 2*sizeof(uintptr_t), "annotated_ptr must be 2 * pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); -static_assert(alignof(cuda::annotated_ptr) == alignof(int*), "annotated_ptr must align with int*"); -static_assert(alignof(cuda::annotated_ptr) == alignof(int*), "annotated_ptr must align with int*"); -static_assert(alignof(cuda::annotated_ptr) == alignof(int*), "annotated_ptr must align with int*"); +static_assert(sizeof(cuda::annotated_ptr) == 2*sizeof(uintptr_t), + "annotated_ptr must be 2 * pointer size"); + +// NOTE: we could make these smaller in the future (e.g. 32-bit) but that would be an ABI breaking change: +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); +static_assert(sizeof(cuda::annotated_ptr) == sizeof(uintptr_t), + "annotated_ptr must be pointer size"); + + +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); + +// NOTE: we could lower the alignment in the future but that would be an ABI breaking change: +static_assert(alignof(cuda::annotated_ptr) == alignof(int*), + "annotated_ptr must align with int*"); + +#define N 128 + +struct S { + int x; + __host__ __device__ S& operator=(int o) { + this->x = o; + return *this; + } +}; + +template +__device__ __host__ __noinline__ +void test_read_access(In i, T* r) { + assert(i); + assert(i - i == 0); + assert((bool)i); + const In o = i; + + //assert(i->x == 0); // FAILS with shmem + //assert(o->x == 0); // FAILS with shmem + for (int n = 0; n < N; ++n) { + assert(i[n].x == n); + assert(&i[n] == &i[n]); + assert(&i[n] == &r[n]); + assert(o[n].x == n); + assert(&o[n] == &o[n]); + assert(&o[n] == &r[n]); + } +} + +template +__device__ __host__ __noinline__ +void test_write_access(In i) { + assert(i); + assert((bool)i); + const In o = i; + + for (int n = 0; n < N; ++n) { + i[n].x = 2*n; + assert(i[n].x == 2*n); + assert(i[n].x == 2*n); + i[n].x = n; + + o[n].x = 2*n; + assert(o[n].x == 2*n); + assert(o[n].x == 2*n); + o[n].x = n; + } +} + +__device__ __host__ __noinline__ +void all_tests() { + S* arr = alloc(false); +#ifdef __CUDA_ARCH__ + S* sarr = alloc(true); +#endif + + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); +#ifdef __CUDA_ARCH__ + test_read_access(cuda::annotated_ptr(sarr), sarr); +#endif + + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); +#ifdef __CUDA_ARCH__ + test_read_access(cuda::annotated_ptr(sarr), sarr); +#endif + + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); +#ifdef __CUDA_ARCH__ + test_read_access(cuda::annotated_ptr(sarr), sarr); +#endif + + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); + test_read_access(cuda::annotated_ptr(arr), arr); +#ifdef __CUDA_ARCH__ + test_read_access(cuda::annotated_ptr(sarr), sarr); +#endif + + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); +#ifdef __CUDA_ARCH__ + test_write_access(cuda::annotated_ptr(sarr)); +#endif + + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); + test_write_access(cuda::annotated_ptr(arr)); +#ifdef __CUDA_ARCH__ + test_write_access(cuda::annotated_ptr(sarr)); +#endif +} int main(int argc, char ** argv) { - test_access_property_interleave(); - test_access_property_block(); - test_access_property_functions(); - test_annotated_ptr_basic(); - test_annotated_ptr_launch_kernel(); - test_annotated_ptr_functions(); - - return 0; + all_tests(); + return 0; } diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp new file mode 100644 index 0000000000..db630c61c4 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp @@ -0,0 +1,164 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" + +__device__ +void annotated_ptr_timing_dev(int* in, int* out) { + cuda::access_property ap(cuda::access_property::persisting{}); + // Retrieve global id + int i = blockIdx.x * blockDim.x + threadIdx.x; + + cuda::annotated_ptr in_ann{in, ap}; + cuda::annotated_ptr out_ann{out, ap}; + + DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i); + DPRINTF("&out[i]:%p = &in_ann[i]:%p for i = %d\n", &out_ann[i], &in_ann[i], i); + + out_ann[i] = in_ann[i]; +}; + +__global__ +void annotated_ptr_timing(int* in, int* out) { + annotated_ptr_timing_dev(in, out); +} + +__device__ +void ptr_timing_dev(int* in, int* out) { + // Retrieve global id + int i = blockIdx.x * blockDim.x + threadIdx.x; + DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i); + out[i] = in[i]; +}; + +__global__ +void ptr_timing(int* in, int* out) { + ptr_timing_dev(in, out); +}; + +__device__ __host__ __noinline__ +void bench() { +#ifndef __CUDA_ARCH__ + static const size_t ARR_SZ = 1 << 22; + static const size_t THREAD_CNT = 128; + static const size_t BLOCK_CNT = ARR_SZ / THREAD_CNT; + const dim3 threads(THREAD_CNT, 1, 1), blocks(BLOCK_CNT, 1, 1); + cudaEvent_t start, stop; +#else + static const size_t ARR_SZ = 1 << 10; +#endif + int* arr0 = nullptr; + int* arr1 = nullptr; + float annotated_time = 0.f, pointer_time = 0.f; + +#ifdef __CUDA_ARCH__ + arr0 = (int*)malloc(ARR_SZ * sizeof(int)); + arr1 = (int*)malloc(ARR_SZ * sizeof(int)); +#else + assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int))); + assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int))); + assert_rt(cudaDeviceSynchronize()); +#endif + +#ifdef __CUDA_ARCH__ + ptr_timing_dev(arr0, arr1); +#else + ptr_timing<<>>(arr0, arr1); + assert_rt(cudaDeviceSynchronize()); +#endif + + for (size_t i = 0; i < ARR_SZ; ++i) { + arr0[i] = static_cast(i); + arr1[i] = 0; + } + +#ifdef __CUDA_ARCH__ + ptr_timing_dev(arr0, arr1); +#else + assert_rt(cudaDeviceSynchronize()); + assert_rt(cudaEventCreate(&start)); + assert_rt(cudaEventCreate(&stop)); + assert_rt(cudaEventRecord(start)); + ptr_timing<<>>(arr0, arr1); + assert_rt(cudaEventRecord(stop)); + assert_rt(cudaEventSynchronize(stop)); + assert_rt(cudaEventElapsedTime(&pointer_time, start, stop)); + assert_rt(cudaEventDestroy(start)); + assert_rt(cudaEventDestroy(stop)); + assert_rt(cudaDeviceSynchronize()); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i); + assert(arr1[i] == static_cast(i)); + } + + arr1[i] = 0; + } +#endif + +#ifdef __CUDA_ARCH__ + annotated_ptr_timing_dev(arr0, arr1); +#else + assert_rt(cudaDeviceSynchronize()); + annotated_ptr_timing<<>>(arr0, arr1); + assert_rt(cudaDeviceSynchronize()); +#endif + + for (size_t i = 0; i < ARR_SZ; ++i) { + arr0[i] = static_cast(i); + arr1[i] = 0; + } + +#ifdef __CUDA_ARCH__ + annotated_ptr_timing_dev(arr0, arr1); +#else + assert_rt(cudaDeviceSynchronize()); + assert_rt(cudaEventCreate(&start)); + assert_rt(cudaEventCreate(&stop)); + assert_rt(cudaEventRecord(start)); + annotated_ptr_timing<<>>(arr0, arr1); + assert_rt(cudaEventRecord(stop)); + assert_rt(cudaEventSynchronize(stop)); + assert_rt(cudaEventElapsedTime(&annotated_time, start, stop)); + assert_rt(cudaEventDestroy(start)); + assert_rt(cudaEventDestroy(stop)); + assert_rt(cudaDeviceSynchronize()); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i); + assert(arr1[i] == static_cast(i)); + } + + arr1[i] = 0; + } +#endif + +#ifdef __CUDA_ARCH__ + free(arr0); + free(arr1); +#else + assert_rt(cudaFree(arr0)); + assert_rt(cudaFree(arr1)); +#endif + printf("array(ms):%f, arrotated_ptr(ms):%f\n", + pointer_time, annotated_time); +} + +int main(int argc, char ** argv) +{ + bench(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp new file mode 100644 index 0000000000..2a00bbb49a --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc + +#include "utils.h" + +template +__host__ __device__ __noinline__ +void test_ctor() { + // default ctor, cpy and cpy assignment + cuda::annotated_ptr def; + { + cuda::annotated_ptr temp; + temp = def; + unused(temp); + } + cuda::annotated_ptr other(def); + unused(other); + + // from ptr + T* rp = nullptr; + cuda::annotated_ptr a(rp); + assert(!a); + + // cpy ctor & asign to cv + cuda::annotated_ptr c(def); + cuda::annotated_ptr d(def); + cuda::annotated_ptr e(def); + c = def; + d = def; + e = def; + + // from c|v to c|v|cv + cuda::annotated_ptr f(c); + cuda::annotated_ptr g(d); + cuda::annotated_ptr h(e); + f = c; + g = d; + h = e; + unused(f, g, h); + + // to cv + cuda::annotated_ptr i(c); + cuda::annotated_ptr j(d); + i = c; + j = d; +} + +template +__host__ __device__ __noinline__ +void test_global_ctor() { + test_ctor(); + + // from ptr + prop + T* rp = nullptr; + P p; + cuda::annotated_ptr a(rp, p); + cuda::annotated_ptr b(rp, p); + cuda::annotated_ptr c(rp, p); + cuda::annotated_ptr d(rp, p); +} + +__host__ __device__ __noinline__ +void test_global_ctors() { + test_global_ctor(); + test_global_ctor(); + test_global_ctor(); + test_global_ctor(); + test_global_ctor(); + test_ctor(); +} + +int main(int argc, char ** argv) +{ + test_global_ctors(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp new file mode 100644 index 0000000000..76f218c844 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc + +#include "utils.h" + +template +__host__ __device__ __noinline__ +void test_ctor() { + // default ctor, cpy and cpy assignment + cuda::annotated_ptr def; + def = def; + cuda::annotated_ptr other(def); + + // from ptr + T* rp = nullptr; + cuda::annotated_ptr a(rp); + assert(!a); + + // cpy ctor & asign to cv + cuda::annotated_ptr c(def); + cuda::annotated_ptr d(def); + cuda::annotated_ptr e(def); + c = e; // FAIL + d = d; // FAIL +} + +template +__host__ __device__ __noinline__ +void test_global_ctor() { + test_ctor(); +} + +__host__ __device__ __noinline__ +void test_global_ctors() { + test_global_ctor(); +} + +int main(int argc, char ** argv) +{ + test_global_ctors(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp new file mode 100644 index 0000000000..ec304bc253 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// XFAIL: nvcc + +#include "utils.h" + +int main(int argc, char ** argv) +{ + + cuda::access_property ap(cuda::access_property::persisting{}); + int* array0 = new int[9]; + cuda::annotated_ptr array_anno_ptr{array0, ap}; + cuda::annotated_ptr shared_ptr; + + array_anno_ptr = shared_ptr; // fail to compile, as expected + + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp index 4be560147f..907d508d03 100644 --- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp @@ -12,18 +12,18 @@ // UNSUPPORTED: nvrtc // XFAIL: nvcc -#include "annotated_ptr.h" +#include "utils.h" int main(int argc, char ** argv) { -#ifndef __CUDA_ARCH__ + cuda::access_property ap(cuda::access_property::persisting{}); int* array0 = new int[9]; cuda::annotated_ptr array_anno_ptr{array0, ap}; cuda::annotated_ptr shared_ptr; array_anno_ptr = shared_ptr; // fail to compile, as expected -#endif + return 0; } diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp index 1d871e3e31..9f0aa9e1f0 100644 --- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp @@ -12,22 +12,16 @@ // UNSUPPORTED: nvrtc // XFAIL: nvcc -#include "annotated_ptr.h" +#include "utils.h" -__device__ __host__ __noinline__ -static void negative_test_access_property_functions() { - size_t ARR_SZ = 1 << 20; - int* arr1 = nullptr; - cuda::access_property ap(cuda::access_property::persisting{}); - - arr1 = (int*)malloc(ARR_SZ * sizeof(int)); - - //calling from host needs to fail and kill the app - __nv_associate_access_property(arr1, static_cast(ap)); +__device__ __host__ +static void fails_from_host() { + int a; + __nv_associate_access_property(&a, uint64_t{0}); } int main(int argc, char ** argv) { - negative_test_access_property_functions(); - + //calling from host needs to fail and kill the app + fails_from_host(); return 0; } diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp new file mode 100644 index 0000000000..91a123f4a5 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" + +template +__device__ __host__ __noinline__ +void shared_mem_test_dev() { + T* smem = alloc(true); + smem[10] = 42; + + cuda::annotated_ptr p{smem + 10}; + + assert(*p == 42); +} + +__device__ __host__ __noinline__ +void all_tests() { + shared_mem_test_dev(); + shared_mem_test_dev(); + shared_mem_test_dev(); + shared_mem_test_dev(); +} + +__global__ +void shared_mem_test() { + all_tests(); +}; + +// TODO: is this needed? +__device__ __host__ __noinline__ +void test_all() { +#ifdef __CUDA_ARCH__ + all_tests(); +#else + shared_mem_test<<<1, 1, 0, 0>>>(); + assert_rt(cudaStreamSynchronize(0)); +#endif +} + +int main(int argc, char ** argv) +{ + test_all(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp new file mode 100644 index 0000000000..8ddfe8872d --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" +#define ARR_SZ 128 + +template +__device__ __host__ __noinline__ +void test(P ap, bool shared = false) +{ + T* arr = alloc(shared); + + cuda::apply_access_property(arr, ARR_SZ, ap); + + for (int i = 0; i < ARR_SZ; ++i) { + assert(arr[i] == i); + } + + dealloc(arr, shared); +} + +__device__ __host__ __noinline__ +void test_all() +{ + test(cuda::access_property::normal{}); + test(cuda::access_property::persisting{}); +} + +__global__ void test_kernel() { + test_all(); +} + +int main(int argc, char ** argv) +{ + + test_all(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp new file mode 100644 index 0000000000..abf5196763 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" +#define ARR_SZ 128 + +template +__device__ __host__ __noinline__ +void test(P ap, bool shared = false) +{ + + T* arr = alloc(shared); + + arr = cuda::associate_access_property(arr, ap); + + for (int i = 0; i < ARR_SZ; ++i) { + assert(arr[i] == i); + } + + dealloc(arr, shared); +} + +__device__ __host__ __noinline__ +void test_all() +{ + test(cuda::access_property::normal{}); + test(cuda::access_property::persisting{}); + test(cuda::access_property::streaming{}); + test(cuda::access_property::global{}); + test(cuda::access_property{}); + test(cuda::access_property::shared{}, true); +} + +__global__ void test_kernel() { + test_all(); +} + +int main(int argc, char ** argv) +{ + + test_all(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp new file mode 100644 index 0000000000..5189e8c5a7 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" +#define ARR_SZ 128 + +template +__device__ __host__ __noinline__ +void test(bool shared = false) +{ + T* arr = alloc(shared); + + cuda::discard_memory(arr, ARR_SZ); + + dealloc(arr, shared); +} + +__device__ __host__ __noinline__ +void test_all() +{ + test(); +} + +__global__ void test_kernel() { + test_all(); +} + +int main(int argc, char ** argv) +{ + + test_all(); + return 0; +} diff --git a/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp new file mode 100644 index 0000000000..a41a3ba95d --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp @@ -0,0 +1,120 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: pre-sm-70 +// UNSUPPORTED: !nvcc +// UNSUPPORTED: nvrtc +// UNSUPPORTED: c++98, c++03 + +#include "utils.h" +#include + +// TODO: global-shared +// TODO: read const +__device__ __host__ __noinline__ +void test_memcpy_async() { + size_t ARR_SZ = 1 << 10; + int* arr0 = nullptr; + int* arr1 = nullptr; + cuda::access_property ap(cuda::access_property::persisting{}); + cuda::barrier bar0, bar1, bar2, bar3; + init(&bar0, 1); + init(&bar1, 1); + init(&bar2, 1); + init(&bar3, 1); + +#ifdef __CUDA_ARCH__ + arr0 = (int*)malloc(ARR_SZ * sizeof(int)); + arr1 = (int*)malloc(ARR_SZ * sizeof(int)); + + auto group = cooperative_groups::this_thread_block(); +#else + assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int))); + assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int))); + assert_rt(cudaDeviceSynchronize()); +#endif + + cuda::annotated_ptr ann0{arr0, ap}; + cuda::annotated_ptr ann1{arr1, ap}; + //cuda::annotated_ptr cann0{arr0, ap}; + + for (size_t i = 0; i < ARR_SZ; ++i) { + arr0[i] = static_cast(i); + arr1[i] = 0; + } + + cuda::memcpy_async(ann1, ann0, ARR_SZ * sizeof(int), bar0); + //cuda::memcpy_async(ann1, cann0, ARR_SZ * sizeof(int), bar0); + bar0.arrive_and_wait(); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); + assert(arr1[i] == static_cast(i)); + } + + arr1[i] = 0; + } + + cuda::memcpy_async(arr1, ann0, ARR_SZ * sizeof(int), bar1); + //cuda::memcpy_async(arr1, cann0, ARR_SZ * sizeof(int), bar1); + bar1.arrive_and_wait(); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); + assert(arr1[i] == static_cast(i)); + } + + arr1[i] = 0; + } + +#ifdef __CUDA_ARCH__ + cuda::memcpy_async(group, ann1, ann0, ARR_SZ * sizeof(int), bar2); + //cuda::memcpy_async(group, ann1, cann0, ARR_SZ * sizeof(int), bar2); + bar2.arrive_and_wait(); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); + assert(arr1[i] == i); + } + + arr1[i] = 0; + } + + cuda::memcpy_async(group, arr1, ann0, ARR_SZ * sizeof(int), bar3); + //cuda::memcpy_async(group, arr1, cann0, ARR_SZ * sizeof(int), bar3); + bar3.arrive_and_wait(); + + for (size_t i = 0; i < ARR_SZ; ++i) { + if (arr1[i] != (int)i) { + DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i); + assert(arr1[i] == i); + } + + arr1[i] = 0; + } +#endif + +#ifdef __CUDA_ARCH__ + free(arr0); + free(arr1); +#else + assert_rt(cudaFree(arr0)); + assert_rt(cudaFree(arr1)); +#endif +} + +int main(int argc, char ** argv) +{ + test_memcpy_async(); + return 0; +} + diff --git a/.upstream-tests/test/cuda/annotated_ptr/utils.h b/.upstream-tests/test/cuda/annotated_ptr/utils.h new file mode 100644 index 0000000000..a72e409041 --- /dev/null +++ b/.upstream-tests/test/cuda/annotated_ptr/utils.h @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(_LIBCUDACXX_COMPILER_MSVC) +#pragma warning(disable: 4505) +#endif + +#include + +#if defined(DEBUG) + #define DPRINTF(...) { printf(__VA_ARGS__); } +#else + #define DPRINTF(...) do {} while (false) +#endif + +__device__ __host__ +void assert_rt_wrap(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { +#ifndef __CUDACC_RTC__ + printf("assert: %s %s %d\n", cudaGetErrorString(code), file, line); +#endif + assert(code == cudaSuccess); + } +} +#define assert_rt(ret) { assert_rt_wrap((ret), __FILE__, __LINE__); } + +template +__host__ __device__ constexpr bool unused(T...) {return true;} + +template +__device__ __host__ __noinline__ +T* alloc(bool shared = false) { + T* arr = nullptr; + +#ifdef __CUDA_ARCH__ + if (!shared) { + arr = (T*)malloc(N * sizeof(T)); + } else { + __shared__ T data[N]; + arr = data; + } +#else + assert_rt(cudaMallocManaged((void**) &arr, N * sizeof(T))); +#endif + + for (int i = 0; i < N; ++i) { + arr[i] = i; + } + return arr; +} + +template +__device__ __host__ __noinline__ +void dealloc(T* arr, bool shared) { +#ifdef __CUDA_ARCH__ + if (!shared) free(arr); +#else + assert_rt(cudaFree(arr)); +#endif +} diff --git a/.upstream-tests/utils/libcudacxx/test/config.py b/.upstream-tests/utils/libcudacxx/test/config.py index 337a9c30bf..0a2b6c2fb5 100644 --- a/.upstream-tests/utils/libcudacxx/test/config.py +++ b/.upstream-tests/utils/libcudacxx/test/config.py @@ -577,17 +577,20 @@ def configure_compile_flags(self): pre_sm_60 = True pre_sm_70 = True pre_sm_80 = True + pre_sm_90 = True if compute_archs and self.cxx.type == 'nvcc': pre_sm_32 = False pre_sm_60 = False pre_sm_70 = False pre_sm_80 = False + pre_sm_90 = False compute_archs = [int(a) for a in sorted(shlex.split(compute_archs))] for arch in compute_archs: if arch < 32: pre_sm_32 = True if arch < 60: pre_sm_60 = True if arch < 70: pre_sm_70 = True if arch < 80: pre_sm_80 = True + if arch < 90: pre_sm_90 = True arch_flag = '-gencode=arch=compute_{0},code=sm_{0}'.format(arch) self.cxx.compile_flags += [arch_flag] enable_compute_future = self.get_lit_conf('enable_compute_future') @@ -602,6 +605,8 @@ def configure_compile_flags(self): self.config.available_features.add("pre-sm-70") if pre_sm_80: self.config.available_features.add("pre-sm-80") + if pre_sm_90: + self.config.available_features.add("pre-sm-90") def configure_default_compile_flags(self): nvcc_host_compiler = self.get_lit_conf('nvcc_host_compiler') @@ -614,7 +619,7 @@ def configure_default_compile_flags(self): std = self.get_lit_conf('std') if not std: # Choose the newest possible language dialect if none is given. - possible_stds = ['c++2a', 'c++17', 'c++1z', 'c++14', 'c++11', + possible_stds = ['c++20', 'c++2a', 'c++17', 'c++1z', 'c++14', 'c++11', 'c++03'] if self.cxx.type == 'gcc': maj_v, _, _ = self.cxx.version diff --git a/.upstream-tests/utils/nvidia/linux/perform_tests.bash b/.upstream-tests/utils/nvidia/linux/perform_tests.bash index c2bdb1913c..4b6529222b 100755 --- a/.upstream-tests/utils/nvidia/linux/perform_tests.bash +++ b/.upstream-tests/utils/nvidia/linux/perform_tests.bash @@ -71,7 +71,7 @@ function section_separator { LIBCXX_LOG=$(mktemp) LIBCUDACXX_LOG=$(mktemp) -KNOWN_COMPUTE_ARCHS="35 50 52 53 60 61 62 70 72 75 80 86 87" +KNOWN_COMPUTE_ARCHS="35 50 52 53 60 61 62 70 72 75 80 86 87 90" function report_and_exit { # If any of the lines searched for below aren't present in the log files, the diff --git a/include/cuda/annotated_ptr b/include/cuda/annotated_ptr index 42412f01c2..fcb6e20d71 100644 --- a/include/cuda/annotated_ptr +++ b/include/cuda/annotated_ptr @@ -127,6 +127,7 @@ _Tp* associate_access_property(_Tp* __ptr, _Property __prop) { std::is_same<_Property, access_property::persisting>::value || std::is_same<_Property, access_property::streaming>::value || std::is_same<_Property, access_property::normal>::value || + std::is_same<_Property, access_property::global>::value || std::is_same<_Property, access_property::shared>::value , "property is not convertible to cuda::access_property"); return __detail_ap::__associate(__ptr, __prop); @@ -207,14 +208,11 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> { __host__ __device__ pointer __get(bool __skip_prop = false, difference_type __n = 0) const { #ifdef __CUDA_ARCH__ - if (__skip_prop == false) { - return static_cast(this->__apply_prop(static_cast(__repr + __n))); - } else { - return __repr + __n; + if (!__skip_prop) { + return static_cast(this->__apply_prop(const_cast(static_cast(__repr + __n)))); } -#else - return __repr + __n; #endif + return __repr + __n; } __host__ __device__ pointer __offset(difference_type __n, bool __skip_prop = false) const { return __get(__skip_prop, __n); @@ -252,9 +250,10 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> { template __host__ __device__ annotated_ptr(pointer __p, _RuntimeProperty __prop) - : __detail_ap::__annotated_ptr_base<_RuntimeProperty>(static_cast(__prop)), __repr(__p) + : __detail_ap::__annotated_ptr_base<_Property>(static_cast(access_property(__prop))), __repr(__p) { - static_assert(std::is_same<_Property, access_property>::value, "This method requires Property=cuda::access_property"); + static_assert(std::is_same<_Property, access_property>::value, + "This method requires annotated_ptr"); static_assert(std::is_same<_RuntimeProperty, access_property::global>::value || std::is_same<_RuntimeProperty, access_property::normal>::value || std::is_same<_RuntimeProperty, access_property::streaming>::value || @@ -268,14 +267,7 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> { } template - __host__ __device__ annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other) - : annotated_ptr<_TTp, _Prop>(__other.prop_), __repr(__other.__repr) - { - static_assert(std::is_assignable::value, "pointer must be assignable from other pointer"); - static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) || - std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space"); - // note: precondition "__other.__rep must be compatible with _Property" currently always holds - } + __host__ __device__ annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other); __host__ __device__ constexpr explicit operator bool() const noexcept { return __repr != nullptr; @@ -285,8 +277,24 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> { constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value; return __is_shared ? __repr : &(*annotated_ptr(__repr)); } + + __host__ __device__ _Property __property() const noexcept { + return this->__get_property(); + } }; + +template +template +__host__ __device__ annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other) + : __detail_ap::__annotated_ptr_base<_Property>(__other.__property()), __repr(__other.get()) +{ + static_assert(std::is_assignable::value, "pointer must be assignable from other pointer"); + static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) || + std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space"); + // note: precondition "__other.__rep must be compatible with _Property" currently always holds +} + template __host__ __device__ void memcpy_async(_Dst* __dst, diff --git a/include/cuda/std/detail/__annotated_ptr b/include/cuda/std/detail/__annotated_ptr index 605af94048..4c7ad40806 100644 --- a/include/cuda/std/detail/__annotated_ptr +++ b/include/cuda/std/detail/__annotated_ptr @@ -105,7 +105,9 @@ namespace __detail_ap { __host__ __device__ _Type* __associate(_Type* __ptr, _Property __prop) { #ifdef __CUDA_ARCH__ - return static_cast<_Type*>(__associate_descriptor(__associate_address_space(__ptr, __prop), __prop)); + return static_cast<_Type*>(__associate_descriptor( + __associate_address_space(const_cast(static_cast(__ptr)), __prop), + __prop)); #else return __ptr; #endif @@ -125,9 +127,13 @@ namespace __detail_ap { constexpr __annotated_ptr_base() noexcept = default; constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; + __host__ __device__ constexpr __annotated_ptr_base(access_property::shared) noexcept {} inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, access_property::shared{}); } + __host__ __device__ constexpr access_property::shared __get_property() const noexcept { + return access_property::shared{}; + } }; template<> @@ -138,9 +144,13 @@ namespace __detail_ap { constexpr __annotated_ptr_base() noexcept = default; constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; + __host__ __device__ constexpr __annotated_ptr_base(access_property::global) noexcept {} inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, access_property::global{}); } + __host__ __device__ constexpr access_property::global __get_property() const noexcept { + return access_property::global{}; + } }; template<> @@ -151,9 +161,13 @@ namespace __detail_ap { constexpr __annotated_ptr_base() noexcept = default; constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; + __host__ __device__ constexpr __annotated_ptr_base(access_property::normal) noexcept {} inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, access_property::normal{}); } + __host__ __device__ constexpr access_property::normal __get_property() const noexcept { + return access_property::normal{}; + } }; template<> @@ -164,9 +178,13 @@ namespace __detail_ap { constexpr __annotated_ptr_base() noexcept = default; constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; + __host__ __device__ constexpr __annotated_ptr_base(access_property::persisting) noexcept {} inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, access_property::persisting{}); } + __host__ __device__ constexpr access_property::persisting __get_property() const noexcept { + return access_property::persisting{}; + } }; template<> @@ -177,9 +195,13 @@ namespace __detail_ap { constexpr __annotated_ptr_base() noexcept = default; constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; + __host__ __device__ constexpr __annotated_ptr_base(access_property::streaming) noexcept {} inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, access_property::streaming{}); } + __host__ __device__ constexpr access_property::streaming __get_property() const noexcept { + return access_property::streaming{}; + } }; template<> @@ -189,11 +211,16 @@ namespace __detail_ap { __host__ __device__ constexpr __annotated_ptr_base() noexcept : __prop(access_property()) {} __host__ __device__ constexpr __annotated_ptr_base(std::uint64_t __property) noexcept : __prop(__property) {} + __host__ __device__ constexpr __annotated_ptr_base(access_property __property) noexcept + : __annotated_ptr_base(static_cast(__property)) {} constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default; _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default; inline __device__ void* __apply_prop(void* __p) const { return __associate(__p, __prop); } + __host__ __device__ access_property __get_property() const noexcept { + return reinterpret_cast(const_cast(__prop)); + } }; } // namespace __detail_ap diff --git a/include/nv/detail/__target_macros b/include/nv/detail/__target_macros index 11f213ae7e..9013291805 100644 --- a/include/nv/detail/__target_macros +++ b/include/nv/detail/__target_macros @@ -30,6 +30,7 @@ # define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80 # define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86 # define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87 +# define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90 # define _NV_TARGET_ARCH_TO_SM_350 35 # define _NV_TARGET_ARCH_TO_SM_370 37 @@ -45,6 +46,7 @@ # define _NV_TARGET_ARCH_TO_SM_800 80 # define _NV_TARGET_ARCH_TO_SM_860 86 # define _NV_TARGET_ARCH_TO_SM_870 87 +# define _NV_TARGET_ARCH_TO_SM_900 90 #if defined(_NV_COMPILER_NVCXX) @@ -62,6 +64,7 @@ # define _NV_TARGET_VAL_SM_80 nv::target::sm_80 # define _NV_TARGET_VAL_SM_86 nv::target::sm_86 # define _NV_TARGET_VAL_SM_87 nv::target::sm_87 +# define _NV_TARGET_VAL_SM_90 nv::target::sm_90 # define _NV_TARGET___NV_IS_HOST nv::target::is_host # define _NV_TARGET___NV_IS_DEVICE nv::target::is_device @@ -94,6 +97,7 @@ # define _NV_TARGET_VAL_SM_80 800 # define _NV_TARGET_VAL_SM_86 860 # define _NV_TARGET_VAL_SM_87 870 +# define _NV_TARGET_VAL_SM_90 900 # if defined(__CUDA_ARCH__) # define _NV_TARGET_VAL __CUDA_ARCH__ @@ -138,6 +142,7 @@ # define _NV_TARGET_VAL_SM_80 800 # define _NV_TARGET_VAL_SM_86 860 # define _NV_TARGET_VAL_SM_87 870 +# define _NV_TARGET_VAL_SM_90 900 # define _NV_TARGET_VAL 0 @@ -165,6 +170,7 @@ #define _NV_TARGET___NV_PROVIDES_SM_80 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_80)) #define _NV_TARGET___NV_PROVIDES_SM_86 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_86)) #define _NV_TARGET___NV_PROVIDES_SM_87 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_87)) +#define _NV_TARGET___NV_PROVIDES_SM_90 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_90)) #define _NV_TARGET___NV_IS_EXACTLY_SM_35 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_35)) #define _NV_TARGET___NV_IS_EXACTLY_SM_37 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_37)) @@ -180,6 +186,7 @@ #define _NV_TARGET___NV_IS_EXACTLY_SM_80 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_80)) #define _NV_TARGET___NV_IS_EXACTLY_SM_86 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_86)) #define _NV_TARGET___NV_IS_EXACTLY_SM_87 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_87)) +#define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90)) #define NV_PROVIDES_SM_35 __NV_PROVIDES_SM_35 #define NV_PROVIDES_SM_37 __NV_PROVIDES_SM_37 @@ -195,6 +202,7 @@ #define NV_PROVIDES_SM_80 __NV_PROVIDES_SM_80 #define NV_PROVIDES_SM_86 __NV_PROVIDES_SM_86 #define NV_PROVIDES_SM_87 __NV_PROVIDES_SM_87 +#define NV_PROVIDES_SM_90 __NV_PROVIDES_SM_90 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37 @@ -210,6 +218,7 @@ #define NV_IS_EXACTLY_SM_80 __NV_IS_EXACTLY_SM_80 #define NV_IS_EXACTLY_SM_86 __NV_IS_EXACTLY_SM_86 #define NV_IS_EXACTLY_SM_87 __NV_IS_EXACTLY_SM_87 +#define NV_IS_EXACTLY_SM_90 __NV_IS_EXACTLY_SM_90 #define NV_IS_HOST __NV_IS_HOST #define NV_IS_DEVICE __NV_IS_DEVICE @@ -315,6 +324,12 @@ # define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_87 0 # endif +# if (_NV_TARGET___NV_IS_EXACTLY_SM_90) +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_90 1 +# else +# define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_90 0 +# endif + # if (_NV_TARGET_IS_HOST) # define _NV_TARGET_BOOL___NV_IS_HOST 1 # define _NV_TARGET_BOOL___NV_IS_DEVICE 0 @@ -412,6 +427,12 @@ # define _NV_TARGET_BOOL___NV_PROVIDES_SM_87 0 # endif +# if (_NV_TARGET___NV_PROVIDES_SM_90) +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_90 1 +# else +# define _NV_TARGET_BOOL___NV_PROVIDES_SM_90 0 +# endif + # define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond # define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond)) @@ -452,6 +473,7 @@ # define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__)) # define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__)) # define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__)) +# define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__)) # define _NV_TARGET_DISPATCH(...) _NV_BLOCK_EXPAND(_NV_DISPATCH_N_ARY(_NV_TARGET_DISPATCH_HANDLE, __VA_ARGS__)) diff --git a/include/nv/target b/include/nv/target index 8622cd2605..9788b6ad36 100644 --- a/include/nv/target +++ b/include/nv/target @@ -67,12 +67,14 @@ namespace nv { constexpr base_int_t sm_80_bit = 1 << 12; constexpr base_int_t sm_86_bit = 1 << 13; constexpr base_int_t sm_87_bit = 1 << 14; + constexpr base_int_t sm_90_bit = 1 << 15; constexpr base_int_t all_devices = sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit | sm_75_bit | - sm_80_bit | sm_86_bit | sm_87_bit; + sm_80_bit | sm_86_bit | sm_87_bit | + sm_90_bit; // Store a set of targets as a set of bits struct _NV_BITSET_ATTRIBUTE target_description { @@ -88,6 +90,7 @@ namespace nv { sm_60 = 60, sm_61 = 61, sm_62 = 62, sm_70 = 70, sm_72 = 72, sm_75 = 75, sm_80 = 80, sm_86 = 86, sm_87 = 87, + sm_90 = 90, }; _NV_FUNCTION_ANNOTATION constexpr base_int_t toint(sm_selector a) { @@ -108,11 +111,13 @@ namespace nv { toint(a) == 75 ? sm_75_bit : toint(a) == 80 ? sm_80_bit : toint(a) == 86 ? sm_86_bit : - toint(a) == 87 ? sm_87_bit : 0; + toint(a) == 87 ? sm_87_bit : + toint(a) == 90 ? sm_90_bit : 0; } _NV_FUNCTION_ANNOTATION constexpr base_int_t bitrounddown(sm_selector a) { - return toint(a) >= 87 ? sm_87_bit : + return toint(a) >= 90 ? sm_90_bit : + toint(a) >= 87 ? sm_87_bit : toint(a) >= 86 ? sm_86_bit : toint(a) >= 80 ? sm_80_bit : toint(a) >= 75 ? sm_75_bit : @@ -188,6 +193,7 @@ namespace nv { constexpr sm_selector sm_80 = sm_selector::sm_80; constexpr sm_selector sm_86 = sm_selector::sm_86; constexpr sm_selector sm_87 = sm_selector::sm_87; + constexpr sm_selector sm_90 = sm_selector::sm_90; using detail::is_exactly; using detail::provides;