From 55dd2c99346baa3a14949a0f7e9c41865e434eda Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 6 Oct 2022 15:49:49 -0700
Subject: [PATCH] Merge internal changes from CTK 11.8

- Fix constness with annotated_ptr
- Adds SM_90 to <nv/target>
- Adds SM_90 to CMake and testing scripts
---
 .upstream-tests/test/CMakeLists.txt           |   2 +-
 .../annotated_ptr/access_property.pass.cpp    | 121 +++++
 .../access_property_explicit.fail.cpp         |  30 ++
 .../test/cuda/annotated_ptr/annotated_ptr.h   | 440 ------------------
 .../cuda/annotated_ptr/annotated_ptr.pass.cpp | 174 ++++++-
 .../annotated_ptr_bench.pass.cpp              | 164 +++++++
 .../annotated_ptr_constructors.pass.cpp       |  86 ++++
 .../annotated_ptr/annotated_ptr_ctor.fail.cpp |  52 +++
 .../annotated_ptr_host_constructor.pass.cpp   |  28 ++
 .../annotated_ptr_host_shared.pass.cpp        |   6 +-
 .../annotated_ptr_neg_tests.pass.cpp          |  20 +-
 .../annotated_ptr_shmem.pass.cpp              |  56 +++
 .../apply_access_property.pass.cpp            |  49 ++
 .../associate_access_property.pass.cpp        |  54 +++
 .../annotated_ptr/discard_memory.pass.cpp     |  44 ++
 .../cuda/annotated_ptr/memcpy_async.pass.cpp  | 120 +++++
 .../test/cuda/annotated_ptr/utils.h           |  66 +++
 .../utils/libcudacxx/test/config.py           |   7 +-
 .../utils/nvidia/linux/perform_tests.bash     |   2 +-
 include/cuda/annotated_ptr                    |  40 +-
 include/cuda/std/detail/__annotated_ptr       |  29 +-
 include/nv/detail/__target_macros             |  22 +
 include/nv/target                             |  12 +-
 23 files changed, 1126 insertions(+), 498 deletions(-)
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp
 delete mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp
 create mode 100644 .upstream-tests/test/cuda/annotated_ptr/utils.h

diff --git a/.upstream-tests/test/CMakeLists.txt b/.upstream-tests/test/CMakeLists.txt
index 838b222748..cae71093b1 100644
--- a/.upstream-tests/test/CMakeLists.txt
+++ b/.upstream-tests/test/CMakeLists.txt
@@ -6,7 +6,7 @@ macro(pythonize_bool var)
   endif()
 endmacro()
 
-set(LIBCUDACXX_HIGHEST_COMPUTE_ARCH 80)
+set(LIBCUDACXX_HIGHEST_COMPUTE_ARCH 90)
 
 foreach (COMPUTE_ARCH ${LIBCUDACXX_COMPUTE_ARCHS})
   set(_compute_message "${_compute_message} sm_${COMPUTE_ARCH}")
diff --git a/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp
new file mode 100644
index 0000000000..809598ef96
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/access_property.pass.cpp
@@ -0,0 +1,121 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+
+__device__ __host__ __noinline__
+void test_access_property_interleave() {
+    const uint64_t INTERLEAVE_NORMAL           = uint64_t{0x10F0000000000000};
+    const uint64_t INTERLEAVE_NORMAL_DEMOTE    = uint64_t{0x16F0000000000000};
+    const uint64_t INTERLEAVE_PERSISTING       = uint64_t{0x14F0000000000000};
+    const uint64_t INTERLEAVE_STREAMING        = uint64_t{0x12F0000000000000};
+    cuda::access_property ap(cuda::access_property::persisting{});
+    cuda::access_property ap2;
+
+    assert(INTERLEAVE_PERSISTING == static_cast<uint64_t>(ap));
+    assert(static_cast<uint64_t>(ap2) == INTERLEAVE_NORMAL);
+
+    ap = cuda::access_property(cuda::access_property::normal());
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL_DEMOTE);
+
+    ap = cuda::access_property(cuda::access_property::streaming());
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_STREAMING);
+
+    ap = cuda::access_property(cuda::access_property::normal(), 2.0f);
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL_DEMOTE);
+}
+
+__device__ __host__ __noinline__
+void test_access_property_block() {
+    //assuming ptr address is 0;
+    const size_t TOTAL_BYTES = 0xFFFFFFFF;
+    const size_t HIT_BYTES = 0xFFFFFFFF;
+    const size_t BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES = size_t{0x1DD00FE000000000};
+    const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000};
+
+    cuda::access_property ap(0x0, HIT_BYTES, TOTAL_BYTES, cuda::access_property::persisting{}, cuda::access_property::streaming{});
+    assert(static_cast<uint64_t>(ap) == BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES);
+
+    ap = cuda::access_property(0x0, 0xFFFFFFFF, 0xFFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{});
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
+
+    ap = cuda::access_property(0x0, 0xFFFFFFFFF, 0xFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{});
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
+
+    ap = cuda::access_property(0x0, 0, 0, cuda::access_property::persisting{}, cuda::access_property::streaming{});
+    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
+
+    for (size_t ptr = 1; ptr < size_t{0xFFFFFFFF}; ptr <<= 1) {
+        for (size_t hit = 1; hit < size_t{0xFFFFFFFF}; hit <<= 1) {
+            ap = cuda::access_property((void*)ptr, hit, hit, cuda::access_property::persisting{}, cuda::access_property::streaming{});
+            DPRINTF("Block encoding PTR:%p, hit:%p, block encoding:%p\n", ptr, hit, static_cast<uint64_t>(ap));
+        }
+    }
+}
+
+template<typename T>
+__host__ __device__ __noinline__ void test_global_implicit_property(T ap, cudaAccessProperty cp) {
+  // Test implicit conversions
+  cudaAccessProperty v = ap;
+  assert(cp == v);
+
+  // Test default, copy constructor, and copy-assignent
+  cuda::access_property o(ap);
+  cuda::access_property d;
+  d = ap;
+
+  // Test explicit conversion to i64
+  std::uint64_t x = (std::uint64_t)o;
+  std::uint64_t y = (std::uint64_t)d;
+  assert(x == y);
+}
+
+__host__ __device__ __noinline__ void test_global() {
+  cuda::access_property o(cuda::access_property::global{});
+  std::uint64_t x = (std::uint64_t)o;
+  unused(x);
+}
+
+__host__ __device__ __noinline__ void test_shared() {
+  (void)cuda::access_property::shared{};
+}
+
+static_assert(sizeof(cuda::access_property::shared) == 1, "");
+static_assert(sizeof(cuda::access_property::global) == 1, "");
+static_assert(sizeof(cuda::access_property::persisting) == 1, "");
+static_assert(sizeof(cuda::access_property::normal) == 1, "");
+static_assert(sizeof(cuda::access_property::streaming) == 1, "");
+static_assert(sizeof(cuda::access_property) == 8, "");
+
+static_assert(alignof(cuda::access_property::shared) == 1, "");
+static_assert(alignof(cuda::access_property::global) == 1, "");
+static_assert(alignof(cuda::access_property::persisting) == 1, "");
+static_assert(alignof(cuda::access_property::normal) == 1, "");
+static_assert(alignof(cuda::access_property::streaming) == 1, "");
+static_assert(alignof(cuda::access_property) == 8, "");
+
+int main(int argc, char ** argv)
+{
+  test_access_property_interleave();
+  test_access_property_block();
+
+  test_global_implicit_property(cuda::access_property::normal{}, cudaAccessProperty::cudaAccessPropertyNormal);
+  test_global_implicit_property(cuda::access_property::streaming{}, cudaAccessProperty::cudaAccessPropertyStreaming);
+  test_global_implicit_property(cuda::access_property::persisting{}, cudaAccessProperty::cudaAccessPropertyPersisting);
+
+  test_global();
+  test_shared();
+
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp b/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp
new file mode 100644
index 0000000000..0f18d671eb
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/access_property_explicit.fail.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+
+__host__ __device__ __noinline__ void test_access_property_fail() {
+  cuda::access_property o = cuda::access_property::normal{};
+  // Test implicit conversion fails
+  std::uint64_t x;
+  x = o;
+  unused(o);
+}
+
+int main(int argc, char ** argv)
+{
+  test_access_property_fail();
+  return 0;
+}
+
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h
deleted file mode 100644
index 386a1a323a..0000000000
--- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.h
+++ /dev/null
@@ -1,440 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: pre-sm-70
-// UNSUPPORTED: c++98, c++03
-
-#include <cuda/annotated_ptr>
-#include <cooperative_groups.h>
-
-#ifndef __CUDA_ARCH__
-#include <array>
-#endif
-
-#if defined(DEBUG)
-    #define DPRINTF(...) { printf(__VA_ARGS__); }
-#else
-    #define DPRINTF(...) do {} while (false)
-#endif
-
-#if defined(_LIBCUDACXX_COMPILER_MSVC)
-#pragma warning(disable: 4505)
-#endif
-
-template <typename ... T>
-__host__ __device__ constexpr bool unused(T...) {return true;}
-
-// ************************ Device code ***************************************
-static __device__
-void shared_mem_test_dev() {
-    __shared__ int smem[42];
-    smem[10] = 42;
-    assert(smem[10] == 42);
-
-    cuda::annotated_ptr<int, cuda::access_property::shared> p{smem + 10};
-
-    assert(*p == 42);
-}
-
-static __global__
-void shared_mem_test() {
-    shared_mem_test_dev();
-};
-
-static __device__
-void annotated_ptr_timing_dev(int * in, int * out) {
-    cuda::access_property ap(cuda::access_property::persisting{});
-    // Retrieve global id
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    cuda::annotated_ptr<int, cuda::access_property> in_ann{in, ap};
-    cuda::annotated_ptr<int, cuda::access_property> out_ann{out, ap};
-
-    DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i);
-    DPRINTF("&out[i]:%p = &in_ann[i]:%p for i = %d\n", &out_ann[i], &in_ann[i], i);
-
-    out_ann[i] = in_ann[i];
-};
-
-static __global__
-void annotated_ptr_timing(int * in, int * out) {
-    annotated_ptr_timing_dev(in, out);
-}
-
-static __device__
-void ptr_timing_dev(int * in, int * out) {
-    // Retrieve global id
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i);
-    out[i] = in[i];
-};
-
-static __global__
-void ptr_timing(int * in, int * out) {
-    ptr_timing_dev(in, out);
-};
-
-// ************************ Host/device code ***************************************
-__device__ __host__
-void assert_rt_wrap(cudaError_t code, const char *file, int line) {
-    if (code != cudaSuccess) {
-#ifndef __CUDACC_RTC__
-        printf("assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-#endif
-        assert(code == cudaSuccess);
-    }
-}
-#define assert_rt(ret) { assert_rt_wrap((ret), __FILE__, __LINE__); }
-
-__device__ __host__ __noinline__
-void test_access_property_interleave() {
-    (void)cuda::access_property::shared{};
-    (void)cuda::access_property::global{};
-    assert(cuda::access_property::persisting{} == cudaAccessPropertyPersisting);
-    assert(cuda::access_property::streaming{} == cudaAccessPropertyStreaming);
-    assert(cuda::access_property::normal{} == cudaAccessPropertyNormal);
-
-    const uint64_t INTERLEAVE_NORMAL           = uint64_t{0x10F0000000000000};
-    const uint64_t INTERLEAVE_NORMAL_DEMOTE    = uint64_t{0x16F0000000000000};
-    const uint64_t INTERLEAVE_PERSISTING       = uint64_t{0x14F0000000000000};
-    const uint64_t INTERLEAVE_STREAMING        = uint64_t{0x12F0000000000000};
-    cuda::access_property ap(cuda::access_property::persisting{});
-    cuda::access_property ap2;
-
-    assert(INTERLEAVE_PERSISTING == static_cast<uint64_t>(ap));
-    assert(static_cast<uint64_t>(ap2) == INTERLEAVE_NORMAL);
-
-    ap = cuda::access_property(cuda::access_property::normal());
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL_DEMOTE);
-
-    ap = cuda::access_property(cuda::access_property::streaming());
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_STREAMING);
-
-    ap = cuda::access_property(cuda::access_property::normal(), 2.0f);
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL_DEMOTE);
-}
-
-__device__ __host__ __noinline__
-void test_access_property_block() {
-    //assuming ptr address is 0;
-    const size_t TOTAL_BYTES = 0xFFFFFFFF;
-    const size_t HIT_BYTES = 0xFFFFFFFF;
-    const size_t BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES = size_t{0x1DD00FE000000000};
-    const uint64_t INTERLEAVE_NORMAL = uint64_t{0x10F0000000000000};
-
-    cuda::access_property ap(0x0, HIT_BYTES, TOTAL_BYTES, cuda::access_property::persisting{}, cuda::access_property::streaming{});
-    assert(static_cast<uint64_t>(ap) == BLOCK_0ADDR_PERSISTHIT_STREAMISS_MAXBYTES);
-
-    ap = cuda::access_property(0x0, 0xFFFFFFFF, 0xFFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{});
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
-
-    ap = cuda::access_property(0x0, 0xFFFFFFFFF, 0xFFFFFFFF, cuda::access_property::persisting{}, cuda::access_property::streaming{});
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
-
-    ap = cuda::access_property(0x0, 0, 0, cuda::access_property::persisting{}, cuda::access_property::streaming{});
-    assert(static_cast<uint64_t>(ap) == INTERLEAVE_NORMAL);
-
-    for (size_t ptr = 1; ptr < size_t{0xFFFFFFFF}; ptr <<= 1) {
-        for (size_t hit = 1; hit < size_t{0xFFFFFFFF}; hit <<= 1) {
-            ap = cuda::access_property((void*)ptr, hit, hit, cuda::access_property::persisting{}, cuda::access_property::streaming{});
-            DPRINTF("Block encoding PTR:%p, hit:%p, block encoding:%p\n", ptr, hit, static_cast<uint64_t>(ap));
-        }
-    }
-}
-
-__device__ __host__ __noinline__
-void test_access_property_functions() {
-    size_t ARR_SZ = 1 << 10;
-    int* arr0 = nullptr;
-    int* arr1 = nullptr;
-    cuda::access_property ap(cuda::access_property::persisting{});
-    cuda::access_property a;
-    unused(a);
-    cuda::access_property as(cuda::access_property::streaming{});
-
-#ifdef __CUDA_ARCH__
-    arr0 = (int*)malloc(ARR_SZ * sizeof(int));
-    arr1 = (int*)malloc(ARR_SZ * sizeof(int));
-#else
-    assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int)));
-    assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int)));
-    assert_rt(cudaDeviceSynchronize());
-#endif
-
-    cuda::discard_memory(arr0, ARR_SZ);
-    arr0 = cuda::associate_access_property(arr0, ap);
-    arr1 = cuda::associate_access_property(arr1, as);
-    cuda::apply_access_property(arr0, ARR_SZ, cuda::access_property::persisting{});
-    cuda::apply_access_property(arr1, ARR_SZ, cuda::access_property::normal{});
-
-#ifdef __CUDA_ARCH__
-    free(arr0);
-    free(arr1);
-#else
-    assert_rt(cudaFree(arr0));
-    assert_rt(cudaFree(arr1));
-#endif
-
-}
-
-__device__ __host__ __noinline__
-void test_annotated_ptr_basic() {
-    cuda::access_property ap(cuda::access_property::persisting{});
-    static const size_t ARR_SZ = 1 << 10;
-    int* array0 = new int[ARR_SZ];
-    int* array1 = new int[ARR_SZ];
-    cuda::annotated_ptr<int, cuda::access_property> array_anno_ptr{array0, ap};
-    cuda::annotated_ptr<int, cuda::access_property> array0_anno_ptr0{array0, ap};
-    cuda::annotated_ptr<int, cuda::access_property> array0_anno_ptr1 = array0_anno_ptr0;
-    cuda::annotated_ptr<int, cuda::access_property> array0_anno_ptr2{array0_anno_ptr0};
-    cuda::annotated_ptr<int, cuda::access_property> array1_anno_ptr{array1, ap};
-#ifndef __CUDA_ARCH__
-    cuda::annotated_ptr<int, cuda::access_property::shared> shared_ptr1;
-    cuda::annotated_ptr<int, cuda::access_property::shared> shared_ptr2;
-
-    shared_ptr1 = shared_ptr2;
-    unused(shared_ptr1);
-
-    //Check on host the arrays through annotated_ptr ops
-    std::array<int, 3> a1{3, 2, 1};
-    cuda::annotated_ptr<std::array<int, 3>, cuda::access_property> anno_ptr{&a1, ap};
-    assert(anno_ptr->at(0) == 3);
-#endif
-
-    //Fill the arrays
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        array0[i] = static_cast<int>(i);
-        array1[i] = static_cast<int>(ARR_SZ - i);
-    }
-
-    assert((bool)array0_anno_ptr0 == true);
-    assert(array0_anno_ptr0.get() == array0);
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        assert(array0_anno_ptr0[i] == static_cast<int>(i));
-        assert(array0_anno_ptr2[i] == static_cast<int>(i));
-        assert(&array0[i] == &array0_anno_ptr0[i]);
-        assert(&array0[i] == &array0_anno_ptr1[i]);
-    }
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        assert(array1_anno_ptr[i] == array1[i]);
-    }
-
-    delete[] array0;
-    delete[] array1;
-}
-
-__device__ __host__ __noinline__
-void test_annotated_ptr_launch_kernel() {
-#ifndef __CUDA_ARCH__
-    static const size_t ARR_SZ          = 1 << 22;
-    static const size_t THREAD_CNT      = 128;
-    static const size_t BLOCK_CNT       = ARR_SZ / THREAD_CNT;
-    const dim3 threads(THREAD_CNT, 1, 1), blocks(BLOCK_CNT, 1, 1);
-    cudaEvent_t start, stop;
-#else
-    static const size_t ARR_SZ     = 1 << 10;
-#endif
-    int* arr0 = nullptr;
-    int* arr1 = nullptr;
-    float annotated_time = 0.f, pointer_time = 0.f;
-
-#ifdef __CUDA_ARCH__
-    arr0 = (int*)malloc(ARR_SZ * sizeof(int));
-    arr1 = (int*)malloc(ARR_SZ * sizeof(int));
-#else
-    assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int)));
-    assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int)));
-    assert_rt(cudaDeviceSynchronize());
-#endif
-
-#ifdef __CUDA_ARCH__
-    shared_mem_test_dev();
-#else
-    shared_mem_test<<<1, 1, 0, 0>>>();
-    assert_rt(cudaStreamSynchronize(0));
-#endif
-
-
-#ifdef __CUDA_ARCH__
-    ptr_timing_dev(arr0, arr1);
-#else
-    ptr_timing<<<blocks,threads>>>(arr0, arr1);
-    assert_rt(cudaDeviceSynchronize());
-#endif
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        arr0[i] = static_cast<int>(i);
-        arr1[i] = 0;
-    }
-
-#ifdef __CUDA_ARCH__
-    ptr_timing_dev(arr0, arr1);
-#else
-    assert_rt(cudaDeviceSynchronize());
-    assert_rt(cudaEventCreate(&start));
-    assert_rt(cudaEventCreate(&stop));
-    assert_rt(cudaEventRecord(start));
-    ptr_timing<<<blocks,threads>>>(arr0, arr1);
-    assert_rt(cudaEventRecord(stop));
-    assert_rt(cudaEventSynchronize(stop));
-    assert_rt(cudaEventElapsedTime(&pointer_time, start, stop));
-    assert_rt(cudaEventDestroy(start));
-    assert_rt(cudaEventDestroy(stop));
-    assert_rt(cudaDeviceSynchronize());
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i);
-            assert(arr1[i] == static_cast<int>(i));
-        }
-
-        arr1[i] = 0;
-    }
-#endif
-
-#ifdef __CUDA_ARCH__
-    annotated_ptr_timing_dev(arr0, arr1);
-#else
-    assert_rt(cudaDeviceSynchronize());
-    annotated_ptr_timing<<<blocks,threads>>>(arr0, arr1);
-    assert_rt(cudaDeviceSynchronize());
-#endif
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        arr0[i] = static_cast<int>(i);
-        arr1[i] = 0;
-    }
-
-#ifdef __CUDA_ARCH__
-    annotated_ptr_timing_dev(arr0, arr1);
-#else
-    assert_rt(cudaDeviceSynchronize());
-    assert_rt(cudaEventCreate(&start));
-    assert_rt(cudaEventCreate(&stop));
-    assert_rt(cudaEventRecord(start));
-    annotated_ptr_timing<<<blocks,threads>>>(arr0, arr1);
-    assert_rt(cudaEventRecord(stop));
-    assert_rt(cudaEventSynchronize(stop));
-    assert_rt(cudaEventElapsedTime(&annotated_time, start, stop));
-    assert_rt(cudaEventDestroy(start));
-    assert_rt(cudaEventDestroy(stop));
-    assert_rt(cudaDeviceSynchronize());
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i);
-            assert(arr1[i] == static_cast<int>(i));
-        }
-
-        arr1[i] = 0;
-    }
-#endif
-
-#ifdef __CUDA_ARCH__
-    free(arr0);
-    free(arr1);
-#else
-    assert_rt(cudaFree(arr0));
-    assert_rt(cudaFree(arr1));
-#endif
-    printf("array(ms):%f, arrotated_ptr(ms):%f\n",
-            pointer_time, annotated_time);
-}
-
-__device__ __host__ __noinline__
-void test_annotated_ptr_functions() {
-    size_t ARR_SZ = 1 << 10;
-    int* arr0 = nullptr;
-    int* arr1 = nullptr;
-    cuda::access_property ap(cuda::access_property::persisting{});
-    cuda::barrier<cuda::thread_scope_system> bar0, bar1, bar2, bar3;
-    init(&bar0, 1);
-    init(&bar1, 1);
-    init(&bar2, 1);
-    init(&bar3, 1);
-
-#ifdef __CUDA_ARCH__
-    arr0 = (int*)malloc(ARR_SZ * sizeof(int));
-    arr1 = (int*)malloc(ARR_SZ * sizeof(int));
-
-    auto group = cooperative_groups::this_thread_block();
-#else
-    assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int)));
-    assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int)));
-    assert_rt(cudaDeviceSynchronize());
-#endif
-
-    cuda::annotated_ptr<int, cuda::access_property> ann0{arr0, ap};
-    cuda::annotated_ptr<int, cuda::access_property> ann1{arr1, ap};
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        arr0[i] = static_cast<int>(i);
-        arr1[i] = 0;
-    }
-
-    cuda::memcpy_async(ann1, ann0, ARR_SZ * sizeof(int), bar0);
-    bar0.arrive_and_wait();
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
-            assert(arr1[i] == static_cast<int>(i));
-        }
-
-        arr1[i] = 0;
-    }
-
-    cuda::memcpy_async(arr1, ann0, ARR_SZ * sizeof(int), bar1);
-    bar1.arrive_and_wait();
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
-            assert(arr1[i] == static_cast<int>(i));
-        }
-
-        arr1[i] = 0;
-    }
-
-#ifdef __CUDA_ARCH__
-    cuda::memcpy_async(group, ann1, ann0, ARR_SZ * sizeof(int), bar2);
-    bar2.arrive_and_wait();
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
-            assert(arr1[i] == i);
-        }
-
-        arr1[i] = 0;
-    }
-
-    cuda::memcpy_async(group, arr1, ann0, ARR_SZ * sizeof(int), bar3);
-    bar3.arrive_and_wait();
-
-    for (size_t i = 0; i < ARR_SZ; ++i) {
-        if (arr1[i] != (int)i) {
-            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
-            assert(arr1[i] == i);
-        }
-
-        arr1[i] = 0;
-    }
-#endif
-
-#ifdef __CUDA_ARCH__
-    free(arr0);
-    free(arr1);
-#else
-    assert_rt(cudaFree(arr0));
-    assert_rt(cudaFree(arr1));
-#endif
-}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp
index 11a9185ee5..2b871f77c6 100644
--- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr.pass.cpp
@@ -8,30 +8,166 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
 // UNSUPPORTED: c++98, c++03
 
-#include "annotated_ptr.h"
+#include "utils.h"
 
-static_assert(sizeof(cuda::annotated_ptr<int, cuda::access_property::shared>) == sizeof(uintptr_t), "annotated_ptr<T, shared> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<char, cuda::access_property::shared>) == sizeof(uintptr_t), "annotated_ptr<T, shared> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::shared>) == sizeof(uintptr_t), "annotated_ptr<T, shared> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<int, cuda::access_property::global>) == sizeof(uintptr_t), "annotated_ptr<T, global> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<char, cuda::access_property::global>) == sizeof(uintptr_t), "annotated_ptr<T, global> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::global>) == sizeof(uintptr_t), "annotated_ptr<T, global> must be pointer size");
-static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property>) == 2*sizeof(uintptr_t), "annotated_ptr<T,access_property> must be 2 * pointer size");
+static_assert(sizeof(cuda::annotated_ptr<int, cuda::access_property::global>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<char, cuda::access_property::global>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::global>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::persisting>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::normal>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::streaming>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, global> must be pointer size");
 
-static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::shared>) == alignof(int*), "annotated_ptr must align with int*");
-static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::global>) == alignof(int*), "annotated_ptr must align with int*");
-static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property>) == alignof(int*), "annotated_ptr must align with int*");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property>) == 2*sizeof(uintptr_t),
+	      "annotated_ptr<T,access_property> must be 2 * pointer size");
+
+// NOTE: we could make these smaller in the future (e.g. 32-bit) but that would be an ABI breaking change:
+static_assert(sizeof(cuda::annotated_ptr<int, cuda::access_property::shared>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, shared> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<char, cuda::access_property::shared>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, shared> must be pointer size");
+static_assert(sizeof(cuda::annotated_ptr<uintptr_t, cuda::access_property::shared>) == sizeof(uintptr_t),
+	      "annotated_ptr<T, shared> must be pointer size");
+
+
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::global>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::persisting>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::normal>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::streaming>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+
+// NOTE: we could lower the alignment in the future but that would be an ABI breaking change:
+static_assert(alignof(cuda::annotated_ptr<int, cuda::access_property::shared>) == alignof(int*),
+	      "annotated_ptr must align with int*");
+
+#define N 128
+
+struct S {
+  int x;
+  __host__ __device__ S& operator=(int o) {
+    this->x = o;
+    return *this;
+  }
+};
+
+template <typename In, typename T>
+__device__ __host__ __noinline__
+void test_read_access(In i, T* r) {
+  assert(i);
+  assert(i - i == 0);
+  assert((bool)i);
+  const In o = i;
+
+  //assert(i->x == 0);  // FAILS with shmem
+  //assert(o->x == 0);  // FAILS with shmem
+  for (int n = 0; n < N; ++n) {
+    assert(i[n].x == n);
+    assert(&i[n] == &i[n]);
+    assert(&i[n] == &r[n]);
+    assert(o[n].x == n);
+    assert(&o[n] == &o[n]);
+    assert(&o[n] == &r[n]);
+  }
+}
+
+template <typename In>
+__device__ __host__ __noinline__
+void test_write_access(In i) {
+  assert(i);
+  assert((bool)i);
+  const In o = i;
+
+  for (int n = 0; n < N; ++n) {
+    i[n].x = 2*n;
+    assert(i[n].x == 2*n);
+    assert(i[n].x == 2*n);
+    i[n].x = n;
+
+    o[n].x = 2*n;
+    assert(o[n].x == 2*n);
+    assert(o[n].x == 2*n);
+    o[n].x = n;
+  }
+}
+
+__device__ __host__ __noinline__
+void all_tests() {
+  S* arr = alloc<S, N>(false);
+#ifdef __CUDA_ARCH__
+  S* sarr = alloc<S, N>(true);
+#endif
+
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property::normal>(arr), arr);
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property::streaming>(arr), arr);
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property::persisting>(arr), arr);
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property::global>(arr), arr);
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property>(arr), arr);
+#ifdef __CUDA_ARCH__
+  test_read_access(cuda::annotated_ptr<S, cuda::access_property::shared>(sarr), sarr);
+#endif
+
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property::normal>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property::streaming>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property::persisting>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property::global>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property>(arr), arr);
+#ifdef __CUDA_ARCH__
+  test_read_access(cuda::annotated_ptr<const S, cuda::access_property::shared>(sarr), sarr);
+#endif
+
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property::normal>(arr), arr);
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property::streaming>(arr), arr);
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property::persisting>(arr), arr);
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property::global>(arr), arr);
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property>(arr), arr);
+#ifdef __CUDA_ARCH__
+  test_read_access(cuda::annotated_ptr<volatile S, cuda::access_property::shared>(sarr), sarr);
+#endif
+
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property::normal>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property::streaming>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property::persisting>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property::global>(arr), arr);
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property>(arr), arr);
+#ifdef __CUDA_ARCH__
+  test_read_access(cuda::annotated_ptr<const volatile S, cuda::access_property::shared>(sarr), sarr);
+#endif
+
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property::normal>(arr));
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property::streaming>(arr));
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property::persisting>(arr));
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property::global>(arr));
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property>(arr));
+#ifdef __CUDA_ARCH__
+  test_write_access(cuda::annotated_ptr<S, cuda::access_property::shared>(sarr));
+#endif
+
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property::normal>(arr));
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property::streaming>(arr));
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property::persisting>(arr));
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property::global>(arr));
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property>(arr));
+#ifdef __CUDA_ARCH__
+  test_write_access(cuda::annotated_ptr<volatile S, cuda::access_property::shared>(sarr));
+#endif
+}
 
 int main(int argc, char ** argv)
 {
-    test_access_property_interleave();
-    test_access_property_block();
-    test_access_property_functions();
-    test_annotated_ptr_basic();
-    test_annotated_ptr_launch_kernel();
-    test_annotated_ptr_functions();
-
-    return 0;
+  all_tests();
+  return 0;
 }
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp
new file mode 100644
index 0000000000..db630c61c4
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_bench.pass.cpp
@@ -0,0 +1,164 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+
+__device__
+void annotated_ptr_timing_dev(int* in, int* out) {
+  cuda::access_property ap(cuda::access_property::persisting{});
+  // Retrieve global id
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cuda::annotated_ptr<int, cuda::access_property> in_ann{in, ap};
+  cuda::annotated_ptr<int, cuda::access_property> out_ann{out, ap};
+
+  DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i);
+  DPRINTF("&out[i]:%p = &in_ann[i]:%p for i = %d\n", &out_ann[i], &in_ann[i], i);
+
+  out_ann[i] = in_ann[i];
+};
+
+__global__
+void annotated_ptr_timing(int* in, int* out) {
+  annotated_ptr_timing_dev(in, out);
+}
+
+__device__
+void ptr_timing_dev(int* in, int* out) {
+  // Retrieve global id
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  DPRINTF("&out[i]:%p = &in[i]:%p for i = %d\n", &out[i], &in[i], i);
+  out[i] = in[i];
+};
+
+__global__
+void ptr_timing(int* in, int* out) {
+  ptr_timing_dev(in, out);
+};
+
+__device__ __host__ __noinline__
+void bench() {
+#ifndef __CUDA_ARCH__
+   static const size_t ARR_SZ          = 1 << 22;
+   static const size_t THREAD_CNT      = 128;
+   static const size_t BLOCK_CNT       = ARR_SZ / THREAD_CNT;
+   const dim3 threads(THREAD_CNT, 1, 1), blocks(BLOCK_CNT, 1, 1);
+   cudaEvent_t start, stop;
+#else
+   static const size_t ARR_SZ     = 1 << 10;
+#endif
+   int* arr0 = nullptr;
+   int* arr1 = nullptr;
+   float annotated_time = 0.f, pointer_time = 0.f;
+
+#ifdef __CUDA_ARCH__
+   arr0 = (int*)malloc(ARR_SZ * sizeof(int));
+   arr1 = (int*)malloc(ARR_SZ * sizeof(int));
+#else
+   assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int)));
+   assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int)));
+   assert_rt(cudaDeviceSynchronize());
+#endif
+
+#ifdef __CUDA_ARCH__
+   ptr_timing_dev(arr0, arr1);
+#else
+   ptr_timing<<<blocks,threads>>>(arr0, arr1);
+   assert_rt(cudaDeviceSynchronize());
+#endif
+
+   for (size_t i = 0; i < ARR_SZ; ++i) {
+     arr0[i] = static_cast<int>(i);
+     arr1[i] = 0;
+   }
+
+#ifdef __CUDA_ARCH__
+   ptr_timing_dev(arr0, arr1);
+#else
+   assert_rt(cudaDeviceSynchronize());
+   assert_rt(cudaEventCreate(&start));
+   assert_rt(cudaEventCreate(&stop));
+   assert_rt(cudaEventRecord(start));
+   ptr_timing<<<blocks,threads>>>(arr0, arr1);
+   assert_rt(cudaEventRecord(stop));
+   assert_rt(cudaEventSynchronize(stop));
+   assert_rt(cudaEventElapsedTime(&pointer_time, start, stop));
+   assert_rt(cudaEventDestroy(start));
+   assert_rt(cudaEventDestroy(stop));
+   assert_rt(cudaDeviceSynchronize());
+
+   for (size_t i = 0; i < ARR_SZ; ++i) {
+     if (arr1[i] != (int)i) {
+       DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i);
+       assert(arr1[i] == static_cast<int>(i));
+     }
+
+     arr1[i] = 0;
+   }
+#endif
+
+#ifdef __CUDA_ARCH__
+   annotated_ptr_timing_dev(arr0, arr1);
+#else
+   assert_rt(cudaDeviceSynchronize());
+   annotated_ptr_timing<<<blocks,threads>>>(arr0, arr1);
+   assert_rt(cudaDeviceSynchronize());
+#endif
+
+   for (size_t i = 0; i < ARR_SZ; ++i) {
+     arr0[i] = static_cast<int>(i);
+     arr1[i] = 0;
+   }
+
+#ifdef __CUDA_ARCH__
+   annotated_ptr_timing_dev(arr0, arr1);
+#else
+   assert_rt(cudaDeviceSynchronize());
+   assert_rt(cudaEventCreate(&start));
+   assert_rt(cudaEventCreate(&stop));
+   assert_rt(cudaEventRecord(start));
+   annotated_ptr_timing<<<blocks,threads>>>(arr0, arr1);
+   assert_rt(cudaEventRecord(stop));
+   assert_rt(cudaEventSynchronize(stop));
+   assert_rt(cudaEventElapsedTime(&annotated_time, start, stop));
+   assert_rt(cudaEventDestroy(start));
+   assert_rt(cudaEventDestroy(stop));
+   assert_rt(cudaDeviceSynchronize());
+
+   for (size_t i = 0; i < ARR_SZ; ++i) {
+     if (arr1[i] != (int)i) {
+       DPRINTF("arr1[%d] == %d, should be:%d\n", i, arr1[i], i);
+       assert(arr1[i] == static_cast<int>(i));
+     }
+
+     arr1[i] = 0;
+   }
+#endif
+
+#ifdef __CUDA_ARCH__
+   free(arr0);
+   free(arr1);
+#else
+   assert_rt(cudaFree(arr0));
+   assert_rt(cudaFree(arr1));
+#endif
+   printf("array(ms):%f, arrotated_ptr(ms):%f\n",
+	  pointer_time, annotated_time);
+}
+
+int main(int argc, char ** argv)
+{
+  bench();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp
new file mode 100644
index 0000000000..2a00bbb49a
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_constructors.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+
+#include "utils.h"
+
+template<typename T, typename P>
+__host__ __device__ __noinline__
+void test_ctor() {
+  // default ctor, cpy and cpy assignment
+  cuda::annotated_ptr<T, P> def;
+  {
+    cuda::annotated_ptr<T, P> temp;
+    temp = def;
+    unused(temp);
+  }
+  cuda::annotated_ptr<T, P> other(def);
+  unused(other);
+
+  // from ptr
+  T* rp = nullptr;
+  cuda::annotated_ptr<T, P> a(rp);
+  assert(!a);
+
+  // cpy ctor & asign to cv
+  cuda::annotated_ptr<const T, P> c(def);
+  cuda::annotated_ptr<volatile T, P> d(def);
+  cuda::annotated_ptr<const volatile T, P> e(def);
+  c = def;
+  d = def;
+  e = def;
+
+  // from c|v to c|v|cv
+  cuda::annotated_ptr<const T, P> f(c);
+  cuda::annotated_ptr<volatile T, P> g(d);
+  cuda::annotated_ptr<const volatile T, P> h(e);
+  f = c;
+  g = d;
+  h = e;
+  unused(f, g, h);
+
+  // to cv
+  cuda::annotated_ptr<const volatile T, P> i(c);
+  cuda::annotated_ptr<const volatile T, P> j(d);
+  i = c;
+  j = d;
+}
+
+template<typename T, typename P>
+__host__ __device__ __noinline__
+void test_global_ctor() {
+  test_ctor<T, P>();
+
+  // from ptr + prop
+  T* rp = nullptr;
+  P p;
+  cuda::annotated_ptr<T, cuda::access_property> a(rp, p);
+  cuda::annotated_ptr<const T, cuda::access_property> b(rp, p);
+  cuda::annotated_ptr<volatile T, cuda::access_property> c(rp, p);
+  cuda::annotated_ptr<const volatile T, cuda::access_property> d(rp, p);
+}
+
+__host__ __device__ __noinline__
+void test_global_ctors() {
+  test_global_ctor<int, cuda::access_property::normal>();
+  test_global_ctor<int, cuda::access_property::streaming>();
+  test_global_ctor<int, cuda::access_property::persisting>();
+  test_global_ctor<int, cuda::access_property::global>();
+  test_global_ctor<int, cuda::access_property>();
+  test_ctor<int, cuda::access_property::shared>();
+}
+
+int main(int argc, char ** argv)
+{
+  test_global_ctors();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp
new file mode 100644
index 0000000000..76f218c844
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_ctor.fail.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+
+#include "utils.h"
+
+template<typename T, typename P>
+__host__ __device__ __noinline__
+void test_ctor() {
+  // default ctor, cpy and cpy assignment
+  cuda::annotated_ptr<T, P> def;
+  def = def;
+  cuda::annotated_ptr<T, P> other(def);
+
+  // from ptr
+  T* rp = nullptr;
+  cuda::annotated_ptr<T, P> a(rp);
+  assert(!a);
+
+  // cpy ctor & asign to cv
+  cuda::annotated_ptr<const T, P> c(def);
+  cuda::annotated_ptr<volatile T, P> d(def);
+  cuda::annotated_ptr<const volatile T, P> e(def);
+  c = e; // FAIL
+  d = d; // FAIL
+}
+
+template<typename T, typename P>
+__host__ __device__ __noinline__
+void test_global_ctor() {
+  test_ctor<T, P>();
+}
+
+__host__ __device__ __noinline__
+void test_global_ctors() {
+  test_global_ctor<int, cuda::access_property::normal>();
+}
+
+int main(int argc, char ** argv)
+{
+  test_global_ctors();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp
new file mode 100644
index 0000000000..ec304bc253
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_constructor.pass.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// XFAIL: nvcc
+
+#include "utils.h"
+
+int main(int argc, char ** argv)
+{
+
+    cuda::access_property ap(cuda::access_property::persisting{});
+    int* array0 = new int[9];
+    cuda::annotated_ptr<int, cuda::access_property> array_anno_ptr{array0, ap};
+    cuda::annotated_ptr<int, cuda::access_property::shared> shared_ptr;
+
+    array_anno_ptr = shared_ptr;  //  fail to compile, as expected
+
+    return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp
index 4be560147f..907d508d03 100644
--- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_host_shared.pass.cpp
@@ -12,18 +12,18 @@
 // UNSUPPORTED: nvrtc
 // XFAIL: nvcc
 
-#include "annotated_ptr.h"
+#include "utils.h"
 
 int main(int argc, char ** argv)
 {
-#ifndef __CUDA_ARCH__
+
     cuda::access_property ap(cuda::access_property::persisting{});
     int* array0 = new int[9];
     cuda::annotated_ptr<int, cuda::access_property> array_anno_ptr{array0, ap};
     cuda::annotated_ptr<int, cuda::access_property::shared> shared_ptr;
 
     array_anno_ptr = shared_ptr;  //  fail to compile, as expected
-#endif
+
 
     return 0;
 }
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp
index 1d871e3e31..9f0aa9e1f0 100644
--- a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_neg_tests.pass.cpp
@@ -12,22 +12,16 @@
 // UNSUPPORTED: nvrtc
 // XFAIL: nvcc
 
-#include "annotated_ptr.h"
+#include "utils.h"
 
-__device__ __host__ __noinline__
-static void negative_test_access_property_functions() {
-    size_t ARR_SZ = 1 << 20;
-    int* arr1 = nullptr;
-    cuda::access_property ap(cuda::access_property::persisting{});
-
-    arr1 = (int*)malloc(ARR_SZ * sizeof(int));
-
-    //calling from host needs to fail and kill the app
-    __nv_associate_access_property(arr1, static_cast<uint64_t>(ap));
+__device__ __host__
+static void fails_from_host() {
+    int a;
+    __nv_associate_access_property(&a, uint64_t{0});
 }
 
 int main(int argc, char ** argv) {
-    negative_test_access_property_functions();
-
+    //calling from host needs to fail and kill the app
+    fails_from_host();
     return 0;
 }
diff --git a/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp
new file mode 100644
index 0000000000..91a123f4a5
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/annotated_ptr_shmem.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+
+template <typename T, typename U>
+__device__ __host__ __noinline__
+void shared_mem_test_dev() {
+  T* smem = alloc<T, 128>(true);
+  smem[10] = 42;
+
+  cuda::annotated_ptr<U, cuda::access_property::shared> p{smem + 10};
+
+  assert(*p == 42);
+}
+
+__device__ __host__ __noinline__
+void all_tests() {
+  shared_mem_test_dev<int, int>();
+  shared_mem_test_dev<int, const int>();
+  shared_mem_test_dev<int, volatile int>();
+  shared_mem_test_dev<int, const volatile int>();
+}
+
+__global__
+void shared_mem_test() {
+  all_tests();
+};
+
+// TODO: is this needed?
+__device__ __host__ __noinline__
+void test_all() {
+#ifdef __CUDA_ARCH__
+  all_tests();
+#else
+  shared_mem_test<<<1, 1, 0, 0>>>();
+  assert_rt(cudaStreamSynchronize(0));
+#endif
+}
+
+int main(int argc, char ** argv)
+{
+  test_all();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp
new file mode 100644
index 0000000000..8ddfe8872d
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/apply_access_property.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+#define ARR_SZ 128
+
+template <typename T, typename P>
+__device__ __host__ __noinline__
+void test(P ap, bool shared = false)
+{
+  T* arr = alloc<T, ARR_SZ>(shared);
+
+  cuda::apply_access_property(arr, ARR_SZ, ap);
+
+  for (int i = 0; i < ARR_SZ; ++i) {
+    assert(arr[i] == i);
+  }
+
+  dealloc<T>(arr, shared);
+}
+
+__device__ __host__ __noinline__
+void test_all()
+{
+  test<int>(cuda::access_property::normal{});
+  test<int>(cuda::access_property::persisting{});
+}
+
+__global__ void test_kernel() {
+  test_all();
+}
+
+int main(int argc, char ** argv)
+{
+
+  test_all();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp
new file mode 100644
index 0000000000..abf5196763
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/associate_access_property.pass.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+#define ARR_SZ 128
+
+template <typename T, typename P>
+__device__ __host__ __noinline__
+void test(P ap, bool shared = false)
+{
+
+  T* arr = alloc<T, ARR_SZ>(shared);
+
+  arr = cuda::associate_access_property(arr, ap);
+
+  for (int i = 0; i < ARR_SZ; ++i) {
+    assert(arr[i] == i);
+  }
+
+  dealloc<T>(arr, shared);
+}
+
+__device__ __host__ __noinline__
+void test_all()
+{
+  test<int>(cuda::access_property::normal{});
+  test<int>(cuda::access_property::persisting{});
+  test<int>(cuda::access_property::streaming{});
+  test<int>(cuda::access_property::global{});
+  test<int>(cuda::access_property{});
+  test<int>(cuda::access_property::shared{}, true);
+}
+
+__global__ void test_kernel() {
+  test_all();
+}
+
+int main(int argc, char ** argv)
+{
+
+  test_all();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp
new file mode 100644
index 0000000000..5189e8c5a7
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/discard_memory.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+#define ARR_SZ 128
+
+template <typename T>
+__device__ __host__ __noinline__
+void test(bool shared = false)
+{
+  T* arr = alloc<T, ARR_SZ>(shared);
+
+  cuda::discard_memory(arr, ARR_SZ);
+
+  dealloc<T>(arr, shared);
+}
+
+__device__ __host__ __noinline__
+void test_all()
+{
+  test<int>();
+}
+
+__global__ void test_kernel() {
+  test_all();
+}
+
+int main(int argc, char ** argv)
+{
+
+  test_all();
+  return 0;
+}
diff --git a/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp b/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp
new file mode 100644
index 0000000000..a41a3ba95d
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/memcpy_async.pass.cpp
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: pre-sm-70
+// UNSUPPORTED: !nvcc
+// UNSUPPORTED: nvrtc
+// UNSUPPORTED: c++98, c++03
+
+#include "utils.h"
+#include <cooperative_groups.h>
+
+// TODO: global-shared
+// TODO: read  const
+__device__ __host__ __noinline__
+void test_memcpy_async() {
+    size_t ARR_SZ = 1 << 10;
+    int* arr0 = nullptr;
+    int* arr1 = nullptr;
+    cuda::access_property ap(cuda::access_property::persisting{});
+    cuda::barrier<cuda::thread_scope_system> bar0, bar1, bar2, bar3;
+    init(&bar0, 1);
+    init(&bar1, 1);
+    init(&bar2, 1);
+    init(&bar3, 1);
+
+#ifdef __CUDA_ARCH__
+    arr0 = (int*)malloc(ARR_SZ * sizeof(int));
+    arr1 = (int*)malloc(ARR_SZ * sizeof(int));
+
+    auto group = cooperative_groups::this_thread_block();
+#else
+    assert_rt(cudaMallocManaged((void**) &arr0, ARR_SZ * sizeof(int)));
+    assert_rt(cudaMallocManaged((void**) &arr1, ARR_SZ * sizeof(int)));
+    assert_rt(cudaDeviceSynchronize());
+#endif
+
+    cuda::annotated_ptr<int, cuda::access_property> ann0{arr0, ap};
+    cuda::annotated_ptr<int, cuda::access_property> ann1{arr1, ap};
+    //cuda::annotated_ptr<const int, cuda::access_property> cann0{arr0, ap};
+
+    for (size_t i = 0; i < ARR_SZ; ++i) {
+        arr0[i] = static_cast<int>(i);
+        arr1[i] = 0;
+    }
+
+    cuda::memcpy_async(ann1, ann0, ARR_SZ * sizeof(int), bar0);
+    //cuda::memcpy_async(ann1, cann0, ARR_SZ * sizeof(int), bar0);
+    bar0.arrive_and_wait();
+
+    for (size_t i = 0; i < ARR_SZ; ++i) {
+        if (arr1[i] != (int)i) {
+            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
+            assert(arr1[i] == static_cast<int>(i));
+        }
+
+        arr1[i] = 0;
+    }
+
+    cuda::memcpy_async(arr1, ann0, ARR_SZ * sizeof(int), bar1);
+    //cuda::memcpy_async(arr1, cann0, ARR_SZ * sizeof(int), bar1);
+    bar1.arrive_and_wait();
+
+    for (size_t i = 0; i < ARR_SZ; ++i) {
+        if (arr1[i] != (int)i) {
+            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
+            assert(arr1[i] == static_cast<int>(i));
+        }
+
+        arr1[i] = 0;
+    }
+
+#ifdef __CUDA_ARCH__
+    cuda::memcpy_async(group, ann1, ann0, ARR_SZ * sizeof(int), bar2);
+    //cuda::memcpy_async(group, ann1, cann0, ARR_SZ * sizeof(int), bar2);
+    bar2.arrive_and_wait();
+
+    for (size_t i = 0; i < ARR_SZ; ++i) {
+        if (arr1[i] != (int)i) {
+            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
+            assert(arr1[i] == i);
+        }
+
+        arr1[i] = 0;
+    }
+
+    cuda::memcpy_async(group, arr1, ann0, ARR_SZ * sizeof(int), bar3);
+    //cuda::memcpy_async(group, arr1, cann0, ARR_SZ * sizeof(int), bar3);
+    bar3.arrive_and_wait();
+
+    for (size_t i = 0; i < ARR_SZ; ++i) {
+        if (arr1[i] != (int)i) {
+            DPRINTF(stderr, "%p:&arr1[i] == %d, should be:%lu\n", &arr1[i], arr1[i], i);
+            assert(arr1[i] == i);
+        }
+
+        arr1[i] = 0;
+    }
+#endif
+
+#ifdef __CUDA_ARCH__
+    free(arr0);
+    free(arr1);
+#else
+    assert_rt(cudaFree(arr0));
+    assert_rt(cudaFree(arr1));
+#endif
+}
+
+int main(int argc, char ** argv)
+{
+  test_memcpy_async();
+  return 0;
+}
+
diff --git a/.upstream-tests/test/cuda/annotated_ptr/utils.h b/.upstream-tests/test/cuda/annotated_ptr/utils.h
new file mode 100644
index 0000000000..a72e409041
--- /dev/null
+++ b/.upstream-tests/test/cuda/annotated_ptr/utils.h
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(_LIBCUDACXX_COMPILER_MSVC)
+#pragma warning(disable: 4505)
+#endif
+
+#include <cuda/annotated_ptr>
+
+#if defined(DEBUG)
+    #define DPRINTF(...) { printf(__VA_ARGS__); }
+#else
+    #define DPRINTF(...) do {} while (false)
+#endif
+
+__device__ __host__
+void assert_rt_wrap(cudaError_t code, const char *file, int line) {
+    if (code != cudaSuccess) {
+#ifndef __CUDACC_RTC__
+        printf("assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+#endif
+        assert(code == cudaSuccess);
+    }
+}
+#define assert_rt(ret) { assert_rt_wrap((ret), __FILE__, __LINE__); }
+
+template <typename ... T>
+__host__ __device__ constexpr bool unused(T...) {return true;}
+
+template<typename T, int N>
+__device__ __host__ __noinline__
+T* alloc(bool shared = false) {
+  T* arr = nullptr;
+
+#ifdef __CUDA_ARCH__
+  if (!shared) {
+    arr = (T*)malloc(N * sizeof(T));
+  } else {
+    __shared__ T data[N];
+    arr = data;
+  }
+#else
+  assert_rt(cudaMallocManaged((void**) &arr, N * sizeof(T)));
+#endif
+
+  for (int i = 0; i < N; ++i) {
+    arr[i] = i;
+  }
+  return arr;
+}
+
+template<typename T>
+__device__ __host__ __noinline__
+void dealloc(T* arr, bool shared) {
+#ifdef __CUDA_ARCH__
+  if (!shared) free(arr);
+#else
+    assert_rt(cudaFree(arr));
+#endif
+}
diff --git a/.upstream-tests/utils/libcudacxx/test/config.py b/.upstream-tests/utils/libcudacxx/test/config.py
index 337a9c30bf..0a2b6c2fb5 100644
--- a/.upstream-tests/utils/libcudacxx/test/config.py
+++ b/.upstream-tests/utils/libcudacxx/test/config.py
@@ -577,17 +577,20 @@ def configure_compile_flags(self):
         pre_sm_60 = True
         pre_sm_70 = True
         pre_sm_80 = True
+        pre_sm_90 = True
         if compute_archs and self.cxx.type == 'nvcc':
             pre_sm_32 = False
             pre_sm_60 = False
             pre_sm_70 = False
             pre_sm_80 = False
+            pre_sm_90 = False
             compute_archs = [int(a) for a in sorted(shlex.split(compute_archs))]
             for arch in compute_archs:
                 if arch < 32: pre_sm_32 = True
                 if arch < 60: pre_sm_60 = True
                 if arch < 70: pre_sm_70 = True
                 if arch < 80: pre_sm_80 = True
+                if arch < 90: pre_sm_90 = True
                 arch_flag = '-gencode=arch=compute_{0},code=sm_{0}'.format(arch)
                 self.cxx.compile_flags += [arch_flag]
             enable_compute_future = self.get_lit_conf('enable_compute_future')
@@ -602,6 +605,8 @@ def configure_compile_flags(self):
             self.config.available_features.add("pre-sm-70")
         if pre_sm_80:
             self.config.available_features.add("pre-sm-80")
+        if pre_sm_90:
+            self.config.available_features.add("pre-sm-90")
 
     def configure_default_compile_flags(self):
         nvcc_host_compiler = self.get_lit_conf('nvcc_host_compiler')
@@ -614,7 +619,7 @@ def configure_default_compile_flags(self):
         std = self.get_lit_conf('std')
         if not std:
             # Choose the newest possible language dialect if none is given.
-            possible_stds = ['c++2a', 'c++17', 'c++1z', 'c++14', 'c++11',
+            possible_stds = ['c++20', 'c++2a', 'c++17', 'c++1z', 'c++14', 'c++11',
                              'c++03']
             if self.cxx.type == 'gcc':
                 maj_v, _, _ = self.cxx.version
diff --git a/.upstream-tests/utils/nvidia/linux/perform_tests.bash b/.upstream-tests/utils/nvidia/linux/perform_tests.bash
index c2bdb1913c..4b6529222b 100755
--- a/.upstream-tests/utils/nvidia/linux/perform_tests.bash
+++ b/.upstream-tests/utils/nvidia/linux/perform_tests.bash
@@ -71,7 +71,7 @@ function section_separator {
 LIBCXX_LOG=$(mktemp)
 LIBCUDACXX_LOG=$(mktemp)
 
-KNOWN_COMPUTE_ARCHS="35 50 52 53 60 61 62 70 72 75 80 86 87"
+KNOWN_COMPUTE_ARCHS="35 50 52 53 60 61 62 70 72 75 80 86 87 90"
 
 function report_and_exit {
   # If any of the lines searched for below aren't present in the log files, the
diff --git a/include/cuda/annotated_ptr b/include/cuda/annotated_ptr
index 42412f01c2..fcb6e20d71 100644
--- a/include/cuda/annotated_ptr
+++ b/include/cuda/annotated_ptr
@@ -127,6 +127,7 @@ _Tp* associate_access_property(_Tp* __ptr, _Property __prop) {
     std::is_same<_Property, access_property::persisting>::value ||
     std::is_same<_Property, access_property::streaming>::value ||
     std::is_same<_Property, access_property::normal>::value ||
+    std::is_same<_Property, access_property::global>::value ||
     std::is_same<_Property, access_property::shared>::value
       , "property is not convertible to cuda::access_property");
   return __detail_ap::__associate(__ptr, __prop);
@@ -207,14 +208,11 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
 
     __host__ __device__ pointer __get(bool __skip_prop = false, difference_type __n = 0) const {
 #ifdef __CUDA_ARCH__
-      if (__skip_prop == false) {
-        return static_cast<pointer>(this->__apply_prop(static_cast<void*>(__repr + __n)));
-      } else {
-        return __repr + __n;
+      if (!__skip_prop) {
+        return static_cast<pointer>(this->__apply_prop(const_cast<void*>(static_cast<const volatile void*>(__repr + __n))));
       }
-#else
-      return __repr + __n;
 #endif
+      return __repr + __n;
     }
     __host__ __device__ pointer __offset(difference_type __n, bool __skip_prop = false) const {
       return __get(__skip_prop, __n);
@@ -252,9 +250,10 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
 
     template <typename _RuntimeProperty>
     __host__ __device__ annotated_ptr(pointer __p, _RuntimeProperty __prop)
-      : __detail_ap::__annotated_ptr_base<_RuntimeProperty>(static_cast<std::uint64_t>(__prop)), __repr(__p)
+      : __detail_ap::__annotated_ptr_base<_Property>(static_cast<std::uint64_t>(access_property(__prop))), __repr(__p)
     {
-      static_assert(std::is_same<_Property, access_property>::value, "This method requires Property=cuda::access_property");
+      static_assert(std::is_same<_Property, access_property>::value,
+		    "This method requires annotated_ptr<T, cuda::access_property>");
       static_assert(std::is_same<_RuntimeProperty, access_property::global>::value ||
                     std::is_same<_RuntimeProperty, access_property::normal>::value ||
                     std::is_same<_RuntimeProperty, access_property::streaming>::value ||
@@ -268,14 +267,7 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
     }
 
     template<class _TTp, class _Prop>
-      __host__ __device__ annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other)
-      : annotated_ptr<_TTp, _Prop>(__other.prop_), __repr(__other.__repr)
-      {
-      static_assert(std::is_assignable<pointer, _TTp*>::value, "pointer must be assignable from other pointer");
-      static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) ||
-                    std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space");
-      // note: precondition "__other.__rep must be compatible with _Property" currently always holds
-    }
+    __host__ __device__ annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other);
 
     __host__ __device__ constexpr explicit operator bool() const noexcept {
       return __repr != nullptr;
@@ -285,8 +277,24 @@ class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
       constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
       return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
     }
+
+    __host__ __device__ _Property __property() const noexcept {
+      return this->__get_property();
+    }
 };
 
+
+template<class _Tp, class _Property>
+template<class _TTp, class _Prop>
+__host__ __device__ annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other)
+  : __detail_ap::__annotated_ptr_base<_Property>(__other.__property()), __repr(__other.get())
+{
+  static_assert(std::is_assignable<pointer&, _TTp*>::value, "pointer must be assignable from other pointer");
+  static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) ||
+		std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space");
+  // note: precondition "__other.__rep must be compatible with _Property" currently always holds
+}
+
 template<class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
 __host__ __device__
 void memcpy_async(_Dst* __dst,
diff --git a/include/cuda/std/detail/__annotated_ptr b/include/cuda/std/detail/__annotated_ptr
index 605af94048..4c7ad40806 100644
--- a/include/cuda/std/detail/__annotated_ptr
+++ b/include/cuda/std/detail/__annotated_ptr
@@ -105,7 +105,9 @@ namespace __detail_ap {
   __host__ __device__
   _Type* __associate(_Type* __ptr, _Property __prop) {
 #ifdef __CUDA_ARCH__
-    return static_cast<_Type*>(__associate_descriptor(__associate_address_space(__ptr, __prop), __prop));
+    return static_cast<_Type*>(__associate_descriptor(
+	    __associate_address_space(const_cast<void*>(static_cast<const void*>(__ptr)), __prop),
+	    __prop));
 #else
     return __ptr;
 #endif
@@ -125,9 +127,13 @@ namespace __detail_ap {
       constexpr __annotated_ptr_base() noexcept = default;
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+      __host__ __device__ constexpr __annotated_ptr_base(access_property::shared) noexcept {}
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, access_property::shared{});
       }
+      __host__ __device__ constexpr access_property::shared __get_property() const noexcept {
+	return access_property::shared{};
+      }
   };
 
   template<>
@@ -138,9 +144,13 @@ namespace __detail_ap {
       constexpr __annotated_ptr_base() noexcept = default;
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+      __host__ __device__ constexpr __annotated_ptr_base(access_property::global) noexcept {}
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, access_property::global{});
       }
+      __host__ __device__ constexpr access_property::global __get_property() const noexcept {
+	return access_property::global{};
+      }
   };
 
   template<>
@@ -151,9 +161,13 @@ namespace __detail_ap {
       constexpr __annotated_ptr_base() noexcept = default;
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+      __host__ __device__ constexpr __annotated_ptr_base(access_property::normal) noexcept {}
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, access_property::normal{});
       }
+      __host__ __device__ constexpr access_property::normal __get_property() const noexcept {
+	return access_property::normal{};
+      }
   };
 
   template<>
@@ -164,9 +178,13 @@ namespace __detail_ap {
       constexpr __annotated_ptr_base() noexcept = default;
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+      __host__ __device__ constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, access_property::persisting{});
       }
+      __host__ __device__ constexpr access_property::persisting __get_property() const noexcept {
+	return access_property::persisting{};
+      }
   };
 
   template<>
@@ -177,9 +195,13 @@ namespace __detail_ap {
       constexpr __annotated_ptr_base() noexcept = default;
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+      __host__ __device__ constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, access_property::streaming{});
       }
+      __host__ __device__ constexpr access_property::streaming __get_property() const noexcept {
+	return access_property::streaming{};
+      }
   };
 
   template<>
@@ -189,11 +211,16 @@ namespace __detail_ap {
 
       __host__ __device__ constexpr __annotated_ptr_base() noexcept : __prop(access_property()) {}
       __host__ __device__ constexpr __annotated_ptr_base(std::uint64_t __property) noexcept : __prop(__property) {}
+      __host__ __device__ constexpr __annotated_ptr_base(access_property __property) noexcept
+        : __annotated_ptr_base(static_cast<std::uint64_t>(__property)) {}
       constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
       _LIBCUDACXX_CONSTEXPR_AFTER_CXX11 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
       inline __device__ void* __apply_prop(void* __p) const {
         return __associate(__p, __prop);
       }
+      __host__ __device__ access_property __get_property() const noexcept {
+	return reinterpret_cast<access_property&>(const_cast<std::uint64_t&>(__prop));
+      }
   };
 } // namespace __detail_ap
 
diff --git a/include/nv/detail/__target_macros b/include/nv/detail/__target_macros
index 11f213ae7e..9013291805 100644
--- a/include/nv/detail/__target_macros
+++ b/include/nv/detail/__target_macros
@@ -30,6 +30,7 @@
 #  define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80
 #  define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86
 #  define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87
+#  define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90
 
 #  define _NV_TARGET_ARCH_TO_SM_350 35
 #  define _NV_TARGET_ARCH_TO_SM_370 37
@@ -45,6 +46,7 @@
 #  define _NV_TARGET_ARCH_TO_SM_800 80
 #  define _NV_TARGET_ARCH_TO_SM_860 86
 #  define _NV_TARGET_ARCH_TO_SM_870 87
+#  define _NV_TARGET_ARCH_TO_SM_900 90
 
 #if defined(_NV_COMPILER_NVCXX)
 
@@ -62,6 +64,7 @@
 #  define _NV_TARGET_VAL_SM_80 nv::target::sm_80
 #  define _NV_TARGET_VAL_SM_86 nv::target::sm_86
 #  define _NV_TARGET_VAL_SM_87 nv::target::sm_87
+#  define _NV_TARGET_VAL_SM_90 nv::target::sm_90
 
 #  define _NV_TARGET___NV_IS_HOST nv::target::is_host
 #  define _NV_TARGET___NV_IS_DEVICE nv::target::is_device
@@ -94,6 +97,7 @@
 #  define _NV_TARGET_VAL_SM_80 800
 #  define _NV_TARGET_VAL_SM_86 860
 #  define _NV_TARGET_VAL_SM_87 870
+#  define _NV_TARGET_VAL_SM_90 900
 
 #  if defined(__CUDA_ARCH__)
 #    define _NV_TARGET_VAL __CUDA_ARCH__
@@ -138,6 +142,7 @@
 #  define _NV_TARGET_VAL_SM_80 800
 #  define _NV_TARGET_VAL_SM_86 860
 #  define _NV_TARGET_VAL_SM_87 870
+#  define _NV_TARGET_VAL_SM_90 900
 
 #  define _NV_TARGET_VAL 0
 
@@ -165,6 +170,7 @@
 #define _NV_TARGET___NV_PROVIDES_SM_80 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_80))
 #define _NV_TARGET___NV_PROVIDES_SM_86 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_86))
 #define _NV_TARGET___NV_PROVIDES_SM_87 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_87))
+#define _NV_TARGET___NV_PROVIDES_SM_90 (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_90))
 
 #define _NV_TARGET___NV_IS_EXACTLY_SM_35 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_35))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_37 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_37))
@@ -180,6 +186,7 @@
 #define _NV_TARGET___NV_IS_EXACTLY_SM_80 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_80))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_86 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_86))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_87 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_87))
+#define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90))
 
 #define NV_PROVIDES_SM_35   __NV_PROVIDES_SM_35
 #define NV_PROVIDES_SM_37   __NV_PROVIDES_SM_37
@@ -195,6 +202,7 @@
 #define NV_PROVIDES_SM_80   __NV_PROVIDES_SM_80
 #define NV_PROVIDES_SM_86   __NV_PROVIDES_SM_86
 #define NV_PROVIDES_SM_87   __NV_PROVIDES_SM_87
+#define NV_PROVIDES_SM_90   __NV_PROVIDES_SM_90
 
 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35
 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37
@@ -210,6 +218,7 @@
 #define NV_IS_EXACTLY_SM_80 __NV_IS_EXACTLY_SM_80
 #define NV_IS_EXACTLY_SM_86 __NV_IS_EXACTLY_SM_86
 #define NV_IS_EXACTLY_SM_87 __NV_IS_EXACTLY_SM_87
+#define NV_IS_EXACTLY_SM_90 __NV_IS_EXACTLY_SM_90
 
 #define NV_IS_HOST         __NV_IS_HOST
 #define NV_IS_DEVICE       __NV_IS_DEVICE
@@ -315,6 +324,12 @@
 #    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_87 0
 #  endif
 
+#  if (_NV_TARGET___NV_IS_EXACTLY_SM_90)
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_90 1
+#  else
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_90 0
+#  endif
+
 #  if (_NV_TARGET_IS_HOST)
 #    define _NV_TARGET_BOOL___NV_IS_HOST   1
 #    define _NV_TARGET_BOOL___NV_IS_DEVICE 0
@@ -412,6 +427,12 @@
 #    define _NV_TARGET_BOOL___NV_PROVIDES_SM_87 0
 #  endif
 
+#  if (_NV_TARGET___NV_PROVIDES_SM_90)
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_90 1
+#  else
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_90 0
+#  endif
+
 #  define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond
 #  define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
 
@@ -452,6 +473,7 @@
 #  define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__))
 #  define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__))
 #  define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__))
 
 #  define _NV_TARGET_DISPATCH(...) _NV_BLOCK_EXPAND(_NV_DISPATCH_N_ARY(_NV_TARGET_DISPATCH_HANDLE, __VA_ARGS__))
 
diff --git a/include/nv/target b/include/nv/target
index 8622cd2605..9788b6ad36 100644
--- a/include/nv/target
+++ b/include/nv/target
@@ -67,12 +67,14 @@ namespace nv {
       constexpr base_int_t sm_80_bit = 1 << 12;
       constexpr base_int_t sm_86_bit = 1 << 13;
       constexpr base_int_t sm_87_bit = 1 << 14;
+      constexpr base_int_t sm_90_bit = 1 << 15;
       constexpr base_int_t all_devices =
           sm_35_bit | sm_37_bit |
           sm_50_bit | sm_52_bit | sm_53_bit |
           sm_60_bit | sm_61_bit | sm_62_bit |
           sm_70_bit | sm_72_bit | sm_75_bit |
-          sm_80_bit | sm_86_bit | sm_87_bit;
+          sm_80_bit | sm_86_bit | sm_87_bit |
+          sm_90_bit;
 
       // Store a set of targets as a set of bits
       struct _NV_BITSET_ATTRIBUTE target_description {
@@ -88,6 +90,7 @@ namespace nv {
         sm_60 = 60, sm_61 = 61, sm_62 = 62,
         sm_70 = 70, sm_72 = 72, sm_75 = 75,
         sm_80 = 80, sm_86 = 86, sm_87 = 87,
+        sm_90 = 90,
       };
       _NV_FUNCTION_ANNOTATION
       constexpr base_int_t toint(sm_selector a) {
@@ -108,11 +111,13 @@ namespace nv {
                toint(a) == 75 ? sm_75_bit :
                toint(a) == 80 ? sm_80_bit :
                toint(a) == 86 ? sm_86_bit :
-               toint(a) == 87 ? sm_87_bit : 0;
+               toint(a) == 87 ? sm_87_bit :
+               toint(a) == 90 ? sm_90_bit : 0;
       }
       _NV_FUNCTION_ANNOTATION
       constexpr base_int_t bitrounddown(sm_selector a) {
-        return toint(a) >= 87 ? sm_87_bit :
+        return toint(a) >= 90 ? sm_90_bit :
+               toint(a) >= 87 ? sm_87_bit :
                toint(a) >= 86 ? sm_86_bit :
                toint(a) >= 80 ? sm_80_bit :
                toint(a) >= 75 ? sm_75_bit :
@@ -188,6 +193,7 @@ namespace nv {
     constexpr sm_selector sm_80 = sm_selector::sm_80;
     constexpr sm_selector sm_86 = sm_selector::sm_86;
     constexpr sm_selector sm_87 = sm_selector::sm_87;
+    constexpr sm_selector sm_90 = sm_selector::sm_90;
 
     using detail::is_exactly;
     using detail::provides;