diff --git a/.clang-format b/.clang-format
index 35ebdbfae..951a549d0 100644
--- a/.clang-format
+++ b/.clang-format
@@ -46,6 +46,7 @@ BraceWrapping:
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
+BreakAfterAttributes: Leave
 BreakAfterJavaFieldAnnotations: false
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: WebKit
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 1403b8cc4..b4b7f3f02 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 90a3732f4..bcdaaf53d 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 8dc4f8be9..cf5679cf7 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 66d332573..2e2cb99e1 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c9a4cba83..334cae320 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,8 +10,14 @@ python/            @rapidsai/rmm-python-codeowners
 **/CMakeLists.txt  @rapidsai/rmm-cmake-codeowners
 **/cmake/          @rapidsai/rmm-cmake-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e4113d5c5..895cac94d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -66,22 +66,45 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build:
+  wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      script: ci/build_wheel.sh
-  wheel-publish:
-    needs: wheel-build
+      script: ci/build_wheel_cpp.sh
+  wheel-build-python:
+    needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_python.sh
+  wheel-publish-cpp:
+    needs: wheel-build-cpp
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: rmm
+      package-type: cpp
+  wheel-publish-python:
+    needs: wheel-build-python
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: rmm
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d717b15fc..96e4503b7 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,67 +18,76 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
-      - wheel-build
+      - wheel-build-cpp
+      - wheel-build-python
       - wheel-tests
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build:
+  wheel-build-cpp:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
-      script: ci/build_wheel.sh
+      script: ci/build_wheel_cpp.sh
+  wheel-build-python:
+    needs: wheel-build-cpp
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_python.sh
   wheel-tests:
-    needs: wheel-build
+    needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0e7938310..747f9f544 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.gitignore b/.gitignore
index f4993502b..2d0b150e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ dist/
 rmm.egg-info/
 python/build
 python/*/build
+python/rmm/docs/_build
 python/rmm/**/_lib/**/*.cpp
 !python/rmm/_lib/_torch_allocator.cpp
 python/rmm/**/_lib/**/*.h
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4df64d11a..d21fcebf8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,14 +14,14 @@ repos:
       rev: 5.12.0
       hooks:
           - id: isort
-            args: ["--settings-path=python/pyproject.toml"]
+            args: ["--settings-path=python/rmm/pyproject.toml"]
             files: python/.*
             types_or: [python, cython, pyi]
     - repo: https://github.com/ambv/black
       rev: 22.3.0
       hooks:
           - id: black
-            args: ["--config=python/pyproject.toml"]
+            args: ["--config=python/rmm/pyproject.toml"]
     - repo: https://github.com/MarcoGorelli/cython-lint
       rev: v0.15.0
       hooks:
@@ -81,6 +81,10 @@ repos:
       hooks:
         - id: ruff
           files: python/.*$
+    - repo: https://github.com/rapidsai/pre-commit-hooks
+      rev: v0.0.3
+      hooks:
+        - id: verify-copyright
 
 default_language_version:
     python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d870908d..70f788615 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,55 @@
+# rmm 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Refactor polymorphic allocator to use device_async_resource_ref ([#1555](https://github.com/rapidsai/rmm/pull/1555)) [@harrism](https://github.com/harrism)
+- Remove deprecated functionality ([#1537](https://github.com/rapidsai/rmm/pull/1537)) [@harrism](https://github.com/harrism)
+- Remove deprecated cuda_async_memory_resource constructor that takes thrust::optional parameters ([#1535](https://github.com/rapidsai/rmm/pull/1535)) [@harrism](https://github.com/harrism)
+- Remove deprecated supports_streams and get_mem_info methods. ([#1519](https://github.com/rapidsai/rmm/pull/1519)) [@harrism](https://github.com/harrism)
+
+## 🐛 Bug Fixes
+
+- rmm needs to link to nvtx3::nvtx3-cpp to support installed nvtx3 ([#1569](https://github.com/rapidsai/rmm/pull/1569)) [@robertmaynard](https://github.com/robertmaynard)
+- Make sure rmm wheel dependency on librmm is updated [skip ci] ([#1565](https://github.com/rapidsai/rmm/pull/1565)) [@raydouglass](https://github.com/raydouglass)
+- Don&#39;t ignore GCC-specific warning under Clang ([#1557](https://github.com/rapidsai/rmm/pull/1557)) [@aaronmondal](https://github.com/aaronmondal)
+- Add publish jobs for C++ wheels ([#1554](https://github.com/rapidsai/rmm/pull/1554)) [@vyasr](https://github.com/vyasr)
+- Explicitly use the current device resource in DeviceBuffer ([#1514](https://github.com/rapidsai/rmm/pull/1514)) [@wence-](https://github.com/wence-)
+
+## 📖 Documentation
+
+- Allow specifying mr in DeviceBuffer construction, and document ownership requirements in Python/C++ interfacing ([#1552](https://github.com/rapidsai/rmm/pull/1552)) [@wence-](https://github.com/wence-)
+- Fix Python install instruction ([#1547](https://github.com/rapidsai/rmm/pull/1547)) [@wence-](https://github.com/wence-)
+- Update multi-gpu discussion for device_buffer and device_vector dtors ([#1524](https://github.com/rapidsai/rmm/pull/1524)) [@wence-](https://github.com/wence-)
+- Fix ordering / heading levels in README.md and python example in guide.md ([#1513](https://github.com/rapidsai/rmm/pull/1513)) [@harrism](https://github.com/harrism)
+
+## 🚀 New Features
+
+- Add NVTX support and RMM_FUNC_RANGE() macro ([#1558](https://github.com/rapidsai/rmm/pull/1558)) [@harrism](https://github.com/harrism)
+- Always use a static gtest ([#1532](https://github.com/rapidsai/rmm/pull/1532)) [@robertmaynard](https://github.com/robertmaynard)
+- Build C++ wheel ([#1529](https://github.com/rapidsai/rmm/pull/1529)) [@vyasr](https://github.com/vyasr)
+- Remove deprecated supports_streams and get_mem_info methods. ([#1519](https://github.com/rapidsai/rmm/pull/1519)) [@harrism](https://github.com/harrism)
+
+## 🛠️ Improvements
+
+- update copyright dates ([#1564](https://github.com/rapidsai/rmm/pull/1564)) [@jameslamb](https://github.com/jameslamb)
+- Overhaul ops-codeowners ([#1561](https://github.com/rapidsai/rmm/pull/1561)) [@raydouglass](https://github.com/raydouglass)
+- Adding support for cupy.cuda.stream.ExternalStream ([#1559](https://github.com/rapidsai/rmm/pull/1559)) [@lilohuang](https://github.com/lilohuang)
+- Refactor polymorphic allocator to use device_async_resource_ref ([#1555](https://github.com/rapidsai/rmm/pull/1555)) [@harrism](https://github.com/harrism)
+- add RAPIDS copyright pre-commit hook ([#1553](https://github.com/rapidsai/rmm/pull/1553)) [@jameslamb](https://github.com/jameslamb)
+- Enable warnings as errors for Python tests ([#1551](https://github.com/rapidsai/rmm/pull/1551)) [@mroeschke](https://github.com/mroeschke)
+- Remove header existence tests. ([#1550](https://github.com/rapidsai/rmm/pull/1550)) [@bdice](https://github.com/bdice)
+- Only use functions in the limited API ([#1545](https://github.com/rapidsai/rmm/pull/1545)) [@vyasr](https://github.com/vyasr)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#1543](https://github.com/rapidsai/rmm/pull/1543)) [@hcho3](https://github.com/hcho3)
+- Fix `cuda11.8` nvcc dependency ([#1542](https://github.com/rapidsai/rmm/pull/1542)) [@trxcllnt](https://github.com/trxcllnt)
+- add --rm and --name to devcontainer run args ([#1539](https://github.com/rapidsai/rmm/pull/1539)) [@trxcllnt](https://github.com/trxcllnt)
+- Remove deprecated functionality ([#1537](https://github.com/rapidsai/rmm/pull/1537)) [@harrism](https://github.com/harrism)
+- Remove deprecated cuda_async_memory_resource constructor that takes thrust::optional parameters ([#1535](https://github.com/rapidsai/rmm/pull/1535)) [@harrism](https://github.com/harrism)
+- Make thrust_allocator deallocate safe in multi-device setting ([#1533](https://github.com/rapidsai/rmm/pull/1533)) [@wence-](https://github.com/wence-)
+- Move rmm Python package to subdirectory ([#1526](https://github.com/rapidsai/rmm/pull/1526)) [@vyasr](https://github.com/vyasr)
+- Remove a file not being used ([#1521](https://github.com/rapidsai/rmm/pull/1521)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unneeded `update-version.sh` update ([#1520](https://github.com/rapidsai/rmm/pull/1520)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Enable all tests for `arm` arch ([#1510](https://github.com/rapidsai/rmm/pull/1510)) [@galipremsagar](https://github.com/galipremsagar)
+
 # RMM 24.04.00 (10 Apr 2024)
 
 ## 🚨 Breaking Changes
@@ -40,7 +92,7 @@
 - Use `conda env create --yes` instead of `--force` ([#1509](https://github.com/rapidsai/rmm/pull/1509)) [@bdice](https://github.com/bdice)
 - Add upper bound to prevent usage of NumPy 2 ([#1501](https://github.com/rapidsai/rmm/pull/1501)) [@bdice](https://github.com/bdice)
 - Remove hard-coding of RAPIDS version where possible ([#1496](https://github.com/rapidsai/rmm/pull/1496)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
-- Requre NumPy 1.23+ ([#1488](https://github.com/rapidsai/rmm/pull/1488)) [@jakirkham](https://github.com/jakirkham)
+- Require NumPy 1.23+ ([#1488](https://github.com/rapidsai/rmm/pull/1488)) [@jakirkham](https://github.com/jakirkham)
 - Use `rmm::device_async_resource_ref` in multi_stream_allocation benchmark ([#1482](https://github.com/rapidsai/rmm/pull/1482)) [@miscco](https://github.com/miscco)
 - Update devcontainers to CUDA Toolkit 12.2 ([#1470](https://github.com/rapidsai/rmm/pull/1470)) [@trxcllnt](https://github.com/trxcllnt)
 - Add support for Python 3.11 ([#1469](https://github.com/rapidsai/rmm/pull/1469)) [@jameslamb](https://github.com/jameslamb)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56454d4b1..fa3358ca0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -38,6 +38,7 @@ rapids_cmake_build_type(Release)
 # ##################################################################################################
 # * build options ----------------------------------------------------------------------------------
 
+option(USE_NVTX "Build with NVTX support" ON)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
 set(RMM_LOGGING_LEVEL
@@ -46,11 +47,12 @@ set(RMM_LOGGING_LEVEL
 set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR"
                                               "CRITICAL" "OFF")
 
+message(VERBOSE "RMM: Build with NVTX support: ${USE_NVTX}")
 # Set logging level. Must go before including gtests and benchmarks. Set the possible values of
-# build type for cmake-gui
+# build type for cmake-gui.
 message(STATUS "RMM: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'")
 
-# cudart can be statically linked or dynamically linked the python ecosystem wants dynamic linking
+# cudart can be linked statically or dynamically
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
 # ##################################################################################################
@@ -71,6 +73,7 @@ rapids_cpm_init()
 include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 include(cmake/thirdparty/get_cccl.cmake)
+include(cmake/thirdparty/get_nvtx.cmake)
 
 # ##################################################################################################
 # * library targets --------------------------------------------------------------------------------
@@ -93,9 +96,15 @@ target_link_libraries(rmm INTERFACE CCCL::CCCL)
 target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
+target_link_libraries(rmm INTERFACE nvtx3::nvtx3-cpp)
 target_compile_features(rmm INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
+# Disable NVTX if necessary
+if(NOT USE_NVTX)
+  target_compile_definitions(rmm INTERFACE NVTX_DISABLE)
+endif()
+
 # ##################################################################################################
 # * tests and benchmarks ---------------------------------------------------------------------------
 
@@ -114,7 +123,8 @@ endif()
 # * add tests --------------------------------------------------------------------------------------
 
 if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
-  include(cmake/thirdparty/get_gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
   include(CTest) # calls enable_testing()
 
   add_subdirectory(tests)
diff --git a/README.md b/README.md
index 9ec8cbf47..4032f161b 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ Python requirements:
 * `cuda-python`
 * `cython`
 
-For more details, see [pyproject.toml](python/pyproject.toml)
+For more details, see [pyproject.toml](python/rmm/pyproject.toml)
 
 
 ### Script to build RMM from source
@@ -127,7 +127,8 @@ $ make test
 
 - Build, install, and test the `rmm` python package, in the `python` folder:
 ```bash
-$ python -m pip install -e ./python
+# In the root rmm directory
+$ python -m pip install -e ./python/rmm
 $ pytest -v
 ```
 
@@ -207,38 +208,7 @@ alignment argument. All allocations are required to be aligned to at least 256B.
 `device_memory_resource` adds an additional `cuda_stream_view` argument to allow specifying the stream
 on which to perform the (de)allocation.
 
-## `cuda_stream_view` and `cuda_stream`
-
-`rmm::cuda_stream_view` is a simple non-owning wrapper around a CUDA `cudaStream_t`. This wrapper's
-purpose is to provide strong type safety for stream types. (`cudaStream_t` is an alias for a pointer,
-which can lead to ambiguity in APIs when it is assigned `0`.)  All RMM stream-ordered APIs take a
-`rmm::cuda_stream_view` argument.
-
-`rmm::cuda_stream` is a simple owning wrapper around a CUDA `cudaStream_t`. This class provides
-RAII semantics (constructor creates the CUDA stream, destructor destroys it). An `rmm::cuda_stream`
-can never represent the CUDA default stream or per-thread default stream; it only ever represents
-a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved.
-
-## `cuda_stream_pool`
-
-`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to
-create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the
-stream pool can be faster than creating the streams on the fly. The size of the pool is configurable.
-Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of
-`rmm::cuda_stream_view` that represent identical CUDA streams.
-
-### Thread Safety
-
-All current device memory resources are thread safe unless documented otherwise. More specifically,
-calls to memory resource `allocate()` and `deallocate()` methods are safe with respect to calls to
-either of these functions from other threads. They are _not_ thread safe with respect to
-construction and destruction of the memory resource object.
-
-Note that a class `thread_safe_resource_adapter` is provided which can be used to adapt a memory
-resource that is not thread safe to be thread safe (as described above). This adapter is not needed
-with any current RMM device memory resources.
-
-### Stream-ordered Memory Allocation
+## Stream-ordered Memory Allocation
 
 `rmm::mr::device_memory_resource` is a base class that provides stream-ordered memory allocation.
 This allows optimizations such as re-using memory deallocated on the same stream without the
@@ -270,16 +240,16 @@ For further information about stream-ordered memory allocation semantics, read
 Allocator](https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/)
 on the NVIDIA Developer Blog.
 
-### Available Resources
+## Available Device Resources
 
 RMM provides several `device_memory_resource` derived classes to satisfy various user requirements.
 For more detailed information about these resources, see their respective documentation.
 
-#### `cuda_memory_resource`
+### `cuda_memory_resource`
 
 Allocates and frees device memory using `cudaMalloc` and `cudaFree`.
 
-#### `managed_memory_resource`
+### `managed_memory_resource`
 
 Allocates and frees device memory using `cudaMallocManaged` and `cudaFree`.
 
@@ -287,22 +257,22 @@ Note that `managed_memory_resource` cannot be used with NVIDIA Virtual GPU Softw
 with virtual machines or hypervisors) because [NVIDIA CUDA Unified Memory is not supported by
 NVIDIA vGPU](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#cuda-open-cl-support-vgpu).
 
-#### `pool_memory_resource`
+### `pool_memory_resource`
 
 A coalescing, best-fit pool sub-allocator.
 
-#### `fixed_size_memory_resource`
+### `fixed_size_memory_resource`
 
 A memory resource that can only allocate a single fixed size. Average allocation and deallocation
 cost is constant.
 
-#### `binning_memory_resource`
+### `binning_memory_resource`
 
 Configurable to use multiple upstream memory resources for allocations that fall within different
 bin sizes. Often configured with multiple bins backed by `fixed_size_memory_resource`s and a single
 `pool_memory_resource` for allocations larger than the largest bin size.
 
-### Default Resources and Per-device Resources
+## Default Resources and Per-device Resources
 
 RMM users commonly need to configure a `device_memory_resource` object to use for all allocations
 where another resource has not explicitly been provided. A common example is configuring a
@@ -327,7 +297,7 @@ Accessing and modifying the default resource is done through two functions:
      `get_current_device_resource()`
    - For more explicit control, you can use `set_per_device_resource()`, which takes a device ID.
 
-#### Example
+### Example
 
 ```c++
 rmm::mr::cuda_memory_resource cuda_mr;
@@ -339,7 +309,7 @@ rmm::mr::set_current_device_resource(&pool_mr); // Updates the current device re
 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); // Points to `pool_mr`
 ```
 
-#### Multiple Devices
+### Multiple Devices
 
 A `device_memory_resource` should only be used when the active CUDA device is the same device
 that was active when the `device_memory_resource` was created. Otherwise behavior is undefined.
@@ -367,36 +337,116 @@ for(int i = 0; i < N; ++i) {
 
 Note that the CUDA device that is current when creating a `device_memory_resource` must also be
 current any time that `device_memory_resource` is used to deallocate memory, including in a
-destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
-(incorrect) example that assumes the above example loop has been run to create a
-`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
-line of the error comment.
+destructor. The RAII class `rmm::device_buffer` and classes that use it as a backing store
+(`rmm::device_scalar` and `rmm::device_uvector`) handle this by storing the active device when the
+constructor is called, and then ensuring that the stored device is active whenever an allocation or
+deallocation is performed (including in the destructor). The user must therefore only ensure that
+the device active during _creation_ of an `rmm::device_buffer` matches the active device of the
+memory resource being used.
+
+Here is an incorrect example that creates a memory resource on device zero and then uses it to
+allocate a `device_buffer` on device one:
 
 ```c++
 {
   RMM_CUDA_TRY(cudaSetDevice(0));
-  rmm::device_buffer buf_a(16);
-
+  auto mr = rmm::mr::cuda_memory_resource{};
   {
     RMM_CUDA_TRY(cudaSetDevice(1));
-    rmm::device_buffer buf_b(16);
+    // Invalid, current device is 1, but MR is only valid for device 0
+    rmm::device_buffer buf(16, rmm::cuda_stream_default, &mr);
   }
+}
+```
 
-  // Error: when buf_a is destroyed, the current device must be 0, but it is 1
+A correct example creates the device buffer with device zero active. After that it is safe to switch
+devices and let the buffer go out of scope and destruct with a different device active. For example,
+this code is correct:
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  auto mr = rmm::mr::cuda_memory_resource{};
+  rmm::device_buffer buf(16, rmm::cuda_stream_default, &mr);
+  RMM_CUDA_TRY(cudaSetDevice(1));
+  ...
+  // No need to switch back to device 0 before ~buf runs
+}
+```
+
+#### Use of `rmm::device_vector` with multiple devices
+
+`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to
+allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory
+resource apply: the active device must match the active device at resource construction time. To
+facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at
+construction time and ensures that device is active whenever it allocates or deallocates memory.
+Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`.
+One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them
+with a different active device.
+
+For example, recapitulating the previous example using `rmm::device_vector`:
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  auto mr = rmm::mr::cuda_memory_resource{};
+  rmm::device_vector<int> vec(16, rmm::mr::thrust_allocator<int>(rmm::cuda_stream_default, &mr));
+  RMM_CUDA_TRY(cudaSetDevice(1));
+  ...
+  // No need to switch back to device 0 before ~vec runs
 }
 ```
 
-### Allocators
+> [!NOTE]
+> Although allocation and deallocation in the `thrust_allocator` run with the correct active device,
+> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the
+> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to
+> initialize new elements: the user must arrange for this kernel launch to occur with the correct
+> device for the memory resource active.
+
+## `cuda_stream_view` and `cuda_stream`
+
+`rmm::cuda_stream_view` is a simple non-owning wrapper around a CUDA `cudaStream_t`. This wrapper's
+purpose is to provide strong type safety for stream types. (`cudaStream_t` is an alias for a pointer,
+which can lead to ambiguity in APIs when it is assigned `0`.)  All RMM stream-ordered APIs take a
+`rmm::cuda_stream_view` argument.
+
+`rmm::cuda_stream` is a simple owning wrapper around a CUDA `cudaStream_t`. This class provides
+RAII semantics (constructor creates the CUDA stream, destructor destroys it). An `rmm::cuda_stream`
+can never represent the CUDA default stream or per-thread default stream; it only ever represents
+a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved.
+
+## `cuda_stream_pool`
+
+`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to
+create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the
+stream pool can be faster than creating the streams on the fly. The size of the pool is configurable.
+Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of
+`rmm::cuda_stream_view` that represent identical CUDA streams.
+
+## Thread Safety
+
+All current device memory resources are thread safe unless documented otherwise. More specifically,
+calls to memory resource `allocate()` and `deallocate()` methods are safe with respect to calls to
+either of these functions from other threads. They are _not_ thread safe with respect to
+construction and destruction of the memory resource object.
+
+Note that a class `thread_safe_resource_adapter` is provided which can be used to adapt a memory
+resource that is not thread safe to be thread safe (as described above). This adapter is not needed
+with any current RMM device memory resources.
+
+## Allocators
 
 C++ interfaces commonly allow customizable memory allocation through an [`Allocator`](https://en.cppreference.com/w/cpp/named_req/Allocator) object.
 RMM provides several `Allocator` and `Allocator`-like classes.
 
-#### `polymorphic_allocator`
+### `polymorphic_allocator`
 
 A [stream-ordered](#stream-ordered-memory-allocation) allocator similar to [`std::pmr::polymorphic_allocator`](https://en.cppreference.com/w/cpp/memory/polymorphic_allocator).
 Unlike the standard C++ `Allocator` interface, the `allocate` and `deallocate` functions take a `cuda_stream_view` indicating the stream on which the (de)allocation occurs.
 
-#### `stream_allocator_adaptor`
+### `stream_allocator_adaptor`
 
 `stream_allocator_adaptor` can be used to adapt a stream-ordered allocator to present a standard `Allocator` interface to consumers that may not be designed to work with a stream-ordered interface.
 
@@ -415,7 +465,7 @@ auto p = adapted.allocate(100);
 adapted.deallocate(p,100);
 ```
 
-#### `thrust_allocator`
+### `thrust_allocator`
 
 `thrust_allocator` is a device memory allocator that uses the strongly typed `thrust::device_ptr`, making it usable with containers like `thrust::device_vector`.
 
@@ -497,13 +547,13 @@ Similar to `device_memory_resource`, it has two key functions for (de)allocation
 Unlike `device_memory_resource`, the `host_memory_resource` interface and behavior is identical to
 `std::pmr::memory_resource`.
 
-### Available Resources
+## Available Host Resources
 
-#### `new_delete_resource`
+### `new_delete_resource`
 
 Uses the global `operator new` and `operator delete` to allocate host memory.
 
-#### `pinned_memory_resource`
+### `pinned_memory_resource`
 
 Allocates "pinned" host memory using `cuda(Malloc/Free)Host`.
 
@@ -611,7 +661,7 @@ resources are detectable with Compute Sanitizer Memcheck.
 It may be possible in the future to add support for memory bounds checking with other memory
 resources using NVTX APIs.
 
-## Using RMM in Python Code
+# Using RMM in Python
 
 There are two ways to use RMM in Python code:
 
@@ -622,7 +672,7 @@ There are two ways to use RMM in Python code:
 RMM provides a `MemoryResource` abstraction to control _how_ device
 memory is allocated in both the above uses.
 
-### DeviceBuffers
+## DeviceBuffer
 
 A DeviceBuffer represents an **untyped, uninitialized device memory
 allocation**.  DeviceBuffers can be created by providing the
@@ -662,7 +712,7 @@ host:
 array([1., 2., 3.])
 ```
 
-### MemoryResource objects
+## MemoryResource objects
 
 `MemoryResource` objects are used to configure how device memory allocations are made by
 RMM.
@@ -805,3 +855,94 @@ Out[6]:
  'total_bytes': 16,
  'total_count': 1}
 ```
+
+## Taking ownership of C++ objects from Python.
+
+When interacting with a C++ library that uses RMM from Python, one
+must be careful when taking ownership of `rmm::device_buffer` objects
+on the Python side. The `rmm::device_buffer` does not contain an
+owning reference to the memory resource used for its allocation (only
+a `device_async_resource_ref`), and the allocating user is expected to
+keep this memory resource alive for at least the lifetime of the
+buffer. When taking ownership of such a buffer in Python, we have no
+way (in the general case) of ensuring that the memory resource will
+outlive the buffer we are now holding.
+
+To avoid any issues, we need two things:
+
+1. The C++ library we are interfacing with should accept a memory
+   resource that is used for allocations that are returned to the
+   user.
+2. When calling into the library from python, we should provide a
+   memory resource whose lifetime we control. This memory resource
+   should then be provided when we take ownership of any allocated
+   `rmm::device_buffer`s.
+
+For example, suppose we have a C++ function that allocates
+`device_buffer`s, which has a utility overload that defaults the
+memory resource to the current device resource:
+
+```c++
+std::unique_ptr<rmm::device_buffer> allocate(
+  std::size_t size,
+  rmm::mr::device_async_resource_ref mr = get_current_device_resource())
+{
+    return std::make_unique<rmm::device_buffer>(size, rmm::cuda_stream_default, mr);
+}
+```
+
+The Python `DeviceBuffer` class has a convenience Cython function,
+`c_from_unique_ptr` to construct a `DeviceBuffer` from a
+`unique_ptr<rmm::device_buffer>`, taking ownership of it. To do this
+safely, we must ensure that the allocation that was done on the C++
+side uses a memory resource we control. So:
+
+```cython
+# Bad, doesn't control lifetime
+buffer_bad = DeviceBuffer.c_from_unique_ptr(allocate(10))
+
+# Good, allocation happens with a memory resource we control
+# mr is a DeviceMemoryResource
+buffer_good = DeviceBuffer.c_from_unique_ptr(
+    allocate(10, mr.get_mr()),
+    mr=mr,
+)
+```
+
+Note two differences between the bad and good cases:
+
+1. In the good case we pass the memory resource to the allocation
+   function.
+2. In the good case, we pass _the same_ memory resource to the
+   `DeviceBuffer` constructor so that its lifetime is tied to the
+   lifetime of the buffer.
+
+### Potential pitfalls of relying on `get_current_device_resource`
+
+Functions in both the C++ and Python APIs that perform allocation
+typically default the memory resource argument to the value of
+`get_current_device_resource`. This is to simplify the interface for
+callers. When using a C++ library from Python, this defaulting is
+safe, _as long as_ it is only the Python process that ever calls
+`set_current_device_resource`.
+
+This is because the current device resource on the C++ side has a
+lifetime which is expected to be managed by the user. The resources
+set by `rmm::mr::set_current_device_resource` are stored in a static
+`std::map` whose keys are device ids and values are raw pointers to
+the memory resources. Consequently,
+`rmm::mr::get_current_device_resource` returns an object with no
+lifetime provenance. This is, for the reasons discussed above, not
+usable from Python. To handle this on the Python side, the
+Python-level `set_current_device_resource` sets the C++ resource _and_
+stores the Python object in a static global dictionary. The Python
+`get_current_device_resource` then _does not use_
+`rmm::mr::get_current_device_resource` and instead looks up the
+current device resource in this global dictionary.
+
+Hence, if the C++ library we are interfacing with calls
+`rmm::mr::set_current_device_resource`, the C++ and Python sides of
+the program can disagree on what `get_current_device_resource`
+returns. The only safe thing to do if using the simplified interfaces
+is therefore to ensure that `set_current_device_resource` is only ever
+called on the Python side.
diff --git a/VERSION b/VERSION
index 4a2fe8aa5..0bff6981a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/build.sh b/build.sh
index ec08fa402..70da14b9b 100755
--- a/build.sh
+++ b/build.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # rmm build script
 
 # This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
-# build as needed (see the help output for details)
+# build as needed (see the help output for details).
 
 # Abort script on first error
 set -e
@@ -141,12 +141,6 @@ if hasArg --ptds; then
     PER_THREAD_DEFAULT_STREAM=ON
 fi
 
-# Append `-DFIND_RMM_CPP=ON` to CMAKE_ARGS unless a user specified the option.
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RMM_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RMM_CPP=ON"
-fi
-
 # If clean given, run it prior to any other steps
 if hasArg clean; then
     # If the dirs to clean are mounted dirs in a container, the
@@ -176,5 +170,5 @@ fi
 # Build and install the rmm Python package
 if (( NUMARGS == 0 )) || hasArg rmm; then
     echo "building and installing rmm..."
-    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" python -m pip install --no-build-isolation --no-deps ${REPODIR}/python
+    SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/rmm
 fi
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index ddd4b5014..dd4af8195 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -36,7 +36,7 @@ doxygen Doxyfile
 popd
 
 rapids-logger "Build Python docs"
-pushd python/docs
+pushd python/rmm/docs
 make dirhtml
 make text
 mkdir -p "${RAPIDS_DOCS_DIR}/rmm/"{html,txt}
diff --git a/ci/build_python.sh b/ci/build_python.sh
index b197b1ae1..394b3a453 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -14,7 +14,7 @@ export CMAKE_GENERATOR=Ninja
 rapids-print-env
 
 package_name="rmm"
-package_dir="python"
+package_dir="python/rmm"
 
 version=$(rapids-generate-version)
 commit=$(git rev-parse HEAD)
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
new file mode 100755
index 000000000..e61f6641c
--- /dev/null
+++ b/ci/build_wheel_cpp.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="librmm"
+package_dir="python/librmm"
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
+
+cd "${package_dir}"
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m pip install wheel
+python -m wheel tags --platform any dist/* --remove
+RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel.sh b/ci/build_wheel_python.sh
similarity index 64%
rename from ci/build_wheel.sh
rename to ci/build_wheel_python.sh
index b2d953361..debe9b0fe 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel_python.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name="rmm"
-package_dir="python"
+package_dir="python/rmm"
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -14,26 +14,33 @@ commit=$(git rev-parse HEAD)
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# This is the version of the suffix with a preceding hyphen. It's used
+# This is the version of the suffix with a preceding hyphen. It is used
 # everywhere except in the final wheel name.
 PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
 
-# Patch project metadata files to include the CUDA version suffix and version override.
 pyproject_file="${package_dir}/pyproject.toml"
 
 sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
 echo "${version}" > VERSION
 sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
 
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+sed -r -i "s/librmm==(.*)\"/librmm${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
     sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
 fi
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/librmm_dist)
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check --find-links "${CPP_WHEELHOUSE}"
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*
 
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2a6e550df..dfb0c17b0 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
 ########################
 # RMM Version Updater #
 ########################
@@ -39,10 +40,22 @@ echo "${NEXT_FULL_TAG}" > VERSION
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 # .devcontainer files
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
+done
+
+DEPENDENCIES=(
+  librmm
+)
+for DEP in "${DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+  done
+  for FILE in python/*/pyproject.toml; do
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" "${FILE}"
+  done
 done
diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
index 73780fcc3..6bda50870 100755
--- a/ci/run_pytests.sh
+++ b/ci/run_pytests.sh
@@ -4,6 +4,6 @@
 set -euo pipefail
 
 # Support invoking run_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/rmm/
 
 pytest --cache-clear -v "$@" .
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index aa2705afe..50cb203c3 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -1,17 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+WHEELHOUSE="${PWD}/dist/"
+RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/rmm*.whl)[test]
+# echo to expand wildcard before adding '[extra]' requires for pip
+python -m pip install "rmm-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}"
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
-    python ./ci/wheel_smoke_test.py
-else
-    python -m pytest ./python/rmm/tests
-fi
+python -m pytest ./python/rmm/rmm/tests
diff --git a/ci/wheel_smoke_test.py b/ci/wheel_smoke_test.py
deleted file mode 100644
index 737c0b605..000000000
--- a/ci/wheel_smoke_test.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import rmm
-
-if __name__ == "__main__":
-    buf = rmm.DeviceBuffer(size=100)
-    assert buf.size == 100
diff --git a/cmake/thirdparty/get_gtest.cmake b/cmake/thirdparty/get_nvtx.cmake
similarity index 72%
rename from cmake/thirdparty/get_gtest.cmake
rename to cmake/thirdparty/get_nvtx.cmake
index 4d4daff44..90487dd22 100644
--- a/cmake/thirdparty/get_gtest.cmake
+++ b/cmake/thirdparty/get_nvtx.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,11 +12,12 @@
 # the License.
 # =============================================================================
 
-# Use CPM to find or clone gtest
-function(find_and_configure_gtest)
-  include(${rapids-cmake-dir}/cpm/gtest.cmake)
-  rapids_cpm_gtest()
+# Use CPM to find or clone NVTX3
+function(find_and_configure_nvtx3)
+
+  include(${rapids-cmake-dir}/cpm/nvtx3.cmake)
+  rapids_cpm_nvtx3(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports)
 
 endfunction()
 
-find_and_configure_gtest()
+find_and_configure_nvtx3()
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c4bbcccd2..5806b8bd9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,7 +9,6 @@ dependencies:
 - clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4
-- cuda-nvcc
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/recipes/librmm/conda_build_config.yaml b/conda/recipes/librmm/conda_build_config.yaml
index ed58ac507..1740bfe2f 100644
--- a/conda/recipes/librmm/conda_build_config.yaml
+++ b/conda/recipes/librmm/conda_build_config.yaml
@@ -16,11 +16,10 @@ cmake_version:
 fmt_version:
   - ">=10.1.1,<11"
 
-gtest_version:
-  - ">=1.13.0"
-
 spdlog_version:
   - ">=1.12.0,<1.13"
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+c_stdlib_version:
   - "2.17"
diff --git a/conda/recipes/librmm/meta.yaml b/conda/recipes/librmm/meta.yaml
index 60cd6ffae..53e16ebdc 100644
--- a/conda/recipes/librmm/meta.yaml
+++ b/conda/recipes/librmm/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -23,18 +23,16 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - cuda-version ={{ cuda_version }}
-    # We require spdlog and fmt (which was devendored from spdlog
+    # We require spdlog and fmt (which was de-vendored from spdlog
     # conda-forge packages in 1.11.0) so that the spdlog headers are not
     # pulled by CPM and installed as a part of the rmm packages. However,
     # building against librmm still requires these headers. They are also
     # added as a run requirement via the packages' run_exports.
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
 
 build:
   script_env:
@@ -83,38 +81,7 @@ outputs:
         - spdlog {{ spdlog_version }}
     test:
       commands:
-        - test -f $PREFIX/include/rmm/thrust_rmm_allocator.h
-        - test -f $PREFIX/include/rmm/logger.hpp
-        - test -f $PREFIX/include/rmm/cuda_stream.hpp
-        - test -f $PREFIX/include/rmm/cuda_stream_view.hpp
-        - test -f $PREFIX/include/rmm/cuda_stream_pool.hpp
-        - test -f $PREFIX/include/rmm/device_uvector.hpp
-        - test -f $PREFIX/include/rmm/device_scalar.hpp
-        - test -f $PREFIX/include/rmm/device_buffer.hpp
-        - test -f $PREFIX/include/rmm/detail/aligned.hpp
-        - test -f $PREFIX/include/rmm/detail/error.hpp
-        - test -f $PREFIX/include/rmm/detail/exec_check_disable.hpp
-        - test -f $PREFIX/include/rmm/mr/device/detail/arena.hpp
-        - test -f $PREFIX/include/rmm/mr/device/detail/free_list.hpp
-        - test -f $PREFIX/include/rmm/mr/device/detail/coalescing_free_list.hpp
-        - test -f $PREFIX/include/rmm/mr/device/detail/fixed_size_free_list.hpp
-        - test -f $PREFIX/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/arena_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/binning_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/cuda_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/device_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/fixed_size_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/limiting_resource_adaptor.hpp
-        - test -f $PREFIX/include/rmm/mr/device/logging_resource_adaptor.hpp
-        - test -f $PREFIX/include/rmm/mr/device/managed_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/owning_wrapper.hpp
-        - test -f $PREFIX/include/rmm/mr/device/per_device_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/pool_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/device/thread_safe_resource_adaptor.hpp
-        - test -f $PREFIX/include/rmm/mr/device/thrust_allocator_adaptor.hpp
-        - test -f $PREFIX/include/rmm/mr/host/host_memory_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/host/new_delete_resource.hpp
-        - test -f $PREFIX/include/rmm/mr/host/pinned_memory_resource.hpp
+        - test -d "${PREFIX}/include/rmm"
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -150,8 +117,6 @@ outputs:
         {% endif %}
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('librmm', exact=True) }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/conda/recipes/rmm/conda_build_config.yaml b/conda/recipes/rmm/conda_build_config.yaml
index e28b98da7..e21871634 100644
--- a/conda/recipes/rmm/conda_build_config.yaml
+++ b/conda/recipes/rmm/conda_build_config.yaml
@@ -10,7 +10,9 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml
index 764c3f75b..fa8a82475 100644
--- a/conda/recipes/rmm/meta.yaml
+++ b/conda/recipes/rmm/meta.yaml
@@ -1,6 +1,6 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].strip().lstrip('v') %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
 {% set cuda_major = cuda_version.split('.')[0] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -51,7 +51,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 548999f1b..eb2c4e4f2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,6 +7,7 @@ files:
       arch: [x86_64]
     includes:
       - build
+      - cython_build
       - checks
       - cuda
       - cuda_version
@@ -37,20 +38,31 @@ files:
       - cuda_version
       - docs
       - py_version
+  py_cpp_build:
+    output: pyproject
+    pyproject_dir: python/librmm
+    extras:
+      table: build-system
+    includes:
+      - build
   py_build:
     output: pyproject
+    pyproject_dir: python/rmm
     extras:
       table: build-system
     includes:
       - build
+      - cython_build
   py_run:
     output: pyproject
+    pyproject_dir: python/rmm
     extras:
       table: project
     includes:
       - run
   py_optional_test:
     output: pyproject
+    pyproject_dir: python/rmm
     extras:
       table: project.optional-dependencies
       key: test
@@ -65,7 +77,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
-          - cython>=3.0.0
           - ninja
       - output_types: conda
         packages:
@@ -106,6 +117,12 @@ dependencies:
               cuda: "12.*"
             packages:
               - cuda-nvcc
+  cython_build:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cython>=3.0.0
+    specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix:
@@ -115,6 +132,19 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+      - output_types: [requirements, pyproject]
+        matrices:
+            - matrix:
+                cuda: "12.*"
+              packages:
+                - librmm-cu12==24.6.*
+            - matrix:
+                cuda: "11.*"
+              packages:
+                - librmm-cu11==24.6.*
+            - matrix:
+              packages:
+                - librmm==24.6.*
   checks:
     common:
       - output_types: [conda, requirements]
@@ -239,7 +269,21 @@ dependencies:
         packages:
           - pytest
           - pytest-cov
+    # Needed for numba in tests
+    specific:
       - output_types: conda
-        packages:
-          # Needed for numba in tests
-          - cuda-nvcc
+        matrices:
+          - matrix:
+              arch: x86_64
+              cuda: "11.8"
+            packages:
+              - nvcc_linux-64=11.8
+          - matrix:
+              arch: aarch64
+              cuda: "11.8"
+            packages:
+              - nvcc_linux-aarch64=11.8
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-nvcc
diff --git a/include/rmm/detail/aligned.hpp b/include/rmm/detail/aligned.hpp
index eb31658e9..1206a1983 100644
--- a/include/rmm/detail/aligned.hpp
+++ b/include/rmm/detail/aligned.hpp
@@ -26,87 +26,6 @@
 
 namespace rmm::detail {
 
-/**
- * @brief Default alignment used for host memory allocated by RMM.
- *
- */
-[[deprecated("Use rmm::RMM_DEFAULT_HOST_ALIGNMENT instead.")]] static constexpr std::size_t
-  RMM_DEFAULT_HOST_ALIGNMENT{rmm::RMM_DEFAULT_HOST_ALIGNMENT};
-
-/**
- * @brief Default alignment used for CUDA memory allocation.
- *
- */
-[[deprecated("Use rmm::CUDA_ALLOCATION_ALIGNMENT instead.")]] static constexpr std::size_t
-  CUDA_ALLOCATION_ALIGNMENT{rmm::CUDA_ALLOCATION_ALIGNMENT};
-
-/**
- * @brief Returns whether or not `n` is a power of 2.
- *
- */
-[[deprecated("Use rmm::is_pow2 instead.")]] constexpr bool is_pow2(std::size_t value) noexcept
-{
-  return rmm::is_pow2(value);
-}
-
-/**
- * @brief Returns whether or not `alignment` is a valid memory alignment.
- *
- */
-[[deprecated("Use rmm::is_supported_alignment instead.")]] constexpr bool is_supported_alignment(
-  std::size_t alignment) noexcept
-{
-  return rmm::is_pow2(alignment);
-}
-
-/**
- * @brief Align up to nearest multiple of specified power of 2
- *
- * @param[in] value value to align
- * @param[in] alignment amount, in bytes, must be a power of 2
- *
- * @return Return the aligned value, as one would expect
- */
-[[deprecated("Use rmm::align_up instead.")]] constexpr std::size_t align_up(
-  std::size_t value, std::size_t alignment) noexcept
-{
-  return rmm::align_up(value, alignment);
-}
-
-/**
- * @brief Align down to the nearest multiple of specified power of 2
- *
- * @param[in] value value to align
- * @param[in] alignment amount, in bytes, must be a power of 2
- *
- * @return Return the aligned value, as one would expect
- */
-[[deprecated("Use rmm::align_down instead.")]] constexpr std::size_t align_down(
-  std::size_t value, std::size_t alignment) noexcept
-{
-  return rmm::align_down(value, alignment);
-}
-
-/**
- * @brief Checks whether a value is aligned to a multiple of a specified power of 2
- *
- * @param[in] value value to check for alignment
- * @param[in] alignment amount, in bytes, must be a power of 2
- *
- * @return true if aligned
- */
-[[deprecated("Use rmm::is_aligned instead.")]] constexpr bool is_aligned(
-  std::size_t value, std::size_t alignment) noexcept
-{
-  return rmm::is_aligned(value, alignment);
-}
-
-[[deprecated("Use rmm::is_pointer_aligned instead.")]] inline bool is_pointer_aligned(
-  void* ptr, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)
-{
-  return rmm::is_pointer_aligned(ptr, alignment);
-}
-
 /**
  * @brief Allocates sufficient host-accessible memory to satisfy the requested size `bytes` with
  * alignment `alignment` using the unary callable `alloc` to allocate memory.
diff --git a/include/rmm/detail/nvtx/ranges.hpp b/include/rmm/detail/nvtx/ranges.hpp
new file mode 100644
index 000000000..1791a8c3b
--- /dev/null
+++ b/include/rmm/detail/nvtx/ranges.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nvtx3/nvtx3.hpp>
+
+namespace rmm {
+/**
+ * @brief Tag type for librmm's NVTX domain.
+ */
+struct librmm_domain {
+  static constexpr char const* name{"librmm"};  ///< Name of the librmm domain
+};
+
+/**
+ * @brief Alias for an NVTX range in the librmm domain.
+ *
+ * Customizes an NVTX range with the given input.
+ *
+ * Example:
+ * ```
+ * void some_function(){
+ *    rmm::scoped_range rng{"custom_name"}; // Customizes range name
+ *    ...
+ * }
+ * ```
+ */
+using scoped_range = ::nvtx3::scoped_range_in<librmm_domain>;
+
+}  // namespace rmm
+
+/**
+ * @brief Convenience macro for generating an NVTX range in the `librmm` domain
+ * from the lifetime of a function.
+ *
+ * Uses the name of the immediately enclosing function returned by `__func__` to
+ * name the range.
+ *
+ * Example:
+ * ```
+ * void some_function(){
+ *    RMM_FUNC_RANGE();
+ *    ...
+ * }
+ * ```
+ */
+#define RMM_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(rmm::librmm_domain)
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index ac6b72076..a51e6b886 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -25,7 +25,6 @@
 
 #include <cuda/std/type_traits>
 #include <cuda_runtime_api.h>
-#include <thrust/optional.h>
 
 #include <cstddef>
 #include <limits>
@@ -68,40 +67,6 @@ class cuda_async_memory_resource final : public device_memory_resource {
     win32_kmt = 0x4  ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
   };
 
-  /**
-   * @brief Constructs a cuda_async_memory_resource with the optionally specified initial pool size
-   * and release threshold.
-   *
-   * If the pool size grows beyond the release threshold, unused memory held by the pool will be
-   * released at the next synchronization event.
-   *
-   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync`
-   *
-   * @param initial_pool_size Optional initial size in bytes of the pool. If no value is provided,
-   * initial pool size is half of the available GPU memory.
-   * @param release_threshold Optional release threshold size in bytes of the pool. If no value is
-   * provided, the release threshold is set to the total amount of memory on the current device.
-   * @param export_handle_type Optional `cudaMemAllocationHandleType` that allocations from this
-   * resource should support interprocess communication (IPC). Default is
-   * `cudaMemHandleTypeNone` for no IPC support.
-   */
-  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-  template <class Optional,
-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
-                                                        thrust::optional<std::size_t>>,
-                                   int> = 0>
-  [[deprecated("Use std::optional instead of thrust::optional.")]]  //
-  explicit cuda_async_memory_resource(
-    Optional initial_pool_size,
-    Optional release_threshold                                  = {},
-    thrust::optional<allocation_handle_type> export_handle_type = {})
-    : cuda_async_memory_resource(initial_pool_size.value_or(std::nullopt),
-                                 release_threshold.value_or(std::nullopt),
-                                 export_handle_type.value_or(std::nullopt))
-
-  {
-  }
-
   /**
    * @brief Constructs a cuda_async_memory_resource with the optionally specified initial pool size
    * and release threshold.
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index c07730f70..9ca695b9d 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -23,7 +23,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
-#include <thrust/optional.h>
 
 #include <cstddef>
 #include <limits>
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 24190f2b4..783dff2ed 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -17,6 +17,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
+#include <rmm/detail/nvtx/ranges.hpp>
 
 #include <cuda/memory_resource>
 
@@ -117,6 +118,7 @@ class device_memory_resource {
    */
   void* allocate(std::size_t bytes, cuda_stream_view stream = cuda_stream_view{})
   {
+    RMM_FUNC_RANGE();
     return do_allocate(bytes, stream);
   }
 
@@ -138,6 +140,7 @@ class device_memory_resource {
    */
   void deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream = cuda_stream_view{})
   {
+    RMM_FUNC_RANGE();
     do_deallocate(ptr, bytes, stream);
   }
 
@@ -173,6 +176,7 @@ class device_memory_resource {
    */
   void* allocate(std::size_t bytes, std::size_t alignment)
   {
+    RMM_FUNC_RANGE();
     return do_allocate(rmm::align_up(bytes, alignment), cuda_stream_view{});
   }
 
@@ -191,6 +195,7 @@ class device_memory_resource {
    */
   void deallocate(void* ptr, std::size_t bytes, std::size_t alignment)
   {
+    RMM_FUNC_RANGE();
     do_deallocate(ptr, rmm::align_up(bytes, alignment), cuda_stream_view{});
   }
 
@@ -209,6 +214,7 @@ class device_memory_resource {
    */
   void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream)
   {
+    RMM_FUNC_RANGE();
     return do_allocate(rmm::align_up(bytes, alignment), stream);
   }
 
@@ -226,6 +232,7 @@ class device_memory_resource {
    */
   void* allocate_async(std::size_t bytes, cuda_stream_view stream)
   {
+    RMM_FUNC_RANGE();
     return do_allocate(bytes, stream);
   }
 
@@ -248,6 +255,7 @@ class device_memory_resource {
                         std::size_t alignment,
                         cuda_stream_view stream)
   {
+    RMM_FUNC_RANGE();
     do_deallocate(ptr, rmm::align_up(bytes, alignment), stream);
   }
 
@@ -266,6 +274,7 @@ class device_memory_resource {
    */
   void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream)
   {
+    RMM_FUNC_RANGE();
     do_deallocate(ptr, bytes, stream);
   }
 
@@ -293,52 +302,6 @@ class device_memory_resource {
     return !do_is_equal(other);
   }
 
-  /**
-   * @brief Query whether the resource supports use of non-null CUDA streams for
-   * allocation/deallocation.
-   *
-   * @deprecated Functionality removed in favor of cuda::mr::async_memory_resource.
-   *
-   * @returns bool true if the resource supports non-null CUDA streams.
-   */
-  [[deprecated("Functionality removed in favor of cuda::mr::async_memory_resource.")]]  //
-  [[nodiscard]] virtual bool
-  supports_streams() const noexcept
-  {
-    return false;
-  }
-
-  /**
-   * @brief Query whether the resource supports the get_mem_info API.
-   *
-   * @deprecated Use rmm::available_device_memory instead.
-   *
-   * @return bool true if the resource supports get_mem_info, false otherwise.
-   */
-  [[deprecated("Use rmm::available_device_memory instead.")]]  //
-  [[nodiscard]] virtual bool
-  supports_get_mem_info() const noexcept
-  {
-    return false;
-  };
-
-  /**
-   * @brief Queries the amount of free and total memory for the resource.
-   *
-   * @deprecated Use rmm::available_device_memory instead.
-   *
-   * @param stream the stream whose memory manager we want to retrieve
-   *
-   * @returns a pair containing the free memory in bytes in .first and total amount of memory in
-   * .second
-   */
-  [[deprecated("Use rmm::available_device_memory instead.")]]  //
-  [[nodiscard]] std::pair<std::size_t, std::size_t>
-  get_mem_info(cuda_stream_view stream) const
-  {
-    return {0, 0};
-  }
-
   /**
    * @brief Enables the `cuda::mr::device_accessible` property
    *
diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp
index e2fb4b0cf..0b63b4691 100644
--- a/include/rmm/mr/device/polymorphic_allocator.hpp
+++ b/include/rmm/mr/device/polymorphic_allocator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -26,7 +25,11 @@
 #include <type_traits>
 
 namespace rmm::mr {
-
+/**
+ * @addtogroup device_memory_resources
+ * @{
+ * @file
+ */
 /**
  * @brief A stream ordered Allocator using a `rmm::mr::device_memory_resource` to satisfy
  * (de)allocations.
@@ -55,21 +58,21 @@ class polymorphic_allocator {
   /**
    * @brief Construct a `polymorphic_allocator` using the provided memory resource.
    *
-   * This constructor provides an implicit conversion from `memory_resource*`.
+   * This constructor provides an implicit conversion from `device_async_resource_ref`.
    *
-   * @param mr The `device_memory_resource` to use as the underlying resource.
+   * @param mr The upstream memory resource to use for allocation.
    */
-  polymorphic_allocator(device_memory_resource* mr) : mr_{mr} {}
+  polymorphic_allocator(device_async_resource_ref mr) : mr_{mr} {}
 
   /**
-   * @brief Construct a `polymorphic_allocator` using `other.resource()` as the underlying memory
-   * resource.
+   * @brief Construct a `polymorphic_allocator` using the underlying memory resource of `other`.
    *
-   * @param other The `polymorphic_resource` whose `resource()` will be used as the underlying
+   * @param other The `polymorphic_allocator` whose memory resource will be used as the underlying
    * resource of the new `polymorphic_allocator`.
    */
   template <typename U>
-  polymorphic_allocator(polymorphic_allocator<U> const& other) noexcept : mr_{other.resource()}
+  polymorphic_allocator(polymorphic_allocator<U> const& other) noexcept
+    : mr_{other.get_upstream_resource()}
   {
   }
 
@@ -82,14 +85,15 @@ class polymorphic_allocator {
    */
   value_type* allocate(std::size_t num, cuda_stream_view stream)
   {
-    return static_cast<value_type*>(resource()->allocate(num * sizeof(T), stream));
+    return static_cast<value_type*>(
+      get_upstream_resource().allocate_async(num * sizeof(T), stream));
   }
 
   /**
    * @brief Deallocates storage pointed to by `ptr`.
    *
-   * `ptr` must have been allocated from a `rmm::mr::device_memory_resource` `r` that compares equal
-   * to `*resource()` using `r.allocate(n * sizeof(T))`.
+   * `ptr` must have been allocated from a memory resource `r` that compares equal
+   * to `get_upstream_resource()` using `r.allocate(n * sizeof(T))`.
    *
    * @param ptr Pointer to memory to deallocate
    * @param num Number of objects originally allocated
@@ -97,7 +101,7 @@ class polymorphic_allocator {
    */
   void deallocate(value_type* ptr, std::size_t num, cuda_stream_view stream)
   {
-    resource()->deallocate(ptr, num * sizeof(T), stream);
+    get_upstream_resource().deallocate_async(ptr, num * sizeof(T), stream);
   }
 
   /**
@@ -108,24 +112,40 @@ class polymorphic_allocator {
     return mr_;
   }
 
-  /**
-   * @brief Returns pointer to the underlying `rmm::mr::device_memory_resource`.
-   *
-   * @return Pointer to the underlying resource.
-   */
-  [[nodiscard]] device_memory_resource* resource() const noexcept { return mr_; }
-
  private:
-  device_memory_resource* mr_{
+  rmm::device_async_resource_ref mr_{
     get_current_device_resource()};  ///< Underlying resource used for (de)allocation
 };
 
+/**
+ * @brief Compare two `polymorphic_allocator`s for equality.
+ *
+ * Two `polymorphic_allocator`s are equal if their underlying memory resources compare equal.
+ *
+ * @tparam T Type of the first allocator
+ * @tparam U Type of the second allocator
+ * @param lhs The first allocator to compare
+ * @param rhs The second allocator to compare
+ * @return true if the two allocators are equal, false otherwise
+ */
 template <typename T, typename U>
 bool operator==(polymorphic_allocator<T> const& lhs, polymorphic_allocator<U> const& rhs)
 {
-  return lhs.resource()->is_equal(*rhs.resource());
+  return lhs.get_upstream_resource() == rhs.get_upstream_resource();
 }
 
+/**
+ * @brief Compare two `polymorphic_allocator`s for inequality.
+ *
+ * Two `polymorphic_allocator`s are not equal if their underlying memory resources compare not
+ * equal.
+ *
+ * @tparam T Type of the first allocator
+ * @tparam U Type of the second allocator
+ * @param lhs The first allocator to compare
+ * @param rhs The second allocator to compare
+ * @return true if the two allocators are not equal, false otherwise
+ */
 template <typename T, typename U>
 bool operator!=(polymorphic_allocator<T> const& lhs, polymorphic_allocator<U> const& rhs)
 {
@@ -237,12 +257,34 @@ class stream_allocator_adaptor {
   cuda_stream_view stream_;  ///< Stream on which (de)allocations are performed
 };
 
+/**
+ * @brief Compare two `stream_allocator_adaptor`s for equality.
+ *
+ * Two `stream_allocator_adaptor`s are equal if their underlying allocators compare equal.
+ *
+ * @tparam A Type of the first allocator
+ * @tparam O Type of the second allocator
+ * @param lhs The first allocator to compare
+ * @param rhs The second allocator to compare
+ * @return true if the two allocators are equal, false otherwise
+ */
 template <typename A, typename O>
 bool operator==(stream_allocator_adaptor<A> const& lhs, stream_allocator_adaptor<O> const& rhs)
 {
   return lhs.underlying_allocator() == rhs.underlying_allocator();
 }
 
+/**
+ * @brief Compare two `stream_allocator_adaptor`s for inequality.
+ *
+ * Two `stream_allocator_adaptor`s are not equal if their underlying allocators compare not equal.
+ *
+ * @tparam A Type of the first allocator
+ * @tparam O Type of the second allocator
+ * @param lhs The first allocator to compare
+ * @param rhs The second allocator to compare
+ * @return true if the two allocators are not equal, false otherwise
+ */
 template <typename A, typename O>
 bool operator!=(stream_allocator_adaptor<A> const& lhs, stream_allocator_adaptor<O> const& rhs)
 {
@@ -264,5 +306,5 @@ auto make_stream_allocator_adaptor(Allocator const& allocator, cuda_stream_view
 {
   return stream_allocator_adaptor<Allocator>{allocator, stream};
 }
-
+/** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index 4cbdeef4a..a3a972904 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -75,18 +75,19 @@ struct maybe_remove_property<PoolResource,
                              Upstream,
                              Property,
                              cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {
-#ifdef __GNUC__  // GCC warns about compatibility issues with pre ISO C++ code
+#if defined(__GNUC__) && !defined(__clang__)  // GCC warns about compatibility
+                                              // issues with pre ISO C++ code
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wnon-template-friend"
-#endif  // __GNUC__
+#endif  // __GNUC__ and not __clang__
   /**
    * @brief Explicit removal of the friend function so we do not pretend to provide device
    * accessible memory
    */
   friend void get_property(const PoolResource&, Property) = delete;
-#ifdef __GNUC__
+#if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
-#endif  // __GNUC__
+#endif  // __GNUC__ and not __clang__
 };
 }  // namespace detail
 
@@ -111,147 +112,6 @@ class pool_memory_resource final
   friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
                                                       detail::coalescing_free_list>;
 
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory
-   * pool using `upstream_mr`.
-   *
-   * @deprecated Use the constructor that takes an explicit initial pool size instead.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available memory from the upstream resource.
-   */
-  template <class Optional,
-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
-                                                        thrust::optional<std::size_t>>,
-                                   int> = 0>
-  [[deprecated(
-    "Must specify initial_pool_size and use std::optional instead of thrust::optional.")]]  //
-  explicit pool_memory_resource(Upstream* upstream_mr,
-                                Optional initial_pool_size,
-                                Optional maximum_pool_size = thrust::nullopt)
-    : pool_memory_resource(
-        upstream_mr, initial_pool_size.value_or(0), maximum_pool_size.value_or(std::nullopt))
-  {
-  }
-
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory
-   * pool using `upstream_mr`.
-   *
-   * @deprecated Use the constructor that takes an explicit initial pool size instead.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available memory from the upstream resource.
-   */
-  [[deprecated("Must specify initial_pool_size")]]  //
-  explicit pool_memory_resource(Upstream* upstream_mr,
-                                std::optional<std::size_t> initial_pool_size = std::nullopt,
-                                std::optional<std::size_t> maximum_pool_size = std::nullopt)
-    : pool_memory_resource(upstream_mr, initial_pool_size.value_or(0), maximum_pool_size)
-  {
-  }
-
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
-   * `upstream_mr`.
-   *
-   * @deprecated Use the constructor that takes an explicit initial pool size instead.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available memory from the upstream resource.
-   */
-  template <class Optional,
-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
-                                                        thrust::optional<std::size_t>>,
-                                   int> = 0>
-  [[deprecated(
-    "Must specify initial_pool_size and use std::optional instead of thrust::optional.")]]  //
-  explicit pool_memory_resource(Upstream& upstream_mr,
-                                Optional initial_pool_size,
-                                Optional maximum_pool_size = thrust::nullopt)
-    : pool_memory_resource(
-        upstream_mr, initial_pool_size.value_or(0), maximum_pool_size.value_or(std::nullopt))
-  {
-  }
-
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
-   * `upstream_mr`.
-   *
-   * @deprecated Use the constructor that takes an explicit initial pool size instead.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available memory from the upstream resource.
-   */
-  template <typename Upstream2                                               = Upstream,
-            cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>
-  [[deprecated("Must specify initial_pool_size")]]  //
-  explicit pool_memory_resource(Upstream2& upstream_mr,
-                                std::optional<std::size_t> initial_pool_size = std::nullopt,
-                                std::optional<std::size_t> maximum_pool_size = std::nullopt)
-    : pool_memory_resource(upstream_mr, initial_pool_size.value_or(0), maximum_pool_size)
-  {
-  }
-
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
-   * `upstream_mr`.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of
-   * pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available from the upstream resource.
-   */
-  template <class Optional,
-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
-                                                        thrust::optional<std::size_t>>,
-                                   int> = 0>
-  [[deprecated("Use std::optional instead of thrust::optional.")]]  //
-  explicit pool_memory_resource(Upstream* upstream_mr,
-                                std::size_t initial_pool_size,
-                                Optional maximum_pool_size)
-    : pool_memory_resource(upstream_mr, initial_pool_size, maximum_pool_size.value_or(std::nullopt))
-  {
-  }
-
   /**
    * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
    * `upstream_mr`.
@@ -283,35 +143,6 @@ class pool_memory_resource final
     initialize_pool(initial_pool_size, maximum_pool_size);
   }
 
-  /**
-   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
-   * `upstream_mr`.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`
-   * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of
-   * pool_memory_resource::allocation_alignment bytes.
-   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
-   * multiple of pool_memory_resource::allocation_alignment bytes.
-   *
-   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
-   * @param initial_pool_size Minimum size, in bytes, of the initial pool.
-   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
-   * of the available memory from the upstream resource.
-   */
-  template <class Optional,
-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
-                                                        thrust::optional<std::size_t>>,
-                                   int> = 0>
-  [[deprecated("Use std::optional instead of thrust::optional.")]]  //
-  explicit pool_memory_resource(Upstream& upstream_mr,
-                                std::size_t initial_pool_size,
-                                Optional maximum_pool_size)
-    : pool_memory_resource(cuda::std::addressof(upstream_mr),
-                           initial_pool_size,
-                           maximum_pool_size.value_or(std::nullopt))
-  {
-  }
-
   /**
    * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
    * `upstream_mr`.
diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
index 41973e04b..3bfd65996 100644
--- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp
+++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -39,6 +40,9 @@ namespace rmm::mr {
  * allocate objects of a specific type `T`, but can be freely rebound to other
  * types.
  *
+ * The allocator records the current cuda device and may only be used with a backing
+ * `device_async_resource_ref` valid for the same device.
+ *
  * @tparam T The type of the objects that will be allocated by this allocator
  */
 template <typename T>
@@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   template <typename U>
   thrust_allocator(thrust_allocator<U> const& other)
-    : _mr(other.resource()), _stream{other.stream()}
+    : _mr(other.resource()), _stream{other.stream()}, _device{other._device}
   {
   }
 
@@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   pointer allocate(size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return thrust::device_pointer_cast(
       static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
   }
@@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   void deallocate(pointer ptr, size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
   }
 
@@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
  private:
   cuda_stream_view _stream{};
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  cuda_device_id _device{get_current_cuda_device()};
 };
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index ce870287c..d8bb311c8 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <rmm/detail/nvtx/ranges.hpp>
+
 #include <cuda/memory_resource>
 
 #include <cstddef>
@@ -76,6 +78,7 @@ class host_memory_resource {
    */
   void* allocate(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t))
   {
+    RMM_FUNC_RANGE();
     return do_allocate(bytes, alignment);
   }
 
@@ -94,6 +97,7 @@ class host_memory_resource {
    */
   void deallocate(void* ptr, std::size_t bytes, std::size_t alignment = alignof(std::max_align_t))
   {
+    RMM_FUNC_RANGE();
     do_deallocate(ptr, bytes, alignment);
   }
 
diff --git a/include/rmm/mr/pinned_host_memory_resource.hpp b/include/rmm/mr/pinned_host_memory_resource.hpp
index ee409dbac..6bca05376 100644
--- a/include/rmm/mr/pinned_host_memory_resource.hpp
+++ b/include/rmm/mr/pinned_host_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <rmm/aligned.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
+#include <rmm/detail/nvtx/ranges.hpp>
 
 #include <cuda/memory_resource>
 #include <cuda/stream_ref>
@@ -63,6 +64,8 @@ class pinned_host_memory_resource {
   static void* allocate(std::size_t bytes,
                         [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
+    RMM_FUNC_RANGE();
+
     // don't allocate anything if the user requested zero bytes
     if (0 == bytes) { return nullptr; }
 
@@ -84,6 +87,8 @@ class pinned_host_memory_resource {
                          std::size_t bytes,
                          std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
   {
+    RMM_FUNC_RANGE();
+
     rmm::detail::aligned_host_deallocate(
       ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); });
   }
@@ -104,6 +109,8 @@ class pinned_host_memory_resource {
    */
   static void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
   {
+    RMM_FUNC_RANGE();
+
     return allocate(bytes);
   }
 
@@ -126,6 +133,8 @@ class pinned_host_memory_resource {
                               std::size_t alignment,
                               [[maybe_unused]] cuda::stream_ref stream)
   {
+    RMM_FUNC_RANGE();
+
     return allocate(bytes, alignment);
   }
 
@@ -142,6 +151,8 @@ class pinned_host_memory_resource {
                                std::size_t bytes,
                                [[maybe_unused]] cuda::stream_ref stream) noexcept
   {
+    RMM_FUNC_RANGE();
+
     return deallocate(ptr, bytes);
   }
 
@@ -161,6 +172,8 @@ class pinned_host_memory_resource {
                                std::size_t alignment,
                                [[maybe_unused]] cuda::stream_ref stream) noexcept
   {
+    RMM_FUNC_RANGE();
+
     return deallocate(ptr, bytes, alignment);
   }
   // NOLINTEND(bugprone-easily-swappable-parameters)
diff --git a/include/rmm/thrust_rmm_allocator.h b/include/rmm/thrust_rmm_allocator.h
deleted file mode 100644
index ad71e107a..000000000
--- a/include/rmm/thrust_rmm_allocator.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/thrust_namespace.h>
-#include <rmm/device_vector.hpp>
-#include <rmm/mr/device/thrust_allocator_adaptor.hpp>
-
-#include <thrust/execution_policy.h>
-
-namespace rmm {
-
-using par_t         = decltype(thrust::cuda::par(*(new rmm::mr::thrust_allocator<char>())));
-using deleter_t     = std::function<void(par_t*)>;
-using exec_policy_t = std::unique_ptr<par_t, deleter_t>;
-
-/**
- * @brief Returns a unique_ptr to a Thrust CUDA execution policy that uses RMM
- * for temporary memory allocation.
- *
- * @param stream The stream that the allocator will use
- *
- * @return A Thrust execution policy that will use RMM for temporary memory
- * allocation.
- */
-[[deprecated("Use new exec_policy in rmm/exec_policy.hpp")]] inline exec_policy_t exec_policy(
-  cudaStream_t stream = nullptr)
-{
-  // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
-  auto* alloc  = new rmm::mr::thrust_allocator<char>(cuda_stream_view{stream});
-  auto deleter = [alloc](par_t* pointer) {
-    delete alloc;    // NOLINT(cppcoreguidelines-owning-memory)
-    delete pointer;  // NOLINT(cppcoreguidelines-owning-memory)
-  };
-
-  exec_policy_t policy{new par_t(*alloc), deleter};
-  return policy;
-}
-
-}  // namespace rmm
diff --git a/python/LICENSE b/python/LICENSE
deleted file mode 120000
index ea5b60640..000000000
--- a/python/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../LICENSE
\ No newline at end of file
diff --git a/python/README.md b/python/README.md
deleted file mode 120000
index 32d46ee88..000000000
--- a/python/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../README.md
\ No newline at end of file
diff --git a/python/librmm/CMakeLists.txt b/python/librmm/CMakeLists.txt
new file mode 100644
index 000000000..a6035ac83
--- /dev/null
+++ b/python/librmm/CMakeLists.txt
@@ -0,0 +1,38 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+project(
+  librmm-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX)
+
+# Check if rmm is already available. If so, it's the user's responsibility to ensure that the CMake
+# package is also available at build time of the Python rmm package.
+find_package(rmm "${RAPIDS_VERSION}")
+
+if(rmm_FOUND)
+  return()
+endif()
+
+unset(rmm_FOUND)
+
+set(BUILD_TESTS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(CUDA_STATIC_RUNTIME ON)
+
+add_subdirectory(../.. rmm-cpp)
diff --git a/python/librmm/LICENSE b/python/librmm/LICENSE
new file mode 120000
index 000000000..30cff7403
--- /dev/null
+++ b/python/librmm/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/librmm/README.md b/python/librmm/README.md
new file mode 120000
index 000000000..fe8400541
--- /dev/null
+++ b/python/librmm/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/librmm/librmm/VERSION b/python/librmm/librmm/VERSION
new file mode 120000
index 000000000..d62dc733e
--- /dev/null
+++ b/python/librmm/librmm/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/librmm/librmm/__init__.py b/python/librmm/librmm/__init__.py
new file mode 100644
index 000000000..b914ecdc3
--- /dev/null
+++ b/python/librmm/librmm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from librmm._version import __git_commit__, __version__
diff --git a/python/librmm/librmm/_version.py b/python/librmm/librmm/_version.py
new file mode 100644
index 000000000..ea50101b2
--- /dev/null
+++ b/python/librmm/librmm/_version.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("librmm").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""
diff --git a/python/librmm/pyproject.toml b/python/librmm/pyproject.toml
new file mode 100644
index 000000000..4b997be9a
--- /dev/null
+++ b/python/librmm/pyproject.toml
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4",
+    "ninja",
+    "scikit-build-core[pyproject]>=0.7.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "librmm"
+dynamic = ["version"]
+description = "rmm - RAPIDS Memory Manager"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: C++",
+    "Environment :: GPU :: NVIDIA CUDA",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/rmm"
+
+[project.entry-points."cmake.prefix"]
+librmm = "librmm"
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["librmm"]
+wheel.install-dir = "librmm"
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "librmm/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/.coveragerc b/python/rmm/.coveragerc
similarity index 72%
rename from python/.coveragerc
rename to python/rmm/.coveragerc
index 6b49c4df6..00997a39b 100644
--- a/python/.coveragerc
+++ b/python/rmm/.coveragerc
@@ -2,3 +2,4 @@
 [run]
 include = rmm/*
 omit = rmm/tests/*
+disable_warnings=include-ignored
diff --git a/python/CMakeLists.txt b/python/rmm/CMakeLists.txt
similarity index 67%
rename from python/CMakeLists.txt
rename to python/rmm/CMakeLists.txt
index 87752ff24..6c2515102 100644
--- a/python/CMakeLists.txt
+++ b/python/rmm/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,34 +14,19 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-include(../rapids_config.cmake)
+include(../../rapids_config.cmake)
 
 project(
   rmm-python
   VERSION "${RAPIDS_VERSION}"
   LANGUAGES CXX)
 
-option(FIND_RMM_CPP "Search for existing RMM C++ installations before defaulting to local files"
-       OFF)
-
-# If the user requested it we attempt to find RMM.
-if(FIND_RMM_CPP)
-  find_package(rmm "${RAPIDS_VERSION}")
-else()
-  set(rmm_FOUND OFF)
-endif()
-
-if(NOT rmm_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_BENCHMARKS OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../ rmm-cpp EXCLUDE_FROM_ALL)
-endif()
+find_package(rmm "${RAPIDS_VERSION}" REQUIRED)
 
 include(rapids-cython-core)
 rapids_cython_init()
 
+# pass through logging level to spdlog
 add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
 
 add_subdirectory(rmm/_cuda)
diff --git a/python/rmm/LICENSE b/python/rmm/LICENSE
new file mode 120000
index 000000000..30cff7403
--- /dev/null
+++ b/python/rmm/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/rmm/README.md b/python/rmm/README.md
new file mode 120000
index 000000000..fe8400541
--- /dev/null
+++ b/python/rmm/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/rmm/VERSION b/python/rmm/VERSION
deleted file mode 120000
index 558194c5a..000000000
--- a/python/rmm/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../VERSION
\ No newline at end of file
diff --git a/python/docs/Makefile b/python/rmm/docs/Makefile
similarity index 100%
rename from python/docs/Makefile
rename to python/rmm/docs/Makefile
diff --git a/python/docs/conf.py b/python/rmm/docs/conf.py
similarity index 94%
rename from python/docs/conf.py
rename to python/rmm/docs/conf.py
index d66e9d30c..dd8267e28 100644
--- a/python/docs/conf.py
+++ b/python/rmm/docs/conf.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -58,7 +60,7 @@
 ]
 
 # Breathe Configuration
-breathe_projects = {"librmm": "../../doxygen/xml"}
+breathe_projects = {"librmm": "../../../doxygen/xml"}
 breathe_default_project = "librmm"
 
 copybutton_prompt_text = ">>> "
@@ -89,6 +91,18 @@
 # This patterns also effect to html_static_path and html_extra_path
 exclude_patterns = []
 
+# List of warnings to suppress
+suppress_warnings = []
+
+# if the file deprecated.xml does not exist in the doxygen xml output,
+# breathe will fail to build the docs, so we conditionally add
+# "deprecated.rst" to the exclude_patterns list
+if not os.path.exists(
+    os.path.join(breathe_projects["librmm"], "deprecated.xml")
+):
+    exclude_patterns.append("librmm_docs/deprecated.rst")
+    suppress_warnings.append("toc.excluded")
+
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
diff --git a/python/docs/cpp.rst b/python/rmm/docs/cpp.rst
similarity index 100%
rename from python/docs/cpp.rst
rename to python/rmm/docs/cpp.rst
diff --git a/python/docs/cpp_api.rst b/python/rmm/docs/cpp_api.rst
similarity index 100%
rename from python/docs/cpp_api.rst
rename to python/rmm/docs/cpp_api.rst
diff --git a/python/docs/guide.md b/python/rmm/docs/guide.md
similarity index 97%
rename from python/docs/guide.md
rename to python/rmm/docs/guide.md
index c06135ca8..968be8586 100644
--- a/python/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -182,8 +182,8 @@ for memory allocations using their by configuring the current
 allocator.
 
 ```python
-from rmm.allocators.torch import rmm_torch_allocator
-import torch
+>>> from rmm.allocators.torch import rmm_torch_allocator
+>>> import torch
 
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+>>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 ```
diff --git a/python/docs/index.rst b/python/rmm/docs/index.rst
similarity index 100%
rename from python/docs/index.rst
rename to python/rmm/docs/index.rst
diff --git a/python/docs/librmm_docs/cuda_device_management.rst b/python/rmm/docs/librmm_docs/cuda_device_management.rst
similarity index 100%
rename from python/docs/librmm_docs/cuda_device_management.rst
rename to python/rmm/docs/librmm_docs/cuda_device_management.rst
diff --git a/python/docs/librmm_docs/cuda_streams.rst b/python/rmm/docs/librmm_docs/cuda_streams.rst
similarity index 100%
rename from python/docs/librmm_docs/cuda_streams.rst
rename to python/rmm/docs/librmm_docs/cuda_streams.rst
diff --git a/python/docs/librmm_docs/data_containers.rst b/python/rmm/docs/librmm_docs/data_containers.rst
similarity index 100%
rename from python/docs/librmm_docs/data_containers.rst
rename to python/rmm/docs/librmm_docs/data_containers.rst
diff --git a/python/docs/librmm_docs/deprecated.rst b/python/rmm/docs/librmm_docs/deprecated.rst
similarity index 100%
rename from python/docs/librmm_docs/deprecated.rst
rename to python/rmm/docs/librmm_docs/deprecated.rst
diff --git a/python/docs/librmm_docs/errors.rst b/python/rmm/docs/librmm_docs/errors.rst
similarity index 100%
rename from python/docs/librmm_docs/errors.rst
rename to python/rmm/docs/librmm_docs/errors.rst
diff --git a/python/docs/librmm_docs/index.rst b/python/rmm/docs/librmm_docs/index.rst
similarity index 100%
rename from python/docs/librmm_docs/index.rst
rename to python/rmm/docs/librmm_docs/index.rst
diff --git a/python/docs/librmm_docs/logging.rst b/python/rmm/docs/librmm_docs/logging.rst
similarity index 100%
rename from python/docs/librmm_docs/logging.rst
rename to python/rmm/docs/librmm_docs/logging.rst
diff --git a/python/docs/librmm_docs/memory_resources.rst b/python/rmm/docs/librmm_docs/memory_resources.rst
similarity index 100%
rename from python/docs/librmm_docs/memory_resources.rst
rename to python/rmm/docs/librmm_docs/memory_resources.rst
diff --git a/python/docs/librmm_docs/thrust_integrations.rst b/python/rmm/docs/librmm_docs/thrust_integrations.rst
similarity index 100%
rename from python/docs/librmm_docs/thrust_integrations.rst
rename to python/rmm/docs/librmm_docs/thrust_integrations.rst
diff --git a/python/docs/librmm_docs/utilities.rst b/python/rmm/docs/librmm_docs/utilities.rst
similarity index 100%
rename from python/docs/librmm_docs/utilities.rst
rename to python/rmm/docs/librmm_docs/utilities.rst
diff --git a/python/docs/python.rst b/python/rmm/docs/python.rst
similarity index 100%
rename from python/docs/python.rst
rename to python/rmm/docs/python.rst
diff --git a/python/docs/python_api.rst b/python/rmm/docs/python_api.rst
similarity index 100%
rename from python/docs/python_api.rst
rename to python/rmm/docs/python_api.rst
diff --git a/python/pyproject.toml b/python/rmm/pyproject.toml
similarity index 86%
rename from python/pyproject.toml
rename to python/rmm/pyproject.toml
index 3f294d180..eb0d23380 100644
--- a/python/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,9 +18,10 @@ requires = [
     "cmake>=3.26.4",
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
+    "librmm==24.6.*",
     "ninja",
     "scikit-build-core[pyproject]>=0.7.0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "rmm"
@@ -36,7 +37,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -52,7 +53,7 @@ classifiers = [
 test = [
     "pytest",
     "pytest-cov",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/rmm"
@@ -123,3 +124,9 @@ wheel.packages = ["rmm"]
 provider = "scikit_build_core.metadata.regex"
 input = "rmm/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.pytest.ini_options]
+# treat warnings as errors
+filterwarnings = [
+    "error",
+]
diff --git a/python/rmm/rmm/VERSION b/python/rmm/rmm/VERSION
new file mode 120000
index 000000000..d62dc733e
--- /dev/null
+++ b/python/rmm/rmm/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/rmm/__init__.py b/python/rmm/rmm/__init__.py
similarity index 100%
rename from python/rmm/__init__.py
rename to python/rmm/rmm/__init__.py
diff --git a/python/rmm/_cuda/CMakeLists.txt b/python/rmm/rmm/_cuda/CMakeLists.txt
similarity index 100%
rename from python/rmm/_cuda/CMakeLists.txt
rename to python/rmm/rmm/_cuda/CMakeLists.txt
diff --git a/python/rmm/_cuda/__init__.pxd b/python/rmm/rmm/_cuda/__init__.pxd
similarity index 100%
rename from python/rmm/_cuda/__init__.pxd
rename to python/rmm/rmm/_cuda/__init__.pxd
diff --git a/python/rmm/_cuda/__init__.py b/python/rmm/rmm/_cuda/__init__.py
similarity index 100%
rename from python/rmm/_cuda/__init__.py
rename to python/rmm/rmm/_cuda/__init__.py
diff --git a/python/rmm/_cuda/gpu.py b/python/rmm/rmm/_cuda/gpu.py
similarity index 100%
rename from python/rmm/_cuda/gpu.py
rename to python/rmm/rmm/_cuda/gpu.py
diff --git a/python/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd
similarity index 100%
rename from python/rmm/_cuda/stream.pxd
rename to python/rmm/rmm/_cuda/stream.pxd
diff --git a/python/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx
similarity index 95%
rename from python/rmm/_cuda/stream.pyx
rename to python/rmm/rmm/_cuda/stream.pyx
index 4795cbb9f..4d5ff5232 100644
--- a/python/rmm/_cuda/stream.pyx
+++ b/python/rmm/rmm/_cuda/stream.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -100,7 +100,8 @@ cdef class Stream:
     def _init_from_cupy_stream(self, obj):
         try:
             import cupy
-            if isinstance(obj, cupy.cuda.stream.Stream):
+            if isinstance(obj, (cupy.cuda.stream.Stream,
+                                cupy.cuda.stream.ExternalStream)):
                 self._cuda_stream = <cudaStream_t><uintptr_t>(obj.ptr)
                 self._owner = obj
                 return
diff --git a/python/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/_lib/CMakeLists.txt
similarity index 100%
rename from python/rmm/_lib/CMakeLists.txt
rename to python/rmm/rmm/_lib/CMakeLists.txt
diff --git a/python/rmm/_lib/__init__.pxd b/python/rmm/rmm/_lib/__init__.pxd
similarity index 100%
rename from python/rmm/_lib/__init__.pxd
rename to python/rmm/rmm/_lib/__init__.pxd
diff --git a/python/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
similarity index 100%
rename from python/rmm/_lib/__init__.py
rename to python/rmm/rmm/_lib/__init__.py
diff --git a/python/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp
similarity index 100%
rename from python/rmm/_lib/_torch_allocator.cpp
rename to python/rmm/rmm/_lib/_torch_allocator.cpp
diff --git a/python/rmm/_lib/cuda_stream.pxd b/python/rmm/rmm/_lib/cuda_stream.pxd
similarity index 100%
rename from python/rmm/_lib/cuda_stream.pxd
rename to python/rmm/rmm/_lib/cuda_stream.pxd
diff --git a/python/rmm/_lib/cuda_stream.pyx b/python/rmm/rmm/_lib/cuda_stream.pyx
similarity index 100%
rename from python/rmm/_lib/cuda_stream.pyx
rename to python/rmm/rmm/_lib/cuda_stream.pyx
diff --git a/python/rmm/_lib/cuda_stream_pool.pxd b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
similarity index 100%
rename from python/rmm/_lib/cuda_stream_pool.pxd
rename to python/rmm/rmm/_lib/cuda_stream_pool.pxd
diff --git a/python/rmm/_lib/cuda_stream_view.pxd b/python/rmm/rmm/_lib/cuda_stream_view.pxd
similarity index 100%
rename from python/rmm/_lib/cuda_stream_view.pxd
rename to python/rmm/rmm/_lib/cuda_stream_view.pxd
diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
similarity index 81%
rename from python/rmm/_lib/device_buffer.pxd
rename to python/rmm/rmm/_lib/device_buffer.pxd
index 3d5f29f9a..2ff1a7da9 100644
--- a/python/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,17 +17,31 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._cuda.stream cimport Stream
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm._lib.memory_resource cimport (
+    DeviceMemoryResource,
+    device_memory_resource,
+)
 
 
 cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
     cdef cppclass device_buffer:
         device_buffer()
-        device_buffer(size_t size, cuda_stream_view stream) except +
-        device_buffer(const void* source_data,
-                      size_t size, cuda_stream_view stream) except +
-        device_buffer(const device_buffer buf,
-                      cuda_stream_view stream) except +
+        device_buffer(
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const void* source_data,
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const device_buffer buf,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
         void reserve(size_t new_capacity, cuda_stream_view stream) except +
         void resize(size_t new_size, cuda_stream_view stream) except +
         void shrink_to_fit(cuda_stream_view stream) except +
@@ -51,7 +65,8 @@ cdef class DeviceBuffer:
     @staticmethod
     cdef DeviceBuffer c_from_unique_ptr(
         unique_ptr[device_buffer] ptr,
-        Stream stream=*
+        Stream stream=*,
+        DeviceMemoryResource mr=*,
     )
 
     @staticmethod
diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/_lib/device_buffer.pyx
similarity index 94%
rename from python/rmm/_lib/device_buffer.pyx
rename to python/rmm/rmm/_lib/device_buffer.pyx
index d248d01ab..9d2298d8b 100644
--- a/python/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/_lib/device_buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 import numpy as np
 
 cimport cython
-from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_FromStringAndSize
+from cpython.bytes cimport PyBytes_FromStringAndSize
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -32,7 +32,11 @@ from cuda.ccudart cimport (
     cudaStream_t,
 )
 
-from rmm._lib.memory_resource cimport get_current_device_resource
+from rmm._lib.memory_resource cimport (
+    DeviceMemoryResource,
+    device_memory_resource,
+    get_current_device_resource,
+)
 
 
 # The DeviceMemoryResource attribute could be released prematurely
@@ -45,7 +49,8 @@ cdef class DeviceBuffer:
     def __cinit__(self, *,
                   uintptr_t ptr=0,
                   size_t size=0,
-                  Stream stream=DEFAULT_STREAM):
+                  Stream stream=DEFAULT_STREAM,
+                  DeviceMemoryResource mr=None):
         """Construct a ``DeviceBuffer`` with optional size and data pointer
 
         Parameters
@@ -62,6 +67,9 @@ cdef class DeviceBuffer:
             scope while the DeviceBuffer is in use. Destroying the
             underlying stream while the DeviceBuffer is in use will
             result in undefined behavior.
+        mr : optional
+           DeviceMemoryResource for the allocation, if not provided
+           defaults to the current device resource.
 
         Note
         ----
@@ -75,24 +83,23 @@ cdef class DeviceBuffer:
         >>> db = rmm.DeviceBuffer(size=5)
         """
         cdef const void* c_ptr
+        cdef device_memory_resource * mr_ptr
+        # Save a reference to the MR and stream used for allocation
+        self.mr = get_current_device_resource() if mr is None else mr
+        self.stream = stream
 
+        mr_ptr = self.mr.get_mr()
         with nogil:
             c_ptr = <const void*>ptr
 
-            if size == 0:
-                self.c_obj.reset(new device_buffer())
-            elif c_ptr == NULL:
-                self.c_obj.reset(new device_buffer(size, stream.view()))
+            if c_ptr == NULL or size == 0:
+                self.c_obj.reset(new device_buffer(size, stream.view(), mr_ptr))
             else:
-                self.c_obj.reset(new device_buffer(c_ptr, size, stream.view()))
+                self.c_obj.reset(new device_buffer(c_ptr, size, stream.view(), mr_ptr))
 
                 if stream.c_is_default():
                     stream.c_synchronize()
 
-        # Save a reference to the MR and stream used for allocation
-        self.mr = get_current_device_resource()
-        self.stream = stream
-
     def __len__(self):
         return self.size
 
@@ -160,13 +167,14 @@ cdef class DeviceBuffer:
     @staticmethod
     cdef DeviceBuffer c_from_unique_ptr(
         unique_ptr[device_buffer] ptr,
-        Stream stream=DEFAULT_STREAM
+        Stream stream=DEFAULT_STREAM,
+        DeviceMemoryResource mr=None,
     ):
         cdef DeviceBuffer buf = DeviceBuffer.__new__(DeviceBuffer)
         if stream.c_is_default():
             stream.c_synchronize()
         buf.c_obj = move(ptr)
-        buf.mr = get_current_device_resource()
+        buf.mr = get_current_device_resource() if mr is None else mr
         buf.stream = stream
         return buf
 
@@ -312,7 +320,7 @@ cdef class DeviceBuffer:
         cdef size_t s = dbp.size()
 
         cdef bytes b = PyBytes_FromStringAndSize(NULL, s)
-        cdef unsigned char* p = <unsigned char*>PyBytes_AS_STRING(b)
+        cdef unsigned char* p = b
         cdef unsigned char[::1] mv = (<unsigned char[:(s + 1):1]>p)[:s]
         self.copy_to_host(mv, stream)
 
diff --git a/python/rmm/_lib/device_uvector.pxd b/python/rmm/rmm/_lib/device_uvector.pxd
similarity index 100%
rename from python/rmm/_lib/device_uvector.pxd
rename to python/rmm/rmm/_lib/device_uvector.pxd
diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/rmm/_lib/lib.pxd
similarity index 100%
rename from python/rmm/_lib/lib.pxd
rename to python/rmm/rmm/_lib/lib.pxd
diff --git a/python/rmm/_lib/lib.pyx b/python/rmm/rmm/_lib/lib.pyx
similarity index 100%
rename from python/rmm/_lib/lib.pyx
rename to python/rmm/rmm/_lib/lib.pyx
diff --git a/python/rmm/_lib/logger.pyx b/python/rmm/rmm/_lib/logger.pyx
similarity index 100%
rename from python/rmm/_lib/logger.pyx
rename to python/rmm/rmm/_lib/logger.pyx
diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
similarity index 97%
rename from python/rmm/_lib/memory_resource.pxd
rename to python/rmm/rmm/_lib/memory_resource.pxd
index 0770fb8ed..f9c2e91de 100644
--- a/python/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -34,7 +34,7 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
 
 cdef class DeviceMemoryResource:
     cdef shared_ptr[device_memory_resource] c_obj
-    cdef device_memory_resource* get_mr(self)
+    cdef device_memory_resource* get_mr(self) noexcept nogil
 
 cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
     cdef readonly DeviceMemoryResource upstream_mr
diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx
similarity index 99%
rename from python/rmm/_lib/memory_resource.pyx
rename to python/rmm/rmm/_lib/memory_resource.pyx
index 7458ca025..100d18b56 100644
--- a/python/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/_lib/memory_resource.pyx
@@ -218,7 +218,7 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
 
 cdef class DeviceMemoryResource:
 
-    cdef device_memory_resource* get_mr(self):
+    cdef device_memory_resource* get_mr(self) noexcept nogil:
         """Get the underlying C++ memory resource object."""
         return self.c_obj.get()
 
diff --git a/python/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd
similarity index 100%
rename from python/rmm/_lib/per_device_resource.pxd
rename to python/rmm/rmm/_lib/per_device_resource.pxd
diff --git a/python/rmm/_lib/tests/__init__.py b/python/rmm/rmm/_lib/tests/__init__.py
similarity index 100%
rename from python/rmm/_lib/tests/__init__.py
rename to python/rmm/rmm/_lib/tests/__init__.py
diff --git a/python/rmm/_lib/tests/test_device_buffer.pyx b/python/rmm/rmm/_lib/tests/test_device_buffer.pyx
similarity index 100%
rename from python/rmm/_lib/tests/test_device_buffer.pyx
rename to python/rmm/rmm/_lib/tests/test_device_buffer.pyx
diff --git a/python/rmm/_version.py b/python/rmm/rmm/_version.py
similarity index 100%
rename from python/rmm/_version.py
rename to python/rmm/rmm/_version.py
diff --git a/python/rmm/allocators/__init__.py b/python/rmm/rmm/allocators/__init__.py
similarity index 100%
rename from python/rmm/allocators/__init__.py
rename to python/rmm/rmm/allocators/__init__.py
diff --git a/python/rmm/allocators/cupy.py b/python/rmm/rmm/allocators/cupy.py
similarity index 100%
rename from python/rmm/allocators/cupy.py
rename to python/rmm/rmm/allocators/cupy.py
diff --git a/python/rmm/allocators/numba.py b/python/rmm/rmm/allocators/numba.py
similarity index 100%
rename from python/rmm/allocators/numba.py
rename to python/rmm/rmm/allocators/numba.py
diff --git a/python/rmm/allocators/torch.py b/python/rmm/rmm/allocators/torch.py
similarity index 100%
rename from python/rmm/allocators/torch.py
rename to python/rmm/rmm/allocators/torch.py
diff --git a/python/rmm/mr.py b/python/rmm/rmm/mr.py
similarity index 100%
rename from python/rmm/mr.py
rename to python/rmm/rmm/mr.py
diff --git a/python/rmm/rmm.py b/python/rmm/rmm/rmm.py
similarity index 100%
rename from python/rmm/rmm.py
rename to python/rmm/rmm/rmm.py
diff --git a/python/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py
similarity index 100%
rename from python/rmm/tests/conftest.py
rename to python/rmm/rmm/tests/conftest.py
diff --git a/python/rmm/tests/test_cython.py b/python/rmm/rmm/tests/test_cython.py
similarity index 100%
rename from python/rmm/tests/test_cython.py
rename to python/rmm/rmm/tests/test_cython.py
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
similarity index 97%
rename from python/rmm/tests/test_rmm.py
rename to python/rmm/rmm/tests/test_rmm.py
index 25ff9a7a6..c37fe0298 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+import functools
 import gc
 import os
 import pickle
@@ -498,6 +499,32 @@ def test_mr_devicebuffer_lifetime():
     del a
 
 
+def test_device_buffer_with_mr():
+    allocations = []
+    base = rmm.mr.CudaMemoryResource()
+    rmm.mr.set_current_device_resource(base)
+
+    def alloc_cb(size, stream, *, base):
+        allocations.append(size)
+        return base.allocate(size, stream)
+
+    def dealloc_cb(ptr, size, stream, *, base):
+        return base.deallocate(ptr, size, stream)
+
+    cb_mr = rmm.mr.CallbackMemoryResource(
+        functools.partial(alloc_cb, base=base),
+        functools.partial(dealloc_cb, base=base),
+    )
+    rmm.DeviceBuffer(size=10)
+    assert len(allocations) == 0
+    buf = rmm.DeviceBuffer(size=256, mr=cb_mr)
+    assert len(allocations) == 1
+    assert allocations[0] == 256
+    del cb_mr
+    gc.collect()
+    del buf
+
+
 def test_mr_upstream_lifetime():
     # Simple test to ensure upstream MRs are deallocated before downstream MR
     cuda_mr = rmm.mr.CudaMemoryResource()
diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/rmm/tests/test_rmm_pytorch.py
similarity index 100%
rename from python/rmm/tests/test_rmm_pytorch.py
rename to python/rmm/rmm/tests/test_rmm_pytorch.py
diff --git a/scripts/run-cmake-format.sh b/scripts/run-cmake-format.sh
index a7d9984b3..df7f22782 100755
--- a/scripts/run-cmake-format.sh
+++ b/scripts/run-cmake-format.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -43,6 +44,7 @@ fi
 
 DEFAULT_FORMAT_FILE_LOCATIONS=(
   "${RMM_BUILD_DIR}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${RMM_BUILD_DIR:-build}/latest/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 
 if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
diff --git a/tests/mr/device/polymorphic_allocator_tests.cpp b/tests/mr/device/polymorphic_allocator_tests.cpp
index 3b73d4a49..d433e010c 100644
--- a/tests/mr/device/polymorphic_allocator_tests.cpp
+++ b/tests/mr/device/polymorphic_allocator_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,14 +33,15 @@ struct allocator_test : public ::testing::Test {
 TEST_F(allocator_test, default_resource)
 {
   rmm::mr::polymorphic_allocator<int> allocator{};
-  EXPECT_EQ(allocator.resource(), rmm::mr::get_current_device_resource());
+  EXPECT_EQ(allocator.get_upstream_resource(),
+            rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
 TEST_F(allocator_test, custom_resource)
 {
   rmm::mr::cuda_memory_resource mr;
   rmm::mr::polymorphic_allocator<int> allocator{&mr};
-  EXPECT_EQ(allocator.resource(), &mr);
+  EXPECT_EQ(allocator.get_upstream_resource(), rmm::device_async_resource_ref{mr});
 }
 
 void test_conversion(rmm::mr::polymorphic_allocator<int> /*unused*/) {}
@@ -48,7 +49,7 @@ void test_conversion(rmm::mr::polymorphic_allocator<int> /*unused*/) {}
 TEST_F(allocator_test, implicit_conversion)
 {
   rmm::mr::cuda_memory_resource mr;
-  test_conversion(&mr);
+  test_conversion(rmm::device_async_resource_ref{mr});
 }
 
 TEST_F(allocator_test, self_equality)
@@ -84,7 +85,7 @@ TEST_F(allocator_test, copy_ctor_same_type)
   rmm::mr::polymorphic_allocator<int> alloc0;
   rmm::mr::polymorphic_allocator<int> alloc1{alloc0};
   EXPECT_EQ(alloc0, alloc1);
-  EXPECT_EQ(alloc0.resource(), alloc1.resource());
+  EXPECT_EQ(alloc0.get_upstream_resource(), alloc1.get_upstream_resource());
 }
 
 TEST_F(allocator_test, copy_ctor_different_type)
@@ -92,7 +93,7 @@ TEST_F(allocator_test, copy_ctor_different_type)
   rmm::mr::polymorphic_allocator<int> alloc0;
   rmm::mr::polymorphic_allocator<double> alloc1{alloc0};
   EXPECT_EQ(alloc0, alloc1);
-  EXPECT_EQ(alloc0.resource(), alloc1.resource());
+  EXPECT_EQ(alloc0.get_upstream_resource(), alloc1.get_upstream_resource());
 }
 
 TEST_F(allocator_test, rebind)
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index b94d6b3e1..e855d1036 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -16,7 +16,9 @@
 
 #include "mr_ref_test.hpp"
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>
@@ -36,6 +38,7 @@ struct allocator_test : public mr_ref_test {};
 
 TEST_P(allocator_test, first)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   auto const num_ints{100};
   rmm::device_vector<int> ints(num_ints, 1);
   EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
@@ -43,12 +46,28 @@ TEST_P(allocator_test, first)
 
 TEST_P(allocator_test, defaults)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
   EXPECT_EQ(allocator.get_upstream_resource(),
             rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
+TEST_P(allocator_test, multi_device)
+{
+  if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; }
+  cuda_set_device_raii with_device{rmm::get_current_cuda_device()};
+  rmm::cuda_stream stream{};
+  // make allocator on device-0
+  rmm::mr::thrust_allocator<int> allocator(stream.view(), this->ref);
+  auto const size{100};
+  EXPECT_NO_THROW([&]() {
+    auto vec = rmm::device_vector<int>(size, allocator);
+    // Destruct with device-1 active
+    RMM_CUDA_TRY(cudaSetDevice(1));
+  }());
+}
+
 INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
                         allocator_test,
                         ::testing::Values(mr_factory{"CUDA", &make_cuda},