diff --git a/.clang-format b/.clang-format index 35ebdbfae..951a549d0 100644 --- a/.clang-format +++ b/.clang-format @@ -46,6 +46,7 @@ BraceWrapping: SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false +BreakAfterAttributes: Leave BreakAfterJavaFieldAnnotations: false BreakBeforeBinaryOperators: None BreakBeforeBraces: WebKit diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 1403b8cc4..b4b7f3f02 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,12 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda" + ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 90a3732f4..bcdaaf53d 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,12 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip" + ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 8dc4f8be9..cf5679cf7 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -5,12 +5,17 @@ "args": { "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda" + ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index 66d332573..2e2cb99e1 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -5,12 +5,17 @@ "args": { "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip" + ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c9a4cba83..334cae320 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,8 +10,14 @@ python/ @rapidsai/rmm-python-codeowners **/CMakeLists.txt @rapidsai/rmm-cmake-codeowners **/cmake/ @rapidsai/rmm-cmake-codeowners -#build/ops code owners -.github/ @rapidsai/ops-codeowners -ci/ @rapidsai/ops-codeowners -conda/ @rapidsai/ops-codeowners -dependencies.yaml @rapidsai/ops-codeowners +#CI code owners +/.github/ @rapidsai/ci-codeowners +/ci/ @rapidsai/ci-codeowners +/.pre-commit-config.yaml @rapidsai/ci-codeowners + +#packaging code owners +/.devcontainers/ @rapidsai/packaging-codeowners +/conda/ @rapidsai/packaging-codeowners +/dependencies.yaml @rapidsai/packaging-codeowners +/build.sh @rapidsai/packaging-codeowners +pyproject.toml @rapidsai/packaging-codeowners diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e4113d5c5..895cac94d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -56,7 +56,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -66,22 +66,45 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" - wheel-build: + wheel-build-cpp: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 with: + matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} - script: ci/build_wheel.sh - wheel-publish: - needs: wheel-build + script: ci/build_wheel_cpp.sh + wheel-build-python: + needs: wheel-build-cpp secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + script: ci/build_wheel_python.sh + wheel-publish-cpp: + needs: wheel-build-cpp + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: rmm + package-type: cpp + wheel-publish-python: + needs: wheel-build-python + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} package-name: rmm + package-type: python diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d717b15fc..96e4503b7 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,67 +18,76 @@ jobs: - conda-python-build - conda-python-tests - docs-build - - wheel-build + - wheel-build-cpp + - wheel-build-python - wheel-tests - devcontainer secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" - wheel-build: + wheel-build-cpp: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 with: + matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) build_type: pull-request - script: ci/build_wheel.sh + script: ci/build_wheel_cpp.sh + wheel-build-python: + needs: wheel-build-cpp + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + with: + build_type: pull-request + script: ci/build_wheel_python.sh wheel-tests: - needs: wheel-build + needs: wheel-build-python secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 with: build_type: pull-request script: ci/test_wheel.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06 with: arch: '["amd64"]' cuda: '["12.2"]' diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0e7938310..747f9f544 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} python-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 with: build_type: nightly branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.gitignore b/.gitignore index f4993502b..2d0b150e1 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ dist/ rmm.egg-info/ python/build python/*/build +python/rmm/docs/_build python/rmm/**/_lib/**/*.cpp !python/rmm/_lib/_torch_allocator.cpp python/rmm/**/_lib/**/*.h diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4df64d11a..d21fcebf8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,14 +14,14 @@ repos: rev: 5.12.0 hooks: - id: isort - args: ["--settings-path=python/pyproject.toml"] + args: ["--settings-path=python/rmm/pyproject.toml"] files: python/.* types_or: [python, cython, pyi] - repo: https://github.com/ambv/black rev: 22.3.0 hooks: - id: black - args: ["--config=python/pyproject.toml"] + args: ["--config=python/rmm/pyproject.toml"] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.15.0 hooks: @@ -81,6 +81,10 @@ repos: hooks: - id: ruff files: python/.*$ + - repo: https://github.com/rapidsai/pre-commit-hooks + rev: v0.0.3 + hooks: + - id: verify-copyright default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d870908d..70f788615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,55 @@ +# rmm 24.06.00 (5 Jun 2024) + +## 🚨 Breaking Changes + +- Refactor polymorphic allocator to use device_async_resource_ref ([#1555](https://github.com/rapidsai/rmm/pull/1555)) [@harrism](https://github.com/harrism) +- Remove deprecated functionality ([#1537](https://github.com/rapidsai/rmm/pull/1537)) [@harrism](https://github.com/harrism) +- Remove deprecated cuda_async_memory_resource constructor that takes thrust::optional parameters ([#1535](https://github.com/rapidsai/rmm/pull/1535)) [@harrism](https://github.com/harrism) +- Remove deprecated supports_streams and get_mem_info methods. ([#1519](https://github.com/rapidsai/rmm/pull/1519)) [@harrism](https://github.com/harrism) + +## 🐛 Bug Fixes + +- rmm needs to link to nvtx3::nvtx3-cpp to support installed nvtx3 ([#1569](https://github.com/rapidsai/rmm/pull/1569)) [@robertmaynard](https://github.com/robertmaynard) +- Make sure rmm wheel dependency on librmm is updated [skip ci] ([#1565](https://github.com/rapidsai/rmm/pull/1565)) [@raydouglass](https://github.com/raydouglass) +- Don't ignore GCC-specific warning under Clang ([#1557](https://github.com/rapidsai/rmm/pull/1557)) [@aaronmondal](https://github.com/aaronmondal) +- Add publish jobs for C++ wheels ([#1554](https://github.com/rapidsai/rmm/pull/1554)) [@vyasr](https://github.com/vyasr) +- Explicitly use the current device resource in DeviceBuffer ([#1514](https://github.com/rapidsai/rmm/pull/1514)) [@wence-](https://github.com/wence-) + +## 📖 Documentation + +- Allow specifying mr in DeviceBuffer construction, and document ownership requirements in Python/C++ interfacing ([#1552](https://github.com/rapidsai/rmm/pull/1552)) [@wence-](https://github.com/wence-) +- Fix Python install instruction ([#1547](https://github.com/rapidsai/rmm/pull/1547)) [@wence-](https://github.com/wence-) +- Update multi-gpu discussion for device_buffer and device_vector dtors ([#1524](https://github.com/rapidsai/rmm/pull/1524)) [@wence-](https://github.com/wence-) +- Fix ordering / heading levels in README.md and python example in guide.md ([#1513](https://github.com/rapidsai/rmm/pull/1513)) [@harrism](https://github.com/harrism) + +## 🚀 New Features + +- Add NVTX support and RMM_FUNC_RANGE() macro ([#1558](https://github.com/rapidsai/rmm/pull/1558)) [@harrism](https://github.com/harrism) +- Always use a static gtest ([#1532](https://github.com/rapidsai/rmm/pull/1532)) [@robertmaynard](https://github.com/robertmaynard) +- Build C++ wheel ([#1529](https://github.com/rapidsai/rmm/pull/1529)) [@vyasr](https://github.com/vyasr) +- Remove deprecated supports_streams and get_mem_info methods. ([#1519](https://github.com/rapidsai/rmm/pull/1519)) [@harrism](https://github.com/harrism) + +## 🛠️ Improvements + +- update copyright dates ([#1564](https://github.com/rapidsai/rmm/pull/1564)) [@jameslamb](https://github.com/jameslamb) +- Overhaul ops-codeowners ([#1561](https://github.com/rapidsai/rmm/pull/1561)) [@raydouglass](https://github.com/raydouglass) +- Adding support for cupy.cuda.stream.ExternalStream ([#1559](https://github.com/rapidsai/rmm/pull/1559)) [@lilohuang](https://github.com/lilohuang) +- Refactor polymorphic allocator to use device_async_resource_ref ([#1555](https://github.com/rapidsai/rmm/pull/1555)) [@harrism](https://github.com/harrism) +- add RAPIDS copyright pre-commit hook ([#1553](https://github.com/rapidsai/rmm/pull/1553)) [@jameslamb](https://github.com/jameslamb) +- Enable warnings as errors for Python tests ([#1551](https://github.com/rapidsai/rmm/pull/1551)) [@mroeschke](https://github.com/mroeschke) +- Remove header existence tests. ([#1550](https://github.com/rapidsai/rmm/pull/1550)) [@bdice](https://github.com/bdice) +- Only use functions in the limited API ([#1545](https://github.com/rapidsai/rmm/pull/1545)) [@vyasr](https://github.com/vyasr) +- Migrate to `{{ stdlib("c") }}` ([#1543](https://github.com/rapidsai/rmm/pull/1543)) [@hcho3](https://github.com/hcho3) +- Fix `cuda11.8` nvcc dependency ([#1542](https://github.com/rapidsai/rmm/pull/1542)) [@trxcllnt](https://github.com/trxcllnt) +- add --rm and --name to devcontainer run args ([#1539](https://github.com/rapidsai/rmm/pull/1539)) [@trxcllnt](https://github.com/trxcllnt) +- Remove deprecated functionality ([#1537](https://github.com/rapidsai/rmm/pull/1537)) [@harrism](https://github.com/harrism) +- Remove deprecated cuda_async_memory_resource constructor that takes thrust::optional parameters ([#1535](https://github.com/rapidsai/rmm/pull/1535)) [@harrism](https://github.com/harrism) +- Make thrust_allocator deallocate safe in multi-device setting ([#1533](https://github.com/rapidsai/rmm/pull/1533)) [@wence-](https://github.com/wence-) +- Move rmm Python package to subdirectory ([#1526](https://github.com/rapidsai/rmm/pull/1526)) [@vyasr](https://github.com/vyasr) +- Remove a file not being used ([#1521](https://github.com/rapidsai/rmm/pull/1521)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unneeded `update-version.sh` update ([#1520](https://github.com/rapidsai/rmm/pull/1520)) [@AyodeAwe](https://github.com/AyodeAwe) +- Enable all tests for `arm` arch ([#1510](https://github.com/rapidsai/rmm/pull/1510)) [@galipremsagar](https://github.com/galipremsagar) + # RMM 24.04.00 (10 Apr 2024) ## 🚨 Breaking Changes @@ -40,7 +92,7 @@ - Use `conda env create --yes` instead of `--force` ([#1509](https://github.com/rapidsai/rmm/pull/1509)) [@bdice](https://github.com/bdice) - Add upper bound to prevent usage of NumPy 2 ([#1501](https://github.com/rapidsai/rmm/pull/1501)) [@bdice](https://github.com/bdice) - Remove hard-coding of RAPIDS version where possible ([#1496](https://github.com/rapidsai/rmm/pull/1496)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) -- Requre NumPy 1.23+ ([#1488](https://github.com/rapidsai/rmm/pull/1488)) [@jakirkham](https://github.com/jakirkham) +- Require NumPy 1.23+ ([#1488](https://github.com/rapidsai/rmm/pull/1488)) [@jakirkham](https://github.com/jakirkham) - Use `rmm::device_async_resource_ref` in multi_stream_allocation benchmark ([#1482](https://github.com/rapidsai/rmm/pull/1482)) [@miscco](https://github.com/miscco) - Update devcontainers to CUDA Toolkit 12.2 ([#1470](https://github.com/rapidsai/rmm/pull/1470)) [@trxcllnt](https://github.com/trxcllnt) - Add support for Python 3.11 ([#1469](https://github.com/rapidsai/rmm/pull/1469)) [@jameslamb](https://github.com/jameslamb) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56454d4b1..fa3358ca0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -38,6 +38,7 @@ rapids_cmake_build_type(Release) # ################################################################################################## # * build options ---------------------------------------------------------------------------------- +option(USE_NVTX "Build with NVTX support" ON) option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF) set(RMM_LOGGING_LEVEL @@ -46,11 +47,12 @@ set(RMM_LOGGING_LEVEL set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF") +message(VERBOSE "RMM: Build with NVTX support: ${USE_NVTX}") # Set logging level. Must go before including gtests and benchmarks. Set the possible values of -# build type for cmake-gui +# build type for cmake-gui. message(STATUS "RMM: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'") -# cudart can be statically linked or dynamically linked the python ecosystem wants dynamic linking +# cudart can be linked statically or dynamically option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) # ################################################################################################## @@ -71,6 +73,7 @@ rapids_cpm_init() include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) include(cmake/thirdparty/get_cccl.cmake) +include(cmake/thirdparty/get_nvtx.cmake) # ################################################################################################## # * library targets -------------------------------------------------------------------------------- @@ -93,9 +96,15 @@ target_link_libraries(rmm INTERFACE CCCL::CCCL) target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) target_link_libraries(rmm INTERFACE dl) +target_link_libraries(rmm INTERFACE nvtx3::nvtx3-cpp) target_compile_features(rmm INTERFACE cxx_std_17 $) target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +# Disable NVTX if necessary +if(NOT USE_NVTX) + target_compile_definitions(rmm INTERFACE NVTX_DISABLE) +endif() + # ################################################################################################## # * tests and benchmarks --------------------------------------------------------------------------- @@ -114,7 +123,8 @@ endif() # * add tests -------------------------------------------------------------------------------------- if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) - include(cmake/thirdparty/get_gtest.cmake) + include(${rapids-cmake-dir}/cpm/gtest.cmake) + rapids_cpm_gtest(BUILD_STATIC) include(CTest) # calls enable_testing() add_subdirectory(tests) diff --git a/README.md b/README.md index 9ec8cbf47..4032f161b 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Python requirements: * `cuda-python` * `cython` -For more details, see [pyproject.toml](python/pyproject.toml) +For more details, see [pyproject.toml](python/rmm/pyproject.toml) ### Script to build RMM from source @@ -127,7 +127,8 @@ $ make test - Build, install, and test the `rmm` python package, in the `python` folder: ```bash -$ python -m pip install -e ./python +# In the root rmm directory +$ python -m pip install -e ./python/rmm $ pytest -v ``` @@ -207,38 +208,7 @@ alignment argument. All allocations are required to be aligned to at least 256B. `device_memory_resource` adds an additional `cuda_stream_view` argument to allow specifying the stream on which to perform the (de)allocation. -## `cuda_stream_view` and `cuda_stream` - -`rmm::cuda_stream_view` is a simple non-owning wrapper around a CUDA `cudaStream_t`. This wrapper's -purpose is to provide strong type safety for stream types. (`cudaStream_t` is an alias for a pointer, -which can lead to ambiguity in APIs when it is assigned `0`.) All RMM stream-ordered APIs take a -`rmm::cuda_stream_view` argument. - -`rmm::cuda_stream` is a simple owning wrapper around a CUDA `cudaStream_t`. This class provides -RAII semantics (constructor creates the CUDA stream, destructor destroys it). An `rmm::cuda_stream` -can never represent the CUDA default stream or per-thread default stream; it only ever represents -a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved. - -## `cuda_stream_pool` - -`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to -create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the -stream pool can be faster than creating the streams on the fly. The size of the pool is configurable. -Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of -`rmm::cuda_stream_view` that represent identical CUDA streams. - -### Thread Safety - -All current device memory resources are thread safe unless documented otherwise. More specifically, -calls to memory resource `allocate()` and `deallocate()` methods are safe with respect to calls to -either of these functions from other threads. They are _not_ thread safe with respect to -construction and destruction of the memory resource object. - -Note that a class `thread_safe_resource_adapter` is provided which can be used to adapt a memory -resource that is not thread safe to be thread safe (as described above). This adapter is not needed -with any current RMM device memory resources. - -### Stream-ordered Memory Allocation +## Stream-ordered Memory Allocation `rmm::mr::device_memory_resource` is a base class that provides stream-ordered memory allocation. This allows optimizations such as re-using memory deallocated on the same stream without the @@ -270,16 +240,16 @@ For further information about stream-ordered memory allocation semantics, read Allocator](https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/) on the NVIDIA Developer Blog. -### Available Resources +## Available Device Resources RMM provides several `device_memory_resource` derived classes to satisfy various user requirements. For more detailed information about these resources, see their respective documentation. -#### `cuda_memory_resource` +### `cuda_memory_resource` Allocates and frees device memory using `cudaMalloc` and `cudaFree`. -#### `managed_memory_resource` +### `managed_memory_resource` Allocates and frees device memory using `cudaMallocManaged` and `cudaFree`. @@ -287,22 +257,22 @@ Note that `managed_memory_resource` cannot be used with NVIDIA Virtual GPU Softw with virtual machines or hypervisors) because [NVIDIA CUDA Unified Memory is not supported by NVIDIA vGPU](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#cuda-open-cl-support-vgpu). -#### `pool_memory_resource` +### `pool_memory_resource` A coalescing, best-fit pool sub-allocator. -#### `fixed_size_memory_resource` +### `fixed_size_memory_resource` A memory resource that can only allocate a single fixed size. Average allocation and deallocation cost is constant. -#### `binning_memory_resource` +### `binning_memory_resource` Configurable to use multiple upstream memory resources for allocations that fall within different bin sizes. Often configured with multiple bins backed by `fixed_size_memory_resource`s and a single `pool_memory_resource` for allocations larger than the largest bin size. -### Default Resources and Per-device Resources +## Default Resources and Per-device Resources RMM users commonly need to configure a `device_memory_resource` object to use for all allocations where another resource has not explicitly been provided. A common example is configuring a @@ -327,7 +297,7 @@ Accessing and modifying the default resource is done through two functions: `get_current_device_resource()` - For more explicit control, you can use `set_per_device_resource()`, which takes a device ID. -#### Example +### Example ```c++ rmm::mr::cuda_memory_resource cuda_mr; @@ -339,7 +309,7 @@ rmm::mr::set_current_device_resource(&pool_mr); // Updates the current device re rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); // Points to `pool_mr` ``` -#### Multiple Devices +### Multiple Devices A `device_memory_resource` should only be used when the active CUDA device is the same device that was active when the `device_memory_resource` was created. Otherwise behavior is undefined. @@ -367,36 +337,116 @@ for(int i = 0; i < N; ++i) { Note that the CUDA device that is current when creating a `device_memory_resource` must also be current any time that `device_memory_resource` is used to deallocate memory, including in a -destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an -(incorrect) example that assumes the above example loop has been run to create a -`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the -line of the error comment. +destructor. The RAII class `rmm::device_buffer` and classes that use it as a backing store +(`rmm::device_scalar` and `rmm::device_uvector`) handle this by storing the active device when the +constructor is called, and then ensuring that the stored device is active whenever an allocation or +deallocation is performed (including in the destructor). The user must therefore only ensure that +the device active during _creation_ of an `rmm::device_buffer` matches the active device of the +memory resource being used. + +Here is an incorrect example that creates a memory resource on device zero and then uses it to +allocate a `device_buffer` on device one: ```c++ { RMM_CUDA_TRY(cudaSetDevice(0)); - rmm::device_buffer buf_a(16); - + auto mr = rmm::mr::cuda_memory_resource{}; { RMM_CUDA_TRY(cudaSetDevice(1)); - rmm::device_buffer buf_b(16); + // Invalid, current device is 1, but MR is only valid for device 0 + rmm::device_buffer buf(16, rmm::cuda_stream_default, &mr); } +} +``` - // Error: when buf_a is destroyed, the current device must be 0, but it is 1 +A correct example creates the device buffer with device zero active. After that it is safe to switch +devices and let the buffer go out of scope and destruct with a different device active. For example, +this code is correct: + +```c++ +{ + RMM_CUDA_TRY(cudaSetDevice(0)); + auto mr = rmm::mr::cuda_memory_resource{}; + rmm::device_buffer buf(16, rmm::cuda_stream_default, &mr); + RMM_CUDA_TRY(cudaSetDevice(1)); + ... + // No need to switch back to device 0 before ~buf runs +} +``` + +#### Use of `rmm::device_vector` with multiple devices + +`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to +allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory +resource apply: the active device must match the active device at resource construction time. To +facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at +construction time and ensures that device is active whenever it allocates or deallocates memory. +Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`. +One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them +with a different active device. + +For example, recapitulating the previous example using `rmm::device_vector`: + +```c++ +{ + RMM_CUDA_TRY(cudaSetDevice(0)); + auto mr = rmm::mr::cuda_memory_resource{}; + rmm::device_vector vec(16, rmm::mr::thrust_allocator(rmm::cuda_stream_default, &mr)); + RMM_CUDA_TRY(cudaSetDevice(1)); + ... + // No need to switch back to device 0 before ~vec runs } ``` -### Allocators +> [!NOTE] +> Although allocation and deallocation in the `thrust_allocator` run with the correct active device, +> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the +> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to +> initialize new elements: the user must arrange for this kernel launch to occur with the correct +> device for the memory resource active. + +## `cuda_stream_view` and `cuda_stream` + +`rmm::cuda_stream_view` is a simple non-owning wrapper around a CUDA `cudaStream_t`. This wrapper's +purpose is to provide strong type safety for stream types. (`cudaStream_t` is an alias for a pointer, +which can lead to ambiguity in APIs when it is assigned `0`.) All RMM stream-ordered APIs take a +`rmm::cuda_stream_view` argument. + +`rmm::cuda_stream` is a simple owning wrapper around a CUDA `cudaStream_t`. This class provides +RAII semantics (constructor creates the CUDA stream, destructor destroys it). An `rmm::cuda_stream` +can never represent the CUDA default stream or per-thread default stream; it only ever represents +a single non-default stream. `rmm::cuda_stream` cannot be copied, but can be moved. + +## `cuda_stream_pool` + +`rmm::cuda_stream_pool` provides fast access to a pool of CUDA streams. This class can be used to +create a set of `cuda_stream` objects whose lifetime is equal to the `cuda_stream_pool`. Using the +stream pool can be faster than creating the streams on the fly. The size of the pool is configurable. +Depending on this size, multiple calls to `cuda_stream_pool::get_stream()` may return instances of +`rmm::cuda_stream_view` that represent identical CUDA streams. + +## Thread Safety + +All current device memory resources are thread safe unless documented otherwise. More specifically, +calls to memory resource `allocate()` and `deallocate()` methods are safe with respect to calls to +either of these functions from other threads. They are _not_ thread safe with respect to +construction and destruction of the memory resource object. + +Note that a class `thread_safe_resource_adapter` is provided which can be used to adapt a memory +resource that is not thread safe to be thread safe (as described above). This adapter is not needed +with any current RMM device memory resources. + +## Allocators C++ interfaces commonly allow customizable memory allocation through an [`Allocator`](https://en.cppreference.com/w/cpp/named_req/Allocator) object. RMM provides several `Allocator` and `Allocator`-like classes. -#### `polymorphic_allocator` +### `polymorphic_allocator` A [stream-ordered](#stream-ordered-memory-allocation) allocator similar to [`std::pmr::polymorphic_allocator`](https://en.cppreference.com/w/cpp/memory/polymorphic_allocator). Unlike the standard C++ `Allocator` interface, the `allocate` and `deallocate` functions take a `cuda_stream_view` indicating the stream on which the (de)allocation occurs. -#### `stream_allocator_adaptor` +### `stream_allocator_adaptor` `stream_allocator_adaptor` can be used to adapt a stream-ordered allocator to present a standard `Allocator` interface to consumers that may not be designed to work with a stream-ordered interface. @@ -415,7 +465,7 @@ auto p = adapted.allocate(100); adapted.deallocate(p,100); ``` -#### `thrust_allocator` +### `thrust_allocator` `thrust_allocator` is a device memory allocator that uses the strongly typed `thrust::device_ptr`, making it usable with containers like `thrust::device_vector`. @@ -497,13 +547,13 @@ Similar to `device_memory_resource`, it has two key functions for (de)allocation Unlike `device_memory_resource`, the `host_memory_resource` interface and behavior is identical to `std::pmr::memory_resource`. -### Available Resources +## Available Host Resources -#### `new_delete_resource` +### `new_delete_resource` Uses the global `operator new` and `operator delete` to allocate host memory. -#### `pinned_memory_resource` +### `pinned_memory_resource` Allocates "pinned" host memory using `cuda(Malloc/Free)Host`. @@ -611,7 +661,7 @@ resources are detectable with Compute Sanitizer Memcheck. It may be possible in the future to add support for memory bounds checking with other memory resources using NVTX APIs. -## Using RMM in Python Code +# Using RMM in Python There are two ways to use RMM in Python code: @@ -622,7 +672,7 @@ There are two ways to use RMM in Python code: RMM provides a `MemoryResource` abstraction to control _how_ device memory is allocated in both the above uses. -### DeviceBuffers +## DeviceBuffer A DeviceBuffer represents an **untyped, uninitialized device memory allocation**. DeviceBuffers can be created by providing the @@ -662,7 +712,7 @@ host: array([1., 2., 3.]) ``` -### MemoryResource objects +## MemoryResource objects `MemoryResource` objects are used to configure how device memory allocations are made by RMM. @@ -805,3 +855,94 @@ Out[6]: 'total_bytes': 16, 'total_count': 1} ``` + +## Taking ownership of C++ objects from Python. + +When interacting with a C++ library that uses RMM from Python, one +must be careful when taking ownership of `rmm::device_buffer` objects +on the Python side. The `rmm::device_buffer` does not contain an +owning reference to the memory resource used for its allocation (only +a `device_async_resource_ref`), and the allocating user is expected to +keep this memory resource alive for at least the lifetime of the +buffer. When taking ownership of such a buffer in Python, we have no +way (in the general case) of ensuring that the memory resource will +outlive the buffer we are now holding. + +To avoid any issues, we need two things: + +1. The C++ library we are interfacing with should accept a memory + resource that is used for allocations that are returned to the + user. +2. When calling into the library from python, we should provide a + memory resource whose lifetime we control. This memory resource + should then be provided when we take ownership of any allocated + `rmm::device_buffer`s. + +For example, suppose we have a C++ function that allocates +`device_buffer`s, which has a utility overload that defaults the +memory resource to the current device resource: + +```c++ +std::unique_ptr allocate( + std::size_t size, + rmm::mr::device_async_resource_ref mr = get_current_device_resource()) +{ + return std::make_unique(size, rmm::cuda_stream_default, mr); +} +``` + +The Python `DeviceBuffer` class has a convenience Cython function, +`c_from_unique_ptr` to construct a `DeviceBuffer` from a +`unique_ptr`, taking ownership of it. To do this +safely, we must ensure that the allocation that was done on the C++ +side uses a memory resource we control. So: + +```cython +# Bad, doesn't control lifetime +buffer_bad = DeviceBuffer.c_from_unique_ptr(allocate(10)) + +# Good, allocation happens with a memory resource we control +# mr is a DeviceMemoryResource +buffer_good = DeviceBuffer.c_from_unique_ptr( + allocate(10, mr.get_mr()), + mr=mr, +) +``` + +Note two differences between the bad and good cases: + +1. In the good case we pass the memory resource to the allocation + function. +2. In the good case, we pass _the same_ memory resource to the + `DeviceBuffer` constructor so that its lifetime is tied to the + lifetime of the buffer. + +### Potential pitfalls of relying on `get_current_device_resource` + +Functions in both the C++ and Python APIs that perform allocation +typically default the memory resource argument to the value of +`get_current_device_resource`. This is to simplify the interface for +callers. When using a C++ library from Python, this defaulting is +safe, _as long as_ it is only the Python process that ever calls +`set_current_device_resource`. + +This is because the current device resource on the C++ side has a +lifetime which is expected to be managed by the user. The resources +set by `rmm::mr::set_current_device_resource` are stored in a static +`std::map` whose keys are device ids and values are raw pointers to +the memory resources. Consequently, +`rmm::mr::get_current_device_resource` returns an object with no +lifetime provenance. This is, for the reasons discussed above, not +usable from Python. To handle this on the Python side, the +Python-level `set_current_device_resource` sets the C++ resource _and_ +stores the Python object in a static global dictionary. The Python +`get_current_device_resource` then _does not use_ +`rmm::mr::get_current_device_resource` and instead looks up the +current device resource in this global dictionary. + +Hence, if the C++ library we are interfacing with calls +`rmm::mr::set_current_device_resource`, the C++ and Python sides of +the program can disagree on what `get_current_device_resource` +returns. The only safe thing to do if using the simplified interfaces +is therefore to ensure that `set_current_device_resource` is only ever +called on the Python side. diff --git a/VERSION b/VERSION index 4a2fe8aa5..0bff6981a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.04.00 +24.06.00 diff --git a/build.sh b/build.sh index ec08fa402..70da14b9b 100755 --- a/build.sh +++ b/build.sh @@ -1,12 +1,12 @@ #!/bin/bash -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # rmm build script # This script is used to build the component(s) in this repo from # source, and can be called with various options to customize the -# build as needed (see the help output for details) +# build as needed (see the help output for details). # Abort script on first error set -e @@ -141,12 +141,6 @@ if hasArg --ptds; then PER_THREAD_DEFAULT_STREAM=ON fi -# Append `-DFIND_RMM_CPP=ON` to CMAKE_ARGS unless a user specified the option. -SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" -if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RMM_CPP"* ]]; then - SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RMM_CPP=ON" -fi - # If clean given, run it prior to any other steps if hasArg clean; then # If the dirs to clean are mounted dirs in a container, the @@ -176,5 +170,5 @@ fi # Build and install the rmm Python package if (( NUMARGS == 0 )) || hasArg rmm; then echo "building and installing rmm..." - SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" python -m pip install --no-build-isolation --no-deps ${REPODIR}/python + SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/rmm fi diff --git a/ci/build_docs.sh b/ci/build_docs.sh index ddd4b5014..dd4af8195 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -36,7 +36,7 @@ doxygen Doxyfile popd rapids-logger "Build Python docs" -pushd python/docs +pushd python/rmm/docs make dirhtml make text mkdir -p "${RAPIDS_DOCS_DIR}/rmm/"{html,txt} diff --git a/ci/build_python.sh b/ci/build_python.sh index b197b1ae1..394b3a453 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -14,7 +14,7 @@ export CMAKE_GENERATOR=Ninja rapids-print-env package_name="rmm" -package_dir="python" +package_dir="python/rmm" version=$(rapids-generate-version) commit=$(git rev-parse HEAD) diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh new file mode 100755 index 000000000..e61f6641c --- /dev/null +++ b/ci/build_wheel_cpp.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_name="librmm" +package_dir="python/librmm" + +source rapids-configure-sccache +source rapids-date-string + +version=$(rapids-generate-version) +commit=$(git rev-parse HEAD) + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +pyproject_file="${package_dir}/pyproject.toml" + +sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +echo "${version}" > VERSION +sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py" + +cd "${package_dir}" + +python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +python -m pip install wheel +python -m wheel tags --platform any dist/* --remove +RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist diff --git a/ci/build_wheel.sh b/ci/build_wheel_python.sh similarity index 64% rename from ci/build_wheel.sh rename to ci/build_wheel_python.sh index b2d953361..debe9b0fe 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel_python.sh @@ -1,10 +1,10 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -euo pipefail package_name="rmm" -package_dir="python" +package_dir="python/rmm" source rapids-configure-sccache source rapids-date-string @@ -14,26 +14,33 @@ commit=$(git rev-parse HEAD) RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -# This is the version of the suffix with a preceding hyphen. It's used +# This is the version of the suffix with a preceding hyphen. It is used # everywhere except in the final wheel name. PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" -# Patch project metadata files to include the CUDA version suffix and version override. pyproject_file="${package_dir}/pyproject.toml" sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} echo "${version}" > VERSION sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py" +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +sed -r -i "s/librmm==(.*)\"/librmm${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} fi cd "${package_dir}" -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/librmm_dist) + +python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check --find-links "${CPP_WHEELHOUSE}" mkdir -p final_dist python -m auditwheel repair -w final_dist dist/* -RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 2a6e550df..dfb0c17b0 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. ######################## # RMM Version Updater # ######################## @@ -39,10 +40,22 @@ echo "${NEXT_FULL_TAG}" > VERSION for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done -sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh # .devcontainer files find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" + sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}" +done + +DEPENDENCIES=( + librmm +) +for DEP in "${DEPENDENCIES[@]}"; do + for FILE in dependencies.yaml conda/environments/*.yaml; do + sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}" + done + for FILE in python/*/pyproject.toml; do + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" "${FILE}" + done done diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh index 73780fcc3..6bda50870 100755 --- a/ci/run_pytests.sh +++ b/ci/run_pytests.sh @@ -4,6 +4,6 @@ set -euo pipefail # Support invoking run_pytests.sh outside the script directory -cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/ +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/rmm/ pytest --cache-clear -v "$@" . diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index aa2705afe..50cb203c3 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -1,17 +1,13 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -eou pipefail RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +WHEELHOUSE="${PWD}/dist/" +RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}" -# echo to expand wildcard before adding `[extra]` requires for pip -python -m pip install $(echo ./dist/rmm*.whl)[test] +# echo to expand wildcard before adding '[extra]' requires for pip +python -m pip install "rmm-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}" -# Run smoke tests for aarch64 pull requests -if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then - python ./ci/wheel_smoke_test.py -else - python -m pytest ./python/rmm/tests -fi +python -m pytest ./python/rmm/rmm/tests diff --git a/ci/wheel_smoke_test.py b/ci/wheel_smoke_test.py deleted file mode 100644 index 737c0b605..000000000 --- a/ci/wheel_smoke_test.py +++ /dev/null @@ -1,5 +0,0 @@ -import rmm - -if __name__ == "__main__": - buf = rmm.DeviceBuffer(size=100) - assert buf.size == 100 diff --git a/cmake/thirdparty/get_gtest.cmake b/cmake/thirdparty/get_nvtx.cmake similarity index 72% rename from cmake/thirdparty/get_gtest.cmake rename to cmake/thirdparty/get_nvtx.cmake index 4d4daff44..90487dd22 100644 --- a/cmake/thirdparty/get_gtest.cmake +++ b/cmake/thirdparty/get_nvtx.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,11 +12,12 @@ # the License. # ============================================================================= -# Use CPM to find or clone gtest -function(find_and_configure_gtest) - include(${rapids-cmake-dir}/cpm/gtest.cmake) - rapids_cpm_gtest() +# Use CPM to find or clone NVTX3 +function(find_and_configure_nvtx3) + + include(${rapids-cmake-dir}/cpm/nvtx3.cmake) + rapids_cpm_nvtx3(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports) endfunction() -find_and_configure_gtest() +find_and_configure_nvtx3() diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index c4bbcccd2..5806b8bd9 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -9,7 +9,6 @@ dependencies: - clang-tools==16.0.6 - clang==16.0.6 - cmake>=3.26.4 -- cuda-nvcc - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit diff --git a/conda/recipes/librmm/conda_build_config.yaml b/conda/recipes/librmm/conda_build_config.yaml index ed58ac507..1740bfe2f 100644 --- a/conda/recipes/librmm/conda_build_config.yaml +++ b/conda/recipes/librmm/conda_build_config.yaml @@ -16,11 +16,10 @@ cmake_version: fmt_version: - ">=10.1.1,<11" -gtest_version: - - ">=1.13.0" - spdlog_version: - ">=1.12.0,<1.13" -sysroot_version: +c_stdlib: + - sysroot +c_stdlib_version: - "2.17" diff --git a/conda/recipes/librmm/meta.yaml b/conda/recipes/librmm/meta.yaml index 60cd6ffae..53e16ebdc 100644 --- a/conda/recipes/librmm/meta.yaml +++ b/conda/recipes/librmm/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -23,18 +23,16 @@ requirements: - {{ compiler('cuda') }} {% endif %} - cuda-version ={{ cuda_version }} - - sysroot_{{ target_platform }} {{ sysroot_version }} + - {{ stdlib("c") }} host: - cuda-version ={{ cuda_version }} - # We require spdlog and fmt (which was devendored from spdlog + # We require spdlog and fmt (which was de-vendored from spdlog # conda-forge packages in 1.11.0) so that the spdlog headers are not # pulled by CPM and installed as a part of the rmm packages. However, # building against librmm still requires these headers. They are also # added as a run requirement via the packages' run_exports. - fmt {{ fmt_version }} - spdlog {{ spdlog_version }} - - gtest {{ gtest_version }} - - gmock {{ gtest_version }} build: script_env: @@ -83,38 +81,7 @@ outputs: - spdlog {{ spdlog_version }} test: commands: - - test -f $PREFIX/include/rmm/thrust_rmm_allocator.h - - test -f $PREFIX/include/rmm/logger.hpp - - test -f $PREFIX/include/rmm/cuda_stream.hpp - - test -f $PREFIX/include/rmm/cuda_stream_view.hpp - - test -f $PREFIX/include/rmm/cuda_stream_pool.hpp - - test -f $PREFIX/include/rmm/device_uvector.hpp - - test -f $PREFIX/include/rmm/device_scalar.hpp - - test -f $PREFIX/include/rmm/device_buffer.hpp - - test -f $PREFIX/include/rmm/detail/aligned.hpp - - test -f $PREFIX/include/rmm/detail/error.hpp - - test -f $PREFIX/include/rmm/detail/exec_check_disable.hpp - - test -f $PREFIX/include/rmm/mr/device/detail/arena.hpp - - test -f $PREFIX/include/rmm/mr/device/detail/free_list.hpp - - test -f $PREFIX/include/rmm/mr/device/detail/coalescing_free_list.hpp - - test -f $PREFIX/include/rmm/mr/device/detail/fixed_size_free_list.hpp - - test -f $PREFIX/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/arena_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/binning_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/cuda_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/device_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/fixed_size_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/limiting_resource_adaptor.hpp - - test -f $PREFIX/include/rmm/mr/device/logging_resource_adaptor.hpp - - test -f $PREFIX/include/rmm/mr/device/managed_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/owning_wrapper.hpp - - test -f $PREFIX/include/rmm/mr/device/per_device_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/pool_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/device/thread_safe_resource_adaptor.hpp - - test -f $PREFIX/include/rmm/mr/device/thrust_allocator_adaptor.hpp - - test -f $PREFIX/include/rmm/mr/host/host_memory_resource.hpp - - test -f $PREFIX/include/rmm/mr/host/new_delete_resource.hpp - - test -f $PREFIX/include/rmm/mr/host/pinned_memory_resource.hpp + - test -d "${PREFIX}/include/rmm" about: home: https://rapids.ai/ license: Apache-2.0 @@ -150,8 +117,6 @@ outputs: {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - {{ pin_subpackage('librmm', exact=True) }} - - gtest {{ gtest_version }} - - gmock {{ gtest_version }} about: home: https://rapids.ai/ license: Apache-2.0 diff --git a/conda/recipes/rmm/conda_build_config.yaml b/conda/recipes/rmm/conda_build_config.yaml index e28b98da7..e21871634 100644 --- a/conda/recipes/rmm/conda_build_config.yaml +++ b/conda/recipes/rmm/conda_build_config.yaml @@ -10,7 +10,9 @@ cuda_compiler: cuda11_compiler: - nvcc -sysroot_version: +c_stdlib: + - sysroot +c_stdlib_version: - "2.17" cmake_version: diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml index 764c3f75b..fa8a82475 100644 --- a/conda/recipes/rmm/meta.yaml +++ b/conda/recipes/rmm/meta.yaml @@ -1,6 +1,6 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. -{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].strip().lstrip('v') %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} {% set py_version = environ['CONDA_PY'] %} @@ -51,7 +51,7 @@ requirements: - {{ compiler('cuda') }} {% endif %} - cuda-version ={{ cuda_version }} - - sysroot_{{ target_platform }} {{ sysroot_version }} + - {{ stdlib("c") }} host: - cuda-version ={{ cuda_version }} {% if cuda_major == "11" %} diff --git a/dependencies.yaml b/dependencies.yaml index 548999f1b..eb2c4e4f2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -7,6 +7,7 @@ files: arch: [x86_64] includes: - build + - cython_build - checks - cuda - cuda_version @@ -37,20 +38,31 @@ files: - cuda_version - docs - py_version + py_cpp_build: + output: pyproject + pyproject_dir: python/librmm + extras: + table: build-system + includes: + - build py_build: output: pyproject + pyproject_dir: python/rmm extras: table: build-system includes: - build + - cython_build py_run: output: pyproject + pyproject_dir: python/rmm extras: table: project includes: - run py_optional_test: output: pyproject + pyproject_dir: python/rmm extras: table: project.optional-dependencies key: test @@ -65,7 +77,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - &cmake_ver cmake>=3.26.4 - - cython>=3.0.0 - ninja - output_types: conda packages: @@ -106,6 +117,12 @@ dependencies: cuda: "12.*" packages: - cuda-nvcc + cython_build: + common: + - output_types: [conda, requirements, pyproject] + packages: + - cython>=3.0.0 + specific: - output_types: [conda, requirements, pyproject] matrices: - matrix: @@ -115,6 +132,19 @@ dependencies: - matrix: # All CUDA 11 versions packages: - &cuda_python11 cuda-python>=11.7.1,<12.0a0 + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - librmm-cu12==24.6.* + - matrix: + cuda: "11.*" + packages: + - librmm-cu11==24.6.* + - matrix: + packages: + - librmm==24.6.* checks: common: - output_types: [conda, requirements] @@ -239,7 +269,21 @@ dependencies: packages: - pytest - pytest-cov + # Needed for numba in tests + specific: - output_types: conda - packages: - # Needed for numba in tests - - cuda-nvcc + matrices: + - matrix: + arch: x86_64 + cuda: "11.8" + packages: + - nvcc_linux-64=11.8 + - matrix: + arch: aarch64 + cuda: "11.8" + packages: + - nvcc_linux-aarch64=11.8 + - matrix: + cuda: "12.*" + packages: + - cuda-nvcc diff --git a/include/rmm/detail/aligned.hpp b/include/rmm/detail/aligned.hpp index eb31658e9..1206a1983 100644 --- a/include/rmm/detail/aligned.hpp +++ b/include/rmm/detail/aligned.hpp @@ -26,87 +26,6 @@ namespace rmm::detail { -/** - * @brief Default alignment used for host memory allocated by RMM. - * - */ -[[deprecated("Use rmm::RMM_DEFAULT_HOST_ALIGNMENT instead.")]] static constexpr std::size_t - RMM_DEFAULT_HOST_ALIGNMENT{rmm::RMM_DEFAULT_HOST_ALIGNMENT}; - -/** - * @brief Default alignment used for CUDA memory allocation. - * - */ -[[deprecated("Use rmm::CUDA_ALLOCATION_ALIGNMENT instead.")]] static constexpr std::size_t - CUDA_ALLOCATION_ALIGNMENT{rmm::CUDA_ALLOCATION_ALIGNMENT}; - -/** - * @brief Returns whether or not `n` is a power of 2. - * - */ -[[deprecated("Use rmm::is_pow2 instead.")]] constexpr bool is_pow2(std::size_t value) noexcept -{ - return rmm::is_pow2(value); -} - -/** - * @brief Returns whether or not `alignment` is a valid memory alignment. - * - */ -[[deprecated("Use rmm::is_supported_alignment instead.")]] constexpr bool is_supported_alignment( - std::size_t alignment) noexcept -{ - return rmm::is_pow2(alignment); -} - -/** - * @brief Align up to nearest multiple of specified power of 2 - * - * @param[in] value value to align - * @param[in] alignment amount, in bytes, must be a power of 2 - * - * @return Return the aligned value, as one would expect - */ -[[deprecated("Use rmm::align_up instead.")]] constexpr std::size_t align_up( - std::size_t value, std::size_t alignment) noexcept -{ - return rmm::align_up(value, alignment); -} - -/** - * @brief Align down to the nearest multiple of specified power of 2 - * - * @param[in] value value to align - * @param[in] alignment amount, in bytes, must be a power of 2 - * - * @return Return the aligned value, as one would expect - */ -[[deprecated("Use rmm::align_down instead.")]] constexpr std::size_t align_down( - std::size_t value, std::size_t alignment) noexcept -{ - return rmm::align_down(value, alignment); -} - -/** - * @brief Checks whether a value is aligned to a multiple of a specified power of 2 - * - * @param[in] value value to check for alignment - * @param[in] alignment amount, in bytes, must be a power of 2 - * - * @return true if aligned - */ -[[deprecated("Use rmm::is_aligned instead.")]] constexpr bool is_aligned( - std::size_t value, std::size_t alignment) noexcept -{ - return rmm::is_aligned(value, alignment); -} - -[[deprecated("Use rmm::is_pointer_aligned instead.")]] inline bool is_pointer_aligned( - void* ptr, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) -{ - return rmm::is_pointer_aligned(ptr, alignment); -} - /** * @brief Allocates sufficient host-accessible memory to satisfy the requested size `bytes` with * alignment `alignment` using the unary callable `alloc` to allocate memory. diff --git a/include/rmm/detail/nvtx/ranges.hpp b/include/rmm/detail/nvtx/ranges.hpp new file mode 100644 index 000000000..1791a8c3b --- /dev/null +++ b/include/rmm/detail/nvtx/ranges.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace rmm { +/** + * @brief Tag type for librmm's NVTX domain. + */ +struct librmm_domain { + static constexpr char const* name{"librmm"}; ///< Name of the librmm domain +}; + +/** + * @brief Alias for an NVTX range in the librmm domain. + * + * Customizes an NVTX range with the given input. + * + * Example: + * ``` + * void some_function(){ + * rmm::scoped_range rng{"custom_name"}; // Customizes range name + * ... + * } + * ``` + */ +using scoped_range = ::nvtx3::scoped_range_in; + +} // namespace rmm + +/** + * @brief Convenience macro for generating an NVTX range in the `librmm` domain + * from the lifetime of a function. + * + * Uses the name of the immediately enclosing function returned by `__func__` to + * name the range. + * + * Example: + * ``` + * void some_function(){ + * RMM_FUNC_RANGE(); + * ... + * } + * ``` + */ +#define RMM_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(rmm::librmm_domain) diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp index ac6b72076..a51e6b886 100644 --- a/include/rmm/mr/device/cuda_async_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp @@ -25,7 +25,6 @@ #include #include -#include #include #include @@ -68,40 +67,6 @@ class cuda_async_memory_resource final : public device_memory_resource { win32_kmt = 0x4 ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) }; - /** - * @brief Constructs a cuda_async_memory_resource with the optionally specified initial pool size - * and release threshold. - * - * If the pool size grows beyond the release threshold, unused memory held by the pool will be - * released at the next synchronization event. - * - * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync` - * - * @param initial_pool_size Optional initial size in bytes of the pool. If no value is provided, - * initial pool size is half of the available GPU memory. - * @param release_threshold Optional release threshold size in bytes of the pool. If no value is - * provided, the release threshold is set to the total amount of memory on the current device. - * @param export_handle_type Optional `cudaMemAllocationHandleType` that allocations from this - * resource should support interprocess communication (IPC). Default is - * `cudaMemHandleTypeNone` for no IPC support. - */ - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - template , - thrust::optional>, - int> = 0> - [[deprecated("Use std::optional instead of thrust::optional.")]] // - explicit cuda_async_memory_resource( - Optional initial_pool_size, - Optional release_threshold = {}, - thrust::optional export_handle_type = {}) - : cuda_async_memory_resource(initial_pool_size.value_or(std::nullopt), - release_threshold.value_or(std::nullopt), - export_handle_type.value_or(std::nullopt)) - - { - } - /** * @brief Constructs a cuda_async_memory_resource with the optionally specified initial pool size * and release threshold. diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index c07730f70..9ca695b9d 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 24190f2b4..783dff2ed 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -17,6 +17,7 @@ #include #include +#include #include @@ -117,6 +118,7 @@ class device_memory_resource { */ void* allocate(std::size_t bytes, cuda_stream_view stream = cuda_stream_view{}) { + RMM_FUNC_RANGE(); return do_allocate(bytes, stream); } @@ -138,6 +140,7 @@ class device_memory_resource { */ void deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream = cuda_stream_view{}) { + RMM_FUNC_RANGE(); do_deallocate(ptr, bytes, stream); } @@ -173,6 +176,7 @@ class device_memory_resource { */ void* allocate(std::size_t bytes, std::size_t alignment) { + RMM_FUNC_RANGE(); return do_allocate(rmm::align_up(bytes, alignment), cuda_stream_view{}); } @@ -191,6 +195,7 @@ class device_memory_resource { */ void deallocate(void* ptr, std::size_t bytes, std::size_t alignment) { + RMM_FUNC_RANGE(); do_deallocate(ptr, rmm::align_up(bytes, alignment), cuda_stream_view{}); } @@ -209,6 +214,7 @@ class device_memory_resource { */ void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream) { + RMM_FUNC_RANGE(); return do_allocate(rmm::align_up(bytes, alignment), stream); } @@ -226,6 +232,7 @@ class device_memory_resource { */ void* allocate_async(std::size_t bytes, cuda_stream_view stream) { + RMM_FUNC_RANGE(); return do_allocate(bytes, stream); } @@ -248,6 +255,7 @@ class device_memory_resource { std::size_t alignment, cuda_stream_view stream) { + RMM_FUNC_RANGE(); do_deallocate(ptr, rmm::align_up(bytes, alignment), stream); } @@ -266,6 +274,7 @@ class device_memory_resource { */ void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream) { + RMM_FUNC_RANGE(); do_deallocate(ptr, bytes, stream); } @@ -293,52 +302,6 @@ class device_memory_resource { return !do_is_equal(other); } - /** - * @brief Query whether the resource supports use of non-null CUDA streams for - * allocation/deallocation. - * - * @deprecated Functionality removed in favor of cuda::mr::async_memory_resource. - * - * @returns bool true if the resource supports non-null CUDA streams. - */ - [[deprecated("Functionality removed in favor of cuda::mr::async_memory_resource.")]] // - [[nodiscard]] virtual bool - supports_streams() const noexcept - { - return false; - } - - /** - * @brief Query whether the resource supports the get_mem_info API. - * - * @deprecated Use rmm::available_device_memory instead. - * - * @return bool true if the resource supports get_mem_info, false otherwise. - */ - [[deprecated("Use rmm::available_device_memory instead.")]] // - [[nodiscard]] virtual bool - supports_get_mem_info() const noexcept - { - return false; - }; - - /** - * @brief Queries the amount of free and total memory for the resource. - * - * @deprecated Use rmm::available_device_memory instead. - * - * @param stream the stream whose memory manager we want to retrieve - * - * @returns a pair containing the free memory in bytes in .first and total amount of memory in - * .second - */ - [[deprecated("Use rmm::available_device_memory instead.")]] // - [[nodiscard]] std::pair - get_mem_info(cuda_stream_view stream) const - { - return {0, 0}; - } - /** * @brief Enables the `cuda::mr::device_accessible` property * diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp index e2fb4b0cf..0b63b4691 100644 --- a/include/rmm/mr/device/polymorphic_allocator.hpp +++ b/include/rmm/mr/device/polymorphic_allocator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include @@ -26,7 +25,11 @@ #include namespace rmm::mr { - +/** + * @addtogroup device_memory_resources + * @{ + * @file + */ /** * @brief A stream ordered Allocator using a `rmm::mr::device_memory_resource` to satisfy * (de)allocations. @@ -55,21 +58,21 @@ class polymorphic_allocator { /** * @brief Construct a `polymorphic_allocator` using the provided memory resource. * - * This constructor provides an implicit conversion from `memory_resource*`. + * This constructor provides an implicit conversion from `device_async_resource_ref`. * - * @param mr The `device_memory_resource` to use as the underlying resource. + * @param mr The upstream memory resource to use for allocation. */ - polymorphic_allocator(device_memory_resource* mr) : mr_{mr} {} + polymorphic_allocator(device_async_resource_ref mr) : mr_{mr} {} /** - * @brief Construct a `polymorphic_allocator` using `other.resource()` as the underlying memory - * resource. + * @brief Construct a `polymorphic_allocator` using the underlying memory resource of `other`. * - * @param other The `polymorphic_resource` whose `resource()` will be used as the underlying + * @param other The `polymorphic_allocator` whose memory resource will be used as the underlying * resource of the new `polymorphic_allocator`. */ template - polymorphic_allocator(polymorphic_allocator const& other) noexcept : mr_{other.resource()} + polymorphic_allocator(polymorphic_allocator const& other) noexcept + : mr_{other.get_upstream_resource()} { } @@ -82,14 +85,15 @@ class polymorphic_allocator { */ value_type* allocate(std::size_t num, cuda_stream_view stream) { - return static_cast(resource()->allocate(num * sizeof(T), stream)); + return static_cast( + get_upstream_resource().allocate_async(num * sizeof(T), stream)); } /** * @brief Deallocates storage pointed to by `ptr`. * - * `ptr` must have been allocated from a `rmm::mr::device_memory_resource` `r` that compares equal - * to `*resource()` using `r.allocate(n * sizeof(T))`. + * `ptr` must have been allocated from a memory resource `r` that compares equal + * to `get_upstream_resource()` using `r.allocate(n * sizeof(T))`. * * @param ptr Pointer to memory to deallocate * @param num Number of objects originally allocated @@ -97,7 +101,7 @@ class polymorphic_allocator { */ void deallocate(value_type* ptr, std::size_t num, cuda_stream_view stream) { - resource()->deallocate(ptr, num * sizeof(T), stream); + get_upstream_resource().deallocate_async(ptr, num * sizeof(T), stream); } /** @@ -108,24 +112,40 @@ class polymorphic_allocator { return mr_; } - /** - * @brief Returns pointer to the underlying `rmm::mr::device_memory_resource`. - * - * @return Pointer to the underlying resource. - */ - [[nodiscard]] device_memory_resource* resource() const noexcept { return mr_; } - private: - device_memory_resource* mr_{ + rmm::device_async_resource_ref mr_{ get_current_device_resource()}; ///< Underlying resource used for (de)allocation }; +/** + * @brief Compare two `polymorphic_allocator`s for equality. + * + * Two `polymorphic_allocator`s are equal if their underlying memory resources compare equal. + * + * @tparam T Type of the first allocator + * @tparam U Type of the second allocator + * @param lhs The first allocator to compare + * @param rhs The second allocator to compare + * @return true if the two allocators are equal, false otherwise + */ template bool operator==(polymorphic_allocator const& lhs, polymorphic_allocator const& rhs) { - return lhs.resource()->is_equal(*rhs.resource()); + return lhs.get_upstream_resource() == rhs.get_upstream_resource(); } +/** + * @brief Compare two `polymorphic_allocator`s for inequality. + * + * Two `polymorphic_allocator`s are not equal if their underlying memory resources compare not + * equal. + * + * @tparam T Type of the first allocator + * @tparam U Type of the second allocator + * @param lhs The first allocator to compare + * @param rhs The second allocator to compare + * @return true if the two allocators are not equal, false otherwise + */ template bool operator!=(polymorphic_allocator const& lhs, polymorphic_allocator const& rhs) { @@ -237,12 +257,34 @@ class stream_allocator_adaptor { cuda_stream_view stream_; ///< Stream on which (de)allocations are performed }; +/** + * @brief Compare two `stream_allocator_adaptor`s for equality. + * + * Two `stream_allocator_adaptor`s are equal if their underlying allocators compare equal. + * + * @tparam A Type of the first allocator + * @tparam O Type of the second allocator + * @param lhs The first allocator to compare + * @param rhs The second allocator to compare + * @return true if the two allocators are equal, false otherwise + */ template bool operator==(stream_allocator_adaptor const& lhs, stream_allocator_adaptor const& rhs) { return lhs.underlying_allocator() == rhs.underlying_allocator(); } +/** + * @brief Compare two `stream_allocator_adaptor`s for inequality. + * + * Two `stream_allocator_adaptor`s are not equal if their underlying allocators compare not equal. + * + * @tparam A Type of the first allocator + * @tparam O Type of the second allocator + * @param lhs The first allocator to compare + * @param rhs The second allocator to compare + * @return true if the two allocators are not equal, false otherwise + */ template bool operator!=(stream_allocator_adaptor const& lhs, stream_allocator_adaptor const& rhs) { @@ -264,5 +306,5 @@ auto make_stream_allocator_adaptor(Allocator const& allocator, cuda_stream_view { return stream_allocator_adaptor{allocator, stream}; } - +/** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 4cbdeef4a..a3a972904 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -75,18 +75,19 @@ struct maybe_remove_property>> { -#ifdef __GNUC__ // GCC warns about compatibility issues with pre ISO C++ code +#if defined(__GNUC__) && !defined(__clang__) // GCC warns about compatibility + // issues with pre ISO C++ code #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wnon-template-friend" -#endif // __GNUC__ +#endif // __GNUC__ and not __clang__ /** * @brief Explicit removal of the friend function so we do not pretend to provide device * accessible memory */ friend void get_property(const PoolResource&, Property) = delete; -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop -#endif // __GNUC__ +#endif // __GNUC__ and not __clang__ }; } // namespace detail @@ -111,147 +112,6 @@ class pool_memory_resource final friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory - * pool using `upstream_mr`. - * - * @deprecated Use the constructor that takes an explicit initial pool size instead. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available memory from the upstream resource. - */ - template , - thrust::optional>, - int> = 0> - [[deprecated( - "Must specify initial_pool_size and use std::optional instead of thrust::optional.")]] // - explicit pool_memory_resource(Upstream* upstream_mr, - Optional initial_pool_size, - Optional maximum_pool_size = thrust::nullopt) - : pool_memory_resource( - upstream_mr, initial_pool_size.value_or(0), maximum_pool_size.value_or(std::nullopt)) - { - } - - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory - * pool using `upstream_mr`. - * - * @deprecated Use the constructor that takes an explicit initial pool size instead. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available memory from the upstream resource. - */ - [[deprecated("Must specify initial_pool_size")]] // - explicit pool_memory_resource(Upstream* upstream_mr, - std::optional initial_pool_size = std::nullopt, - std::optional maximum_pool_size = std::nullopt) - : pool_memory_resource(upstream_mr, initial_pool_size.value_or(0), maximum_pool_size) - { - } - - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using - * `upstream_mr`. - * - * @deprecated Use the constructor that takes an explicit initial pool size instead. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available memory from the upstream resource. - */ - template , - thrust::optional>, - int> = 0> - [[deprecated( - "Must specify initial_pool_size and use std::optional instead of thrust::optional.")]] // - explicit pool_memory_resource(Upstream& upstream_mr, - Optional initial_pool_size, - Optional maximum_pool_size = thrust::nullopt) - : pool_memory_resource( - upstream_mr, initial_pool_size.value_or(0), maximum_pool_size.value_or(std::nullopt)) - { - } - - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using - * `upstream_mr`. - * - * @deprecated Use the constructor that takes an explicit initial pool size instead. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to zero. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available memory from the upstream resource. - */ - template , int> = 0> - [[deprecated("Must specify initial_pool_size")]] // - explicit pool_memory_resource(Upstream2& upstream_mr, - std::optional initial_pool_size = std::nullopt, - std::optional maximum_pool_size = std::nullopt) - : pool_memory_resource(upstream_mr, initial_pool_size.value_or(0), maximum_pool_size) - { - } - - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using - * `upstream_mr`. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of - * pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available from the upstream resource. - */ - template , - thrust::optional>, - int> = 0> - [[deprecated("Use std::optional instead of thrust::optional.")]] // - explicit pool_memory_resource(Upstream* upstream_mr, - std::size_t initial_pool_size, - Optional maximum_pool_size) - : pool_memory_resource(upstream_mr, initial_pool_size, maximum_pool_size.value_or(std::nullopt)) - { - } - /** * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. @@ -283,35 +143,6 @@ class pool_memory_resource final initialize_pool(initial_pool_size, maximum_pool_size); } - /** - * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using - * `upstream_mr`. - * - * @throws rmm::logic_error if `upstream_mr == nullptr` - * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of - * pool_memory_resource::allocation_alignment bytes. - * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a - * multiple of pool_memory_resource::allocation_alignment bytes. - * - * @param upstream_mr The memory_resource from which to allocate blocks for the pool. - * @param initial_pool_size Minimum size, in bytes, of the initial pool. - * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all - * of the available memory from the upstream resource. - */ - template , - thrust::optional>, - int> = 0> - [[deprecated("Use std::optional instead of thrust::optional.")]] // - explicit pool_memory_resource(Upstream& upstream_mr, - std::size_t initial_pool_size, - Optional maximum_pool_size) - : pool_memory_resource(cuda::std::addressof(upstream_mr), - initial_pool_size, - maximum_pool_size.value_or(std::nullopt)) - { - } - /** * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index 41973e04b..3bfd65996 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -39,6 +40,9 @@ namespace rmm::mr { * allocate objects of a specific type `T`, but can be freely rebound to other * types. * + * The allocator records the current cuda device and may only be used with a backing + * `device_async_resource_ref` valid for the same device. + * * @tparam T The type of the objects that will be allocated by this allocator */ template @@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ template thrust_allocator(thrust_allocator const& other) - : _mr(other.resource()), _stream{other.stream()} + : _mr(other.resource()), _stream{other.stream()}, _device{other._device} { } @@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ pointer allocate(size_type num) { + cuda_set_device_raii dev{_device}; return thrust::device_pointer_cast( static_cast(_mr.allocate_async(num * sizeof(T), _stream))); } @@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ void deallocate(pointer ptr, size_type num) { + cuda_set_device_raii dev{_device}; return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream); } @@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { private: cuda_stream_view _stream{}; rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()}; + cuda_device_id _device{get_current_cuda_device()}; }; /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index ce870287c..d8bb311c8 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -76,6 +78,7 @@ class host_memory_resource { */ void* allocate(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) { + RMM_FUNC_RANGE(); return do_allocate(bytes, alignment); } @@ -94,6 +97,7 @@ class host_memory_resource { */ void deallocate(void* ptr, std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) { + RMM_FUNC_RANGE(); do_deallocate(ptr, bytes, alignment); } diff --git a/include/rmm/mr/pinned_host_memory_resource.hpp b/include/rmm/mr/pinned_host_memory_resource.hpp index ee409dbac..6bca05376 100644 --- a/include/rmm/mr/pinned_host_memory_resource.hpp +++ b/include/rmm/mr/pinned_host_memory_resource.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,8 @@ class pinned_host_memory_resource { static void* allocate(std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { + RMM_FUNC_RANGE(); + // don't allocate anything if the user requested zero bytes if (0 == bytes) { return nullptr; } @@ -84,6 +87,8 @@ class pinned_host_memory_resource { std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept { + RMM_FUNC_RANGE(); + rmm::detail::aligned_host_deallocate( ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); }); } @@ -104,6 +109,8 @@ class pinned_host_memory_resource { */ static void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) { + RMM_FUNC_RANGE(); + return allocate(bytes); } @@ -126,6 +133,8 @@ class pinned_host_memory_resource { std::size_t alignment, [[maybe_unused]] cuda::stream_ref stream) { + RMM_FUNC_RANGE(); + return allocate(bytes, alignment); } @@ -142,6 +151,8 @@ class pinned_host_memory_resource { std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) noexcept { + RMM_FUNC_RANGE(); + return deallocate(ptr, bytes); } @@ -161,6 +172,8 @@ class pinned_host_memory_resource { std::size_t alignment, [[maybe_unused]] cuda::stream_ref stream) noexcept { + RMM_FUNC_RANGE(); + return deallocate(ptr, bytes, alignment); } // NOLINTEND(bugprone-easily-swappable-parameters) diff --git a/include/rmm/thrust_rmm_allocator.h b/include/rmm/thrust_rmm_allocator.h deleted file mode 100644 index ad71e107a..000000000 --- a/include/rmm/thrust_rmm_allocator.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include - -namespace rmm { - -using par_t = decltype(thrust::cuda::par(*(new rmm::mr::thrust_allocator()))); -using deleter_t = std::function; -using exec_policy_t = std::unique_ptr; - -/** - * @brief Returns a unique_ptr to a Thrust CUDA execution policy that uses RMM - * for temporary memory allocation. - * - * @param stream The stream that the allocator will use - * - * @return A Thrust execution policy that will use RMM for temporary memory - * allocation. - */ -[[deprecated("Use new exec_policy in rmm/exec_policy.hpp")]] inline exec_policy_t exec_policy( - cudaStream_t stream = nullptr) -{ - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - auto* alloc = new rmm::mr::thrust_allocator(cuda_stream_view{stream}); - auto deleter = [alloc](par_t* pointer) { - delete alloc; // NOLINT(cppcoreguidelines-owning-memory) - delete pointer; // NOLINT(cppcoreguidelines-owning-memory) - }; - - exec_policy_t policy{new par_t(*alloc), deleter}; - return policy; -} - -} // namespace rmm diff --git a/python/LICENSE b/python/LICENSE deleted file mode 120000 index ea5b60640..000000000 --- a/python/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE \ No newline at end of file diff --git a/python/README.md b/python/README.md deleted file mode 120000 index 32d46ee88..000000000 --- a/python/README.md +++ /dev/null @@ -1 +0,0 @@ -../README.md \ No newline at end of file diff --git a/python/librmm/CMakeLists.txt b/python/librmm/CMakeLists.txt new file mode 100644 index 000000000..a6035ac83 --- /dev/null +++ b/python/librmm/CMakeLists.txt @@ -0,0 +1,38 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) + +include(../../rapids_config.cmake) + +project( + librmm-python + VERSION "${RAPIDS_VERSION}" + LANGUAGES CXX) + +# Check if rmm is already available. If so, it's the user's responsibility to ensure that the CMake +# package is also available at build time of the Python rmm package. +find_package(rmm "${RAPIDS_VERSION}") + +if(rmm_FOUND) + return() +endif() + +unset(rmm_FOUND) + +set(BUILD_TESTS OFF) +set(BUILD_BENCHMARKS OFF) +set(CUDA_STATIC_RUNTIME ON) + +add_subdirectory(../.. rmm-cpp) diff --git a/python/librmm/LICENSE b/python/librmm/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/python/librmm/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/librmm/README.md b/python/librmm/README.md new file mode 120000 index 000000000..fe8400541 --- /dev/null +++ b/python/librmm/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/python/librmm/librmm/VERSION b/python/librmm/librmm/VERSION new file mode 120000 index 000000000..d62dc733e --- /dev/null +++ b/python/librmm/librmm/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/librmm/librmm/__init__.py b/python/librmm/librmm/__init__.py new file mode 100644 index 000000000..b914ecdc3 --- /dev/null +++ b/python/librmm/librmm/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from librmm._version import __git_commit__, __version__ diff --git a/python/librmm/librmm/_version.py b/python/librmm/librmm/_version.py new file mode 100644 index 000000000..ea50101b2 --- /dev/null +++ b/python/librmm/librmm/_version.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files("librmm").joinpath("VERSION").read_text().strip() +) +__git_commit__ = "" diff --git a/python/librmm/pyproject.toml b/python/librmm/pyproject.toml new file mode 100644 index 000000000..4b997be9a --- /dev/null +++ b/python/librmm/pyproject.toml @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[build-system] +build-backend = "scikit_build_core.build" +requires = [ + "cmake>=3.26.4", + "ninja", + "scikit-build-core[pyproject]>=0.7.0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. + +[project] +name = "librmm" +dynamic = ["version"] +description = "rmm - RAPIDS Memory Manager" +readme = { file = "README.md", content-type = "text/markdown" } +authors = [ + { name = "NVIDIA Corporation" }, +] +license = { text = "Apache 2.0" } +classifiers = [ + "Intended Audience :: Developers", + "Topic :: Database", + "Topic :: Scientific/Engineering", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: C++", + "Environment :: GPU :: NVIDIA CUDA", +] + +[project.urls] +Homepage = "https://github.com/rapidsai/rmm" + +[project.entry-points."cmake.prefix"] +librmm = "librmm" + +[tool.scikit-build] +build-dir = "build/{wheel_tag}" +cmake.build-type = "Release" +cmake.minimum-version = "3.26.4" +ninja.make-fallback = true +sdist.reproducible = true +wheel.packages = ["librmm"] +wheel.install-dir = "librmm" +wheel.py-api = "py3" + +[tool.scikit-build.metadata.version] +provider = "scikit_build_core.metadata.regex" +input = "librmm/VERSION" +regex = "(?P.*)" diff --git a/python/.coveragerc b/python/rmm/.coveragerc similarity index 72% rename from python/.coveragerc rename to python/rmm/.coveragerc index 6b49c4df6..00997a39b 100644 --- a/python/.coveragerc +++ b/python/rmm/.coveragerc @@ -2,3 +2,4 @@ [run] include = rmm/* omit = rmm/tests/* +disable_warnings=include-ignored diff --git a/python/CMakeLists.txt b/python/rmm/CMakeLists.txt similarity index 67% rename from python/CMakeLists.txt rename to python/rmm/CMakeLists.txt index 87752ff24..6c2515102 100644 --- a/python/CMakeLists.txt +++ b/python/rmm/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -14,34 +14,19 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) -include(../rapids_config.cmake) +include(../../rapids_config.cmake) project( rmm-python VERSION "${RAPIDS_VERSION}" LANGUAGES CXX) -option(FIND_RMM_CPP "Search for existing RMM C++ installations before defaulting to local files" - OFF) - -# If the user requested it we attempt to find RMM. -if(FIND_RMM_CPP) - find_package(rmm "${RAPIDS_VERSION}") -else() - set(rmm_FOUND OFF) -endif() - -if(NOT rmm_FOUND) - set(BUILD_TESTS OFF) - set(BUILD_BENCHMARKS OFF) - set(CUDA_STATIC_RUNTIME ON) - - add_subdirectory(../ rmm-cpp EXCLUDE_FROM_ALL) -endif() +find_package(rmm "${RAPIDS_VERSION}" REQUIRED) include(rapids-cython-core) rapids_cython_init() +# pass through logging level to spdlog add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") add_subdirectory(rmm/_cuda) diff --git a/python/rmm/LICENSE b/python/rmm/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/python/rmm/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/rmm/README.md b/python/rmm/README.md new file mode 120000 index 000000000..fe8400541 --- /dev/null +++ b/python/rmm/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/python/rmm/VERSION b/python/rmm/VERSION deleted file mode 120000 index 558194c5a..000000000 --- a/python/rmm/VERSION +++ /dev/null @@ -1 +0,0 @@ -../../VERSION \ No newline at end of file diff --git a/python/docs/Makefile b/python/rmm/docs/Makefile similarity index 100% rename from python/docs/Makefile rename to python/rmm/docs/Makefile diff --git a/python/docs/conf.py b/python/rmm/docs/conf.py similarity index 94% rename from python/docs/conf.py rename to python/rmm/docs/conf.py index d66e9d30c..dd8267e28 100644 --- a/python/docs/conf.py +++ b/python/rmm/docs/conf.py @@ -1,3 +1,5 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -58,7 +60,7 @@ ] # Breathe Configuration -breathe_projects = {"librmm": "../../doxygen/xml"} +breathe_projects = {"librmm": "../../../doxygen/xml"} breathe_default_project = "librmm" copybutton_prompt_text = ">>> " @@ -89,6 +91,18 @@ # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] +# List of warnings to suppress +suppress_warnings = [] + +# if the file deprecated.xml does not exist in the doxygen xml output, +# breathe will fail to build the docs, so we conditionally add +# "deprecated.rst" to the exclude_patterns list +if not os.path.exists( + os.path.join(breathe_projects["librmm"], "deprecated.xml") +): + exclude_patterns.append("librmm_docs/deprecated.rst") + suppress_warnings.append("toc.excluded") + # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" diff --git a/python/docs/cpp.rst b/python/rmm/docs/cpp.rst similarity index 100% rename from python/docs/cpp.rst rename to python/rmm/docs/cpp.rst diff --git a/python/docs/cpp_api.rst b/python/rmm/docs/cpp_api.rst similarity index 100% rename from python/docs/cpp_api.rst rename to python/rmm/docs/cpp_api.rst diff --git a/python/docs/guide.md b/python/rmm/docs/guide.md similarity index 97% rename from python/docs/guide.md rename to python/rmm/docs/guide.md index c06135ca8..968be8586 100644 --- a/python/docs/guide.md +++ b/python/rmm/docs/guide.md @@ -182,8 +182,8 @@ for memory allocations using their by configuring the current allocator. ```python -from rmm.allocators.torch import rmm_torch_allocator -import torch +>>> from rmm.allocators.torch import rmm_torch_allocator +>>> import torch -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +>>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator) ``` diff --git a/python/docs/index.rst b/python/rmm/docs/index.rst similarity index 100% rename from python/docs/index.rst rename to python/rmm/docs/index.rst diff --git a/python/docs/librmm_docs/cuda_device_management.rst b/python/rmm/docs/librmm_docs/cuda_device_management.rst similarity index 100% rename from python/docs/librmm_docs/cuda_device_management.rst rename to python/rmm/docs/librmm_docs/cuda_device_management.rst diff --git a/python/docs/librmm_docs/cuda_streams.rst b/python/rmm/docs/librmm_docs/cuda_streams.rst similarity index 100% rename from python/docs/librmm_docs/cuda_streams.rst rename to python/rmm/docs/librmm_docs/cuda_streams.rst diff --git a/python/docs/librmm_docs/data_containers.rst b/python/rmm/docs/librmm_docs/data_containers.rst similarity index 100% rename from python/docs/librmm_docs/data_containers.rst rename to python/rmm/docs/librmm_docs/data_containers.rst diff --git a/python/docs/librmm_docs/deprecated.rst b/python/rmm/docs/librmm_docs/deprecated.rst similarity index 100% rename from python/docs/librmm_docs/deprecated.rst rename to python/rmm/docs/librmm_docs/deprecated.rst diff --git a/python/docs/librmm_docs/errors.rst b/python/rmm/docs/librmm_docs/errors.rst similarity index 100% rename from python/docs/librmm_docs/errors.rst rename to python/rmm/docs/librmm_docs/errors.rst diff --git a/python/docs/librmm_docs/index.rst b/python/rmm/docs/librmm_docs/index.rst similarity index 100% rename from python/docs/librmm_docs/index.rst rename to python/rmm/docs/librmm_docs/index.rst diff --git a/python/docs/librmm_docs/logging.rst b/python/rmm/docs/librmm_docs/logging.rst similarity index 100% rename from python/docs/librmm_docs/logging.rst rename to python/rmm/docs/librmm_docs/logging.rst diff --git a/python/docs/librmm_docs/memory_resources.rst b/python/rmm/docs/librmm_docs/memory_resources.rst similarity index 100% rename from python/docs/librmm_docs/memory_resources.rst rename to python/rmm/docs/librmm_docs/memory_resources.rst diff --git a/python/docs/librmm_docs/thrust_integrations.rst b/python/rmm/docs/librmm_docs/thrust_integrations.rst similarity index 100% rename from python/docs/librmm_docs/thrust_integrations.rst rename to python/rmm/docs/librmm_docs/thrust_integrations.rst diff --git a/python/docs/librmm_docs/utilities.rst b/python/rmm/docs/librmm_docs/utilities.rst similarity index 100% rename from python/docs/librmm_docs/utilities.rst rename to python/rmm/docs/librmm_docs/utilities.rst diff --git a/python/docs/python.rst b/python/rmm/docs/python.rst similarity index 100% rename from python/docs/python.rst rename to python/rmm/docs/python.rst diff --git a/python/docs/python_api.rst b/python/rmm/docs/python_api.rst similarity index 100% rename from python/docs/python_api.rst rename to python/rmm/docs/python_api.rst diff --git a/python/pyproject.toml b/python/rmm/pyproject.toml similarity index 86% rename from python/pyproject.toml rename to python/rmm/pyproject.toml index 3f294d180..eb0d23380 100644 --- a/python/pyproject.toml +++ b/python/rmm/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,9 +18,10 @@ requires = [ "cmake>=3.26.4", "cuda-python>=11.7.1,<12.0a0", "cython>=3.0.0", + "librmm==24.6.*", "ninja", "scikit-build-core[pyproject]>=0.7.0", -] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`. +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project] name = "rmm" @@ -36,7 +37,7 @@ dependencies = [ "cuda-python>=11.7.1,<12.0a0", "numba>=0.57", "numpy>=1.23,<2.0a0", -] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`. +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", "Topic :: Database", @@ -52,7 +53,7 @@ classifiers = [ test = [ "pytest", "pytest-cov", -] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`. +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/rmm" @@ -123,3 +124,9 @@ wheel.packages = ["rmm"] provider = "scikit_build_core.metadata.regex" input = "rmm/VERSION" regex = "(?P.*)" + +[tool.pytest.ini_options] +# treat warnings as errors +filterwarnings = [ + "error", +] diff --git a/python/rmm/rmm/VERSION b/python/rmm/rmm/VERSION new file mode 120000 index 000000000..d62dc733e --- /dev/null +++ b/python/rmm/rmm/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/rmm/__init__.py b/python/rmm/rmm/__init__.py similarity index 100% rename from python/rmm/__init__.py rename to python/rmm/rmm/__init__.py diff --git a/python/rmm/_cuda/CMakeLists.txt b/python/rmm/rmm/_cuda/CMakeLists.txt similarity index 100% rename from python/rmm/_cuda/CMakeLists.txt rename to python/rmm/rmm/_cuda/CMakeLists.txt diff --git a/python/rmm/_cuda/__init__.pxd b/python/rmm/rmm/_cuda/__init__.pxd similarity index 100% rename from python/rmm/_cuda/__init__.pxd rename to python/rmm/rmm/_cuda/__init__.pxd diff --git a/python/rmm/_cuda/__init__.py b/python/rmm/rmm/_cuda/__init__.py similarity index 100% rename from python/rmm/_cuda/__init__.py rename to python/rmm/rmm/_cuda/__init__.py diff --git a/python/rmm/_cuda/gpu.py b/python/rmm/rmm/_cuda/gpu.py similarity index 100% rename from python/rmm/_cuda/gpu.py rename to python/rmm/rmm/_cuda/gpu.py diff --git a/python/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd similarity index 100% rename from python/rmm/_cuda/stream.pxd rename to python/rmm/rmm/_cuda/stream.pxd diff --git a/python/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx similarity index 95% rename from python/rmm/_cuda/stream.pyx rename to python/rmm/rmm/_cuda/stream.pyx index 4795cbb9f..4d5ff5232 100644 --- a/python/rmm/_cuda/stream.pyx +++ b/python/rmm/rmm/_cuda/stream.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -100,7 +100,8 @@ cdef class Stream: def _init_from_cupy_stream(self, obj): try: import cupy - if isinstance(obj, cupy.cuda.stream.Stream): + if isinstance(obj, (cupy.cuda.stream.Stream, + cupy.cuda.stream.ExternalStream)): self._cuda_stream = (obj.ptr) self._owner = obj return diff --git a/python/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/_lib/CMakeLists.txt similarity index 100% rename from python/rmm/_lib/CMakeLists.txt rename to python/rmm/rmm/_lib/CMakeLists.txt diff --git a/python/rmm/_lib/__init__.pxd b/python/rmm/rmm/_lib/__init__.pxd similarity index 100% rename from python/rmm/_lib/__init__.pxd rename to python/rmm/rmm/_lib/__init__.pxd diff --git a/python/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py similarity index 100% rename from python/rmm/_lib/__init__.py rename to python/rmm/rmm/_lib/__init__.py diff --git a/python/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp similarity index 100% rename from python/rmm/_lib/_torch_allocator.cpp rename to python/rmm/rmm/_lib/_torch_allocator.cpp diff --git a/python/rmm/_lib/cuda_stream.pxd b/python/rmm/rmm/_lib/cuda_stream.pxd similarity index 100% rename from python/rmm/_lib/cuda_stream.pxd rename to python/rmm/rmm/_lib/cuda_stream.pxd diff --git a/python/rmm/_lib/cuda_stream.pyx b/python/rmm/rmm/_lib/cuda_stream.pyx similarity index 100% rename from python/rmm/_lib/cuda_stream.pyx rename to python/rmm/rmm/_lib/cuda_stream.pyx diff --git a/python/rmm/_lib/cuda_stream_pool.pxd b/python/rmm/rmm/_lib/cuda_stream_pool.pxd similarity index 100% rename from python/rmm/_lib/cuda_stream_pool.pxd rename to python/rmm/rmm/_lib/cuda_stream_pool.pxd diff --git a/python/rmm/_lib/cuda_stream_view.pxd b/python/rmm/rmm/_lib/cuda_stream_view.pxd similarity index 100% rename from python/rmm/_lib/cuda_stream_view.pxd rename to python/rmm/rmm/_lib/cuda_stream_view.pxd diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd similarity index 81% rename from python/rmm/_lib/device_buffer.pxd rename to python/rmm/rmm/_lib/device_buffer.pxd index 3d5f29f9a..2ff1a7da9 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/rmm/_lib/device_buffer.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,17 +17,31 @@ from libcpp.memory cimport unique_ptr from rmm._cuda.stream cimport Stream from rmm._lib.cuda_stream_view cimport cuda_stream_view -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm._lib.memory_resource cimport ( + DeviceMemoryResource, + device_memory_resource, +) cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: cdef cppclass device_buffer: device_buffer() - device_buffer(size_t size, cuda_stream_view stream) except + - device_buffer(const void* source_data, - size_t size, cuda_stream_view stream) except + - device_buffer(const device_buffer buf, - cuda_stream_view stream) except + + device_buffer( + size_t size, + cuda_stream_view stream, + device_memory_resource * + ) except + + device_buffer( + const void* source_data, + size_t size, + cuda_stream_view stream, + device_memory_resource * + ) except + + device_buffer( + const device_buffer buf, + cuda_stream_view stream, + device_memory_resource * + ) except + void reserve(size_t new_capacity, cuda_stream_view stream) except + void resize(size_t new_size, cuda_stream_view stream) except + void shrink_to_fit(cuda_stream_view stream) except + @@ -51,7 +65,8 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr( unique_ptr[device_buffer] ptr, - Stream stream=* + Stream stream=*, + DeviceMemoryResource mr=*, ) @staticmethod diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/_lib/device_buffer.pyx similarity index 94% rename from python/rmm/_lib/device_buffer.pyx rename to python/rmm/rmm/_lib/device_buffer.pyx index d248d01ab..9d2298d8b 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/rmm/_lib/device_buffer.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ import numpy as np cimport cython -from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_FromStringAndSize +from cpython.bytes cimport PyBytes_FromStringAndSize from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -32,7 +32,11 @@ from cuda.ccudart cimport ( cudaStream_t, ) -from rmm._lib.memory_resource cimport get_current_device_resource +from rmm._lib.memory_resource cimport ( + DeviceMemoryResource, + device_memory_resource, + get_current_device_resource, +) # The DeviceMemoryResource attribute could be released prematurely @@ -45,7 +49,8 @@ cdef class DeviceBuffer: def __cinit__(self, *, uintptr_t ptr=0, size_t size=0, - Stream stream=DEFAULT_STREAM): + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None): """Construct a ``DeviceBuffer`` with optional size and data pointer Parameters @@ -62,6 +67,9 @@ cdef class DeviceBuffer: scope while the DeviceBuffer is in use. Destroying the underlying stream while the DeviceBuffer is in use will result in undefined behavior. + mr : optional + DeviceMemoryResource for the allocation, if not provided + defaults to the current device resource. Note ---- @@ -75,24 +83,23 @@ cdef class DeviceBuffer: >>> db = rmm.DeviceBuffer(size=5) """ cdef const void* c_ptr + cdef device_memory_resource * mr_ptr + # Save a reference to the MR and stream used for allocation + self.mr = get_current_device_resource() if mr is None else mr + self.stream = stream + mr_ptr = self.mr.get_mr() with nogil: c_ptr = ptr - if size == 0: - self.c_obj.reset(new device_buffer()) - elif c_ptr == NULL: - self.c_obj.reset(new device_buffer(size, stream.view())) + if c_ptr == NULL or size == 0: + self.c_obj.reset(new device_buffer(size, stream.view(), mr_ptr)) else: - self.c_obj.reset(new device_buffer(c_ptr, size, stream.view())) + self.c_obj.reset(new device_buffer(c_ptr, size, stream.view(), mr_ptr)) if stream.c_is_default(): stream.c_synchronize() - # Save a reference to the MR and stream used for allocation - self.mr = get_current_device_resource() - self.stream = stream - def __len__(self): return self.size @@ -160,13 +167,14 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr( unique_ptr[device_buffer] ptr, - Stream stream=DEFAULT_STREAM + Stream stream=DEFAULT_STREAM, + DeviceMemoryResource mr=None, ): cdef DeviceBuffer buf = DeviceBuffer.__new__(DeviceBuffer) if stream.c_is_default(): stream.c_synchronize() buf.c_obj = move(ptr) - buf.mr = get_current_device_resource() + buf.mr = get_current_device_resource() if mr is None else mr buf.stream = stream return buf @@ -312,7 +320,7 @@ cdef class DeviceBuffer: cdef size_t s = dbp.size() cdef bytes b = PyBytes_FromStringAndSize(NULL, s) - cdef unsigned char* p = PyBytes_AS_STRING(b) + cdef unsigned char* p = b cdef unsigned char[::1] mv = (p)[:s] self.copy_to_host(mv, stream) diff --git a/python/rmm/_lib/device_uvector.pxd b/python/rmm/rmm/_lib/device_uvector.pxd similarity index 100% rename from python/rmm/_lib/device_uvector.pxd rename to python/rmm/rmm/_lib/device_uvector.pxd diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/rmm/_lib/lib.pxd similarity index 100% rename from python/rmm/_lib/lib.pxd rename to python/rmm/rmm/_lib/lib.pxd diff --git a/python/rmm/_lib/lib.pyx b/python/rmm/rmm/_lib/lib.pyx similarity index 100% rename from python/rmm/_lib/lib.pyx rename to python/rmm/rmm/_lib/lib.pyx diff --git a/python/rmm/_lib/logger.pyx b/python/rmm/rmm/_lib/logger.pyx similarity index 100% rename from python/rmm/_lib/logger.pyx rename to python/rmm/rmm/_lib/logger.pyx diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd similarity index 97% rename from python/rmm/_lib/memory_resource.pxd rename to python/rmm/rmm/_lib/memory_resource.pxd index 0770fb8ed..f9c2e91de 100644 --- a/python/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -34,7 +34,7 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \ cdef class DeviceMemoryResource: cdef shared_ptr[device_memory_resource] c_obj - cdef device_memory_resource* get_mr(self) + cdef device_memory_resource* get_mr(self) noexcept nogil cdef class UpstreamResourceAdaptor(DeviceMemoryResource): cdef readonly DeviceMemoryResource upstream_mr diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx similarity index 99% rename from python/rmm/_lib/memory_resource.pyx rename to python/rmm/rmm/_lib/memory_resource.pyx index 7458ca025..100d18b56 100644 --- a/python/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -218,7 +218,7 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ cdef class DeviceMemoryResource: - cdef device_memory_resource* get_mr(self): + cdef device_memory_resource* get_mr(self) noexcept nogil: """Get the underlying C++ memory resource object.""" return self.c_obj.get() diff --git a/python/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd similarity index 100% rename from python/rmm/_lib/per_device_resource.pxd rename to python/rmm/rmm/_lib/per_device_resource.pxd diff --git a/python/rmm/_lib/tests/__init__.py b/python/rmm/rmm/_lib/tests/__init__.py similarity index 100% rename from python/rmm/_lib/tests/__init__.py rename to python/rmm/rmm/_lib/tests/__init__.py diff --git a/python/rmm/_lib/tests/test_device_buffer.pyx b/python/rmm/rmm/_lib/tests/test_device_buffer.pyx similarity index 100% rename from python/rmm/_lib/tests/test_device_buffer.pyx rename to python/rmm/rmm/_lib/tests/test_device_buffer.pyx diff --git a/python/rmm/_version.py b/python/rmm/rmm/_version.py similarity index 100% rename from python/rmm/_version.py rename to python/rmm/rmm/_version.py diff --git a/python/rmm/allocators/__init__.py b/python/rmm/rmm/allocators/__init__.py similarity index 100% rename from python/rmm/allocators/__init__.py rename to python/rmm/rmm/allocators/__init__.py diff --git a/python/rmm/allocators/cupy.py b/python/rmm/rmm/allocators/cupy.py similarity index 100% rename from python/rmm/allocators/cupy.py rename to python/rmm/rmm/allocators/cupy.py diff --git a/python/rmm/allocators/numba.py b/python/rmm/rmm/allocators/numba.py similarity index 100% rename from python/rmm/allocators/numba.py rename to python/rmm/rmm/allocators/numba.py diff --git a/python/rmm/allocators/torch.py b/python/rmm/rmm/allocators/torch.py similarity index 100% rename from python/rmm/allocators/torch.py rename to python/rmm/rmm/allocators/torch.py diff --git a/python/rmm/mr.py b/python/rmm/rmm/mr.py similarity index 100% rename from python/rmm/mr.py rename to python/rmm/rmm/mr.py diff --git a/python/rmm/rmm.py b/python/rmm/rmm/rmm.py similarity index 100% rename from python/rmm/rmm.py rename to python/rmm/rmm/rmm.py diff --git a/python/rmm/tests/conftest.py b/python/rmm/rmm/tests/conftest.py similarity index 100% rename from python/rmm/tests/conftest.py rename to python/rmm/rmm/tests/conftest.py diff --git a/python/rmm/tests/test_cython.py b/python/rmm/rmm/tests/test_cython.py similarity index 100% rename from python/rmm/tests/test_cython.py rename to python/rmm/rmm/tests/test_cython.py diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py similarity index 97% rename from python/rmm/tests/test_rmm.py rename to python/rmm/rmm/tests/test_rmm.py index 25ff9a7a6..c37fe0298 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. import copy +import functools import gc import os import pickle @@ -498,6 +499,32 @@ def test_mr_devicebuffer_lifetime(): del a +def test_device_buffer_with_mr(): + allocations = [] + base = rmm.mr.CudaMemoryResource() + rmm.mr.set_current_device_resource(base) + + def alloc_cb(size, stream, *, base): + allocations.append(size) + return base.allocate(size, stream) + + def dealloc_cb(ptr, size, stream, *, base): + return base.deallocate(ptr, size, stream) + + cb_mr = rmm.mr.CallbackMemoryResource( + functools.partial(alloc_cb, base=base), + functools.partial(dealloc_cb, base=base), + ) + rmm.DeviceBuffer(size=10) + assert len(allocations) == 0 + buf = rmm.DeviceBuffer(size=256, mr=cb_mr) + assert len(allocations) == 1 + assert allocations[0] == 256 + del cb_mr + gc.collect() + del buf + + def test_mr_upstream_lifetime(): # Simple test to ensure upstream MRs are deallocated before downstream MR cuda_mr = rmm.mr.CudaMemoryResource() diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/rmm/tests/test_rmm_pytorch.py similarity index 100% rename from python/rmm/tests/test_rmm_pytorch.py rename to python/rmm/rmm/tests/test_rmm_pytorch.py diff --git a/scripts/run-cmake-format.sh b/scripts/run-cmake-format.sh index a7d9984b3..df7f22782 100755 --- a/scripts/run-cmake-format.sh +++ b/scripts/run-cmake-format.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. # This script is a wrapper for cmakelang that may be used with pre-commit. The # wrapping is necessary because RAPIDS libraries split configuration for @@ -43,6 +44,7 @@ fi DEFAULT_FORMAT_FILE_LOCATIONS=( "${RMM_BUILD_DIR}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" + "${RMM_BUILD_DIR:-build}/latest/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" ) if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then diff --git a/tests/mr/device/polymorphic_allocator_tests.cpp b/tests/mr/device/polymorphic_allocator_tests.cpp index 3b73d4a49..d433e010c 100644 --- a/tests/mr/device/polymorphic_allocator_tests.cpp +++ b/tests/mr/device/polymorphic_allocator_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,14 +33,15 @@ struct allocator_test : public ::testing::Test { TEST_F(allocator_test, default_resource) { rmm::mr::polymorphic_allocator allocator{}; - EXPECT_EQ(allocator.resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(allocator.get_upstream_resource(), + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); } TEST_F(allocator_test, custom_resource) { rmm::mr::cuda_memory_resource mr; rmm::mr::polymorphic_allocator allocator{&mr}; - EXPECT_EQ(allocator.resource(), &mr); + EXPECT_EQ(allocator.get_upstream_resource(), rmm::device_async_resource_ref{mr}); } void test_conversion(rmm::mr::polymorphic_allocator /*unused*/) {} @@ -48,7 +49,7 @@ void test_conversion(rmm::mr::polymorphic_allocator /*unused*/) {} TEST_F(allocator_test, implicit_conversion) { rmm::mr::cuda_memory_resource mr; - test_conversion(&mr); + test_conversion(rmm::device_async_resource_ref{mr}); } TEST_F(allocator_test, self_equality) @@ -84,7 +85,7 @@ TEST_F(allocator_test, copy_ctor_same_type) rmm::mr::polymorphic_allocator alloc0; rmm::mr::polymorphic_allocator alloc1{alloc0}; EXPECT_EQ(alloc0, alloc1); - EXPECT_EQ(alloc0.resource(), alloc1.resource()); + EXPECT_EQ(alloc0.get_upstream_resource(), alloc1.get_upstream_resource()); } TEST_F(allocator_test, copy_ctor_different_type) @@ -92,7 +93,7 @@ TEST_F(allocator_test, copy_ctor_different_type) rmm::mr::polymorphic_allocator alloc0; rmm::mr::polymorphic_allocator alloc1{alloc0}; EXPECT_EQ(alloc0, alloc1); - EXPECT_EQ(alloc0.resource(), alloc1.resource()); + EXPECT_EQ(alloc0.get_upstream_resource(), alloc1.get_upstream_resource()); } TEST_F(allocator_test, rebind) diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index b94d6b3e1..e855d1036 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -16,7 +16,9 @@ #include "mr_ref_test.hpp" +#include #include +#include #include #include #include @@ -36,6 +38,7 @@ struct allocator_test : public mr_ref_test {}; TEST_P(allocator_test, first) { + rmm::mr::set_current_device_resource(this->mr.get()); auto const num_ints{100}; rmm::device_vector ints(num_ints, 1); EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end())); @@ -43,12 +46,28 @@ TEST_P(allocator_test, first) TEST_P(allocator_test, defaults) { + rmm::mr::set_current_device_resource(this->mr.get()); rmm::mr::thrust_allocator allocator(rmm::cuda_stream_default); EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default); EXPECT_EQ(allocator.get_upstream_resource(), rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); } +TEST_P(allocator_test, multi_device) +{ + if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; } + cuda_set_device_raii with_device{rmm::get_current_cuda_device()}; + rmm::cuda_stream stream{}; + // make allocator on device-0 + rmm::mr::thrust_allocator allocator(stream.view(), this->ref); + auto const size{100}; + EXPECT_NO_THROW([&]() { + auto vec = rmm::device_vector(size, allocator); + // Destruct with device-1 active + RMM_CUDA_TRY(cudaSetDevice(1)); + }()); +} + INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests, allocator_test, ::testing::Values(mr_factory{"CUDA", &make_cuda},