RMM: cuda_async_memory_resource.hpp Source File

 /*

  * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.

  * SPDX-License-Identifier: Apache-2.0

  */

 #pragma once


 #include <rmm/cuda_device.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/runtime_capabilities.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/mr/cuda_async_view_memory_resource.hpp>

 #include <rmm/mr/device_memory_resource.hpp>


 #include <cuda/std/type_traits>

 #include <cuda_runtime_api.h>


 #include <cstddef>

 #include <cstdint>

 #include <optional>


 namespace RMM_NAMESPACE {

 namespace mr {

 class cuda_async_memory_resource final : public device_memory_resource {

  public:

   enum class allocation_handle_type : std::int32_t {

     none = cudaMemHandleTypeNone,

     posix_file_descriptor =

       cudaMemHandleTypePosixFileDescriptor,

     win32 =

       cudaMemHandleTypeWin32,

     win32_kmt = cudaMemHandleTypeWin32Kmt,

     fabric = 0x8

   };


   enum class mempool_usage : unsigned short {

     hw_decompress = 0x2,

   };


   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)

   cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size             = {},

                              std::optional<std::size_t> release_threshold             = {},

                              std::optional<allocation_handle_type> export_handle_type = {})

   {

     // Check if cudaMallocAsync Memory pool supported

     RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),

                 "cudaMallocAsync not supported with this CUDA driver/runtime version");


     // Construct explicit pool

     cudaMemPoolProps pool_props{};

     pool_props.allocType   = cudaMemAllocationTypePinned;

     pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(

       export_handle_type.value_or(allocation_handle_type::none));


 #if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_HWDECOMPRESS_CUDA_DRIVER_VERSION

     // Enable hardware decompression if supported (requires CUDA 12.8 driver or higher)

     if (rmm::detail::hwdecompress::is_supported()) {

       pool_props.usage = static_cast<unsigned short>(mempool_usage::hw_decompress);

     }

 #endif


     RMM_EXPECTS(rmm::detail::export_handle_type::is_supported(pool_props.handleTypes),

                 "Requested IPC memory handle type not supported");

     pool_props.location.type = cudaMemLocationTypeDevice;

     pool_props.location.id   = rmm::get_current_cuda_device().value();

     cudaMemPool_t cuda_pool_handle{};

     RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));

     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};


     auto const [free, total] = rmm::available_device_memory();


     // Need an l-value to take address to pass to cudaMemPoolSetAttribute

     uint64_t threshold = release_threshold.value_or(total);

     RMM_CUDA_TRY(

       cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));


     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the

     // specified size (only if initial_pool_size is provided)

     if (initial_pool_size.has_value()) {

       auto const pool_size = initial_pool_size.value();

       auto* ptr            = do_allocate(pool_size, cuda_stream_default);

       do_deallocate(ptr, pool_size, cuda_stream_default);

     }

   }


   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }


   ~cuda_async_memory_resource() override

   {

     RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));

   }

   cuda_async_memory_resource(cuda_async_memory_resource const&)            = delete;

   cuda_async_memory_resource(cuda_async_memory_resource&&)                 = delete;

   cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;

   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&)      = delete;


  private:

   cuda_async_view_memory_resource pool_{};


   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override

   {

     void* ptr{nullptr};

     ptr = pool_.allocate(stream, bytes);

     return ptr;

   }


   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override

   {

     pool_.deallocate(stream, ptr, bytes);

   }


   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override

   {

     auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);

     return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());

   }

 };

   // end of group

 }  // namespace mr

 }  // namespace RMM_NAMESPACE

rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28

rmm::mr::cuda_async_memory_resource
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_memory_resource.hpp:35

rmm::mr::cuda_async_memory_resource::allocation_handle_type
allocation_handle_type
Flags for specifying memory allocation handle types.
Definition: cuda_async_memory_resource.hpp:49

rmm::mr::cuda_async_memory_resource::cuda_async_memory_resource
cuda_async_memory_resource(std::optional< std::size_t > initial_pool_size={}, std::optional< std::size_t > release_threshold={}, std::optional< allocation_handle_type > export_handle_type={})
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release t...
Definition: cuda_async_memory_resource.hpp:96

rmm::mr::cuda_async_memory_resource::mempool_usage
mempool_usage
Flags for specifying memory pool usage.
Definition: cuda_async_memory_resource.hpp:72

rmm::mr::cuda_async_memory_resource::pool_handle
cudaMemPool_t pool_handle() const noexcept
Returns the underlying native handle to the CUDA pool.
Definition: cuda_async_memory_resource.hpp:146

rmm::mr::device_memory_resource
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:83

cuda_async_view_memory_resource.hpp

cuda_device.hpp

cuda_stream_view.hpp

device_memory_resource.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.

rmm::get_current_cuda_device
cuda_device_id get_current_cuda_device()
Returns a cuda_device_id for the current device.

rmm::cuda_stream_default
static constexpr cuda_stream_view cuda_stream_default
Static cuda_stream_view of the default stream (stream 0), for convenience.
Definition: cuda_stream_view.hpp:111

rmm::cuda_device_id::value
constexpr value_type value() const noexcept
The wrapped integer value.
Definition: cuda_device.hpp:43