cuda_async_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include <rmm/cuda_device.hpp>
9 #include <rmm/detail/error.hpp>
10 #include <rmm/detail/export.hpp>
11 #include <rmm/detail/runtime_capabilities.hpp>
14 
15 #include <cuda/std/type_traits>
16 #include <cuda_runtime_api.h>
17 
18 #include <cstddef>
19 #include <cstdint>
20 #include <optional>
21 
22 namespace RMM_NAMESPACE {
23 namespace mr {
35  public:
48  enum class allocation_handle_type : std::int32_t {
49  none = cudaMemHandleTypeNone,
50  posix_file_descriptor =
51  cudaMemHandleTypePosixFileDescriptor,
53  win32 =
54  cudaMemHandleTypeWin32,
55  win32_kmt = cudaMemHandleTypeWin32Kmt,
57  fabric = 0x8
58  };
59 
71  enum class mempool_usage : unsigned short {
72  hw_decompress = 0x2,
74  };
75 
94  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
95  cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size = {},
96  std::optional<std::size_t> release_threshold = {},
97  std::optional<allocation_handle_type> export_handle_type = {})
98  {
99  // Check if cudaMallocAsync Memory pool supported
100  RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
101  "cudaMallocAsync not supported with this CUDA driver/runtime version");
102 
103  // Construct explicit pool
104  cudaMemPoolProps pool_props{};
105  pool_props.allocType = cudaMemAllocationTypePinned;
106  pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
107  export_handle_type.value_or(allocation_handle_type::none));
108 
109 #if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_HWDECOMPRESS_CUDA_DRIVER_VERSION
110  // Enable hardware decompression if supported (requires CUDA 12.8 driver or higher)
111  if (rmm::detail::hwdecompress::is_supported()) {
112  pool_props.usage = static_cast<unsigned short>(mempool_usage::hw_decompress);
113  }
114 #endif
115 
116  RMM_EXPECTS(rmm::detail::export_handle_type::is_supported(pool_props.handleTypes),
117  "Requested IPC memory handle type not supported");
118  pool_props.location.type = cudaMemLocationTypeDevice;
119  pool_props.location.id = rmm::get_current_cuda_device().value();
120  cudaMemPool_t cuda_pool_handle{};
121  RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
122  pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
123 
124  auto const [free, total] = rmm::available_device_memory();
125 
126  // Need an l-value to take address to pass to cudaMemPoolSetAttribute
127  uint64_t threshold = release_threshold.value_or(total);
128  RMM_CUDA_TRY(
129  cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
130 
131  // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
132  // specified size (only if initial_pool_size is provided)
133  if (initial_pool_size.has_value()) {
134  auto const pool_size = initial_pool_size.value();
135  auto* ptr = do_allocate(pool_size, cuda_stream_default);
136  do_deallocate(ptr, pool_size, cuda_stream_default);
137  }
138  }
139 
145  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
146 
147  ~cuda_async_memory_resource() override
148  {
149  RMM_ASSERT_CUDA_SUCCESS_SAFE_SHUTDOWN(cudaMemPoolDestroy(pool_handle()));
150  }
151  cuda_async_memory_resource(cuda_async_memory_resource const&) = delete;
152  cuda_async_memory_resource(cuda_async_memory_resource&&) = delete;
153  cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
154  cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;
155 
156  private:
157  cuda_async_view_memory_resource pool_{};
158 
168  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
169  {
170  void* ptr{nullptr};
171  ptr = pool_.allocate(stream, bytes);
172  return ptr;
173  }
174 
183  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
184  {
185  pool_.deallocate(stream, ptr, bytes);
186  }
187 
195  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
196  {
197  auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
198  return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
199  }
200 };
201  // end of group
203 } // namespace mr
204 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_memory_resource.hpp:34
allocation_handle_type
Flags for specifying memory allocation handle types.
Definition: cuda_async_memory_resource.hpp:48
cuda_async_memory_resource(std::optional< std::size_t > initial_pool_size={}, std::optional< std::size_t > release_threshold={}, std::optional< allocation_handle_type > export_handle_type={})
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release t...
Definition: cuda_async_memory_resource.hpp:95
mempool_usage
Flags for specifying memory pool usage.
Definition: cuda_async_memory_resource.hpp:71
cudaMemPool_t pool_handle() const noexcept
Returns the underlying native handle to the CUDA pool.
Definition: cuda_async_memory_resource.hpp:145
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:82
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.
cuda_device_id get_current_cuda_device()
Returns a cuda_device_id for the current device.
static constexpr cuda_stream_view cuda_stream_default
Static cuda_stream_view of the default stream (stream 0), for convenience.
Definition: cuda_stream_view.hpp:111
constexpr value_type value() const noexcept
The wrapped integer value.
Definition: cuda_device.hpp:43