cuda_async_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/cuda_device.hpp>
19 #include <rmm/cuda_stream_view.hpp>
20 #include <rmm/detail/error.hpp>
21 #include <rmm/detail/export.hpp>
22 #include <rmm/detail/runtime_async_alloc.hpp>
23 #include <rmm/detail/thrust_namespace.h>
26 
27 #include <cuda/std/type_traits>
28 #include <cuda_runtime_api.h>
29 
30 #include <cstddef>
31 #include <cstdint>
32 #include <optional>
33 
34 namespace RMM_NAMESPACE {
35 namespace mr {
47  public:
59  enum class allocation_handle_type : std::int32_t {
60  none = 0x0,
61  posix_file_descriptor = 0x1,
63  win32 = 0x2,
64  win32_kmt = 0x4,
65  fabric = 0x8
66  };
67 
74  enum class mempool_usage : unsigned short {
75  hw_decompress = 0x2,
77  };
78 
96  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
97  cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size = {},
98  std::optional<std::size_t> release_threshold = {},
99  std::optional<allocation_handle_type> export_handle_type = {})
100  {
101  // Check if cudaMallocAsync Memory pool supported
102  RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
103  "cudaMallocAsync not supported with this CUDA driver/runtime version");
104 
105  // Construct explicit pool
106  cudaMemPoolProps pool_props{};
107  pool_props.allocType = cudaMemAllocationTypePinned;
108  pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
109  export_handle_type.value_or(allocation_handle_type::none));
110 
111 #if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_HWDECOMPRESS_CUDA_DRIVER_VERSION
112  // Enable hardware decompression if supported (requires CUDA 12.8 driver or higher)
113  if (rmm::detail::runtime_async_alloc::is_hwdecompress_supported()) {
114  pool_props.usage = static_cast<unsigned short>(mempool_usage::hw_decompress);
115  }
116 #endif
117 
118  RMM_EXPECTS(
119  rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
120  "Requested IPC memory handle type not supported");
121  pool_props.location.type = cudaMemLocationTypeDevice;
122  pool_props.location.id = rmm::get_current_cuda_device().value();
123  cudaMemPool_t cuda_pool_handle{};
124  RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
125  pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
126 
127  // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
128  // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.
129  // See https://github.com/NVIDIA/spark-rapids/issues/4710.
130  int driver_version{};
131  RMM_CUDA_TRY(cudaDriverGetVersion(&driver_version));
132  constexpr auto min_async_driver_version{11050};
133  if (driver_version < min_async_driver_version) {
134  int disabled{0};
135  RMM_CUDA_TRY(
136  cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
137  }
138 
139  auto const [free, total] = rmm::available_device_memory();
140 
141  // Need an l-value to take address to pass to cudaMemPoolSetAttribute
142  uint64_t threshold = release_threshold.value_or(total);
143  RMM_CUDA_TRY(
144  cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
145 
146  // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
147  // specified size
148  auto const pool_size = initial_pool_size.value_or(free / 2);
149  auto* ptr = do_allocate(pool_size, cuda_stream_default);
150  do_deallocate(ptr, pool_size, cuda_stream_default);
151  }
152 
158  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
159 
160  ~cuda_async_memory_resource() override
161  {
162  RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
163  }
164  cuda_async_memory_resource(cuda_async_memory_resource const&) = delete;
165  cuda_async_memory_resource(cuda_async_memory_resource&&) = delete;
166  cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
167  cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;
168 
169  private:
170  cuda_async_view_memory_resource pool_{};
171 
181  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
182  {
183  void* ptr{nullptr};
184  ptr = pool_.allocate(bytes, stream);
185  return ptr;
186  }
187 
196  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
197  {
198  pool_.deallocate(ptr, bytes, stream);
199  }
200 
208  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
209  {
210  auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
211  return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
212  }
213 };
214  // end of group
216 } // namespace mr
217 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:39
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_memory_resource.hpp:46
allocation_handle_type
Flags for specifying memory allocation handle types.
Definition: cuda_async_memory_resource.hpp:59
cuda_async_memory_resource(std::optional< std::size_t > initial_pool_size={}, std::optional< std::size_t > release_threshold={}, std::optional< allocation_handle_type > export_handle_type={})
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release t...
Definition: cuda_async_memory_resource.hpp:97
mempool_usage
Flags for specifying memory pool usage.
Definition: cuda_async_memory_resource.hpp:74
cudaMemPool_t pool_handle() const noexcept
Returns the underlying native handle to the CUDA pool.
Definition: cuda_async_memory_resource.hpp:158
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:92
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.
cuda_device_id get_current_cuda_device()
Returns a cuda_device_id for the current device.
static constexpr cuda_stream_view cuda_stream_default
Static cuda_stream_view of the default stream (stream 0), for convenience.
Definition: cuda_stream_view.hpp:122
constexpr value_type value() const noexcept
The wrapped integer value.
Definition: cuda_device.hpp:54