arena_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include <rmm/aligned.hpp>
8 #include <rmm/detail/error.hpp>
9 #include <rmm/detail/export.hpp>
10 #include <rmm/detail/format.hpp>
11 #include <rmm/detail/logging_assert.hpp>
12 #include <rmm/logger.hpp>
13 #include <rmm/mr/device/detail/arena.hpp>
15 #include <rmm/resource_ref.hpp>
16 
17 #include <cuda_runtime_api.h>
18 
19 #include <cstddef>
20 #include <map>
21 #include <shared_mutex>
22 #include <thread>
23 
24 namespace RMM_NAMESPACE {
25 namespace mr {
71 template <typename Upstream>
73  public:
83  std::optional<std::size_t> arena_size = std::nullopt,
84  bool dump_log_on_failure = false)
85  : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
86  {
87  if (dump_log_on_failure_) {
88  logger_ =
89  std::make_shared<rapids_logger::logger>("arena_memory_dump", "rmm_arena_memory_dump.log");
90  // Set the level to `debug` for more detailed output.
91  logger_->set_level(rapids_logger::level_enum::info);
92  }
93  }
94 
105  explicit arena_memory_resource(Upstream* upstream_mr,
106  std::optional<std::size_t> arena_size = std::nullopt,
107  bool dump_log_on_failure = false)
109  to_device_async_resource_ref_checked(upstream_mr), arena_size, dump_log_on_failure}
110  {
111  }
112 
113  ~arena_memory_resource() override = default;
114 
115  // Disable copy (and move) semantics.
117  arena_memory_resource& operator=(arena_memory_resource const&) = delete;
118  arena_memory_resource(arena_memory_resource&&) noexcept = delete;
119  arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
120 
121  private:
122  using global_arena = rmm::mr::detail::arena::global_arena;
123  using arena = rmm::mr::detail::arena::arena;
124 
136  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
137  {
138  if (bytes <= 0) { return nullptr; }
139 #ifdef RMM_ARENA_USE_SIZE_CLASSES
140  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
141 #else
143 #endif
144  auto& arena = get_arena(stream);
145 
146  {
147  std::shared_lock lock(mtx_);
148  void* pointer = arena.allocate_sync(bytes);
149  if (pointer != nullptr) { return pointer; }
150  }
151 
152  {
153  std::unique_lock lock(mtx_);
154  defragment();
155  void* pointer = arena.allocate_sync(bytes);
156  if (pointer == nullptr) {
157  if (dump_log_on_failure_) { dump_memory_log(bytes); }
158  auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
159  rmm::detail::format_bytes(bytes) + "): No room in arena.";
160  RMM_FAIL(msg.c_str(), rmm::out_of_memory);
161  }
162  return pointer;
163  }
164  }
165 
169  void defragment()
170  {
171  RMM_CUDA_TRY(cudaDeviceSynchronize());
172  for (auto& thread_arena : thread_arenas_) {
173  thread_arena.second->clean();
174  }
175  for (auto& stream_arena : stream_arenas_) {
176  stream_arena.second.clean();
177  }
178  }
179 
188  void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
189  {
190  if (ptr == nullptr || bytes <= 0) { return; }
191 #ifdef RMM_ARENA_USE_SIZE_CLASSES
192  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
193 #else
195 #endif
196  auto& arena = get_arena(stream);
197 
198  {
199  std::shared_lock lock(mtx_);
200  // If the memory being freed does not belong to the arena, the following will return false.
201  if (arena.deallocate(stream, ptr, bytes)) { return; }
202  }
203 
204  {
205  // Since we are returning this memory to another stream, we need to make sure the current
206  // stream is caught up.
207  stream.synchronize_no_throw();
208 
209  std::unique_lock lock(mtx_);
210  deallocate_from_other_arena(stream, ptr, bytes);
211  }
212  }
213 
222  void deallocate_from_other_arena(cuda_stream_view stream, void* ptr, std::size_t bytes)
223  {
224  if (use_per_thread_arena(stream)) {
225  for (auto const& thread_arena : thread_arenas_) {
226  if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
227  }
228  } else {
229  for (auto& stream_arena : stream_arenas_) {
230  if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
231  }
232  }
233 
234  if (!global_arena_.deallocate_sync(ptr, bytes)) {
235  // It's possible to use per thread default streams along with another pool of streams.
236  // This means that it's possible for an allocation to move from a thread or stream arena
237  // back into the global arena during a defragmentation and then move down into another arena
238  // type. For instance, thread arena -> global arena -> stream arena. If this happens and
239  // there was an allocation from it while it was a thread arena, we now have to check to
240  // see if the allocation is part of a stream arena, and vice versa.
241  // Only do this in exceptional cases to not affect performance and have to check all
242  // arenas all the time.
243  if (use_per_thread_arena(stream)) {
244  for (auto& stream_arena : stream_arenas_) {
245  if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
246  }
247  } else {
248  for (auto const& thread_arena : thread_arenas_) {
249  if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
250  }
251  }
252  RMM_FAIL("allocation not found");
253  }
254  }
255 
262  arena& get_arena(cuda_stream_view stream)
263  {
264  if (use_per_thread_arena(stream)) { return get_thread_arena(); }
265  return get_stream_arena(stream);
266  }
267 
273  arena& get_thread_arena()
274  {
275  auto const thread_id = std::this_thread::get_id();
276  {
277  std::shared_lock lock(map_mtx_);
278  auto const iter = thread_arenas_.find(thread_id);
279  if (iter != thread_arenas_.end()) { return *iter->second; }
280  }
281  {
282  std::unique_lock lock(map_mtx_);
283  auto thread_arena = std::make_shared<arena>(global_arena_);
284  thread_arenas_.emplace(thread_id, thread_arena);
285  thread_local detail::arena::arena_cleaner cleaner{thread_arena};
286  return *thread_arena;
287  }
288  }
289 
295  arena& get_stream_arena(cuda_stream_view stream)
296  {
297  RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
298  {
299  std::shared_lock lock(map_mtx_);
300  auto const iter = stream_arenas_.find(stream.value());
301  if (iter != stream_arenas_.end()) { return iter->second; }
302  }
303  {
304  std::unique_lock lock(map_mtx_);
305  stream_arenas_.emplace(stream.value(), global_arena_);
306  return stream_arenas_.at(stream.value());
307  }
308  }
309 
315  void dump_memory_log(size_t bytes)
316  {
317  logger_->info("**************************************************");
318  logger_->info("Ran out of memory trying to allocate %s.", rmm::detail::format_bytes(bytes));
319  logger_->info("**************************************************");
320  logger_->info("Global arena:");
321  global_arena_.dump_memory_log(logger_);
322  logger_->flush();
323  }
324 
331  static bool use_per_thread_arena(cuda_stream_view stream)
332  {
333  return stream.is_per_thread_default();
334  }
335 
337  global_arena global_arena_;
340  std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
343  std::map<cudaStream_t, arena> stream_arenas_;
345  bool dump_log_on_failure_{};
347  std::shared_ptr<rapids_logger::logger> logger_{};
349  mutable std::shared_mutex map_mtx_;
351  mutable std::shared_mutex mtx_;
352 };
353  // end of group
355 } // namespace mr
356 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:72
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:105
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:82
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:83
Exception thrown when RMM runs out of memory.
Definition: error.hpp:76
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:72
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:32
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:31
std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.