All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
arena_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/aligned.hpp>
19 #include <rmm/detail/error.hpp>
20 #include <rmm/detail/export.hpp>
21 #include <rmm/detail/format.hpp>
22 #include <rmm/detail/logging_assert.hpp>
23 #include <rmm/logger.hpp>
24 #include <rmm/mr/device/detail/arena.hpp>
26 #include <rmm/resource_ref.hpp>
27 
28 #include <cuda_runtime_api.h>
29 
30 #include <cstddef>
31 #include <map>
32 #include <shared_mutex>
33 #include <thread>
34 
35 namespace RMM_NAMESPACE {
36 namespace mr {
82 template <typename Upstream>
84  public:
94  std::optional<std::size_t> arena_size = std::nullopt,
95  bool dump_log_on_failure = false)
96  : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
97  {
98  if (dump_log_on_failure_) {
99  logger_ =
100  std::make_shared<rapids_logger::logger>("arena_memory_dump", "rmm_arena_memory_dump.log");
101  // Set the level to `debug` for more detailed output.
102  logger_->set_level(rapids_logger::level_enum::info);
103  }
104  }
105 
116  explicit arena_memory_resource(Upstream* upstream_mr,
117  std::optional<std::size_t> arena_size = std::nullopt,
118  bool dump_log_on_failure = false)
120  to_device_async_resource_ref_checked(upstream_mr), arena_size, dump_log_on_failure}
121  {
122  }
123 
124  ~arena_memory_resource() override = default;
125 
126  // Disable copy (and move) semantics.
128  arena_memory_resource& operator=(arena_memory_resource const&) = delete;
129  arena_memory_resource(arena_memory_resource&&) noexcept = delete;
130  arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
131 
132  private:
133  using global_arena = rmm::mr::detail::arena::global_arena;
134  using arena = rmm::mr::detail::arena::arena;
135 
147  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
148  {
149  if (bytes <= 0) { return nullptr; }
150 #ifdef RMM_ARENA_USE_SIZE_CLASSES
151  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
152 #else
154 #endif
155  auto& arena = get_arena(stream);
156 
157  {
158  std::shared_lock lock(mtx_);
159  void* pointer = arena.allocate(bytes);
160  if (pointer != nullptr) { return pointer; }
161  }
162 
163  {
164  std::unique_lock lock(mtx_);
165  defragment();
166  void* pointer = arena.allocate(bytes);
167  if (pointer == nullptr) {
168  if (dump_log_on_failure_) { dump_memory_log(bytes); }
169  auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
170  rmm::detail::format_bytes(bytes) + "): No room in arena.";
171  RMM_FAIL(msg.c_str(), rmm::out_of_memory);
172  }
173  return pointer;
174  }
175  }
176 
180  void defragment()
181  {
182  RMM_CUDA_TRY(cudaDeviceSynchronize());
183  for (auto& thread_arena : thread_arenas_) {
184  thread_arena.second->clean();
185  }
186  for (auto& stream_arena : stream_arenas_) {
187  stream_arena.second.clean();
188  }
189  }
190 
199  void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
200  {
201  if (ptr == nullptr || bytes <= 0) { return; }
202 #ifdef RMM_ARENA_USE_SIZE_CLASSES
203  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
204 #else
206 #endif
207  auto& arena = get_arena(stream);
208 
209  {
210  std::shared_lock lock(mtx_);
211  // If the memory being freed does not belong to the arena, the following will return false.
212  if (arena.deallocate(ptr, bytes, stream)) { return; }
213  }
214 
215  {
216  // Since we are returning this memory to another stream, we need to make sure the current
217  // stream is caught up.
218  stream.synchronize_no_throw();
219 
220  std::unique_lock lock(mtx_);
221  deallocate_from_other_arena(ptr, bytes, stream);
222  }
223  }
224 
233  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
234  {
235  if (use_per_thread_arena(stream)) {
236  for (auto const& thread_arena : thread_arenas_) {
237  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
238  }
239  } else {
240  for (auto& stream_arena : stream_arenas_) {
241  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
242  }
243  }
244 
245  if (!global_arena_.deallocate(ptr, bytes)) {
246  // It's possible to use per thread default streams along with another pool of streams.
247  // This means that it's possible for an allocation to move from a thread or stream arena
248  // back into the global arena during a defragmentation and then move down into another arena
249  // type. For instance, thread arena -> global arena -> stream arena. If this happens and
250  // there was an allocation from it while it was a thread arena, we now have to check to
251  // see if the allocation is part of a stream arena, and vice versa.
252  // Only do this in exceptional cases to not affect performance and have to check all
253  // arenas all the time.
254  if (use_per_thread_arena(stream)) {
255  for (auto& stream_arena : stream_arenas_) {
256  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
257  }
258  } else {
259  for (auto const& thread_arena : thread_arenas_) {
260  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
261  }
262  }
263  RMM_FAIL("allocation not found");
264  }
265  }
266 
273  arena& get_arena(cuda_stream_view stream)
274  {
275  if (use_per_thread_arena(stream)) { return get_thread_arena(); }
276  return get_stream_arena(stream);
277  }
278 
284  arena& get_thread_arena()
285  {
286  auto const thread_id = std::this_thread::get_id();
287  {
288  std::shared_lock lock(map_mtx_);
289  auto const iter = thread_arenas_.find(thread_id);
290  if (iter != thread_arenas_.end()) { return *iter->second; }
291  }
292  {
293  std::unique_lock lock(map_mtx_);
294  auto thread_arena = std::make_shared<arena>(global_arena_);
295  thread_arenas_.emplace(thread_id, thread_arena);
296  thread_local detail::arena::arena_cleaner cleaner{thread_arena};
297  return *thread_arena;
298  }
299  }
300 
306  arena& get_stream_arena(cuda_stream_view stream)
307  {
308  RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
309  {
310  std::shared_lock lock(map_mtx_);
311  auto const iter = stream_arenas_.find(stream.value());
312  if (iter != stream_arenas_.end()) { return iter->second; }
313  }
314  {
315  std::unique_lock lock(map_mtx_);
316  stream_arenas_.emplace(stream.value(), global_arena_);
317  return stream_arenas_.at(stream.value());
318  }
319  }
320 
326  void dump_memory_log(size_t bytes)
327  {
328  logger_->info("**************************************************");
329  logger_->info("Ran out of memory trying to allocate %s.", rmm::detail::format_bytes(bytes));
330  logger_->info("**************************************************");
331  logger_->info("Global arena:");
332  global_arena_.dump_memory_log(logger_);
333  logger_->flush();
334  }
335 
342  static bool use_per_thread_arena(cuda_stream_view stream)
343  {
344  return stream.is_per_thread_default();
345  }
346 
348  global_arena global_arena_;
351  std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
354  std::map<cudaStream_t, arena> stream_arenas_;
356  bool dump_log_on_failure_{};
358  std::shared_ptr<rapids_logger::logger> logger_{};
360  mutable std::shared_mutex map_mtx_;
362  mutable std::shared_mutex mtx_;
363 };
364  // end of group
366 } // namespace mr
367 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:39
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:83
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:116
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:93
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:93
Exception thrown when RMM runs out of memory.
Definition: error.hpp:87
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:41
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:79
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:43
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Definition: aligned.hpp:77