All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
arena_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/aligned.hpp>
19 #include <rmm/detail/error.hpp>
20 #include <rmm/detail/export.hpp>
21 #include <rmm/detail/format.hpp>
22 #include <rmm/detail/logging_assert.hpp>
23 #include <rmm/logger.hpp>
24 #include <rmm/mr/device/detail/arena.hpp>
26 #include <rmm/resource_ref.hpp>
27 
28 #include <cuda_runtime_api.h>
29 
30 #include <spdlog/common.h>
31 
32 #include <cstddef>
33 #include <map>
34 #include <shared_mutex>
35 #include <thread>
36 
37 namespace RMM_NAMESPACE {
38 namespace mr {
84 template <typename Upstream>
86  public:
96  std::optional<std::size_t> arena_size = std::nullopt,
97  bool dump_log_on_failure = false)
98  : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
99  {
100  if (dump_log_on_failure_) {
101  logger_ =
102  std::make_shared<spdlog::logger>("arena_memory_dump",
103  std::make_shared<spdlog::sinks::basic_file_sink_mt>(
104  "rmm_arena_memory_dump.log", true /*truncate file*/));
105  // Set the level to `debug` for more detailed output.
106  logger_->set_level(spdlog::level::info);
107  }
108  }
109 
120  explicit arena_memory_resource(Upstream* upstream_mr,
121  std::optional<std::size_t> arena_size = std::nullopt,
122  bool dump_log_on_failure = false)
123  : global_arena_{to_device_async_resource_ref_checked(upstream_mr), arena_size},
124  dump_log_on_failure_{dump_log_on_failure}
125  {
126  if (dump_log_on_failure_) {
127  logger_ =
128  std::make_shared<spdlog::logger>("arena_memory_dump",
129  std::make_shared<spdlog::sinks::basic_file_sink_mt>(
130  "rmm_arena_memory_dump.log", true /*truncate file*/));
131  // Set the level to `debug` for more detailed output.
132  logger_->set_level(spdlog::level::info);
133  }
134  }
135 
136  ~arena_memory_resource() override = default;
137 
138  // Disable copy (and move) semantics.
140  arena_memory_resource& operator=(arena_memory_resource const&) = delete;
141  arena_memory_resource(arena_memory_resource&&) noexcept = delete;
142  arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
143 
144  private:
145  using global_arena = rmm::mr::detail::arena::global_arena;
146  using arena = rmm::mr::detail::arena::arena;
147 
159  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
160  {
161  if (bytes <= 0) { return nullptr; }
162 #ifdef RMM_ARENA_USE_SIZE_CLASSES
163  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
164 #else
166 #endif
167  auto& arena = get_arena(stream);
168 
169  {
170  std::shared_lock lock(mtx_);
171  void* pointer = arena.allocate(bytes);
172  if (pointer != nullptr) { return pointer; }
173  }
174 
175  {
176  std::unique_lock lock(mtx_);
177  defragment();
178  void* pointer = arena.allocate(bytes);
179  if (pointer == nullptr) {
180  if (dump_log_on_failure_) { dump_memory_log(bytes); }
181  RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
182  }
183  return pointer;
184  }
185  }
186 
190  void defragment()
191  {
192  RMM_CUDA_TRY(cudaDeviceSynchronize());
193  for (auto& thread_arena : thread_arenas_) {
194  thread_arena.second->clean();
195  }
196  for (auto& stream_arena : stream_arenas_) {
197  stream_arena.second.clean();
198  }
199  }
200 
209  void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
210  {
211  if (ptr == nullptr || bytes <= 0) { return; }
212 #ifdef RMM_ARENA_USE_SIZE_CLASSES
213  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
214 #else
216 #endif
217  auto& arena = get_arena(stream);
218 
219  {
220  std::shared_lock lock(mtx_);
221  // If the memory being freed does not belong to the arena, the following will return false.
222  if (arena.deallocate(ptr, bytes, stream)) { return; }
223  }
224 
225  {
226  // Since we are returning this memory to another stream, we need to make sure the current
227  // stream is caught up.
228  stream.synchronize_no_throw();
229 
230  std::unique_lock lock(mtx_);
231  deallocate_from_other_arena(ptr, bytes, stream);
232  }
233  }
234 
243  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
244  {
245  if (use_per_thread_arena(stream)) {
246  for (auto const& thread_arena : thread_arenas_) {
247  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
248  }
249  } else {
250  for (auto& stream_arena : stream_arenas_) {
251  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
252  }
253  }
254 
255  if (!global_arena_.deallocate(ptr, bytes)) {
256  // It's possible to use per thread default streams along with another pool of streams.
257  // This means that it's possible for an allocation to move from a thread or stream arena
258  // back into the global arena during a defragmentation and then move down into another arena
259  // type. For instance, thread arena -> global arena -> stream arena. If this happens and
260  // there was an allocation from it while it was a thread arena, we now have to check to
261  // see if the allocation is part of a stream arena, and vice versa.
262  // Only do this in exceptional cases to not affect performance and have to check all
263  // arenas all the time.
264  if (use_per_thread_arena(stream)) {
265  for (auto& stream_arena : stream_arenas_) {
266  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
267  }
268  } else {
269  for (auto const& thread_arena : thread_arenas_) {
270  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
271  }
272  }
273  RMM_FAIL("allocation not found");
274  }
275  }
276 
283  arena& get_arena(cuda_stream_view stream)
284  {
285  if (use_per_thread_arena(stream)) { return get_thread_arena(); }
286  return get_stream_arena(stream);
287  }
288 
294  arena& get_thread_arena()
295  {
296  auto const thread_id = std::this_thread::get_id();
297  {
298  std::shared_lock lock(map_mtx_);
299  auto const iter = thread_arenas_.find(thread_id);
300  if (iter != thread_arenas_.end()) { return *iter->second; }
301  }
302  {
303  std::unique_lock lock(map_mtx_);
304  auto thread_arena = std::make_shared<arena>(global_arena_);
305  thread_arenas_.emplace(thread_id, thread_arena);
306  thread_local detail::arena::arena_cleaner cleaner{thread_arena};
307  return *thread_arena;
308  }
309  }
310 
316  arena& get_stream_arena(cuda_stream_view stream)
317  {
318  RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
319  {
320  std::shared_lock lock(map_mtx_);
321  auto const iter = stream_arenas_.find(stream.value());
322  if (iter != stream_arenas_.end()) { return iter->second; }
323  }
324  {
325  std::unique_lock lock(map_mtx_);
326  stream_arenas_.emplace(stream.value(), global_arena_);
327  return stream_arenas_.at(stream.value());
328  }
329  }
330 
336  void dump_memory_log(size_t bytes)
337  {
338  logger_->info("**************************************************");
339  logger_->info(rmm::detail::formatted_log("Ran out of memory trying to allocate %s.",
340  rmm::detail::format_bytes(bytes)));
341  logger_->info("**************************************************");
342  logger_->info("Global arena:");
343  global_arena_.dump_memory_log(logger_);
344  logger_->flush();
345  }
346 
353  static bool use_per_thread_arena(cuda_stream_view stream)
354  {
355  return stream.is_per_thread_default();
356  }
357 
359  global_arena global_arena_;
362  std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
365  std::map<cudaStream_t, arena> stream_arenas_;
367  bool dump_log_on_failure_{};
369  std::shared_ptr<spdlog::logger> logger_{};
371  mutable std::shared_mutex map_mtx_;
373  mutable std::shared_mutex mtx_;
374 };
375  // end of group
377 } // namespace mr
378 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:85
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:120
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:95
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:94
Exception thrown when RMM runs out of memory.
Definition: error.hpp:87
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:41
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:79
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:43
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Definition: aligned.hpp:77