arena_memory_resource.hpp
1 /*
2  * Copyright (c) 2020-2022, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/detail/error.hpp>
19 #include <rmm/logger.hpp>
20 #include <rmm/mr/device/detail/arena.hpp>
21 #include <rmm/mr/device/device_memory_resource.hpp>
22 
23 #include <cuda_runtime_api.h>
24 
25 #include <spdlog/common.h>
26 #include <spdlog/fmt/ostr.h>
27 
28 #include <cstddef>
29 #include <map>
30 #include <shared_mutex>
31 #include <thread>
32 
33 namespace rmm::mr {
34 
74 template <typename Upstream>
76  public:
87  explicit arena_memory_resource(Upstream* upstream_mr,
88  std::optional<std::size_t> arena_size = std::nullopt,
89  bool dump_log_on_failure = false)
90  : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
91  {
92  if (dump_log_on_failure_) {
93  logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
94  // Set the level to `debug` for more detailed output.
95  logger_->set_level(spdlog::level::info);
96  }
97  }
98 
99  ~arena_memory_resource() override = default;
100 
101  // Disable copy (and move) semantics.
103  arena_memory_resource& operator=(arena_memory_resource const&) = delete;
104  arena_memory_resource(arena_memory_resource&&) noexcept = delete;
105  arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
106 
113  bool supports_streams() const noexcept override { return true; }
114 
120  bool supports_get_mem_info() const noexcept override { return false; }
121 
122  private:
125 
137  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
138  {
139  if (bytes <= 0) { return nullptr; }
140 #ifdef RMM_ARENA_USE_SIZE_CLASSES
141  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
142 #else
143  bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
144 #endif
145  auto& arena = get_arena(stream);
146 
147  {
148  std::shared_lock lock(mtx_);
149  void* pointer = arena.allocate(bytes);
150  if (pointer != nullptr) { return pointer; }
151  }
152 
153  {
154  std::unique_lock lock(mtx_);
155  defragment();
156  void* pointer = arena.allocate(bytes);
157  if (pointer == nullptr) {
158  if (dump_log_on_failure_) { dump_memory_log(bytes); }
159  RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
160  }
161  return pointer;
162  }
163  }
164 
168  void defragment()
169  {
170  RMM_CUDA_TRY(cudaDeviceSynchronize());
171  for (auto& thread_arena : thread_arenas_) {
172  thread_arena.second->clean();
173  }
174  for (auto& stream_arena : stream_arenas_) {
175  stream_arena.second.clean();
176  }
177  }
178 
187  void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
188  {
189  if (ptr == nullptr || bytes <= 0) { return; }
190 #ifdef RMM_ARENA_USE_SIZE_CLASSES
191  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
192 #else
193  bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
194 #endif
195  auto& arena = get_arena(stream);
196 
197  {
198  std::shared_lock lock(mtx_);
199  // If the memory being freed does not belong to the arena, the following will return false.
200  if (arena.deallocate(ptr, bytes, stream)) { return; }
201  }
202 
203  {
204  // Since we are returning this memory to another stream, we need to make sure the current
205  // stream is caught up.
206  stream.synchronize_no_throw();
207 
208  std::unique_lock lock(mtx_);
209  deallocate_from_other_arena(ptr, bytes, stream);
210  }
211  }
212 
221  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
222  {
223  if (use_per_thread_arena(stream)) {
224  for (auto const& thread_arena : thread_arenas_) {
225  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
226  }
227  } else {
228  for (auto& stream_arena : stream_arenas_) {
229  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
230  }
231  }
232 
233  if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); }
234  }
235 
242  arena& get_arena(cuda_stream_view stream)
243  {
244  if (use_per_thread_arena(stream)) { return get_thread_arena(); }
245  return get_stream_arena(stream);
246  }
247 
253  arena& get_thread_arena()
254  {
255  auto const thread_id = std::this_thread::get_id();
256  {
257  std::shared_lock lock(map_mtx_);
258  auto const iter = thread_arenas_.find(thread_id);
259  if (iter != thread_arenas_.end()) { return *iter->second; }
260  }
261  {
262  std::unique_lock lock(map_mtx_);
263  auto thread_arena = std::make_shared<arena>(global_arena_);
264  thread_arenas_.emplace(thread_id, thread_arena);
265  thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
266  return *thread_arena;
267  }
268  }
269 
275  arena& get_stream_arena(cuda_stream_view stream)
276  {
277  RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
278  {
279  std::shared_lock lock(map_mtx_);
280  auto const iter = stream_arenas_.find(stream.value());
281  if (iter != stream_arenas_.end()) { return iter->second; }
282  }
283  {
284  std::unique_lock lock(map_mtx_);
285  stream_arenas_.emplace(stream.value(), global_arena_);
286  return stream_arenas_.at(stream.value());
287  }
288  }
289 
296  std::pair<std::size_t, std::size_t> do_get_mem_info(cuda_stream_view stream) const override
297  {
298  return std::make_pair(0, 0);
299  }
300 
306  void dump_memory_log(size_t bytes)
307  {
308  logger_->info("**************************************************");
309  logger_->info("Ran out of memory trying to allocate {}.", rmm::detail::bytes{bytes});
310  logger_->info("**************************************************");
311  logger_->info("Global arena:");
312  global_arena_.dump_memory_log(logger_);
313  logger_->flush();
314  }
315 
322  static bool use_per_thread_arena(cuda_stream_view stream)
323  {
324  return stream.is_per_thread_default();
325  }
326 
328  global_arena global_arena_;
331  std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
334  std::map<cudaStream_t, arena> stream_arenas_;
336  bool dump_log_on_failure_{};
338  std::shared_ptr<spdlog::logger> logger_{};
340  mutable std::shared_mutex map_mtx_;
342  mutable std::shared_mutex mtx_;
343 };
344 
345 } // namespace rmm::mr
rmm::detail::bytes
Represent a size in number of bytes.
Definition: logger.hpp:75
rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:34
rmm::mr::arena_memory_resource::supports_get_mem_info
bool supports_get_mem_info() const noexcept override
Query whether the resource supports the get_mem_info API.
Definition: arena_memory_resource.hpp:120
rmm::out_of_memory
Exception thrown when RMM runs out of memory.
Definition: error.hpp:68
rmm::mr::arena_memory_resource::arena_memory_resource
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:87
rmm::mr::detail::arena::global_arena::dump_memory_log
void dump_memory_log(std::shared_ptr< spdlog::logger > const &logger) const
Dump memory to log.
Definition: arena.hpp:650
rmm::mr::detail::arena::global_arena::deallocate
bool deallocate(void *ptr, std::size_t size, cuda_stream_view stream)
Deallocate memory pointed to by ptr.
Definition: arena.hpp:610
rmm::mr::detail::arena::arena
An arena for allocating memory for a thread.
Definition: arena.hpp:796
rmm::mr::arena_memory_resource::supports_streams
bool supports_streams() const noexcept override
Queries whether the resource supports use of non-null CUDA streams for allocation/deallocation.
Definition: arena_memory_resource.hpp:113
rmm::mr::device_memory_resource
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:82
rmm::mr::arena_memory_resource
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:75
rmm::mr::detail::arena::global_arena
The global arena for allocating memory from the upstream memory resource.
Definition: arena.hpp:495