8 #include <rmm/detail/error.hpp>
9 #include <rmm/detail/export.hpp>
10 #include <rmm/detail/format.hpp>
11 #include <rmm/detail/logging_assert.hpp>
12 #include <rmm/logger.hpp>
13 #include <rmm/mr/device/detail/arena.hpp>
17 #include <cuda_runtime_api.h>
21 #include <shared_mutex>
24 namespace RMM_NAMESPACE {
71 template <
typename Upstream>
83 std::optional<std::size_t> arena_size = std::nullopt,
84 bool dump_log_on_failure =
false)
85 : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
87 if (dump_log_on_failure_) {
89 std::make_shared<rapids_logger::logger>(
"arena_memory_dump",
"rmm_arena_memory_dump.log");
91 logger_->set_level(rapids_logger::level_enum::info);
106 std::optional<std::size_t> arena_size = std::nullopt,
107 bool dump_log_on_failure =
false)
122 using global_arena = rmm::mr::detail::arena::global_arena;
123 using arena = rmm::mr::detail::arena::arena;
138 if (bytes <= 0) {
return nullptr; }
139 #ifdef RMM_ARENA_USE_SIZE_CLASSES
140 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
144 auto& arena = get_arena(stream);
147 std::shared_lock lock(mtx_);
148 void* pointer = arena.allocate_sync(bytes);
149 if (pointer !=
nullptr) {
return pointer; }
153 std::unique_lock lock(mtx_);
155 void* pointer = arena.allocate_sync(bytes);
156 if (pointer ==
nullptr) {
157 if (dump_log_on_failure_) { dump_memory_log(bytes); }
158 auto const msg = std::string(
"Maximum pool size exceeded (failed to allocate ") +
159 rmm::detail::format_bytes(bytes) +
"): No room in arena.";
171 RMM_CUDA_TRY(cudaDeviceSynchronize());
172 for (
auto& thread_arena : thread_arenas_) {
173 thread_arena.second->clean();
175 for (
auto& stream_arena : stream_arenas_) {
176 stream_arena.second.clean();
188 void do_deallocate(
void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept
override
190 if (ptr ==
nullptr || bytes <= 0) {
return; }
191 #ifdef RMM_ARENA_USE_SIZE_CLASSES
192 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
196 auto& arena = get_arena(stream);
199 std::shared_lock lock(mtx_);
201 if (arena.deallocate(stream, ptr, bytes)) {
return; }
207 stream.synchronize_no_throw();
209 std::unique_lock lock(mtx_);
210 deallocate_from_other_arena(stream, ptr, bytes);
222 void deallocate_from_other_arena(cuda_stream_view stream,
void* ptr, std::size_t bytes)
224 if (use_per_thread_arena(stream)) {
225 for (
auto const& thread_arena : thread_arenas_) {
226 if (thread_arena.second->deallocate_sync(ptr, bytes)) {
return; }
229 for (
auto& stream_arena : stream_arenas_) {
230 if (stream_arena.second.deallocate_sync(ptr, bytes)) {
return; }
234 if (!global_arena_.deallocate_sync(ptr, bytes)) {
243 if (use_per_thread_arena(stream)) {
244 for (
auto& stream_arena : stream_arenas_) {
245 if (stream_arena.second.deallocate_sync(ptr, bytes)) {
return; }
248 for (
auto const& thread_arena : thread_arenas_) {
249 if (thread_arena.second->deallocate_sync(ptr, bytes)) {
return; }
252 RMM_FAIL(
"allocation not found");
262 arena& get_arena(cuda_stream_view stream)
264 if (use_per_thread_arena(stream)) {
return get_thread_arena(); }
265 return get_stream_arena(stream);
273 arena& get_thread_arena()
275 auto const thread_id = std::this_thread::get_id();
277 std::shared_lock lock(map_mtx_);
278 auto const iter = thread_arenas_.find(thread_id);
279 if (iter != thread_arenas_.end()) {
return *iter->second; }
282 std::unique_lock lock(map_mtx_);
283 auto thread_arena = std::make_shared<arena>(global_arena_);
284 thread_arenas_.emplace(thread_id, thread_arena);
285 thread_local detail::arena::arena_cleaner cleaner{thread_arena};
286 return *thread_arena;
295 arena& get_stream_arena(cuda_stream_view stream)
297 RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
299 std::shared_lock lock(map_mtx_);
300 auto const iter = stream_arenas_.find(stream.value());
301 if (iter != stream_arenas_.end()) {
return iter->second; }
304 std::unique_lock lock(map_mtx_);
305 stream_arenas_.emplace(stream.value(), global_arena_);
306 return stream_arenas_.at(stream.value());
315 void dump_memory_log(
size_t bytes)
317 logger_->info(
"**************************************************");
318 logger_->info(
"Ran out of memory trying to allocate %s.", rmm::detail::format_bytes(bytes));
319 logger_->info(
"**************************************************");
320 logger_->info(
"Global arena:");
321 global_arena_.dump_memory_log(logger_);
331 static bool use_per_thread_arena(cuda_stream_view stream)
333 return stream.is_per_thread_default();
337 global_arena global_arena_;
340 std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
343 std::map<cudaStream_t, arena> stream_arenas_;
345 bool dump_log_on_failure_{};
347 std::shared_ptr<rapids_logger::logger> logger_{};
349 mutable std::shared_mutex map_mtx_;
351 mutable std::shared_mutex mtx_;
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:72
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:105
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:82
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:83
Exception thrown when RMM runs out of memory.
Definition: error.hpp:76
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:72
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:32
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:31
std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.