19 #include <rmm/detail/error.hpp>
20 #include <rmm/detail/export.hpp>
21 #include <rmm/detail/format.hpp>
22 #include <rmm/detail/logging_assert.hpp>
23 #include <rmm/logger.hpp>
24 #include <rmm/mr/device/detail/arena.hpp>
28 #include <cuda_runtime_api.h>
30 #include <spdlog/common.h>
34 #include <shared_mutex>
37 namespace RMM_NAMESPACE {
84 template <
typename Upstream>
96 std::optional<std::size_t> arena_size = std::nullopt,
97 bool dump_log_on_failure =
false)
98 : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
100 if (dump_log_on_failure_) {
102 std::make_shared<spdlog::logger>(
"arena_memory_dump",
103 std::make_shared<spdlog::sinks::basic_file_sink_mt>(
104 "rmm_arena_memory_dump.log",
true ));
106 logger_->set_level(spdlog::level::info);
121 std::optional<std::size_t> arena_size = std::nullopt,
122 bool dump_log_on_failure =
false)
124 dump_log_on_failure_{dump_log_on_failure}
126 if (dump_log_on_failure_) {
128 std::make_shared<spdlog::logger>(
"arena_memory_dump",
129 std::make_shared<spdlog::sinks::basic_file_sink_mt>(
130 "rmm_arena_memory_dump.log",
true ));
132 logger_->set_level(spdlog::level::info);
145 using global_arena = rmm::mr::detail::arena::global_arena;
146 using arena = rmm::mr::detail::arena::arena;
161 if (bytes <= 0) {
return nullptr; }
162 #ifdef RMM_ARENA_USE_SIZE_CLASSES
163 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
167 auto& arena = get_arena(stream);
170 std::shared_lock lock(mtx_);
171 void* pointer = arena.allocate(bytes);
172 if (pointer !=
nullptr) {
return pointer; }
176 std::unique_lock lock(mtx_);
178 void* pointer = arena.allocate(bytes);
179 if (pointer ==
nullptr) {
180 if (dump_log_on_failure_) { dump_memory_log(bytes); }
192 RMM_CUDA_TRY(cudaDeviceSynchronize());
193 for (
auto& thread_arena : thread_arenas_) {
194 thread_arena.second->clean();
196 for (
auto& stream_arena : stream_arenas_) {
197 stream_arena.second.clean();
209 void do_deallocate(
void* ptr, std::size_t bytes, cuda_stream_view stream)
override
211 if (ptr ==
nullptr || bytes <= 0) {
return; }
212 #ifdef RMM_ARENA_USE_SIZE_CLASSES
213 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
217 auto& arena = get_arena(stream);
220 std::shared_lock lock(mtx_);
222 if (arena.deallocate(ptr, bytes, stream)) {
return; }
228 stream.synchronize_no_throw();
230 std::unique_lock lock(mtx_);
231 deallocate_from_other_arena(ptr, bytes, stream);
243 void deallocate_from_other_arena(
void* ptr, std::size_t bytes, cuda_stream_view stream)
245 if (use_per_thread_arena(stream)) {
246 for (
auto const& thread_arena : thread_arenas_) {
247 if (thread_arena.second->deallocate(ptr, bytes)) {
return; }
250 for (
auto& stream_arena : stream_arenas_) {
251 if (stream_arena.second.deallocate(ptr, bytes)) {
return; }
255 if (!global_arena_.deallocate(ptr, bytes)) {
264 if (use_per_thread_arena(stream)) {
265 for (
auto& stream_arena : stream_arenas_) {
266 if (stream_arena.second.deallocate(ptr, bytes)) {
return; }
269 for (
auto const& thread_arena : thread_arenas_) {
270 if (thread_arena.second->deallocate(ptr, bytes)) {
return; }
273 RMM_FAIL(
"allocation not found");
283 arena& get_arena(cuda_stream_view stream)
285 if (use_per_thread_arena(stream)) {
return get_thread_arena(); }
286 return get_stream_arena(stream);
294 arena& get_thread_arena()
296 auto const thread_id = std::this_thread::get_id();
298 std::shared_lock lock(map_mtx_);
299 auto const iter = thread_arenas_.find(thread_id);
300 if (iter != thread_arenas_.end()) {
return *iter->second; }
303 std::unique_lock lock(map_mtx_);
304 auto thread_arena = std::make_shared<arena>(global_arena_);
305 thread_arenas_.emplace(thread_id, thread_arena);
306 thread_local detail::arena::arena_cleaner cleaner{thread_arena};
307 return *thread_arena;
316 arena& get_stream_arena(cuda_stream_view stream)
318 RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
320 std::shared_lock lock(map_mtx_);
321 auto const iter = stream_arenas_.find(stream.value());
322 if (iter != stream_arenas_.end()) {
return iter->second; }
325 std::unique_lock lock(map_mtx_);
326 stream_arenas_.emplace(stream.value(), global_arena_);
327 return stream_arenas_.at(stream.value());
336 void dump_memory_log(
size_t bytes)
338 logger_->info(
"**************************************************");
339 logger_->info(rmm::detail::formatted_log(
"Ran out of memory trying to allocate %s.",
340 rmm::detail::format_bytes(bytes)));
341 logger_->info(
"**************************************************");
342 logger_->info(
"Global arena:");
343 global_arena_.dump_memory_log(logger_);
353 static bool use_per_thread_arena(cuda_stream_view stream)
355 return stream.is_per_thread_default();
359 global_arena global_arena_;
362 std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
365 std::map<cudaStream_t, arena> stream_arenas_;
367 bool dump_log_on_failure_{};
369 std::shared_ptr<spdlog::logger> logger_{};
371 mutable std::shared_mutex map_mtx_;
373 mutable std::shared_mutex mtx_;
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:85
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:120
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:95
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:94
Exception thrown when RMM runs out of memory.
Definition: error.hpp:87
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:41
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:79
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:43
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Definition: aligned.hpp:77