RMM: arena_memory_resource.hpp Source File

 /*

  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.

  * SPDX-License-Identifier: Apache-2.0

  */

 #pragma once


 #include <rmm/aligned.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/format.hpp>

 #include <rmm/detail/logging_assert.hpp>

 #include <rmm/logger.hpp>

 #include <rmm/mr/device/detail/arena.hpp>

 #include <rmm/mr/device/device_memory_resource.hpp>

 #include <rmm/resource_ref.hpp>


 #include <cuda_runtime_api.h>


 #include <cstddef>

 #include <map>

 #include <shared_mutex>

 #include <thread>


 namespace RMM_NAMESPACE {

 namespace mr {

 template <typename Upstream>

 class arena_memory_resource final : public device_memory_resource {

  public:

   explicit arena_memory_resource(device_async_resource_ref upstream_mr,

                                  std::optional<std::size_t> arena_size = std::nullopt,

                                  bool dump_log_on_failure              = false)

     : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}

   {

     if (dump_log_on_failure_) {

       logger_ =

         std::make_shared<rapids_logger::logger>("arena_memory_dump", "rmm_arena_memory_dump.log");

       // Set the level to `debug` for more detailed output.

       logger_->set_level(rapids_logger::level_enum::info);

     }

   }


   explicit arena_memory_resource(Upstream* upstream_mr,

                                  std::optional<std::size_t> arena_size = std::nullopt,

                                  bool dump_log_on_failure              = false)

     : arena_memory_resource{

         to_device_async_resource_ref_checked(upstream_mr), arena_size, dump_log_on_failure}

   {

   }


   ~arena_memory_resource() override = default;


   // Disable copy (and move) semantics.

   arena_memory_resource(arena_memory_resource const&)                = delete;

   arena_memory_resource& operator=(arena_memory_resource const&)     = delete;

   arena_memory_resource(arena_memory_resource&&) noexcept            = delete;

   arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;


  private:

   using global_arena = rmm::mr::detail::arena::global_arena;

   using arena        = rmm::mr::detail::arena::arena;


   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override

   {

     if (bytes <= 0) { return nullptr; }

 #ifdef RMM_ARENA_USE_SIZE_CLASSES

     bytes = rmm::mr::detail::arena::align_to_size_class(bytes);

 #else

     bytes = rmm::align_up(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT);

 #endif

     auto& arena = get_arena(stream);


     {

       std::shared_lock lock(mtx_);

       void* pointer = arena.allocate_sync(bytes);

       if (pointer != nullptr) { return pointer; }

     }


     {

       std::unique_lock lock(mtx_);

       defragment();

       void* pointer = arena.allocate_sync(bytes);

       if (pointer == nullptr) {

         if (dump_log_on_failure_) { dump_memory_log(bytes); }

         auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +

                          rmm::detail::format_bytes(bytes) + "): No room in arena.";

         RMM_FAIL(msg.c_str(), rmm::out_of_memory);

       }

       return pointer;

     }

   }


   void defragment()

   {

     RMM_CUDA_TRY(cudaDeviceSynchronize());

     for (auto& thread_arena : thread_arenas_) {

       thread_arena.second->clean();

     }

     for (auto& stream_arena : stream_arenas_) {

       stream_arena.second.clean();

     }

   }


   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override

   {

     if (ptr == nullptr || bytes <= 0) { return; }

 #ifdef RMM_ARENA_USE_SIZE_CLASSES

     bytes = rmm::mr::detail::arena::align_to_size_class(bytes);

 #else

     bytes = rmm::align_up(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT);

 #endif

     auto& arena = get_arena(stream);


     {

       std::shared_lock lock(mtx_);

       // If the memory being freed does not belong to the arena, the following will return false.

       if (arena.deallocate(stream, ptr, bytes)) { return; }

     }


     {

       // Since we are returning this memory to another stream, we need to make sure the current

       // stream is caught up.

       stream.synchronize_no_throw();


       std::unique_lock lock(mtx_);

       deallocate_from_other_arena(stream, ptr, bytes);

     }

   }


   void deallocate_from_other_arena(cuda_stream_view stream, void* ptr, std::size_t bytes)

   {

     if (use_per_thread_arena(stream)) {

       for (auto const& thread_arena : thread_arenas_) {

         if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }

       }

     } else {

       for (auto& stream_arena : stream_arenas_) {

         if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }

       }

     }


     if (!global_arena_.deallocate_sync(ptr, bytes)) {

       // It's possible to use per thread default streams along with another pool of streams.

       // This means that it's possible for an allocation to move from a thread or stream arena

       // back into the global arena during a defragmentation and then move down into another arena

       // type. For instance, thread arena -> global arena -> stream arena. If this happens and

       // there was an allocation from it while it was a thread arena, we now have to check to

       // see if the allocation is part of a stream arena, and vice versa.

       // Only do this in exceptional cases to not affect performance and have to check all

       // arenas all the time.

       if (use_per_thread_arena(stream)) {

         for (auto& stream_arena : stream_arenas_) {

           if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }

         }

       } else {

         for (auto const& thread_arena : thread_arenas_) {

           if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }

         }

       }

       RMM_FAIL("allocation not found");

     }

   }


   arena& get_arena(cuda_stream_view stream)

   {

     if (use_per_thread_arena(stream)) { return get_thread_arena(); }

     return get_stream_arena(stream);

   }


   arena& get_thread_arena()

   {

     auto const thread_id = std::this_thread::get_id();

     {

       std::shared_lock lock(map_mtx_);

       auto const iter = thread_arenas_.find(thread_id);

       if (iter != thread_arenas_.end()) { return *iter->second; }

     }

     {

       std::unique_lock lock(map_mtx_);

       auto thread_arena = std::make_shared<arena>(global_arena_);

       thread_arenas_.emplace(thread_id, thread_arena);

       thread_local detail::arena::arena_cleaner cleaner{thread_arena};

       return *thread_arena;

     }

   }


   arena& get_stream_arena(cuda_stream_view stream)

   {

     RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));

     {

       std::shared_lock lock(map_mtx_);

       auto const iter = stream_arenas_.find(stream.value());

       if (iter != stream_arenas_.end()) { return iter->second; }

     }

     {

       std::unique_lock lock(map_mtx_);

       stream_arenas_.emplace(stream.value(), global_arena_);

       return stream_arenas_.at(stream.value());

     }

   }


   void dump_memory_log(size_t bytes)

   {

     logger_->info("**************************************************");

     logger_->info("Ran out of memory trying to allocate %s.", rmm::detail::format_bytes(bytes));

     logger_->info("**************************************************");

     logger_->info("Global arena:");

     global_arena_.dump_memory_log(logger_);

     logger_->flush();

   }


   static bool use_per_thread_arena(cuda_stream_view stream)

   {

     return stream.is_per_thread_default();

   }


   global_arena global_arena_;

   std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;

   std::map<cudaStream_t, arena> stream_arenas_;

   bool dump_log_on_failure_{};

   std::shared_ptr<rapids_logger::logger> logger_{};

   mutable std::shared_mutex map_mtx_;

   mutable std::shared_mutex mtx_;

 };

   // end of group

 }  // namespace mr

 }  // namespace RMM_NAMESPACE

aligned.hpp

rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28

rmm::mr::arena_memory_resource
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:72

rmm::mr::arena_memory_resource::arena_memory_resource
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:105

rmm::mr::arena_memory_resource::arena_memory_resource
arena_memory_resource(device_async_resource_ref upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:82

rmm::mr::device_memory_resource
Base class for all librmm device memory allocation.
Definition: device_memory_resource.hpp:83

rmm::out_of_memory
Exception thrown when RMM runs out of memory.
Definition: error.hpp:76

device_memory_resource.hpp

rmm::to_device_async_resource_ref_checked
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:72

rmm::device_async_resource_ref
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:32

rmm::CUDA_ALLOCATION_ALIGNMENT
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:31

rmm::align_up
std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.

resource_ref.hpp