librmm/25.04/pool__memory__resource_8hpp_source

 /*

  * Copyright (c) 2020-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <rmm/aligned.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/format.hpp>

 #include <rmm/detail/logging_assert.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/logger.hpp>

 #include <rmm/mr/device/detail/coalescing_free_list.hpp>

 #include <rmm/mr/device/detail/stream_ordered_memory_resource.hpp>

 #include <rmm/mr/device/device_memory_resource.hpp>

 #include <rmm/mr/device/per_device_resource.hpp>

 #include <rmm/resource_ref.hpp>


 #include <cuda/std/type_traits>

 #include <cuda_runtime_api.h>

 #include <thrust/iterator/counting_iterator.h>

 #include <thrust/iterator/transform_iterator.h>


 #include <algorithm>

 #include <cstddef>

 #include <mutex>

 #include <optional>

 #include <set>


 namespace RMM_NAMESPACE {

 namespace mr {

 namespace detail {

 template <class PoolResource, class Upstream, class Property, class = void>

 struct maybe_remove_property {};


 template <class PoolResource, class Upstream, class Property>

 struct maybe_remove_property<PoolResource,

                              Upstream,

                              Property,

                              cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {

 #if defined(__GNUC__) && !defined(__clang__)  // GCC warns about compatibility

                                               // issues with pre ISO C++ code

 #pragma GCC diagnostic push

 #pragma GCC diagnostic ignored "-Wnon-template-friend"

 #endif  // __GNUC__ and not __clang__

   friend void get_property(const PoolResource&, Property) = delete;

 #if defined(__GNUC__) && !defined(__clang__)

 #pragma GCC diagnostic pop

 #endif  // __GNUC__ and not __clang__

 };

 }  // namespace detail


 template <typename Upstream>

 class pool_memory_resource final

   : public detail::

       maybe_remove_property<pool_memory_resource<Upstream>, Upstream, cuda::mr::device_accessible>,

     public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

                                                   detail::coalescing_free_list>,

     public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {

  public:

   friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

                                                       detail::coalescing_free_list>;


   explicit pool_memory_resource(device_async_resource_ref upstream_mr,

                                 std::size_t initial_pool_size,

                                 std::optional<std::size_t> maximum_pool_size = std::nullopt)

     : upstream_mr_{upstream_mr}

   {

     RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),

                 "Error, Initial pool size required to be a multiple of 256 bytes");

     RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),

                 "Error, Maximum pool size required to be a multiple of 256 bytes");


     initialize_pool(initial_pool_size, maximum_pool_size);

   }


   explicit pool_memory_resource(Upstream* upstream_mr,

                                 std::size_t initial_pool_size,

                                 std::optional<std::size_t> maximum_pool_size = std::nullopt)

     : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}

   {

     RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),

                 "Error, Initial pool size required to be a multiple of 256 bytes");

     RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),

                 "Error, Maximum pool size required to be a multiple of 256 bytes");


     initialize_pool(initial_pool_size, maximum_pool_size);

   }


   template <typename Upstream2                                               = Upstream,

             cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>

   explicit pool_memory_resource(Upstream2& upstream_mr,

                                 std::size_t initial_pool_size,

                                 std::optional<std::size_t> maximum_pool_size = std::nullopt)

     : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)

   {

   }


   ~pool_memory_resource() override { release(); }


   pool_memory_resource()                                       = delete;

   pool_memory_resource(pool_memory_resource const&)            = delete;

   pool_memory_resource(pool_memory_resource&&)                 = delete;

   pool_memory_resource& operator=(pool_memory_resource const&) = delete;

   pool_memory_resource& operator=(pool_memory_resource&&)      = delete;


   [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept

   {

     return upstream_mr_;

   }


   [[nodiscard]] std::size_t pool_size() const noexcept { return current_pool_size_; }


  protected:

   using free_list  = detail::coalescing_free_list;

   using block_type = free_list::block_type;

   using typename detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

                                                         detail::coalescing_free_list>::split_block;

   using lock_guard = std::lock_guard<std::mutex>;


   [[nodiscard]] std::size_t get_maximum_allocation_size() const

   {

     return std::numeric_limits<std::size_t>::max();

   }


   block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)

   {

     auto report_error = [&](const char* reason) {

       RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",

                     rmm::detail::format_stream(stream),

                     min_size,

                     reason);

       auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +

                        rmm::detail::format_bytes(min_size) + std::string("): ") + reason;

       RMM_FAIL(msg.c_str(), rmm::out_of_memory);

     };


     while (try_size >= min_size) {

       try {

         auto block = block_from_upstream(try_size, stream);

         current_pool_size_ += block.size();

         return block;

       } catch (std::exception const& e) {

         if (try_size == min_size) { report_error(e.what()); }

       }

       try_size = std::max(min_size, try_size / 2);

     }


     auto const max_size = maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max());

     auto const msg      = std::string("Not enough room to grow, current/max/try size = ") +

                      rmm::detail::format_bytes(pool_size()) + ", " +

                      rmm::detail::format_bytes(max_size) + ", " +

                      rmm::detail::format_bytes(min_size);

     report_error(msg.c_str());

     return {};

   }


   void initialize_pool(std::size_t initial_size, std::optional<std::size_t> maximum_size)

   {

     current_pool_size_ = 0;  // try_to_expand will set this if it succeeds

     maximum_pool_size_ = maximum_size;


     RMM_EXPECTS(

       initial_size <= maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max()),

       "Initial pool size exceeds the maximum pool size!");


     if (initial_size > 0) {

       auto const block = try_to_expand(initial_size, initial_size, cuda_stream_legacy);

       this->insert_block(block, cuda_stream_legacy);

     }

   }


   block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)

   {

     // Strategy: If maximum_pool_size_ is set, then grow geometrically, e.g. by halfway to the

     // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each

     // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,

     // until either success or the attempt is less than the requested size.


     return try_to_expand(size_to_grow(size), size, stream);

   }


   [[nodiscard]] std::size_t size_to_grow(std::size_t size) const

   {

     if (maximum_pool_size_.has_value()) {

       auto const unaligned_remaining = maximum_pool_size_.value() - pool_size();

       using rmm::align_up;

       auto const remaining    = align_up(unaligned_remaining, rmm::CUDA_ALLOCATION_ALIGNMENT);

       auto const aligned_size = align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);

       return (aligned_size <= remaining) ? std::max(aligned_size, remaining / 2) : 0;

     }

     return std::max(size, pool_size());

   };


   block_type block_from_upstream(std::size_t size, cuda_stream_view stream)

   {

     RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);


     if (size == 0) { return {}; }


     void* ptr = get_upstream_resource().allocate_async(size, stream);

     return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;

   }


   split_block allocate_from_block(block_type const& block, std::size_t size)

   {

     block_type const alloc{block.pointer(), size, block.is_head()};

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

     allocated_blocks_.insert(alloc);

 #endif


     auto rest = (block.size() > size)

                   // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

                   ? block_type{block.pointer() + size, block.size() - size, false}

                   : block_type{};

     return {alloc, rest};

   }


   block_type free_block(void* ptr, std::size_t size) noexcept

   {

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

     if (ptr == nullptr) return block_type{};

     auto const iter = allocated_blocks_.find(static_cast<char*>(ptr));

     RMM_LOGGING_ASSERT(iter != allocated_blocks_.end());


     auto block = *iter;

     RMM_LOGGING_ASSERT(block.size() == rmm::align_up(size, allocation_alignment));

     allocated_blocks_.erase(iter);


     return block;

 #else

     auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));

     return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};

 #endif

   }


   void release()

   {

     lock_guard lock(this->get_mutex());


     for (auto block : upstream_blocks_) {

       get_upstream_resource().deallocate(block.pointer(), block.size());

     }

     upstream_blocks_.clear();

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

     allocated_blocks_.clear();

 #endif


     current_pool_size_ = 0;

   }


 #ifdef RMM_DEBUG_PRINT

   void print()

   {

     lock_guard lock(this->get_mutex());


     auto const [free, total] = rmm::available_device_memory();

     std::cout << "GPU free memory: " << free << " total: " << total << "\n";


     std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";

     std::size_t upstream_total{0};


     for (auto blocks : upstream_blocks_) {

       blocks.print();

       upstream_total += blocks.size();

     }

     std::cout << "total upstream: " << upstream_total << " B\n";


 #ifdef RMM_POOL_TRACK_ALLOCATIONS

     std::cout << "allocated_blocks: " << allocated_blocks_.size() << "\n";

     for (auto block : allocated_blocks_)

       block.print();

 #endif


     this->print_free_blocks();

   }

 #endif


   std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)

   {

     std::size_t largest{};

     std::size_t total{};

     std::for_each(blocks.cbegin(), blocks.cend(), [&largest, &total](auto const& block) {

       total += block.size();

       largest = std::max(largest, block.size());

     });

     return {largest, total};

   }


  private:

   // The "heap" to allocate the pool from

   device_async_resource_ref upstream_mr_;

   std::size_t current_pool_size_{};

   std::optional<std::size_t> maximum_pool_size_{};


 #ifdef RMM_POOL_TRACK_ALLOCATIONS

   std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;

 #endif


   // blocks allocated from upstream

   std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;

 };  // namespace mr

   // end of group

 }  // namespace mr

 }  // namespace RMM_NAMESPACE

aligned.hpp

rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:39

rmm::mr::pool_memory_resource
A coalescing best-fit suballocator which uses a pool of memory allocated from an upstream memory_reso...
Definition: pool_memory_resource.hpp:105

rmm::mr::pool_memory_resource::free_block
block_type free_block(void *ptr, std::size_t size) noexcept
Finds, frees and returns the block associated with pointer ptr.
Definition: pool_memory_resource.hpp:401

rmm::mr::pool_memory_resource::initialize_pool
void initialize_pool(std::size_t initial_size, std::optional< std::size_t > maximum_size)
Allocate initial memory for the pool.
Definition: pool_memory_resource.hpp:293

rmm::mr::pool_memory_resource::allocate_from_block
split_block allocate_from_block(block_type const &block, std::size_t size)
Splits block if necessary to return a pointer to memory of size bytes.
Definition: pool_memory_resource.hpp:379

rmm::mr::pool_memory_resource::get_upstream_resource
device_async_resource_ref get_upstream_resource() const noexcept
rmm::device_async_resource_ref to the upstream resource
Definition: pool_memory_resource.hpp:204

rmm::mr::pool_memory_resource::size_to_grow
std::size_t size_to_grow(std::size_t size) const
Given a minimum size, computes an appropriate size to grow the pool.
Definition: pool_memory_resource.hpp:339

rmm::mr::pool_memory_resource::block_type
free_list::block_type block_type
The type of block returned by the free list.
Definition: pool_memory_resource.hpp:220

rmm::mr::pool_memory_resource::free_list_summary
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
Get the largest available block size and total free size in the specified free list.
Definition: pool_memory_resource.hpp:479

rmm::mr::pool_memory_resource::get_maximum_allocation_size
std::size_t get_maximum_allocation_size() const
Get the maximum size of allocations supported by this memory resource.
Definition: pool_memory_resource.hpp:233

rmm::mr::pool_memory_resource::expand_pool
block_type expand_pool(std::size_t size, free_list &blocks, cuda_stream_view stream)
Allocate space from upstream to supply the suballocation pool and return a sufficiently sized block.
Definition: pool_memory_resource.hpp:317

rmm::mr::pool_memory_resource::pool_memory_resource
pool_memory_resource(Upstream2 &upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:182

rmm::mr::pool_memory_resource::release
void release()
Free all memory allocated from the upstream memory_resource.
Definition: pool_memory_resource.hpp:423

rmm::mr::pool_memory_resource::try_to_expand
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
Try to expand the pool by allocating a block of at least min_size bytes from upstream.
Definition: pool_memory_resource.hpp:253

rmm::mr::pool_memory_resource::lock_guard
std::lock_guard< std::mutex > lock_guard
Type of lock used to synchronize access.
Definition: pool_memory_resource.hpp:223

rmm::mr::pool_memory_resource::pool_size
std::size_t pool_size() const noexcept
Computes the size of the current pool.
Definition: pool_memory_resource.hpp:216

rmm::mr::pool_memory_resource::~pool_memory_resource
~pool_memory_resource() override
Destroy the pool_memory_resource and deallocate all memory it allocated using the upstream resource.
Definition: pool_memory_resource.hpp:193

rmm::mr::pool_memory_resource::free_list
detail::coalescing_free_list free_list
The free list implementation.
Definition: pool_memory_resource.hpp:219

rmm::mr::pool_memory_resource::pool_memory_resource
pool_memory_resource(Upstream *upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:152

rmm::mr::pool_memory_resource::block_from_upstream
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
Allocate a block from upstream to expand the suballocation pool.
Definition: pool_memory_resource.hpp:359

rmm::out_of_memory
Exception thrown when RMM runs out of memory.
Definition: error.hpp:87

cuda_stream_view.hpp

device_memory_resource.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.
Definition: cuda_device.hpp:123

rmm::cuda_stream_legacy
static const cuda_stream_view cuda_stream_legacy
Static cuda_stream_view of cudaStreamLegacy, for convenience.
Definition: cuda_stream_view.hpp:131

rmm::device_async_resource_ref
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:41

rmm::to_device_async_resource_ref_checked
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:79

rmm::CUDA_ALLOCATION_ALIGNMENT
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:43

rmm::is_aligned
constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
Checks whether a value is aligned to a multiple of a specified power of 2.
Definition: aligned.hpp:105

rmm::align_up
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Definition: aligned.hpp:77

per_device_resource.hpp
Management of per-device device_memory_resources.

resource_ref.hpp

rmm::mr::detail::maybe_remove_property
A helper class to remove the device_accessible property.
Definition: pool_memory_resource.hpp:63