RMM: fixed_size_memory_resource.hpp Source File

 /*

  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.

  * SPDX-License-Identifier: Apache-2.0

  */

 #pragma once


 #include <rmm/aligned.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/logging_assert.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/mr/detail/fixed_size_free_list.hpp>

 #include <rmm/mr/detail/stream_ordered_memory_resource.hpp>

 #include <rmm/resource_ref.hpp>


 #include <cuda_runtime_api.h>

 #include <thrust/iterator/counting_iterator.h>

 #include <thrust/iterator/transform_iterator.h>


 #include <algorithm>

 #include <cstddef>

 #include <utility>

 #include <vector>


 namespace RMM_NAMESPACE {

 namespace mr {

 template <typename Upstream>

 class fixed_size_memory_resource

   : public detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

                                                   detail::fixed_size_free_list> {

  public:

   friend class detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

                                                       detail::fixed_size_free_list>;


   static constexpr std::size_t default_block_size = 1 << 20;


   static constexpr std::size_t default_blocks_to_preallocate = 128;


   explicit fixed_size_memory_resource(

     device_async_resource_ref upstream_mr,

     // NOLINTNEXTLINE bugprone-easily-swappable-parameters

     std::size_t block_size            = default_block_size,

     std::size_t blocks_to_preallocate = default_blocks_to_preallocate)

     : upstream_mr_{upstream_mr},

       block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},

       upstream_chunk_size_{block_size_ * blocks_to_preallocate}

   {

     // allocate initial blocks and insert into free list

     this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);

   }


   explicit fixed_size_memory_resource(

     Upstream* upstream_mr,

     // NOLINTNEXTLINE bugprone-easily-swappable-parameters

     std::size_t block_size            = default_block_size,

     std::size_t blocks_to_preallocate = default_blocks_to_preallocate)

     : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)},

       block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},

       upstream_chunk_size_{block_size_ * blocks_to_preallocate}

   {

     // allocate initial blocks and insert into free list

     this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);

   }


   ~fixed_size_memory_resource() override { release(); }


   fixed_size_memory_resource()                                             = delete;

   fixed_size_memory_resource(fixed_size_memory_resource const&)            = delete;

   fixed_size_memory_resource(fixed_size_memory_resource&&)                 = delete;

   fixed_size_memory_resource& operator=(fixed_size_memory_resource const&) = delete;

   fixed_size_memory_resource& operator=(fixed_size_memory_resource&&)      = delete;


   [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept

   {

     return upstream_mr_;

   }


   [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }


  protected:

   using free_list  = detail::fixed_size_free_list;

   using block_type = free_list::block_type;

   using typename detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

                                                         detail::fixed_size_free_list>::split_block;

   using lock_guard = std::lock_guard<std::mutex>;


   [[nodiscard]] std::size_t get_maximum_allocation_size() const { return get_block_size(); }


   block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)

   {

     blocks.insert(std::move(blocks_from_upstream(stream)));

     return blocks.get_block(size);

   }


   free_list blocks_from_upstream(cuda_stream_view stream)

   {

     void* ptr = get_upstream_resource().allocate(stream, upstream_chunk_size_);

     block_type block{ptr};

     upstream_blocks_.push_back(block);


     auto num_blocks = upstream_chunk_size_ / block_size_;


     auto block_gen = [ptr, this](int index) {

       // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

       return block_type{static_cast<char*>(ptr) + index * block_size_};

     };

     auto first =

       thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), block_gen);

     return free_list(first, first + num_blocks);

   }


   split_block allocate_from_block(block_type const& block, std::size_t size)

   {

     return {block, block_type{nullptr}};

   }


   block_type free_block(void* ptr, std::size_t size) noexcept

   {

     // Deallocating a fixed-size block just inserts it in the free list, which is

     // handled by the parent class

     RMM_LOGGING_ASSERT(align_up(size, CUDA_ALLOCATION_ALIGNMENT) <= block_size_);

     return block_type{ptr};

   }


   void release()

   {

     lock_guard lock(this->get_mutex());


     for (auto block : upstream_blocks_) {

       get_upstream_resource().deallocate_sync(block.pointer(), upstream_chunk_size_);

     }

     upstream_blocks_.clear();

   }


 #ifdef RMM_DEBUG_PRINT

   void print()

   {

     lock_guard lock(this->get_mutex());


     auto const [free, total] = rmm::available_device_memory();

     std::cout << "GPU free memory: " << free << " total: " << total << "\n";


     std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";

     std::size_t upstream_total{0};


     for (auto blocks : upstream_blocks_) {

       blocks.print();

       upstream_total += upstream_chunk_size_;

     }

     std::cout << "total upstream: " << upstream_total << " B\n";


     this->print_free_blocks();

   }

 #endif


   std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)

   {

     return blocks.is_empty() ? std::make_pair(std::size_t{0}, std::size_t{0})

                              : std::make_pair(block_size_, blocks.size() * block_size_);

   }


  private:

   device_async_resource_ref upstream_mr_;  // The resource from which to allocate new blocks


   std::size_t block_size_;           // size of blocks this MR allocates

   std::size_t upstream_chunk_size_;  // size of chunks allocated from heap MR


   // blocks allocated from heap: so they can be easily freed

   std::vector<block_type> upstream_blocks_;

 };

   // end of group

 }  // namespace mr

 }  // namespace RMM_NAMESPACE

aligned.hpp

rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28

rmm::mr::fixed_size_memory_resource
A device_memory_resource which allocates memory blocks of a single fixed size.
Definition: fixed_size_memory_resource.hpp:42

rmm::mr::fixed_size_memory_resource::free_list_summary
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
Get the largest available block size and total free size in the specified free list.
Definition: fixed_size_memory_resource.hpp:257

rmm::mr::fixed_size_memory_resource::free_list
detail::fixed_size_free_list free_list
The free list type.
Definition: fixed_size_memory_resource.hpp:129

rmm::mr::fixed_size_memory_resource::free_block
block_type free_block(void *ptr, std::size_t size) noexcept
Finds, frees and returns the block associated with pointer.
Definition: fixed_size_memory_resource.hpp:206

rmm::mr::fixed_size_memory_resource::get_block_size
std::size_t get_block_size() const noexcept
Get the size of blocks allocated by this memory resource.
Definition: fixed_size_memory_resource.hpp:126

rmm::mr::fixed_size_memory_resource::get_maximum_allocation_size
std::size_t get_maximum_allocation_size() const
Get the (fixed) size of allocations supported by this memory resource.
Definition: fixed_size_memory_resource.hpp:141

rmm::mr::fixed_size_memory_resource::block_type
free_list::block_type block_type
The type of block managed by the free list.
Definition: fixed_size_memory_resource.hpp:130

rmm::mr::fixed_size_memory_resource::get_upstream_resource
device_async_resource_ref get_upstream_resource() const noexcept
device_async_resource_ref to the upstream resource
Definition: fixed_size_memory_resource.hpp:116

rmm::mr::fixed_size_memory_resource::expand_pool
block_type expand_pool(std::size_t size, free_list &blocks, cuda_stream_view stream)
Allocate a block from upstream to supply the suballocation pool.
Definition: fixed_size_memory_resource.hpp:154

rmm::mr::fixed_size_memory_resource::blocks_from_upstream
free_list blocks_from_upstream(cuda_stream_view stream)
Allocate blocks from upstream to expand the suballocation pool.
Definition: fixed_size_memory_resource.hpp:166

rmm::mr::fixed_size_memory_resource::lock_guard
std::lock_guard< std::mutex > lock_guard
Type of lock used to synchronize access.
Definition: fixed_size_memory_resource.hpp:133

rmm::mr::fixed_size_memory_resource::release
void release()
free all memory allocated using the upstream resource.
Definition: fixed_size_memory_resource.hpp:218

rmm::mr::fixed_size_memory_resource::allocate_from_block
split_block allocate_from_block(block_type const &block, std::size_t size)
Splits block if necessary to return a pointer to memory of size bytes.
Definition: fixed_size_memory_resource.hpp:193

cuda_stream_view.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.

rmm::device_async_resource_ref
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:32

rmm::CUDA_ALLOCATION_ALIGNMENT
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:25

rmm::align_up
std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.

resource_ref.hpp