pool_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include <rmm/aligned.hpp>
9 #include <rmm/detail/error.hpp>
10 #include <rmm/detail/export.hpp>
11 #include <rmm/detail/format.hpp>
12 #include <rmm/detail/logging_assert.hpp>
13 #include <rmm/detail/thrust_namespace.h>
14 #include <rmm/logger.hpp>
15 #include <rmm/mr/detail/coalescing_free_list.hpp>
16 #include <rmm/mr/detail/stream_ordered_memory_resource.hpp>
19 #include <rmm/resource_ref.hpp>
20 
21 #include <cuda/std/type_traits>
22 #include <cuda_runtime_api.h>
23 #include <thrust/iterator/counting_iterator.h>
24 #include <thrust/iterator/transform_iterator.h>
25 
26 #include <algorithm>
27 #include <cstddef>
28 #include <mutex>
29 #include <optional>
30 #include <set>
31 
32 namespace RMM_NAMESPACE {
33 namespace mr {
39 namespace detail {
51 template <class PoolResource, class Upstream, class Property, class = void>
53 
57 template <class PoolResource, class Upstream, class Property>
58 struct maybe_remove_property<PoolResource,
59  Upstream,
60  Property,
61  cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {
62 #if defined(__GNUC__) && !defined(__clang__) // GCC warns about compatibility
63  // issues with pre ISO C++ code
64 #pragma GCC diagnostic push
65 #pragma GCC diagnostic ignored "-Wnon-template-friend"
66 #endif // __GNUC__ and not __clang__
71  friend void get_property(const PoolResource&, Property) = delete;
72 #if defined(__GNUC__) && !defined(__clang__)
73 #pragma GCC diagnostic pop
74 #endif // __GNUC__ and not __clang__
75 };
76 } // namespace detail
77 
88 template <typename Upstream>
90  : public detail::
91  maybe_remove_property<pool_memory_resource<Upstream>, Upstream, cuda::mr::device_accessible>,
92  public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
93  detail::coalescing_free_list>,
94  public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {
95  public:
96  friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
97  detail::coalescing_free_list>;
98 
113  explicit pool_memory_resource(device_async_resource_ref upstream_mr,
114  std::size_t initial_pool_size,
115  std::optional<std::size_t> maximum_pool_size = std::nullopt)
116  : upstream_mr_{upstream_mr}
117  {
118  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
119  "Error, Initial pool size required to be a multiple of 256 bytes");
120  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),
121  "Error, Maximum pool size required to be a multiple of 256 bytes");
122 
123  initialize_pool(initial_pool_size, maximum_pool_size);
124  }
125 
141  explicit pool_memory_resource(Upstream* upstream_mr,
142  std::size_t initial_pool_size,
143  std::optional<std::size_t> maximum_pool_size = std::nullopt)
144  : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}
145  {
146  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
147  "Error, Initial pool size required to be a multiple of 256 bytes");
148  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),
149  "Error, Maximum pool size required to be a multiple of 256 bytes");
150 
151  initialize_pool(initial_pool_size, maximum_pool_size);
152  }
153 
169  template <typename Upstream2 = Upstream>
170  explicit pool_memory_resource(Upstream2& upstream_mr,
171  std::size_t initial_pool_size,
172  std::optional<std::size_t> maximum_pool_size = std::nullopt)
173  : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)
174  {
175  }
176 
181  ~pool_memory_resource() override { release(); }
182 
183  pool_memory_resource() = delete;
186  pool_memory_resource& operator=(pool_memory_resource const&) = delete;
187  pool_memory_resource& operator=(pool_memory_resource&&) = delete;
188 
192  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
193  {
194  return upstream_mr_;
195  }
196 
204  [[nodiscard]] std::size_t pool_size() const noexcept { return current_pool_size_; }
205 
206  protected:
207  using free_list = detail::coalescing_free_list;
208  using block_type = free_list::block_type;
209  using typename detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
210  detail::coalescing_free_list>::split_block;
211  using lock_guard = std::lock_guard<std::mutex>;
212 
221  [[nodiscard]] std::size_t get_maximum_allocation_size() const
222  {
223  return std::numeric_limits<std::size_t>::max();
224  }
225 
241  block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
242  {
243  auto report_error = [&](const char* reason) {
244  RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",
245  rmm::detail::format_stream(stream),
246  min_size,
247  reason);
248  auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
249  rmm::detail::format_bytes(min_size) + std::string("): ") + reason;
250  RMM_FAIL(msg.c_str(), rmm::out_of_memory);
251  };
252 
253  while (try_size >= min_size) {
254  try {
255  auto block = block_from_upstream(try_size, stream);
256  current_pool_size_ += block.size();
257  return block;
258  } catch (std::exception const& e) {
259  if (try_size == min_size) { report_error(e.what()); }
260  }
261  try_size = std::max(min_size, try_size / 2);
262  }
263 
264  auto const max_size = maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max());
265  auto const msg = std::string("Not enough room to grow, current/max/try size = ") +
266  rmm::detail::format_bytes(pool_size()) + ", " +
267  rmm::detail::format_bytes(max_size) + ", " +
268  rmm::detail::format_bytes(min_size);
269  report_error(msg.c_str());
270  return {};
271  }
272 
281  void initialize_pool(std::size_t initial_size, std::optional<std::size_t> maximum_size)
282  {
283  current_pool_size_ = 0; // try_to_expand will set this if it succeeds
284  maximum_pool_size_ = maximum_size;
285 
286  RMM_EXPECTS(
287  initial_size <= maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max()),
288  "Initial pool size exceeds the maximum pool size!");
289 
290  if (initial_size > 0) {
291  auto const block = try_to_expand(initial_size, initial_size, cuda_stream_legacy);
292  this->insert_block(block, cuda_stream_legacy);
293  }
294  }
295 
305  block_type expand_pool(std::size_t size,
306  [[maybe_unused]] free_list& blocks,
307  cuda_stream_view stream)
308  {
309  // Strategy: If maximum_pool_size_ is set, then grow geometrically, e.g. by halfway to the
310  // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
311  // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
312  // until either success or the attempt is less than the requested size.
313 
314  return try_to_expand(size_to_grow(size), size, stream);
315  }
316 
329  [[nodiscard]] std::size_t size_to_grow(std::size_t size) const
330  {
331  if (maximum_pool_size_.has_value()) {
332  auto const unaligned_remaining = maximum_pool_size_.value() - pool_size();
333  auto const remaining = rmm::align_up(unaligned_remaining, rmm::CUDA_ALLOCATION_ALIGNMENT);
334  auto const aligned_size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
335  return (aligned_size <= remaining) ? std::max(aligned_size, remaining / 2) : 0;
336  }
337  return std::max(size, pool_size());
338  };
339 
349  {
350  RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);
351 
352  if (size == 0) { return {}; }
353 
354  void* ptr = get_upstream_resource().allocate(stream, size);
355  return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
356  }
357 
368  split_block allocate_from_block(block_type const& block, std::size_t size)
369  {
370  block_type const alloc{block.pointer(), size, block.is_head()};
371 #ifdef RMM_POOL_TRACK_ALLOCATIONS
372  allocated_blocks_.insert(alloc);
373 #endif
374 
375  auto rest = (block.size() > size)
376  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
377  ? block_type{block.pointer() + size, block.size() - size, false}
378  : block_type{};
379  return {alloc, rest};
380  }
381 
390  block_type free_block(void* ptr, std::size_t size) noexcept
391  {
392 #ifdef RMM_POOL_TRACK_ALLOCATIONS
393  if (ptr == nullptr) return block_type{};
394  auto const iter = allocated_blocks_.find(static_cast<char*>(ptr));
395  RMM_LOGGING_ASSERT(iter != allocated_blocks_.end());
396 
397  auto block = *iter;
398  RMM_LOGGING_ASSERT(block.size() == rmm::align_up(size, allocation_alignment));
399  allocated_blocks_.erase(iter);
400 
401  return block;
402 #else
403  auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));
404  return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};
405 #endif
406  }
407 
412  void release()
413  {
414  lock_guard lock(this->get_mutex());
415 
416  for (auto block : upstream_blocks_) {
417  get_upstream_resource().deallocate_sync(block.pointer(), block.size());
418  }
419  upstream_blocks_.clear();
420 #ifdef RMM_POOL_TRACK_ALLOCATIONS
421  allocated_blocks_.clear();
422 #endif
423 
424  current_pool_size_ = 0;
425  }
426 
427 #ifdef RMM_DEBUG_PRINT
434  void print()
435  {
436  lock_guard lock(this->get_mutex());
437 
438  auto const [free, total] = rmm::available_device_memory();
439  std::cout << "GPU free memory: " << free << " total: " << total << "\n";
440 
441  std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";
442  std::size_t upstream_total{0};
443 
444  for (auto blocks : upstream_blocks_) {
445  blocks.print();
446  upstream_total += blocks.size();
447  }
448  std::cout << "total upstream: " << upstream_total << " B\n";
449 
450 #ifdef RMM_POOL_TRACK_ALLOCATIONS
451  std::cout << "allocated_blocks: " << allocated_blocks_.size() << "\n";
452  for (auto block : allocated_blocks_)
453  block.print();
454 #endif
455 
456  this->print_free_blocks();
457  }
458 #endif
459 
468  std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)
469  {
470  std::size_t largest{};
471  std::size_t total{};
472  std::for_each(blocks.cbegin(), blocks.cend(), [&largest, &total](auto const& block) {
473  total += block.size();
474  largest = std::max(largest, block.size());
475  });
476  return {largest, total};
477  }
478 
479  private:
480  // The "heap" to allocate the pool from
481  device_async_resource_ref upstream_mr_;
482  std::size_t current_pool_size_{};
483  std::optional<std::size_t> maximum_pool_size_{};
484 
485 #ifdef RMM_POOL_TRACK_ALLOCATIONS
486  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;
487 #endif
488 
489  // blocks allocated from upstream
490  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;
491 }; // namespace mr
492  // end of group
494 } // namespace mr
495 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:28
A coalescing best-fit suballocator which uses a pool of memory allocated from an upstream memory_reso...
Definition: pool_memory_resource.hpp:94
pool_memory_resource(Upstream2 &upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:170
block_type free_block(void *ptr, std::size_t size) noexcept
Finds, frees and returns the block associated with pointer ptr.
Definition: pool_memory_resource.hpp:390
void initialize_pool(std::size_t initial_size, std::optional< std::size_t > maximum_size)
Allocate initial memory for the pool.
Definition: pool_memory_resource.hpp:281
split_block allocate_from_block(block_type const &block, std::size_t size)
Splits block if necessary to return a pointer to memory of size bytes.
Definition: pool_memory_resource.hpp:368
device_async_resource_ref get_upstream_resource() const noexcept
rmm::device_async_resource_ref to the upstream resource
Definition: pool_memory_resource.hpp:192
std::size_t size_to_grow(std::size_t size) const
Given a minimum size, computes an appropriate size to grow the pool.
Definition: pool_memory_resource.hpp:329
block_type expand_pool(std::size_t size, [[maybe_unused]] free_list &blocks, cuda_stream_view stream)
Allocate space from upstream to supply the suballocation pool and return a sufficiently sized block.
Definition: pool_memory_resource.hpp:305
free_list::block_type block_type
The type of block returned by the free list.
Definition: pool_memory_resource.hpp:208
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
Get the largest available block size and total free size in the specified free list.
Definition: pool_memory_resource.hpp:468
std::size_t get_maximum_allocation_size() const
Get the maximum size of allocations supported by this memory resource.
Definition: pool_memory_resource.hpp:221
void release()
Free all memory allocated from the upstream memory_resource.
Definition: pool_memory_resource.hpp:412
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
Try to expand the pool by allocating a block of at least min_size bytes from upstream.
Definition: pool_memory_resource.hpp:241
std::lock_guard< std::mutex > lock_guard
Type of lock used to synchronize access.
Definition: pool_memory_resource.hpp:211
std::size_t pool_size() const noexcept
Computes the size of the current pool.
Definition: pool_memory_resource.hpp:204
~pool_memory_resource() override
Destroy the pool_memory_resource and deallocate all memory it allocated using the upstream resource.
Definition: pool_memory_resource.hpp:181
detail::coalescing_free_list free_list
The free list implementation.
Definition: pool_memory_resource.hpp:207
pool_memory_resource(Upstream *upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:141
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
Allocate a block from upstream to expand the suballocation pool.
Definition: pool_memory_resource.hpp:348
Exception thrown when RMM runs out of memory.
Definition: error.hpp:76
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.
static const cuda_stream_view cuda_stream_legacy
Static cuda_stream_view of cudaStreamLegacy, for convenience.
Definition: cuda_stream_view.hpp:116
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:72
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:32
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:25
bool is_aligned(std::size_t value, std::size_t alignment) noexcept
Checks whether a value is aligned to a multiple of a specified power of 2.
std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Management of per-device device_memory_resources.
A helper class to remove the device_accessible property.
Definition: pool_memory_resource.hpp:52