All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
pool_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/aligned.hpp>
19 #include <rmm/cuda_stream_view.hpp>
20 #include <rmm/detail/error.hpp>
21 #include <rmm/detail/export.hpp>
22 #include <rmm/detail/format.hpp>
23 #include <rmm/detail/logging_assert.hpp>
24 #include <rmm/detail/thrust_namespace.h>
25 #include <rmm/logger.hpp>
26 #include <rmm/mr/device/detail/coalescing_free_list.hpp>
27 #include <rmm/mr/device/detail/stream_ordered_memory_resource.hpp>
30 #include <rmm/resource_ref.hpp>
31 
32 #include <cuda/std/type_traits>
33 #include <cuda_runtime_api.h>
34 #include <thrust/iterator/counting_iterator.h>
35 #include <thrust/iterator/transform_iterator.h>
36 #include <thrust/optional.h>
37 
38 #include <algorithm>
39 #include <cstddef>
40 #include <iostream>
41 #include <map>
42 #include <mutex>
43 #include <numeric>
44 #include <optional>
45 #include <set>
46 #include <thread>
47 #include <unordered_map>
48 #include <vector>
49 
50 namespace RMM_NAMESPACE {
51 namespace mr {
57 namespace detail {
69 template <class PoolResource, class Upstream, class Property, class = void>
71 
75 template <class PoolResource, class Upstream, class Property>
76 struct maybe_remove_property<PoolResource,
77  Upstream,
78  Property,
79  cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {
80 #if defined(__GNUC__) && !defined(__clang__) // GCC warns about compatibility
81  // issues with pre ISO C++ code
82 #pragma GCC diagnostic push
83 #pragma GCC diagnostic ignored "-Wnon-template-friend"
84 #endif // __GNUC__ and not __clang__
89  friend void get_property(const PoolResource&, Property) = delete;
90 #if defined(__GNUC__) && !defined(__clang__)
91 #pragma GCC diagnostic pop
92 #endif // __GNUC__ and not __clang__
93 };
94 } // namespace detail
95 
106 template <typename Upstream>
108  : public detail::
109  maybe_remove_property<pool_memory_resource<Upstream>, Upstream, cuda::mr::device_accessible>,
110  public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
111  detail::coalescing_free_list>,
112  public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {
113  public:
114  friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
115  detail::coalescing_free_list>;
116 
131  explicit pool_memory_resource(device_async_resource_ref upstream_mr,
132  std::size_t initial_pool_size,
133  std::optional<std::size_t> maximum_pool_size = std::nullopt)
134  : upstream_mr_{upstream_mr}
135  {
136  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
137  "Error, Initial pool size required to be a multiple of 256 bytes");
138  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),
139  "Error, Maximum pool size required to be a multiple of 256 bytes");
140 
141  initialize_pool(initial_pool_size, maximum_pool_size);
142  }
143 
159  explicit pool_memory_resource(Upstream* upstream_mr,
160  std::size_t initial_pool_size,
161  std::optional<std::size_t> maximum_pool_size = std::nullopt)
162  : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}
163  {
164  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),
165  "Error, Initial pool size required to be a multiple of 256 bytes");
166  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),
167  "Error, Maximum pool size required to be a multiple of 256 bytes");
168 
169  initialize_pool(initial_pool_size, maximum_pool_size);
170  }
171 
187  template <typename Upstream2 = Upstream,
188  cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>
189  explicit pool_memory_resource(Upstream2& upstream_mr,
190  std::size_t initial_pool_size,
191  std::optional<std::size_t> maximum_pool_size = std::nullopt)
192  : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)
193  {
194  }
195 
200  ~pool_memory_resource() override { release(); }
201 
202  pool_memory_resource() = delete;
205  pool_memory_resource& operator=(pool_memory_resource const&) = delete;
206  pool_memory_resource& operator=(pool_memory_resource&&) = delete;
207 
211  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept
212  {
213  return upstream_mr_;
214  }
215 
223  [[nodiscard]] std::size_t pool_size() const noexcept { return current_pool_size_; }
224 
225  protected:
226  using free_list = detail::coalescing_free_list;
227  using block_type = free_list::block_type;
228  using typename detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
229  detail::coalescing_free_list>::split_block;
230  using lock_guard = std::lock_guard<std::mutex>;
231 
240  [[nodiscard]] std::size_t get_maximum_allocation_size() const
241  {
242  return std::numeric_limits<std::size_t>::max();
243  }
244 
260  block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
261  {
262  while (try_size >= min_size) {
263  auto block = block_from_upstream(try_size, stream);
264  if (block.has_value()) {
265  current_pool_size_ += block.value().size();
266  return block.value();
267  }
268  if (try_size == min_size) {
269  break; // only try `size` once
270  }
271  try_size = std::max(min_size, try_size / 2);
272  }
273  RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
274  rmm::detail::format_stream(stream),
275  min_size);
276  RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
277  }
278 
287  void initialize_pool(std::size_t initial_size, std::optional<std::size_t> maximum_size)
288  {
289  current_pool_size_ = 0; // try_to_expand will set this if it succeeds
290  maximum_pool_size_ = maximum_size;
291 
292  RMM_EXPECTS(
293  initial_size <= maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max()),
294  "Initial pool size exceeds the maximum pool size!");
295 
296  if (initial_size > 0) {
297  auto const block = try_to_expand(initial_size, initial_size, cuda_stream_legacy);
298  this->insert_block(block, cuda_stream_legacy);
299  }
300  }
301 
311  block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)
312  {
313  // Strategy: If maximum_pool_size_ is set, then grow geometrically, e.g. by halfway to the
314  // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
315  // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
316  // until either success or the attempt is less than the requested size.
317  return try_to_expand(size_to_grow(size), size, stream);
318  }
319 
332  [[nodiscard]] std::size_t size_to_grow(std::size_t size) const
333  {
334  if (maximum_pool_size_.has_value()) {
335  auto const unaligned_remaining = maximum_pool_size_.value() - pool_size();
336  using rmm::align_up;
337  auto const remaining = align_up(unaligned_remaining, rmm::CUDA_ALLOCATION_ALIGNMENT);
338  auto const aligned_size = align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
339  return (aligned_size <= remaining) ? std::max(aligned_size, remaining / 2) : 0;
340  }
341  return std::max(size, pool_size());
342  };
343 
351  std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
352  {
353  RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);
354 
355  if (size == 0) { return {}; }
356 
357  try {
358  void* ptr = get_upstream_resource().allocate_async(size, stream);
359  return std::optional<block_type>{
360  *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
361  } catch (std::exception const& e) {
362  return std::nullopt;
363  }
364  }
365 
376  split_block allocate_from_block(block_type const& block, std::size_t size)
377  {
378  block_type const alloc{block.pointer(), size, block.is_head()};
379 #ifdef RMM_POOL_TRACK_ALLOCATIONS
380  allocated_blocks_.insert(alloc);
381 #endif
382 
383  auto rest = (block.size() > size)
384  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
385  ? block_type{block.pointer() + size, block.size() - size, false}
386  : block_type{};
387  return {alloc, rest};
388  }
389 
398  block_type free_block(void* ptr, std::size_t size) noexcept
399  {
400 #ifdef RMM_POOL_TRACK_ALLOCATIONS
401  if (ptr == nullptr) return block_type{};
402  auto const iter = allocated_blocks_.find(static_cast<char*>(ptr));
403  RMM_LOGGING_ASSERT(iter != allocated_blocks_.end());
404 
405  auto block = *iter;
406  RMM_LOGGING_ASSERT(block.size() == rmm::align_up(size, allocation_alignment));
407  allocated_blocks_.erase(iter);
408 
409  return block;
410 #else
411  auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));
412  return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};
413 #endif
414  }
415 
420  void release()
421  {
422  lock_guard lock(this->get_mutex());
423 
424  for (auto block : upstream_blocks_) {
425  get_upstream_resource().deallocate(block.pointer(), block.size());
426  }
427  upstream_blocks_.clear();
428 #ifdef RMM_POOL_TRACK_ALLOCATIONS
429  allocated_blocks_.clear();
430 #endif
431 
432  current_pool_size_ = 0;
433  }
434 
435 #ifdef RMM_DEBUG_PRINT
442  void print()
443  {
444  lock_guard lock(this->get_mutex());
445 
446  auto const [free, total] = rmm::available_device_memory();
447  std::cout << "GPU free memory: " << free << " total: " << total << "\n";
448 
449  std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";
450  std::size_t upstream_total{0};
451 
452  for (auto blocks : upstream_blocks_) {
453  blocks.print();
454  upstream_total += blocks.size();
455  }
456  std::cout << "total upstream: " << upstream_total << " B\n";
457 
458 #ifdef RMM_POOL_TRACK_ALLOCATIONS
459  std::cout << "allocated_blocks: " << allocated_blocks_.size() << "\n";
460  for (auto block : allocated_blocks_)
461  block.print();
462 #endif
463 
464  this->print_free_blocks();
465  }
466 #endif
467 
476  std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)
477  {
478  std::size_t largest{};
479  std::size_t total{};
480  std::for_each(blocks.cbegin(), blocks.cend(), [&largest, &total](auto const& block) {
481  total += block.size();
482  largest = std::max(largest, block.size());
483  });
484  return {largest, total};
485  }
486 
487  private:
488  // The "heap" to allocate the pool from
489  device_async_resource_ref upstream_mr_;
490  std::size_t current_pool_size_{};
491  std::optional<std::size_t> maximum_pool_size_{};
492 
493 #ifdef RMM_POOL_TRACK_ALLOCATIONS
494  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;
495 #endif
496 
497  // blocks allocated from upstream
498  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;
499 }; // namespace mr
500  // end of group
502 } // namespace mr
503 } // namespace RMM_NAMESPACE
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41
A coalescing best-fit suballocator which uses a pool of memory allocated from an upstream memory_reso...
Definition: pool_memory_resource.hpp:112
block_type free_block(void *ptr, std::size_t size) noexcept
Finds, frees and returns the block associated with pointer ptr.
Definition: pool_memory_resource.hpp:398
void initialize_pool(std::size_t initial_size, std::optional< std::size_t > maximum_size)
Allocate initial memory for the pool.
Definition: pool_memory_resource.hpp:287
split_block allocate_from_block(block_type const &block, std::size_t size)
Splits block if necessary to return a pointer to memory of size bytes.
Definition: pool_memory_resource.hpp:376
device_async_resource_ref get_upstream_resource() const noexcept
rmm::device_async_resource_ref to the upstream resource
Definition: pool_memory_resource.hpp:211
std::size_t size_to_grow(std::size_t size) const
Given a minimum size, computes an appropriate size to grow the pool.
Definition: pool_memory_resource.hpp:332
free_list::block_type block_type
The type of block returned by the free list.
Definition: pool_memory_resource.hpp:227
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
Get the largest available block size and total free size in the specified free list.
Definition: pool_memory_resource.hpp:476
std::size_t get_maximum_allocation_size() const
Get the maximum size of allocations supported by this memory resource.
Definition: pool_memory_resource.hpp:240
block_type expand_pool(std::size_t size, free_list &blocks, cuda_stream_view stream)
Allocate space from upstream to supply the suballocation pool and return a sufficiently sized block.
Definition: pool_memory_resource.hpp:311
pool_memory_resource(Upstream2 &upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:189
void release()
Free all memory allocated from the upstream memory_resource.
Definition: pool_memory_resource.hpp:420
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
Try to expand the pool by allocating a block of at least min_size bytes from upstream.
Definition: pool_memory_resource.hpp:260
std::lock_guard< std::mutex > lock_guard
Type of lock used to synchronize access.
Definition: pool_memory_resource.hpp:230
std::optional< block_type > block_from_upstream(std::size_t size, cuda_stream_view stream)
Allocate a block from upstream to expand the suballocation pool.
Definition: pool_memory_resource.hpp:351
std::size_t pool_size() const noexcept
Computes the size of the current pool.
Definition: pool_memory_resource.hpp:223
~pool_memory_resource() override
Destroy the pool_memory_resource and deallocate all memory it allocated using the upstream resource.
Definition: pool_memory_resource.hpp:200
detail::coalescing_free_list free_list
The free list implementation.
Definition: pool_memory_resource.hpp:226
pool_memory_resource(Upstream *upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
Construct a pool_memory_resource and allocate the initial device memory pool using upstream_mr.
Definition: pool_memory_resource.hpp:159
Exception thrown when RMM runs out of memory.
Definition: error.hpp:87
std::pair< std::size_t, std::size_t > available_device_memory()
Returns the available and total device memory in bytes for the current device.
Definition: cuda_device.hpp:120
static const cuda_stream_view cuda_stream_legacy
Static cuda_stream_view of cudaStreamLegacy, for convenience.
Definition: cuda_stream_view.hpp:133
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
Alias for a cuda::mr::async_resource_ref with the property cuda::mr::device_accessible.
Definition: resource_ref.hpp:41
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
Convert pointer to memory resource into device_async_resource_ref, checking for nullptr
Definition: resource_ref.hpp:79
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
Default alignment used for CUDA memory allocation.
Definition: aligned.hpp:43
constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
Checks whether a value is aligned to a multiple of a specified power of 2.
Definition: aligned.hpp:105
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
Align up to nearest multiple of specified power of 2.
Definition: aligned.hpp:77
Management of per-device device_memory_resources.
A helper class to remove the device_accessible property.
Definition: pool_memory_resource.hpp:70