Memory Resources#
-
using allocate_callback_t = std::function<void*(std::size_t, cuda_stream_view, void*)>#
Callback function type used by callback memory resource for allocation.
The signature of the callback function is:
void* allocate_callback_t(std::size_t bytes, cuda_stream_view stream, void* arg);Returns a pointer to an allocation of at least
bytesusable immediately onstream. The stream-ordered behavior requirements are identical toallocate.The
argis provided to the constructor of thecallback_memory_resourceand will be forwarded along to every invocation of the callback function.
-
using deallocate_callback_t = std::function<void(void*, std::size_t, cuda_stream_view, void*)>#
Callback function type used by callback_memory_resource for deallocation.
The signature of the callback function is:
void deallocate_callback_t(void* ptr, std::size_t bytes, cuda_stream_view stream, void* arg);Deallocates memory pointed to by
ptr.bytesspecifies the size of the allocation in bytes, and must equal the value ofbytesthat was passed to the allocate callback function. The stream-ordered behavior requirements are identical todeallocate.The
argis provided to the constructor of thecallback_memory_resourceand will be forwarded along to every invocation of the callback function.
-
using device_resource_ref = cuda::mr::synchronous_resource_ref<cuda::mr::device_accessible>#
Alias for a
cuda::mr::synchronous_resource_refwith the propertycuda::mr::device_accessible.
-
using device_async_resource_ref = cuda::mr::resource_ref<cuda::mr::device_accessible>#
Alias for a
cuda::mr::resource_refwith the propertycuda::mr::device_accessible.
-
using host_resource_ref = cuda::mr::synchronous_resource_ref<cuda::mr::host_accessible>#
Alias for a
cuda::mr::synchronous_resource_refwith the propertycuda::mr::host_accessible.
-
using host_async_resource_ref = cuda::mr::resource_ref<cuda::mr::host_accessible>#
Alias for a
cuda::mr::resource_refwith the propertycuda::mr::host_accessible.
-
using host_device_resource_ref = cuda::mr::synchronous_resource_ref<cuda::mr::host_accessible, cuda::mr::device_accessible>#
Alias for a
cuda::mr::synchronous_resource_refwith the propertiescuda::mr::host_accessibleandcuda::mr::device_accessible.
-
using host_device_async_resource_ref = cuda::mr::resource_ref<cuda::mr::host_accessible, cuda::mr::device_accessible>#
Alias for a
cuda::mr::resource_refwith the propertiescuda::mr::host_accessibleandcuda::mr::device_accessible.
-
template<class Resource, class = void>
constexpr bool is_resource_adaptor = false# Concept to check whether a resource is a resource adaptor by checking for
get_upstream_resource.- Deprecated:
This trait will be removed in a future release.
-
inline device_async_resource_ref get_per_device_resource_ref(cuda_device_id device_id)#
Get the
device_async_resource_reffor the specified device.Returns a
device_async_resource_reffor the specified device. The initial resource_ref references acuda_memory_resource.device_id.value()must be in the range[0, cudaGetDeviceCount()), otherwise behavior is undefined.This function is thread-safe with respect to concurrent calls to
set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The returned
device_async_resource_refshould only be used when CUDA devicedevice_idis the current device (e.g. set usingcudaSetDevice()). The behavior of adevice_async_resource_refis undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.- Parameters:
device_id – The id of the target device
- Returns:
The current
device_async_resource_reffor devicedevice_id
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> set_per_device_resource(cuda_device_id device_id, cuda::mr::any_resource<cuda::mr::device_accessible> new_resource)#
Set the memory resource for the specified device.
Takes ownership of the provided resource by value. The resource is moved into the per-device resource map.
device_id.value()must be in the range[0, cudaGetDeviceCount()), otherwise behavior is undefined.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The resource passed in
new_resourcemust have been created when devicedevice_idwas the current CUDA device (e.g. set usingcudaSetDevice()). The behavior of a memory resource is undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.Note
The per-device resource map keeps the provided resource alive until process exit. Its destructor may therefore run during process termination. If the destructor may call CUDA APIs, it must consult
rmm::process_is_exiting()and skip those calls when it returnstrue.- Parameters:
device_id – The id of the target device
new_resource – New resource to use for
device_id
- Returns:
An owning
any_resourceholding the previous resource fordevice_id
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> set_per_device_resource_ref(cuda_device_id device_id, device_async_resource_ref new_resource_ref)#
Set the
device_async_resource_reffor the specified device tonew_resource_ref- Deprecated:
Use
set_per_device_resourceinstead.
device_id.value()must be in the range[0, cudaGetDeviceCount()), otherwise behavior is undefined.The referenced resource is copied into an owning
any_resourceand moved into the per-device resource map.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The resource passed in
new_resource_refmust have been created when devicedevice_idwas the current CUDA device (e.g. set usingcudaSetDevice()). The behavior of adevice_async_resource_refis undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.Note
The per-device resource map keeps the underlying resource alive until process exit. Its destructor may therefore run during process termination. If it may call CUDA APIs, it must consult
rmm::process_is_exiting()and skip those calls when it returnstrue.- Parameters:
device_id – The id of the target device
new_resource_ref – new
device_async_resource_refto use as new resource fordevice_id
- Returns:
An owning
any_resourceholding the previous resource fordevice_id
-
inline device_async_resource_ref get_current_device_resource_ref()#
Get the
device_async_resource_reffor the current device.Returns the
device_async_resource_refset for the current device. The initial resource_ref references acuda_memory_resource.The “current device” is the device returned by
cudaGetDevice.This function is thread-safe with respect to concurrent calls to
set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource_refand `reset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The returned
device_async_resource_refshould only be used with the current CUDA device. Changing the current device (e.g. usingcudaSetDevice()) and then using the returnedresource_refcan result in undefined behavior. The behavior of adevice_async_resource_refis undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.- Returns:
device_async_resource_refactive for the current device
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> set_current_device_resource(cuda::mr::any_resource<cuda::mr::device_accessible> new_resource)#
Set the memory resource for the current device.
Takes ownership of the provided resource by value. The “current device” is the device returned by
cudaGetDevice.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The resource passed in
new_resourcemust have been created for the current CUDA device. The behavior of a memory resource is undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.Note
The per-device resource map keeps the provided resource alive until process exit. Its destructor may therefore run during process termination. If the destructor may call CUDA APIs, it must consult
rmm::process_is_exiting()and skip those calls when it returnstrue.- Parameters:
new_resource – New resource to use for the current device
- Returns:
An owning
any_resourceholding the previous resource for the current device
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> set_current_device_resource_ref(device_async_resource_ref new_resource_ref)#
Set the
device_async_resource_reffor the current device.- Deprecated:
Use
set_current_device_resourceinstead.
The “current device” is the device returned by
cudaGetDevice.The referenced resource is copied into an owning
any_resourceand moved into the per-device resource map.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.Note
The resource passed in
new_resource_refmust have been created for the current CUDA device. The behavior of adevice_async_resource_refis undefined if used while the active CUDA device is a different device from the one that was active when the memory resource was created.Note
The per-device resource map keeps the underlying resource alive until process exit. Its destructor may therefore run during process termination. If it may call CUDA APIs, it must consult
rmm::process_is_exiting()and skip those calls when it returnstrue.- Parameters:
new_resource_ref – New
device_async_resource_refto use for the current device- Returns:
An owning
any_resourceholding the previous resource for the current device
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> reset_per_device_resource(cuda_device_id device_id)#
Reset the memory resource for the specified device to the initial resource.
Resets to the initial
cuda_memory_resource.device_id.value()must be in the range[0, cudaGetDeviceCount()), otherwise behavior is undefined.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.- Parameters:
device_id – The id of the target device
- Returns:
An owning
any_resourceholding the previous resource fordevice_id
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> reset_current_device_resource()#
Reset the memory resource for the current device to the initial resource.
Resets to the initial
cuda_memory_resource. The “current device” is the device returned bycudaGetDevice.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.- Returns:
An owning
any_resourceholding the previous resource for the current device
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> reset_per_device_resource_ref(cuda_device_id device_id)#
Reset the
device_async_resource_reffor the specified device to the initial resource.- Deprecated:
Use
reset_per_device_resourceinstead.
Resets to a reference to the initial
cuda_memory_resource.device_id.value()must be in the range[0, cudaGetDeviceCount()), otherwise behavior is undefined.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.- Parameters:
device_id – The id of the target device
- Returns:
An owning
any_resourceholding the previous resource fordevice_id
-
inline cuda::mr::any_resource<cuda::mr::device_accessible> reset_current_device_resource_ref()#
Reset the
device_async_resource_reffor the current device to the initial resource.- Deprecated:
Use
reset_current_device_resourceinstead.
Resets to a reference to the initial
cuda_memory_resource. The “current device” is the device returned bycudaGetDevice.This function is thread-safe with respect to concurrent calls to
set_per_device_resource,set_per_device_resource_ref,get_per_device_resource_ref,get_current_device_resource_ref,set_current_device_resource,set_current_device_resource_refandreset_current_device_resource_ref. Concurrent calls to any of these functions will result in a valid state, but the order of execution is undefined.- Returns:
An owning
any_resourceholding the previous resource for the current device
-
template<typename T, typename U>
bool operator==(polymorphic_allocator<T> const &lhs, polymorphic_allocator<U> const &rhs)# Compare two
polymorphic_allocators for equality.Two
polymorphic_allocators are equal if their underlying memory resources compare equal.- Template Parameters:
T – Type of the first allocator
U – Type of the second allocator
- Parameters:
lhs – The first allocator to compare
rhs – The second allocator to compare
- Returns:
true if the two allocators are equal, false otherwise
-
template<typename T, typename U>
bool operator!=(polymorphic_allocator<T> const &lhs, polymorphic_allocator<U> const &rhs)# Compare two
polymorphic_allocators for inequality.Two
polymorphic_allocators are not equal if their underlying memory resources compare not equal.- Template Parameters:
T – Type of the first allocator
U – Type of the second allocator
- Parameters:
lhs – The first allocator to compare
rhs – The second allocator to compare
- Returns:
true if the two allocators are not equal, false otherwise
-
template<typename A, typename O>
bool operator==(stream_allocator_adaptor<A> const &lhs, stream_allocator_adaptor<O> const &rhs)# Compare two
stream_allocator_adaptors for equality.Two
stream_allocator_adaptors are equal if their underlying allocators compare equal.- Template Parameters:
A – Type of the first allocator
O – Type of the second allocator
- Parameters:
lhs – The first allocator to compare
rhs – The second allocator to compare
- Returns:
true if the two allocators are equal, false otherwise
-
template<typename A, typename O>
bool operator!=(stream_allocator_adaptor<A> const &lhs, stream_allocator_adaptor<O> const &rhs)# Compare two
stream_allocator_adaptors for inequality.Two
stream_allocator_adaptors are not equal if their underlying allocators compare not equal.- Template Parameters:
A – Type of the first allocator
O – Type of the second allocator
- Parameters:
lhs – The first allocator to compare
rhs – The second allocator to compare
- Returns:
true if the two allocators are not equal, false otherwise
-
bool process_is_exiting() noexcept#
Returns
trueif the process has enteredexit()/ atexit handler execution.Destructors of static objects, as well as atexit handlers registered by other DSOs, run during process termination after
main()has returned. At that point calling into the CUDA runtime or driver is undefined behavior: the primary context may already be destroyed, and CUDA API calls may dereference released state and crash inside libcuda rather than returning an error.Use this function from a memory resource destructor (or a helper invoked by a destructor, such as a
release()method) when the resource may be held in RMM’s internal per-device resource map and destroyed during process termination. In that case the destructor may run after the CUDA primary context has been destroyed, and calling into the CUDA runtime is undefined behavior. Destructors can avoid that by:Never calling CUDA APIs from the destructor at all, or
Consulting
rmm::process_is_exiting()in the destructor (and in any helper invoked by the destructor, such as arelease()method) and skipping CUDA API calls when it returnstrue. In that case, resources that would have been explicitly released should be leaked; the OS reclaims them when the process exits.
Storing RMM objects with static or thread-local scope is unsupported. Users should not create their own static containers of RMM objects and rely on
rmm::process_is_exiting()to make those destructors safe.Calling
rmm::process_is_exiting()from a resource destructor is always safe: it performs a single atomic load (acquire semantics) and never calls into CUDA.Example:
class my_resource final : public ... { ~my_resource() override { if (rmm::process_is_exiting()) { return; } RMM_ASSERT_CUDA_SUCCESS_SAFE_SHUTDOWN(cudaFree(ptr_)); } };
- Returns:
trueifexit()has begun;falseotherwise.
-
template<class Resource>
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)# Convert pointer to memory resource into
device_async_resource_ref, checking fornullptr- Template Parameters:
Resource – The type of the memory resource.
- Parameters:
res – A pointer to the memory resource.
- Throws:
std::logic_error – if the memory resource pointer is null.
- Returns:
A
device_async_resource_refto the memory resource.
-
class arena_memory_resource : public cuda::mr::shared_resource<detail::arena_memory_resource_impl>#
- #include <arena_memory_resource.hpp>
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Allocation and deallocation are thread-safe. Also, this class is compatible with CUDA per-thread default stream.
GPU memory is divided into a global arena, per-thread arenas for default streams, and per-stream arenas for non-default streams. Each arena allocates memory from the global arena in chunks called superblocks.
Blocks in each arena are allocated using address-ordered first fit. When a block is freed, it is coalesced with neighbouring free blocks if the addresses are contiguous. Free superblocks are returned to the global arena.
In real-world applications, allocation sizes tend to follow a power law distribution in which large allocations are rare, but small ones quite common. By handling small allocations in the per-thread arena, adequate performance can be achieved without introducing excessive memory fragmentation under high concurrency.
This design is inspired by several existing CPU memory allocators targeting multi-threaded applications (glibc malloc, Hoard, jemalloc, TCMalloc), albeit in a simpler form. Possible future improvements include using size classes, allocation caches, and more fine-grained locking or lock-free approaches.
This class is copyable and shares ownership of its internal state via
cuda::mr::shared_resource.See also
Wilson, P. R., Johnstone, M. S., Neely, M., & Boles, D. (1995, September). Dynamic storage allocation: A survey and critical review. In International Workshop on Memory Management (pp. 1-116). Springer, Berlin, Heidelberg.
See also
Berger, E. D., McKinley, K. S., Blumofe, R. D., & Wilson, P. R. (2000). Hoard: A scalable memory allocator for multithreaded applications. ACM Sigplan Notices, 35(11), 117-128.
See also
Evans, J. (2006, April). A scalable concurrent malloc (3) implementation for FreeBSD. In Proc. of the bsdcan conference, ottawa, canada.
See also
See also
See also
Public Functions
-
explicit arena_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::optional<std::size_t> arena_size = std::nullopt, bool dump_log_on_failure = false)#
Construct an
arena_memory_resource.- Parameters:
upstream – The resource from which to allocate blocks for the global arena.
arena_size – Size in bytes of the global arena. Defaults to half of the available memory on the current device.
dump_log_on_failure – If true, dump memory log when running out of memory.
-
~arena_memory_resource() = default#
Friends
-
inline friend void get_property(arena_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.
-
explicit arena_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::optional<std::size_t> arena_size = std::nullopt, bool dump_log_on_failure = false)#
-
class binning_memory_resource : public cuda::mr::shared_resource<detail::binning_memory_resource_impl>#
- #include <binning_memory_resource.hpp>
Allocates memory from upstream resources associated with bin sizes.
This class is copyable and shares ownership of its internal state, allowing multiple instances to safely reference the same underlying bins.
Public Functions
-
explicit binning_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream)#
Construct a new binning memory resource object.
Initially has no bins, so simply uses the upstream resource until bin resources are added with
add_bin.- Parameters:
upstream – The resource used to allocate bin pools.
-
binning_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, int8_t min_size_exponent, int8_t max_size_exponent)#
Construct a new binning memory resource object with a range of initial bins.
Constructs a new binning memory resource and adds bins backed by
fixed_size_memory_resourcein the range [2^min_size_exponent, 2^max_size_exponent]. For example ifmin_size_exponent==18andmax_size_exponent==22, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and 4096KiB.- Parameters:
upstream – The resource used to allocate bin pools.
min_size_exponent – The minimum base-2 exponent bin size.
max_size_exponent – The maximum base-2 exponent bin size.
-
~binning_memory_resource() = default#
-
device_async_resource_ref get_upstream_resource() const noexcept#
device_async_resource_ref to the upstream resource
- Returns:
device_async_resource_ref to the upstream resource
-
void add_bin(std::size_t allocation_size, std::optional<device_async_resource_ref> bin_resource = std::nullopt)#
Add a bin allocator to this resource.
Adds
bin_resourceif provided; otherwise constructs and adds a fixed_size_memory_resource.This bin will be used for any allocation smaller than
allocation_sizethat is larger than the next smaller bin’s allocation size.If there is already a bin of the specified size nothing is changed.
This function is not thread safe.
- Parameters:
allocation_size – The maximum size that this bin allocates
bin_resource – The memory resource for the bin
Friends
-
inline friend void get_property(binning_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
binning_memory_resourceprovides device accessible memory
-
explicit binning_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream)#
-
class callback_memory_resource : public cuda::mr::shared_resource<detail::callback_memory_resource_impl>#
- #include <callback_memory_resource.hpp>
A device memory resource that uses the provided callbacks for memory allocation and deallocation.
This class is copyable and shares ownership of its internal state via
cuda::mr::shared_resource.Public Functions
-
callback_memory_resource(allocate_callback_t allocate_callback, deallocate_callback_t deallocate_callback, void *allocate_callback_arg = nullptr, void *deallocate_callback_arg = nullptr)#
Construct a new callback memory resource.
Constructs a callback memory resource that uses the user-provided callbacks
allocate_callbackfor allocation anddeallocate_callbackfor deallocation.- Parameters:
allocate_callback – The callback function used for allocation
deallocate_callback – The callback function used for deallocation
allocate_callback_arg – Additional context passed to
allocate_callback. It is the caller’s responsibility to maintain the lifetime of the pointed-to data for the duration of the lifetime of thecallback_memory_resource.deallocate_callback_arg – Additional context passed to
deallocate_callback. It is the caller’s responsibility to maintain the lifetime of the pointed-to data for the duration of the lifetime of thecallback_memory_resource.
-
callback_memory_resource() = delete#
-
~callback_memory_resource() = default#
Friends
-
inline friend void get_property(callback_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.
-
callback_memory_resource(allocate_callback_t allocate_callback, deallocate_callback_t deallocate_callback, void *allocate_callback_arg = nullptr, void *deallocate_callback_arg = nullptr)#
-
class cuda_async_managed_memory_resource : public cuda::mr::shared_resource<detail::cuda_async_managed_memory_resource_impl>#
- #include <cuda_async_managed_memory_resource.hpp>
Memory resource that uses
cudaMallocFromPoolAsync/cudaFreeFromPoolAsyncwith a managed memory pool for allocation/deallocation.Public Functions
-
cuda_async_managed_memory_resource()#
Constructs a cuda_async_managed_memory_resource with the default managed memory pool for the current device.
The default managed memory pool is the pool that is created when the device is created. Pool properties such as the release threshold are not modified.
- Throws:
rmm::logic_error – if the CUDA version does not support
cudaMallocFromPoolAsyncwith managed memory pool
-
cudaMemPool_t pool_handle() const noexcept#
Returns the underlying native handle to the CUDA pool.
- Returns:
cudaMemPool_t Handle to the underlying CUDA pool
-
~cuda_async_managed_memory_resource() = default#
-
cuda_async_managed_memory_resource(cuda_async_managed_memory_resource const&) = default#
Default copy constructor.
-
cuda_async_managed_memory_resource(cuda_async_managed_memory_resource&&) = default#
Default move constructor.
-
cuda_async_managed_memory_resource &operator=(cuda_async_managed_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
cuda_async_managed_memory_resource& Reference to the assigned object
-
cuda_async_managed_memory_resource &operator=(cuda_async_managed_memory_resource&&) = default#
Default move assignment operator.
- Returns:
cuda_async_managed_memory_resource& Reference to the assigned object
Friends
-
inline friend void get_property(cuda_async_managed_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.
-
inline friend void get_property(cuda_async_managed_memory_resource const&, cuda::mr::host_accessible) noexcept#
Enables the
cuda::mr::host_accessibleproperty.
-
cuda_async_managed_memory_resource()#
-
class cuda_async_memory_resource : public cuda::mr::shared_resource<detail::cuda_async_memory_resource_impl>#
- #include <cuda_async_memory_resource.hpp>
Memory resource that uses
cudaMallocAsync/cudaFreeAsyncfor allocation/deallocation.Public Types
-
enum class allocation_handle_type : std::int32_t#
Flags for specifying memory allocation handle types.
Note
These values are exact copies from
cudaMemAllocationHandleType. We need a placeholder that can be used consistently in the constructor ofcuda_async_memory_resourcewith all supported versions of CUDA. See thecudaMemAllocationHandleTypedocs at https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html and ensure the enum values are kept in sync with the CUDA documentation.Note
cudaMemHandleTypeFabric can be used instead of 0x8 once we require CUDA 12.4+.
Values:
-
enumerator none#
Does not allow any export mechanism.
-
enumerator posix_file_descriptor#
Allows a file descriptor to be used for exporting. Permitted only on POSIX systems.
-
enumerator win32#
Allows a Win32 NT handle to be used for exporting. (HANDLE)
-
enumerator win32_kmt#
Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
-
enumerator fabric#
Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
-
enumerator none#
-
enum class mempool_usage : unsigned short#
Flags for specifying memory pool usage.
Note
These values are exact copies from the runtime API. See the
cudaMemPoolPropsdocs at https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaMemPoolProps.html and ensure the enum values are kept in sync with the CUDA documentation.cudaMemPoolCreateUsageHwDecompressis currently the only supported usage flag, introduced in CUDA 12.8 and documented in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.htmlValues:
-
enumerator hw_decompress#
If set indicates that the memory can be used as a buffer for hardware accelerated decompression.
-
enumerator hw_decompress#
Public Functions
-
cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size = {}, std::optional<std::size_t> release_threshold = {}, std::optional<allocation_handle_type> export_handle_type = {})#
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release threshold.
If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next synchronization event.
- Throws:
rmm::logic_error – if the CUDA version does not support
cudaMallocAsync- Parameters:
initial_pool_size – Optional initial size in bytes of the pool. If provided, the pool will be primed by allocating and immediately deallocating this amount of memory on the default CUDA stream.
release_threshold – Optional release threshold size in bytes of the pool. If no value is provided, the release threshold is set to the maximum value of
std::uint64_t, so that the pool retains memory across synchronization events unless the caller specifies otherwise.export_handle_type – Optional
cudaMemAllocationHandleTypethat allocations from this resource should support interprocess communication (IPC). Default iscudaMemHandleTypeNonefor no IPC support.
-
cudaMemPool_t pool_handle() const noexcept#
Returns the underlying native handle to the CUDA pool.
- Returns:
cudaMemPool_t Handle to the underlying CUDA pool
-
~cuda_async_memory_resource() = default#
-
cuda_async_memory_resource(cuda_async_memory_resource const&) = default#
Default copy constructor.
-
cuda_async_memory_resource(cuda_async_memory_resource&&) = default#
Default move constructor.
-
cuda_async_memory_resource &operator=(cuda_async_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
cuda_async_memory_resource& Reference to the assigned object
-
cuda_async_memory_resource &operator=(cuda_async_memory_resource&&) = default#
Default move assignment operator.
- Returns:
cuda_async_memory_resource& Reference to the assigned object
Friends
-
inline friend void get_property(cuda_async_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.
-
enum class allocation_handle_type : std::int32_t#
-
class cuda_async_view_memory_resource#
- #include <cuda_async_view_memory_resource.hpp>
Memory resource that uses
cudaMallocAsync/cudaFreeAsyncfor allocation/deallocation.Public Functions
-
inline cuda_async_view_memory_resource(cudaMemPool_t pool_handle)#
Constructs a cuda_async_view_memory_resource which uses an existing CUDA memory pool. The provided pool is not owned by cuda_async_view_memory_resource and must remain valid during the lifetime of the memory resource.
- Throws:
rmm::logic_error – if the CUDA version does not support
cudaMallocAsync- Parameters:
pool_handle – Handle to a CUDA memory pool which will be used to serve allocation requests.
-
inline cudaMemPool_t pool_handle() const noexcept#
Returns the underlying native handle to the CUDA pool.
- Returns:
cudaMemPool_t Handle to the underlying CUDA pool
-
cuda_async_view_memory_resource() = default#
-
~cuda_async_view_memory_resource() = default#
-
cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = default#
Default copy constructor.
-
cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) = default#
Default move constructor.
-
cuda_async_view_memory_resource &operator=(cuda_async_view_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
cuda_async_view_memory_resource& Reference to the assigned object
-
cuda_async_view_memory_resource &operator=(cuda_async_view_memory_resource&&) = default#
Default move assignment operator.
- Returns:
cuda_async_view_memory_resource& Reference to the assigned object
-
inline void *allocate(cuda::stream_ref stream, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytes.The returned pointer will have at minimum 256 byte alignment.
- Parameters:
stream – Stream on which to perform allocation
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate(cuda::stream_ref stream, void *ptr, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptr.- Parameters:
stream – Stream on which to perform deallocation
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation. This must be equal to the value of
bytesthat was passed to theallocatecall that returnedptr.alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline void *allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytessynchronously.- Parameters:
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate_sync(void *ptr, std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptrsynchronously.- Parameters:
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation
alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline bool operator==(cuda_async_view_memory_resource const &other) const noexcept#
Compare this resource to another.
- Parameters:
other – The other resource to compare to
- Returns:
true If the two resources are equivalent
- Returns:
false If the two resources are not equal
-
inline bool operator!=(cuda_async_view_memory_resource const &other) const noexcept#
Compare this resource to another.
- Parameters:
other – The other resource to compare to
- Returns:
true If the two resources are equivalent
- Returns:
false If the two resources are not equal
Friends
-
inline friend void get_property(cuda_async_view_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
cuda_async_view_memory_resourceprovides device accessible memory
-
inline cuda_async_view_memory_resource(cudaMemPool_t pool_handle)#
-
class cuda_memory_resource#
- #include <cuda_memory_resource.hpp>
Memory resource that uses cudaMalloc/Free for allocation/deallocation.
Public Functions
-
cuda_memory_resource() = default#
-
~cuda_memory_resource() = default#
-
cuda_memory_resource(cuda_memory_resource const&) = default#
Default copy constructor.
-
cuda_memory_resource(cuda_memory_resource&&) = default#
Default move constructor.
-
cuda_memory_resource &operator=(cuda_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
cuda_memory_resource& Reference to the assigned object
-
cuda_memory_resource &operator=(cuda_memory_resource&&) = default#
Default move assignment operator.
- Returns:
cuda_memory_resource& Reference to the assigned object
-
inline void *allocate([[maybe_unused]] cuda::stream_ref stream, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytes.The returned pointer will have at minimum 256 byte alignment.
The stream argument is ignored.
- Parameters:
stream – This argument is ignored
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate([[maybe_unused]] cuda::stream_ref stream, void *ptr, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptr.The stream argument is ignored.
- Parameters:
stream – This argument is ignored
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation. This must be equal to the value of
bytesthat was passed to theallocatecall that returnedptr.alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline void *allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytessynchronously.- Parameters:
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate_sync(void *ptr, std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptrsynchronously.- Parameters:
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation
alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline bool operator==(cuda_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of cuda_memory_resource are equivalent.
- Returns:
true Always
-
inline bool operator!=(cuda_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of cuda_memory_resource are equivalent.
- Returns:
true Always
Friends
-
inline friend void get_property(cuda_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
cuda_memory_resourceprovides device accessible memory
-
cuda_memory_resource() = default#
-
class fixed_size_memory_resource : public cuda::mr::shared_resource<detail::fixed_size_memory_resource_impl>#
- #include <fixed_size_memory_resource.hpp>
A memory resource which allocates memory blocks of a single fixed size.
Supports only allocations of size smaller than the configured block_size.
This class is copyable and shares ownership of its internal state, allowing multiple instances to safely reference the same underlying pool.
Public Functions
-
explicit fixed_size_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::size_t block_size = default_block_size, std::size_t blocks_to_preallocate = default_blocks_to_preallocate)#
Construct a new
fixed_size_memory_resourcethat allocates memory fromupstream.When the pool of blocks is all allocated, grows the pool by allocating
blocks_to_preallocatemore blocks fromupstream.- Parameters:
upstream – The resource from which to allocate blocks for the pool.
block_size – The size of blocks to allocate.
blocks_to_preallocate – The number of blocks to allocate to initialize the pool.
-
~fixed_size_memory_resource() = default#
-
device_async_resource_ref get_upstream_resource() const noexcept#
device_async_resource_ref to the upstream resource
- Returns:
device_async_resource_ref to the upstream resource
-
std::size_t get_block_size() const noexcept#
Get the size of blocks allocated by this memory resource.
- Returns:
std::size_t size in bytes of allocated blocks.
Public Static Attributes
-
static constexpr std::size_t default_block_size = 1 << 20#
Default allocation block size.
-
static constexpr std::size_t default_blocks_to_preallocate = 128#
The number of blocks that the pool starts out with, and also the number of blocks by which the pool grows when all of its current blocks are allocated
Friends
-
inline friend void get_property(fixed_size_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
fixed_size_memory_resourceprovides device accessible memory
-
explicit fixed_size_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::size_t block_size = default_block_size, std::size_t blocks_to_preallocate = default_blocks_to_preallocate)#
-
class managed_memory_resource#
- #include <managed_memory_resource.hpp>
Memory resource that uses cudaMallocManaged/Free for allocation/deallocation.
Public Functions
-
managed_memory_resource() = default#
-
~managed_memory_resource() = default#
-
managed_memory_resource(managed_memory_resource const&) = default#
Default copy constructor.
-
managed_memory_resource(managed_memory_resource&&) = default#
Default move constructor.
-
managed_memory_resource &operator=(managed_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
managed_memory_resource& Reference to the assigned object
-
managed_memory_resource &operator=(managed_memory_resource&&) = default#
Default move assignment operator.
- Returns:
managed_memory_resource& Reference to the assigned object
-
inline void *allocate([[maybe_unused]] cuda::stream_ref stream, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytes.The returned pointer will have at minimum 256 byte alignment.
The stream argument is ignored.
- Parameters:
stream – This argument is ignored
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate([[maybe_unused]] cuda::stream_ref stream, void *ptr, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptr.The stream argument is ignored.
- Parameters:
stream – This argument is ignored
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation. This must be equal to the value of
bytesthat was passed to theallocatecall that returnedptr.alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline void *allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytessynchronously.- Parameters:
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate_sync(void *ptr, std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptrsynchronously.- Parameters:
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation
alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline bool operator==(managed_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of managed_memory_resource are equivalent.
- Returns:
true Always
-
inline bool operator!=(managed_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of managed_memory_resource are equivalent.
- Returns:
true Always
Friends
-
inline friend void get_property(managed_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
managed_memory_resourceprovides device accessible memory
-
inline friend void get_property(managed_memory_resource const&, cuda::mr::host_accessible) noexcept#
Enables the
cuda::mr::host_accessibleproperty.This property declares that a
managed_memory_resourceprovides host accessible memory
-
managed_memory_resource() = default#
-
class pinned_host_memory_resource#
- #include <pinned_host_memory_resource.hpp>
Memory resource class for allocating pinned host memory.
This class uses CUDA’s
cudaHostAllocto allocate pinned host memory. It satisfies thecuda::mr::resourceandcuda::mr::synchronous_resourceconcepts, and thecuda::mr::host_accessibleandcuda::mr::device_accessibleproperties.Public Functions
-
pinned_host_memory_resource() = default#
-
~pinned_host_memory_resource() = default#
-
pinned_host_memory_resource(pinned_host_memory_resource const&) = default#
Default copy constructor.
-
pinned_host_memory_resource(pinned_host_memory_resource&&) = default#
Default move constructor.
-
pinned_host_memory_resource &operator=(pinned_host_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
pinned_host_memory_resource& Reference to the assigned object
-
pinned_host_memory_resource &operator=(pinned_host_memory_resource&&) = default#
Default move assignment operator.
- Returns:
pinned_host_memory_resource& Reference to the assigned object
-
inline void *allocate([[maybe_unused]] cuda::stream_ref stream, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates pinned host memory of size at least
bytesbytes.The stream argument is ignored.
- Throws:
rmm::out_of_memory – if the requested allocation could not be fulfilled due to a CUDA out of memory error.
rmm::bad_alloc – if the requested allocation could not be fulfilled due to any other reason.
- Parameters:
stream – CUDA stream on which to perform the allocation (ignored).
bytes – The size, in bytes, of the allocation.
alignment – The alignment of the allocation
- Returns:
Pointer to the newly allocated memory.
-
inline void deallocate([[maybe_unused]] cuda::stream_ref stream, void *ptr, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptr.The stream argument is ignored.
- Parameters:
stream – This argument is ignored.
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation. This must be equal to the value of
bytesthat was passed to theallocatecall that returnedptr.alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline void *allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates pinned host memory of size at least
bytesbytes synchronously.- Parameters:
bytes – The size, in bytes, of the allocation.
alignment – The alignment of the allocation
- Returns:
Pointer to the newly allocated memory.
-
inline void deallocate_sync(void *ptr, std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptrsynchronously.- Parameters:
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation
alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline bool operator==(pinned_host_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of pinned_host_memory_resource are equivalent.
- Returns:
true Always
-
inline bool operator!=(pinned_host_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of pinned_host_memory_resource are equivalent.
- Returns:
true Always
Friends
-
inline friend void get_property(pinned_host_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
pinned_host_memory_resourceprovides device accessible memory
-
inline friend void get_property(pinned_host_memory_resource const&, cuda::mr::host_accessible) noexcept#
Enables the
cuda::mr::host_accessibleproperty.This property declares that a
pinned_host_memory_resourceprovides host accessible memory
-
pinned_host_memory_resource() = default#
-
template<typename T>
class polymorphic_allocator# - #include <polymorphic_allocator.hpp>
A stream ordered Allocator using a
device_async_resource_refto satisfy (de)allocations.Similar to
std::pmr::polymorphic_allocator, uses the runtime polymorphism of type-erased resource refs to allow containers withpolymorphic_allocatoras their static allocator type to be interoperable, but exhibit different behavior depending on resource used.Unlike STL allocators,
polymorphic_allocator’sallocateanddeallocatefunctions are stream ordered. Usestream_allocator_adaptorto allow interoperability with interfaces that require standard, non stream-orderedAllocatorinterfaces.- Template Parameters:
T – The allocators value type.
Public Functions
-
polymorphic_allocator() = default#
Construct a
polymorphic_allocatorusing the return value ofrmm::mr::get_current_device_resource_ref()as the underlying memory resource.
-
inline polymorphic_allocator(cuda::mr::any_resource<cuda::mr::device_accessible> mr)#
Construct a
polymorphic_allocatorusing the provided memory resource.This constructor provides an implicit conversion from
device_async_resource_ref.- Parameters:
mr – The upstream memory resource to use for allocation.
-
template<typename U>
inline polymorphic_allocator(polymorphic_allocator<U> const &other) noexcept# Construct a
polymorphic_allocatorusing the underlying memory resource ofother.- Parameters:
other – The
polymorphic_allocatorwhose memory resource will be used as the underlying resource of the newpolymorphic_allocator.
-
inline value_type *allocate(std::size_t num, cuda_stream_view stream)#
Allocates storage for
numobjects of typeTusing the underlying memory resource.- Parameters:
num – The number of objects to allocate storage for
stream – The stream on which to perform the allocation
- Returns:
Pointer to the allocated storage
-
inline void deallocate(value_type *ptr, std::size_t num, cuda_stream_view stream) noexcept#
Deallocates storage pointed to by
ptr.ptrmust have been allocated from a memory resourcerthat compares equal toget_upstream_resource()usingr.allocate(n * sizeof(T)).- Parameters:
ptr – Pointer to memory to deallocate
num – Number of objects originally allocated
stream – Stream on which to perform the deallocation
-
inline rmm::device_async_resource_ref get_upstream_resource() const noexcept#
rmm::device_async_resource_ref to the upstream resource
- Returns:
rmm::device_async_resource_ref to the upstream resource
-
template<typename Allocator>
class stream_allocator_adaptor# - #include <polymorphic_allocator.hpp>
Adapts a stream ordered allocator to provide a standard
Allocatorinterface.A stream-ordered allocator (i.e.,
allocate/deallocateuse acuda_stream_view) cannot be used in an interface that expects a standard C++Allocatorinterface.stream_allocator_adaptorwraps a stream-ordered allocator and a stream to provide a standardAllocatorinterface. The adaptor uses the wrapped stream in calls to the underlying allocator’sallocateanddeallocatefunctions.Example:
my_stream_ordered_allocator<int> a{...}; cuda_stream_view s = // create stream; auto adapted = stream_allocator_adaptor(a, s); // Allocates storage for `n` int's on stream `s` int * p = std::allocator_traits<decltype(adapted)>::allocate(adapted, n);
- Template Parameters:
Allocator – Stream ordered allocator type to adapt
Public Types
Public Functions
-
stream_allocator_adaptor() = delete#
-
inline stream_allocator_adaptor(Allocator const &allocator, cuda_stream_view stream)#
Construct a
stream_allocator_adaptorusingaas the underlying allocator.Note
The
streammust not be destroyed before thestream_allocator_adaptor, otherwise behavior is undefined.- Parameters:
allocator – The stream ordered allocator to use as the underlying allocator
stream – The stream used with the underlying allocator
-
template<typename OtherAllocator>
inline stream_allocator_adaptor(stream_allocator_adaptor<OtherAllocator> const &other)# Construct a
stream_allocator_adaptorusingother.underlying_allocator()andother.stream()as the underlying allocator and stream.- Template Parameters:
OtherAllocator – Type of
other’s underlying allocator- Parameters:
other – The other
stream_allocator_adaptorwhose underlying allocator and stream will be copied
-
inline value_type *allocate(std::size_t num)#
Allocates storage for
numobjects of typeTusing the underlying allocator onstream().- Parameters:
num – The number of objects to allocate storage for
- Returns:
Pointer to the allocated storage
-
inline void deallocate(value_type *ptr, std::size_t num) noexcept#
Deallocates storage pointed to by
ptrusing the underlying allocator onstream().ptrmust have been allocated from by an allocatorathat compares equal tounderlying_allocator()usinga.allocate(n).- Parameters:
ptr – Pointer to memory to deallocate
num – Number of objects originally allocated
-
inline cuda_stream_view stream() const noexcept#
The stream on which calls to the underlying allocator are made.
- Returns:
The stream on which calls to the underlying allocator are made
-
template<typename T>
struct rebind# - #include <polymorphic_allocator.hpp>
Rebinds the allocator to the specified type.
- Template Parameters:
T – The desired
value_typeof the rebound allocator type
Public Types
-
using other = stream_allocator_adaptor<typename std::allocator_traits<Allocator>::template rebind_alloc<T>>#
The type to bind to.
-
class pool_memory_resource : public cuda::mr::shared_resource<detail::pool_memory_resource_impl>#
- #include <pool_memory_resource.hpp>
A coalescing best-fit suballocator which uses a pool of memory allocated from an upstream memory_resource.
Allocation and deallocation are thread-safe. Also, this class is compatible with CUDA per-thread default stream.
This class is copyable and shares ownership of its internal state, allowing multiple instances to safely reference the same underlying pool.
Public Functions
-
explicit pool_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::size_t initial_pool_size, std::optional<std::size_t> maximum_pool_size = std::nullopt)#
Construct a
pool_memory_resourceand allocate the initial device memory pool usingupstream.- Throws:
rmm::logic_error – if
initial_pool_sizeis not aligned to a multiple of 256 bytes.rmm::logic_error – if
maximum_pool_sizeis neither the default nor aligned to a multiple of 256 bytes.
- Parameters:
upstream – The resource from which to allocate blocks for the pool.
initial_pool_size – Minimum size, in bytes, of the initial pool.
maximum_pool_size – Maximum size, in bytes, that the pool can grow to. Defaults to all of the available memory from the upstream resource.
-
device_async_resource_ref get_upstream_resource() const noexcept#
rmm::device_async_resource_ref to the upstream resource
- Returns:
rmm::device_async_resource_ref to the upstream resource
-
std::size_t pool_size() const noexcept#
Computes the size of the current pool.
Includes allocated as well as free memory.
- Returns:
std::size_t The total size of the currently allocated pool.
Friends
-
inline friend void get_property(pool_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
pool_memory_resourceprovides device accessible memory
-
explicit pool_memory_resource(cuda::mr::any_resource<cuda::mr::device_accessible> upstream, std::size_t initial_pool_size, std::optional<std::size_t> maximum_pool_size = std::nullopt)#
-
class sam_headroom_memory_resource : public cuda::mr::shared_resource<detail::sam_headroom_memory_resource_impl>#
- #include <sam_headroom_memory_resource.hpp>
Resource that uses system memory resource to allocate memory with a headroom.
System allocated memory (SAM) can be migrated to the GPU, but is never migrated back the host. If GPU memory is over-subscribed, this can cause other CUDA calls to fail with out-of-memory errors. To work around this problem, when using a system memory resource, we reserve some GPU memory as headroom for other CUDA calls, and only conditionally set its preferred location to the GPU if the allocation would not eat into the headroom.
Since doing this check on every allocation can be expensive, the caller may choose to use other allocators (e.g.
binning_memory_resource) for small allocations, and use this allocator for large allocations only.Public Functions
-
explicit sam_headroom_memory_resource(std::size_t headroom)#
Construct a headroom memory resource.
- Parameters:
headroom – Size of the reserved GPU memory as headroom
-
sam_headroom_memory_resource() = delete#
-
~sam_headroom_memory_resource() = default#
-
sam_headroom_memory_resource(sam_headroom_memory_resource const&) = default#
Default copy constructor.
-
sam_headroom_memory_resource(sam_headroom_memory_resource&&) = default#
Default move constructor.
-
sam_headroom_memory_resource &operator=(sam_headroom_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
sam_headroom_memory_resource& Reference to the assigned object
-
sam_headroom_memory_resource &operator=(sam_headroom_memory_resource&&) = default#
Default move assignment operator.
- Returns:
sam_headroom_memory_resource& Reference to the assigned object
Friends
-
inline friend void get_property(sam_headroom_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.
-
inline friend void get_property(sam_headroom_memory_resource const&, cuda::mr::host_accessible) noexcept#
Enables the
cuda::mr::host_accessibleproperty.
-
explicit sam_headroom_memory_resource(std::size_t headroom)#
-
class system_memory_resource#
- #include <system_memory_resource.hpp>
Memory resource that uses malloc/free for allocation/deallocation.
There are two flavors of hardware/software environments that support accessing system allocated memory (SAM) from the GPU: HMM and ATS.
Heterogeneous Memory Management (HMM) is a software-based solution for PCIe-connected GPUs on x86 systems. Requirements:
NVIDIA CUDA 12.2 with the open-source r535_00 driver or newer.
A sufficiently recent Linux kernel: 6.1.24+, 6.2.11+, or 6.3+.
A GPU with one of the following supported architectures: NVIDIA Turing, NVIDIA Ampere, NVIDIA Ada Lovelace, NVIDIA Hopper, or newer.
A 64-bit x86 CPU.
For more information, see https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/.
Address Translation Services (ATS) is a hardware/software solution for the Grace Hopper Superchip that uses the NVLink Chip-2-Chip (C2C) interconnect to provide coherent memory. For more information, see https://developer.nvidia.com/blog/nvidia-grace-hopper-superchip-architecture-in-depth/.
Public Functions
-
inline system_memory_resource()#
-
~system_memory_resource() = default#
-
system_memory_resource(system_memory_resource const&) = default#
Default copy constructor.
-
system_memory_resource(system_memory_resource&&) = default#
Default copy constructor.
-
system_memory_resource &operator=(system_memory_resource const&) = default#
Default copy assignment operator.
- Returns:
system_memory_resource& Reference to the assigned object
-
system_memory_resource &operator=(system_memory_resource&&) = default#
Default move assignment operator.
- Returns:
system_memory_resource& Reference to the assigned object
-
inline void *allocate([[maybe_unused]] cuda::stream_ref stream, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytes.The returned pointer will have at minimum 256 byte alignment.
The stream argument is ignored.
- Parameters:
stream – This argument is ignored
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate(cuda::stream_ref stream, void *ptr, std::size_t bytes, [[maybe_unused]] std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptr.This function synchronizes the stream before deallocating the memory.
- Parameters:
stream – The stream in which to order this deallocation
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation. This must be equal to the value of
bytesthat was passed to theallocatecall that returnedptr.alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline void *allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)#
Allocates memory of size at least
bytessynchronously.- Parameters:
bytes – The size of the allocation
alignment – The alignment of the allocation
- Returns:
void* Pointer to the newly allocated memory
-
inline void deallocate_sync(void *ptr, std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept#
Deallocate memory pointed to by
ptrsynchronously.- Parameters:
ptr – Pointer to be deallocated
bytes – The size in bytes of the allocation
alignment – The alignment that was passed to the
allocatecall that returnedptr
-
inline bool operator==(system_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of system_memory_resource are equivalent.
- Returns:
true Always
-
inline bool operator!=(system_memory_resource const&) const noexcept#
Compare this resource to another.
All instances of system_memory_resource are equivalent.
- Returns:
true Always
Friends
-
inline friend void get_property(system_memory_resource const&, cuda::mr::device_accessible) noexcept#
Enables the
cuda::mr::device_accessibleproperty.This property declares that a
system_memory_resourceprovides device-accessible memory
-
inline friend void get_property(system_memory_resource const&, cuda::mr::host_accessible) noexcept#
Enables the
cuda::mr::host_accessibleproperty.This property declares that a
system_memory_resourceprovides host-accessible memory