cuda_async_memory_resource.hpp
1 /*
2  * Copyright (c) 2021-2022, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/cuda_device.hpp>
19 #include <rmm/cuda_stream_view.hpp>
20 #include <rmm/detail/cuda_util.hpp>
21 #include <rmm/detail/dynamic_load_runtime.hpp>
22 #include <rmm/detail/error.hpp>
23 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
24 #include <rmm/mr/device/device_memory_resource.hpp>
25 
26 #include <thrust/optional.h>
27 
28 #include <cuda_runtime_api.h>
29 
30 #include <cstddef>
31 #include <limits>
32 
33 #if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync
34 #ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC
35 #define RMM_CUDA_MALLOC_ASYNC_SUPPORT
36 #endif
37 #endif
38 
39 namespace rmm::mr {
40 
46  public:
58  none = 0x0,
59  posix_file_descriptor = 0x1,
60  win32 = 0x2,
62  win32_kmt = 0x4
63  };
64 
82  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
83  cuda_async_memory_resource(thrust::optional<std::size_t> initial_pool_size = {},
84  thrust::optional<std::size_t> release_threshold = {},
85  thrust::optional<allocation_handle_type> export_handle_type = {})
86  {
87 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
88  // Check if cudaMallocAsync Memory pool supported
89  RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
90  "cudaMallocAsync not supported with this CUDA driver/runtime version");
91 
92  // Construct explicit pool
93  cudaMemPoolProps pool_props{};
94  pool_props.allocType = cudaMemAllocationTypePinned;
95  pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
96  export_handle_type.value_or(allocation_handle_type::none));
97  RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
98  "Requested IPC memory handle type not supported");
99  pool_props.location.type = cudaMemLocationTypeDevice;
100  pool_props.location.id = rmm::detail::current_device().value();
101  cudaMemPool_t cuda_pool_handle{};
102  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
103  pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
104 
105  // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
106  // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.
107  // See https://github.com/NVIDIA/spark-rapids/issues/4710.
108  int driver_version{};
109  RMM_CUDA_TRY(cudaDriverGetVersion(&driver_version));
110  constexpr auto min_async_version{11050};
111  if (driver_version < min_async_version) {
112  int disabled{0};
113  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
114  pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
115  }
116 
117  auto const [free, total] = rmm::detail::available_device_memory();
118 
119  // Need an l-value to take address to pass to cudaMemPoolSetAttribute
120  uint64_t threshold = release_threshold.value_or(total);
121  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
122  pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
123 
124  // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
125  // specified size
126  auto const pool_size = initial_pool_size.value_or(free / 2);
127  auto* ptr = do_allocate(pool_size, cuda_stream_default);
128  do_deallocate(ptr, pool_size, cuda_stream_default);
129 #else
130  RMM_FAIL(
131  "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");
132 #endif
133  }
134 
135 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
136 
140  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
141 #endif
142 
143  ~cuda_async_memory_resource() override
144  {
145 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
146  RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
147 #endif
148  }
151  cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
153 
160  [[nodiscard]] bool supports_streams() const noexcept override { return true; }
161 
167  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
168 
169  private:
170 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
172 #endif
173 
184  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
185  {
186  void* ptr{nullptr};
187 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
188  ptr = pool_.allocate(bytes, stream);
189 #else
190  (void)bytes;
191  (void)stream;
192 #endif
193  return ptr;
194  }
195 
203  void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view stream) override
204  {
205 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
206  pool_.deallocate(ptr, size, stream);
207 #else
208  (void)ptr;
209  (void)size;
210  (void)stream;
211 #endif
212  }
213 
223  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
224  {
225  auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
226 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
227  return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
228 #else
229  return async_mr != nullptr;
230 #endif
231  }
232 
240  [[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(
241  rmm::cuda_stream_view) const override
242  {
243  return std::make_pair(0, 0);
244  }
245 };
246 
247 } // namespace rmm::mr
rmm::mr::cuda_async_memory_resource::allocation_handle_type::posix_file_descriptor
@ posix_file_descriptor
rmm::mr::cuda_async_memory_resource::allocation_handle_type::none
@ none
Does not allow any export mechanism.
rmm::mr::device_memory_resource::allocate
void * allocate(std::size_t bytes, cuda_stream_view stream=cuda_stream_view{})
Allocates memory of size at least bytes.
Definition: device_memory_resource.hpp:106
rmm::mr::cuda_async_memory_resource
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_memory_resource.hpp:45
rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:34
rmm::mr::cuda_async_view_memory_resource
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_view_memory_resource.hpp:42
rmm::mr::cuda_async_memory_resource::allocation_handle_type::win32
@ win32
Allows a Win32 NT handle to be used for exporting. (HANDLE)
rmm::mr::cuda_async_memory_resource::cuda_async_memory_resource
cuda_async_memory_resource(thrust::optional< std::size_t > initial_pool_size={}, thrust::optional< std::size_t > release_threshold={}, thrust::optional< allocation_handle_type > export_handle_type={})
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release t...
Definition: cuda_async_memory_resource.hpp:83
rmm::mr::cuda_async_memory_resource::allocation_handle_type::win32_kmt
@ win32_kmt
Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
rmm::mr::cuda_async_memory_resource::supports_streams
bool supports_streams() const noexcept override
Query whether the resource supports use of non-null CUDA streams for allocation/deallocation....
Definition: cuda_async_memory_resource.hpp:160
rmm::mr::cuda_async_memory_resource::allocation_handle_type
allocation_handle_type
Flags for specifying memory allocation handle types.
Definition: cuda_async_memory_resource.hpp:57
rmm::mr::device_memory_resource
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:82
rmm::mr::cuda_async_memory_resource::supports_get_mem_info
bool supports_get_mem_info() const noexcept override
Query whether the resource supports the get_mem_info API.
Definition: cuda_async_memory_resource.hpp:167
rmm::cuda_device_id::value
constexpr value_type value() const noexcept
Returns the wrapped integer value.
Definition: cuda_device.hpp:39