RMM: cuda_async_memory_resource.hpp Source File

 /*

  * Copyright (c) 2021-2022, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <rmm/cuda_device.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/cuda_util.hpp>

 #include <rmm/detail/dynamic_load_runtime.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>

 #include <rmm/mr/device/device_memory_resource.hpp>


 #include <rmm/detail/thrust_namespace.h>

 #include <thrust/optional.h>


 #include <cuda_runtime_api.h>


 #include <cstddef>

 #include <limits>


 #if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync

 #ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC

 #define RMM_CUDA_MALLOC_ASYNC_SUPPORT

 #endif

 #endif


 namespace rmm::mr {

 class cuda_async_memory_resource final : public device_memory_resource {

  public:

   enum class allocation_handle_type {

     none                  = 0x0,

     posix_file_descriptor = 0x1,

     win32     = 0x2,

     win32_kmt = 0x4

   };


   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)

   cuda_async_memory_resource(thrust::optional<std::size_t> initial_pool_size             = {},

                              thrust::optional<std::size_t> release_threshold             = {},

                              thrust::optional<allocation_handle_type> export_handle_type = {})

   {

 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

     // Check if cudaMallocAsync Memory pool supported

     RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),

                 "cudaMallocAsync not supported with this CUDA driver/runtime version");


     // Construct explicit pool

     cudaMemPoolProps pool_props{};

     pool_props.allocType   = cudaMemAllocationTypePinned;

     pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(

       export_handle_type.value_or(allocation_handle_type::none));

     RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),

                 "Requested IPC memory handle type not supported");

     pool_props.location.type = cudaMemLocationTypeDevice;

     pool_props.location.id   = rmm::get_current_cuda_device().value();

     cudaMemPool_t cuda_pool_handle{};

     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));

     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};


     // CUDA drivers before 11.5 have known incompatibilities with the async allocator.

     // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.

     // See https://github.com/NVIDIA/spark-rapids/issues/4710.

     int driver_version{};

     RMM_CUDA_TRY(cudaDriverGetVersion(&driver_version));

     constexpr auto min_async_version{11050};

     if (driver_version < min_async_version) {

       int disabled{0};

       RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(

         pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));

     }


     auto const [free, total] = rmm::detail::available_device_memory();


     // Need an l-value to take address to pass to cudaMemPoolSetAttribute

     uint64_t threshold = release_threshold.value_or(total);

     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(

       pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));


     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the

     // specified size

     auto const pool_size = initial_pool_size.value_or(free / 2);

     auto* ptr            = do_allocate(pool_size, cuda_stream_default);

     do_deallocate(ptr, pool_size, cuda_stream_default);

 #else

     RMM_FAIL(

       "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");

 #endif

   }


 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }

 #endif


   ~cuda_async_memory_resource() override

   {

 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)

     RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));

 #endif

   }

   cuda_async_memory_resource(cuda_async_memory_resource const&)            = delete;

   cuda_async_memory_resource(cuda_async_memory_resource&&)                 = delete;

   cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;

   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&)      = delete;


   [[nodiscard]] bool supports_streams() const noexcept override { return true; }


   [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }


  private:

 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

   cuda_async_view_memory_resource pool_{};

 #endif


   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override

   {

     void* ptr{nullptr};

 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

     ptr = pool_.allocate(bytes, stream);

 #else

     (void)bytes;

     (void)stream;

 #endif

     return ptr;

   }


   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override

   {

 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

     pool_.deallocate(ptr, bytes, stream);

 #else

     (void)ptr;

     (void)bytes;

     (void)stream;

 #endif

   }


   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override

   {

     auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);

 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT

     return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());

 #else

     return async_mr != nullptr;

 #endif

   }


   [[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(

     rmm::cuda_stream_view) const override

   {

     return std::make_pair(0, 0);

   }

 };

   // end of group

 }  // namespace rmm::mr

rmm::cuda_stream_view
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41

rmm::mr::cuda_async_memory_resource
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_memory_resource.hpp:51

rmm::mr::cuda_async_memory_resource::allocation_handle_type
allocation_handle_type
Flags for specifying memory allocation handle types.
Definition: cuda_async_memory_resource.hpp:63

rmm::mr::cuda_async_memory_resource::allocation_handle_type::none
@ none
Does not allow any export mechanism.

rmm::mr::cuda_async_memory_resource::allocation_handle_type::win32_kmt
@ win32_kmt
Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)

rmm::mr::cuda_async_memory_resource::allocation_handle_type::win32
@ win32
Allows a Win32 NT handle to be used for exporting. (HANDLE)

rmm::mr::cuda_async_memory_resource::allocation_handle_type::posix_file_descriptor
@ posix_file_descriptor

rmm::mr::cuda_async_memory_resource::supports_streams
bool supports_streams() const noexcept override
Query whether the resource supports use of non-null CUDA streams for allocation/deallocation....
Definition: cuda_async_memory_resource.hpp:166

rmm::mr::cuda_async_memory_resource::supports_get_mem_info
bool supports_get_mem_info() const noexcept override
Query whether the resource supports the get_mem_info API.
Definition: cuda_async_memory_resource.hpp:173

rmm::mr::cuda_async_memory_resource::cuda_async_memory_resource
cuda_async_memory_resource(thrust::optional< std::size_t > initial_pool_size={}, thrust::optional< std::size_t > release_threshold={}, thrust::optional< allocation_handle_type > export_handle_type={})
Constructs a cuda_async_memory_resource with the optionally specified initial pool size and release t...
Definition: cuda_async_memory_resource.hpp:89

rmm::mr::cuda_async_view_memory_resource
device_memory_resource derived class that uses cudaMallocAsync/cudaFreeAsync for allocation/deallocat...
Definition: cuda_async_view_memory_resource.hpp:48

rmm::mr::device_memory_resource
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:89

rmm::mr::device_memory_resource::allocate
void * allocate(std::size_t bytes, cuda_stream_view stream=cuda_stream_view{})
Allocates memory of size at least bytes.
Definition: device_memory_resource.hpp:116

cuda_async_view_memory_resource.hpp

cuda_device.hpp

cuda_stream_view.hpp

device_memory_resource.hpp

rmm::get_current_cuda_device
cuda_device_id get_current_cuda_device()
Returns a cuda_device_id for the current device.
Definition: cuda_device.hpp:86

rmm::cuda_device_id::value
constexpr value_type value() const noexcept
The wrapped integer value.
Definition: cuda_device.hpp:44