utils.hpp
1 /*
2  * Copyright (c) 2021-2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <chrono>
19 #include <cstring>
20 #include <future>
21 #include <iostream>
22 #include <map>
23 #include <optional>
24 #include <tuple>
25 
26 #include <kvikio/error.hpp>
27 #include <kvikio/shim/cuda.hpp>
28 
29 namespace kvikio {
30 
31 // cuFile defines a page size to 4 KiB
32 inline constexpr std::size_t page_size = 4096;
33 
34 [[nodiscard]] inline off_t convert_size2off(std::size_t x)
35 {
36  if (x >= static_cast<std::size_t>(std::numeric_limits<off_t>::max())) {
37  throw CUfileException("size_t argument too large to fit off_t");
38  }
39  return static_cast<off_t>(x);
40 }
41 
42 [[nodiscard]] inline ssize_t convert_size2ssize(std::size_t x)
43 {
44  if (x >= static_cast<std::size_t>(std::numeric_limits<ssize_t>::max())) {
45  throw CUfileException("size_t argument too large to fit ssize_t");
46  }
47  return static_cast<ssize_t>(x);
48 }
49 
50 [[nodiscard]] inline CUdeviceptr convert_void2deviceptr(const void* devPtr)
51 {
52  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
53  return reinterpret_cast<CUdeviceptr>(devPtr);
54 }
55 
64 inline bool is_host_memory(const void* ptr)
65 {
66  CUpointer_attribute attrs[1] = {
67  CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
68  };
69  CUmemorytype memtype{};
70  void* data[1] = {&memtype};
71  CUresult result =
72  cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
73 
74  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
75  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
76  CUDA_DRIVER_TRY(result);
77 
78  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
79  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
80  // does it to support `cudaMemoryTypeUnregistered`.
81  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
82 }
83 
90 [[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
91 {
92  int ret = 0;
93  CUDA_DRIVER_TRY(
94  cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
95  return ret;
96 }
97 
102  public:
103  CUdevice dev{};
104  CUcontext ctx{};
105 
106  CudaPrimaryContext(int device_ordinal)
107  {
108  CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
109  CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
110  }
111  CudaPrimaryContext(const CudaPrimaryContext&) = delete;
112  CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
114  CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete;
116  {
117  try {
118  CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
119  } catch (const CUfileException& e) {
120  std::cerr << e.what() << std::endl;
121  }
122  }
123 };
124 
133 [[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
134 {
135  static std::map<int, CudaPrimaryContext> _primary_contexts;
136  _primary_contexts.try_emplace(ordinal, ordinal);
137  return _primary_contexts.at(ordinal).ctx;
138 }
139 
146 [[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
147 {
148  CUcontext ctx = nullptr;
149  const CUresult err =
150  cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
151  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
152  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
153  return {};
154 }
155 
162 [[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
163 {
164  CUdeviceptr current_ctx_dev_ptr{};
165  const CUresult err = cudaAPI::instance().PointerGetAttribute(
166  &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
167  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
168  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
169  return false;
170 }
171 
188 [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
189 {
190  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
191 
192  // First we check if a context has been associated with `devPtr`.
193  {
194  auto ctx = get_context_associated_pointer(dev_ptr);
195  if (ctx.has_value()) { return ctx.value(); }
196  }
197 
198  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
199  // return the current context.
200  {
201  CUcontext ctx = nullptr;
202  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
203  if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
204  }
205 
206  // Finally, if we didn't find any usable context, we return the primary context of the
207  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
208  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
209 }
210 
215  private:
216  CUcontext _ctx;
217 
218  public:
219  PushAndPopContext(CUcontext ctx) : _ctx{ctx}
220  {
221  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
222  }
223  PushAndPopContext(const PushAndPopContext&) = delete;
224  PushAndPopContext& operator=(PushAndPopContext const&) = delete;
226  PushAndPopContext&& operator=(PushAndPopContext&&) = delete;
228  {
229  try {
230  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException);
231  } catch (const CUfileException& e) {
232  std::cerr << e.what() << std::endl;
233  }
234  }
235 };
236 
237 // Find the base and offset of the memory allocation `devPtr` is in
238 inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
239  CUcontext* ctx = nullptr)
240 {
241  auto dev = convert_void2deviceptr(devPtr);
242  CUdeviceptr base_ptr{};
243  std::size_t base_size{};
244  CUcontext _ctx{};
245  if (ctx != nullptr) {
246  _ctx = *ctx;
247  } else {
248  _ctx = get_context_from_pointer(devPtr);
249  }
250  PushAndPopContext context(_ctx);
251  CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
252  std::size_t offset = dev - base_ptr;
253  // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast)
254  return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
255 }
256 
257 template <typename T>
258 inline bool is_future_done(const T& future)
259 {
260  return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
261 }
262 
263 } // namespace kvikio
RAII wrapper for a CUDA primary context.
Definition: utils.hpp:101
Push CUDA context on creation and pop it on destruction.
Definition: utils.hpp:214