utils.hpp
1 /*
2  * Copyright (c) 2021-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <chrono>
19 #include <cstring>
20 #include <future>
21 #include <iostream>
22 #include <map>
23 #include <optional>
24 #include <stdexcept>
25 #include <tuple>
26 #include <type_traits>
27 
28 #ifdef KVIKIO_CUDA_FOUND
29 #include <nvtx3/nvtx3.hpp>
30 #endif
31 
32 #include <kvikio/error.hpp>
33 #include <kvikio/shim/cuda.hpp>
34 
35 // Macros used for defining symbol visibility, only GLIBC is supported.
36 // Since KvikIO is header-only, we rely on the linker to disambiguate inline functions
37 // that have (or return) static references. To do this, the relevant function must have
38 // `__attribute__((visibility("default")))`. If not, then if KvikIO is used in two
39 // different DSOs, the function will appear twice, and there will be two static objects.
40 // See <https://github.com/rapidsai/kvikio/issues/442>.
41 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MINGW32__) && !defined(__MINGW64__)
42 #define KVIKIO_EXPORT __attribute__((visibility("default")))
43 #define KVIKIO_HIDDEN __attribute__((visibility("hidden")))
44 #else
45 #define KVIKIO_EXPORT
46 #define KVIKIO_HIDDEN
47 #endif
48 
49 namespace kvikio {
50 
51 // cuFile defines a page size to 4 KiB
52 inline constexpr std::size_t page_size = 4096;
53 
54 [[nodiscard]] inline off_t convert_size2off(std::size_t x)
55 {
56  if (x >= static_cast<std::size_t>(std::numeric_limits<off_t>::max())) {
57  throw CUfileException("size_t argument too large to fit off_t");
58  }
59  return static_cast<off_t>(x);
60 }
61 
62 [[nodiscard]] inline ssize_t convert_size2ssize(std::size_t x)
63 {
64  if (x >= static_cast<std::size_t>(std::numeric_limits<ssize_t>::max())) {
65  throw CUfileException("size_t argument too large to fit ssize_t");
66  }
67  return static_cast<ssize_t>(x);
68 }
69 
70 [[nodiscard]] inline CUdeviceptr convert_void2deviceptr(const void* devPtr)
71 {
72  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
73  return reinterpret_cast<CUdeviceptr>(devPtr);
74 }
75 
79 template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
80 [[nodiscard]] std::int64_t convert_to_64bit(T value)
81 {
82  if constexpr (std::numeric_limits<T>::max() > std::numeric_limits<std::int64_t>::max()) {
83  if (value > std::numeric_limits<std::int64_t>::max()) {
84  throw std::overflow_error("convert_to_64bit(x): x too large to fit std::int64_t");
85  }
86  }
87  return std::int64_t(value);
88 }
89 
93 template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
94 [[nodiscard]] double convert_to_64bit(T value)
95 {
96  return double(value);
97 }
98 
107 #ifdef KVIKIO_CUDA_FOUND
108 inline bool is_host_memory(const void* ptr)
109 {
110  CUpointer_attribute attrs[1] = {
111  CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
112  };
113  CUmemorytype memtype{};
114  void* data[1] = {&memtype};
115  CUresult result =
116  cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
117 
118  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
119  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
120  CUDA_DRIVER_TRY(result);
121 
122  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
123  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
124  // does it to support `cudaMemoryTypeUnregistered`.
125  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
126 }
127 #else
128 constexpr bool is_host_memory(const void* ptr) { return true; }
129 #endif
130 
137 [[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
138 {
139  int ret = 0;
140  CUDA_DRIVER_TRY(
141  cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
142  return ret;
143 }
144 
153 [[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal)
154 {
155  static std::map<int, CUcontext> _cache;
156  static std::mutex _mutex;
157  std::lock_guard const lock(_mutex);
158 
159  if (_cache.find(ordinal) == _cache.end()) {
160  CUdevice dev{};
161  CUcontext ctx{};
162  CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
163 
164  // Notice, we let the primary context leak at program exit. We do this because `_cache`
165  // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main:
166  // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
167  CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
168  _cache.emplace(ordinal, ctx);
169  }
170  return _cache.at(ordinal);
171 }
172 
179 [[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
180 {
181  CUcontext ctx = nullptr;
182  const CUresult err =
183  cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
184  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
185  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
186  return {};
187 }
188 
195 [[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
196 {
197  CUdeviceptr current_ctx_dev_ptr{};
198  const CUresult err = cudaAPI::instance().PointerGetAttribute(
199  &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
200  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
201  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
202  return false;
203 }
204 
221 [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
222 {
223  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
224 
225  // First we check if a context has been associated with `devPtr`.
226  {
227  auto ctx = get_context_associated_pointer(dev_ptr);
228  if (ctx.has_value()) { return ctx.value(); }
229  }
230 
231  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
232  // return the current context.
233  {
234  CUcontext ctx = nullptr;
235  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
236  if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
237  }
238 
239  // Finally, if we didn't find any usable context, we return the primary context of the
240  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
241  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
242 }
243 
248  private:
249  CUcontext _ctx;
250 
251  public:
252  PushAndPopContext(CUcontext ctx) : _ctx{ctx}
253  {
254  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
255  }
256  PushAndPopContext(const PushAndPopContext&) = delete;
257  PushAndPopContext& operator=(PushAndPopContext const&) = delete;
259  PushAndPopContext&& operator=(PushAndPopContext&&) = delete;
261  {
262  try {
263  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException);
264  } catch (const CUfileException& e) {
265  std::cerr << e.what() << std::endl;
266  }
267  }
268 };
269 
270 // Find the base and offset of the memory allocation `devPtr` is in
271 inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
272  CUcontext* ctx = nullptr)
273 {
274  auto dev = convert_void2deviceptr(devPtr);
275  CUdeviceptr base_ptr{};
276  std::size_t base_size{};
277  CUcontext _ctx{};
278  if (ctx != nullptr) {
279  _ctx = *ctx;
280  } else {
281  _ctx = get_context_from_pointer(devPtr);
282  }
283  PushAndPopContext context(_ctx);
284  CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
285  std::size_t offset = dev - base_ptr;
286  // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast)
287  return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
288 }
289 
290 template <typename T>
291 inline bool is_future_done(const T& future)
292 {
293  return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
294 }
295 
296 #ifdef KVIKIO_CUDA_FOUND
300 struct libkvikio_domain {
301  static constexpr char const* name{"libkvikio"};
302 };
303 
304 // Macro overloads of KVIKIO_NVTX_FUNC_RANGE
305 #define KVIKIO_NVTX_FUNC_RANGE_1() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
306 #define KVIKIO_NVTX_FUNC_RANGE_2(msg, val) \
307  nvtx3::scoped_range_in<libkvikio_domain> _kvikio_nvtx_range \
308  { \
309  nvtx3::event_attributes \
310  { \
311  msg, nvtx3::payload { convert_to_64bit(val) } \
312  } \
313  }
314 #define GET_KVIKIO_NVTX_FUNC_RANGE_MACRO(_1, _2, NAME, ...) NAME
315 #endif
316 
336 #ifdef KVIKIO_CUDA_FOUND
337 #define KVIKIO_NVTX_FUNC_RANGE(...) \
338  GET_KVIKIO_NVTX_FUNC_RANGE_MACRO( \
339  __VA_ARGS__, KVIKIO_NVTX_FUNC_RANGE_2, KVIKIO_NVTX_FUNC_RANGE_1) \
340  (__VA_ARGS__)
341 #else
342 #define KVIKIO_NVTX_FUNC_RANGE(...) \
343  do { \
344  } while (0)
345 #endif
346 
347 } // namespace kvikio
Push CUDA context on creation and pop it on destruction.
Definition: utils.hpp:247