utils.hpp
1 /*
2  * Copyright (c) 2021-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <chrono>
19 #include <cstring>
20 #include <future>
21 #include <iostream>
22 #include <map>
23 #include <optional>
24 #include <stdexcept>
25 #include <tuple>
26 #include <type_traits>
27 
28 #ifdef KVIKIO_CUDA_FOUND
29 #include <nvtx3/nvtx3.hpp>
30 #endif
31 
32 #include <kvikio/error.hpp>
33 #include <kvikio/shim/cuda.hpp>
34 
35 namespace kvikio {
36 
37 // cuFile defines a page size to 4 KiB
38 inline constexpr std::size_t page_size = 4096;
39 
40 [[nodiscard]] inline off_t convert_size2off(std::size_t x)
41 {
42  if (x >= static_cast<std::size_t>(std::numeric_limits<off_t>::max())) {
43  throw CUfileException("size_t argument too large to fit off_t");
44  }
45  return static_cast<off_t>(x);
46 }
47 
48 [[nodiscard]] inline ssize_t convert_size2ssize(std::size_t x)
49 {
50  if (x >= static_cast<std::size_t>(std::numeric_limits<ssize_t>::max())) {
51  throw CUfileException("size_t argument too large to fit ssize_t");
52  }
53  return static_cast<ssize_t>(x);
54 }
55 
56 [[nodiscard]] inline CUdeviceptr convert_void2deviceptr(const void* devPtr)
57 {
58  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
59  return reinterpret_cast<CUdeviceptr>(devPtr);
60 }
61 
65 template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
66 [[nodiscard]] std::int64_t convert_to_64bit(T value)
67 {
68  if constexpr (std::numeric_limits<T>::max() > std::numeric_limits<std::int64_t>::max()) {
69  if (value > std::numeric_limits<std::int64_t>::max()) {
70  throw std::overflow_error("convert_to_64bit(x): x too large to fit std::int64_t");
71  }
72  }
73  return std::int64_t(value);
74 }
75 
79 template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
80 [[nodiscard]] double convert_to_64bit(T value)
81 {
82  return double(value);
83 }
84 
93 #ifdef KVIKIO_CUDA_FOUND
94 inline bool is_host_memory(const void* ptr)
95 {
96  CUpointer_attribute attrs[1] = {
97  CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
98  };
99  CUmemorytype memtype{};
100  void* data[1] = {&memtype};
101  CUresult result =
102  cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
103 
104  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
105  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
106  CUDA_DRIVER_TRY(result);
107 
108  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
109  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
110  // does it to support `cudaMemoryTypeUnregistered`.
111  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
112 }
113 #else
114 constexpr bool is_host_memory(const void* ptr) { return true; }
115 #endif
116 
123 [[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
124 {
125  int ret = 0;
126  CUDA_DRIVER_TRY(
127  cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
128  return ret;
129 }
130 
139 [[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal)
140 {
141  static std::map<int, CUcontext> _cache;
142  static std::mutex _mutex;
143  std::lock_guard const lock(_mutex);
144 
145  if (_cache.find(ordinal) == _cache.end()) {
146  CUdevice dev{};
147  CUcontext ctx{};
148  CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
149 
150  // Notice, we let the primary context leak at program exit. We do this because `_cache`
151  // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main:
152  // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
153  CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
154  _cache.emplace(ordinal, ctx);
155  }
156  return _cache.at(ordinal);
157 }
158 
165 [[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
166 {
167  CUcontext ctx = nullptr;
168  const CUresult err =
169  cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
170  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
171  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
172  return {};
173 }
174 
181 [[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
182 {
183  CUdeviceptr current_ctx_dev_ptr{};
184  const CUresult err = cudaAPI::instance().PointerGetAttribute(
185  &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
186  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
187  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
188  return false;
189 }
190 
207 [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
208 {
209  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
210 
211  // First we check if a context has been associated with `devPtr`.
212  {
213  auto ctx = get_context_associated_pointer(dev_ptr);
214  if (ctx.has_value()) { return ctx.value(); }
215  }
216 
217  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
218  // return the current context.
219  {
220  CUcontext ctx = nullptr;
221  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
222  if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
223  }
224 
225  // Finally, if we didn't find any usable context, we return the primary context of the
226  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
227  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
228 }
229 
234  private:
235  CUcontext _ctx;
236 
237  public:
238  PushAndPopContext(CUcontext ctx) : _ctx{ctx}
239  {
240  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
241  }
242  PushAndPopContext(const PushAndPopContext&) = delete;
243  PushAndPopContext& operator=(PushAndPopContext const&) = delete;
245  PushAndPopContext&& operator=(PushAndPopContext&&) = delete;
247  {
248  try {
249  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException);
250  } catch (const CUfileException& e) {
251  std::cerr << e.what() << std::endl;
252  }
253  }
254 };
255 
256 // Find the base and offset of the memory allocation `devPtr` is in
257 inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
258  CUcontext* ctx = nullptr)
259 {
260  auto dev = convert_void2deviceptr(devPtr);
261  CUdeviceptr base_ptr{};
262  std::size_t base_size{};
263  CUcontext _ctx{};
264  if (ctx != nullptr) {
265  _ctx = *ctx;
266  } else {
267  _ctx = get_context_from_pointer(devPtr);
268  }
269  PushAndPopContext context(_ctx);
270  CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
271  std::size_t offset = dev - base_ptr;
272  // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast)
273  return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
274 }
275 
276 template <typename T>
277 inline bool is_future_done(const T& future)
278 {
279  return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
280 }
281 
282 #ifdef KVIKIO_CUDA_FOUND
286 struct libkvikio_domain {
287  static constexpr char const* name{"libkvikio"};
288 };
289 
290 // Macro to concatenate two tokens x and y.
291 #define KVIKIO_CONCAT_HELPER(x, y) x##y
292 #define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y)
293 
294 // Macro to create a static, registered string that will not have a name conflict with any
295 // registered string defined in the same scope.
296 #define KVIKIO_REGISTER_STRING(msg) \
297  [](const char* a_msg) -> auto& { \
298  static nvtx3::registered_string_in<libkvikio_domain> a_reg_str{a_msg}; \
299  return a_reg_str; \
300  }(msg)
301 
302 // Macro overloads of KVIKIO_NVTX_FUNC_RANGE
303 #define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
304 
305 #define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) \
306  nvtx3::scoped_range_in<libkvikio_domain> KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \
307  { \
308  nvtx3::event_attributes \
309  { \
310  KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) } \
311  } \
312  }
313 
314 #define KVIKIO_NVTX_MARKER_IMPL(msg, val) \
315  nvtx3::mark_in<libkvikio_domain>( \
316  nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}})
317 
318 #endif
319 
335 #ifdef KVIKIO_CUDA_FOUND
336 #define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL()
337 #else
338 #define KVIKIO_NVTX_FUNC_RANGE(...) \
339  do { \
340  } while (0)
341 #endif
342 
357 #ifdef KVIKIO_CUDA_FOUND
358 #define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val)
359 #else
360 #define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \
361  do { \
362  } while (0)
363 #endif
364 
382 #ifdef KVIKIO_CUDA_FOUND
383 #define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload)
384 #else
385 #define KVIKIO_NVTX_MARKER(message, payload) \
386  do { \
387  } while (0)
388 #endif
389 
390 } // namespace kvikio
Push CUDA context on creation and pop it on destruction.
Definition: utils.hpp:233