26 #include <type_traits>
28 #ifdef KVIKIO_CUDA_FOUND
29 #include <nvtx3/nvtx3.hpp>
32 #include <kvikio/error.hpp>
33 #include <kvikio/shim/cuda.hpp>
41 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MINGW32__) && !defined(__MINGW64__)
42 #define KVIKIO_EXPORT __attribute__((visibility("default")))
43 #define KVIKIO_HIDDEN __attribute__((visibility("hidden")))
52 inline constexpr std::size_t page_size = 4096;
54 [[nodiscard]]
inline off_t convert_size2off(std::size_t x)
56 if (x >=
static_cast<std::size_t
>(std::numeric_limits<off_t>::max())) {
57 throw CUfileException(
"size_t argument too large to fit off_t");
59 return static_cast<off_t
>(x);
62 [[nodiscard]]
inline ssize_t convert_size2ssize(std::size_t x)
64 if (x >=
static_cast<std::size_t
>(std::numeric_limits<ssize_t>::max())) {
65 throw CUfileException(
"size_t argument too large to fit ssize_t");
67 return static_cast<ssize_t
>(x);
70 [[nodiscard]]
inline CUdeviceptr convert_void2deviceptr(
const void* devPtr)
73 return reinterpret_cast<CUdeviceptr
>(devPtr);
79 template <
typename T, std::enable_if_t<std::is_
integral_v<T>>* =
nullptr>
80 [[nodiscard]] std::int64_t convert_to_64bit(T value)
82 if constexpr (std::numeric_limits<T>::max() > std::numeric_limits<std::int64_t>::max()) {
83 if (value > std::numeric_limits<std::int64_t>::max()) {
84 throw std::overflow_error(
"convert_to_64bit(x): x too large to fit std::int64_t");
87 return std::int64_t(value);
93 template <
typename T, std::enable_if_t<std::is_
floating_po
int_v<T>>* =
nullptr>
94 [[nodiscard]]
double convert_to_64bit(T value)
107 #ifdef KVIKIO_CUDA_FOUND
108 inline bool is_host_memory(
const void* ptr)
110 CUpointer_attribute attrs[1] = {
111 CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
113 CUmemorytype memtype{};
114 void* data[1] = {&memtype};
116 cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
119 if (result == CUDA_ERROR_NOT_INITIALIZED) {
return true; }
120 CUDA_DRIVER_TRY(result);
125 return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
128 constexpr
bool is_host_memory(
const void* ptr) {
return true; }
137 [[nodiscard]]
inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
141 cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
153 [[nodiscard]] KVIKIO_EXPORT
inline CUcontext get_primary_cuda_context(
int ordinal)
155 static std::map<int, CUcontext> _cache;
156 static std::mutex _mutex;
157 std::lock_guard
const lock(_mutex);
159 if (_cache.find(ordinal) == _cache.end()) {
162 CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
167 CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
168 _cache.emplace(ordinal, ctx);
170 return _cache.at(ordinal);
179 [[nodiscard]]
inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
181 CUcontext ctx =
nullptr;
183 cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
184 if (err == CUDA_SUCCESS && ctx !=
nullptr) {
return ctx; }
185 if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
195 [[nodiscard]]
inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
197 CUdeviceptr current_ctx_dev_ptr{};
198 const CUresult err = cudaAPI::instance().PointerGetAttribute(
199 ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
200 if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) {
return true; }
201 if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
221 [[nodiscard]]
inline CUcontext get_context_from_pointer(
const void* devPtr)
223 CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
227 auto ctx = get_context_associated_pointer(dev_ptr);
228 if (ctx.has_value()) {
return ctx.value(); }
234 CUcontext ctx =
nullptr;
235 CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
236 if (ctx !=
nullptr && current_context_can_access_pointer(dev_ptr)) {
return ctx; }
241 return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
254 CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
263 CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx),
CUfileException);
265 std::cerr << e.what() << std::endl;
271 inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(
const void* devPtr,
272 CUcontext* ctx =
nullptr)
274 auto dev = convert_void2deviceptr(devPtr);
275 CUdeviceptr base_ptr{};
276 std::size_t base_size{};
278 if (ctx !=
nullptr) {
281 _ctx = get_context_from_pointer(devPtr);
283 PushAndPopContext context(_ctx);
284 CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
285 std::size_t offset = dev - base_ptr;
287 return std::make_tuple(
reinterpret_cast<void*
>(base_ptr), base_size, offset);
290 template <
typename T>
291 inline bool is_future_done(
const T& future)
293 return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
296 #ifdef KVIKIO_CUDA_FOUND
300 struct libkvikio_domain {
301 static constexpr
char const* name{
"libkvikio"};
305 #define KVIKIO_NVTX_FUNC_RANGE_1() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
306 #define KVIKIO_NVTX_FUNC_RANGE_2(msg, val) \
307 nvtx3::scoped_range_in<libkvikio_domain> _kvikio_nvtx_range \
309 nvtx3::event_attributes \
311 msg, nvtx3::payload { convert_to_64bit(val) } \
314 #define GET_KVIKIO_NVTX_FUNC_RANGE_MACRO(_1, _2, NAME, ...) NAME
336 #ifdef KVIKIO_CUDA_FOUND
337 #define KVIKIO_NVTX_FUNC_RANGE(...) \
338 GET_KVIKIO_NVTX_FUNC_RANGE_MACRO( \
339 __VA_ARGS__, KVIKIO_NVTX_FUNC_RANGE_2, KVIKIO_NVTX_FUNC_RANGE_1) \
342 #define KVIKIO_NVTX_FUNC_RANGE(...) \
Push CUDA context on creation and pop it on destruction.