26 #include <type_traits>
28 #ifdef KVIKIO_CUDA_FOUND
29 #include <nvtx3/nvtx3.hpp>
32 #include <kvikio/error.hpp>
33 #include <kvikio/shim/cuda.hpp>
38 inline constexpr std::size_t page_size = 4096;
40 [[nodiscard]]
inline off_t convert_size2off(std::size_t x)
42 if (x >=
static_cast<std::size_t
>(std::numeric_limits<off_t>::max())) {
43 throw CUfileException(
"size_t argument too large to fit off_t");
45 return static_cast<off_t
>(x);
48 [[nodiscard]]
inline ssize_t convert_size2ssize(std::size_t x)
50 if (x >=
static_cast<std::size_t
>(std::numeric_limits<ssize_t>::max())) {
51 throw CUfileException(
"size_t argument too large to fit ssize_t");
53 return static_cast<ssize_t
>(x);
56 [[nodiscard]]
inline CUdeviceptr convert_void2deviceptr(
const void* devPtr)
59 return reinterpret_cast<CUdeviceptr
>(devPtr);
65 template <
typename T, std::enable_if_t<std::is_
integral_v<T>>* =
nullptr>
66 [[nodiscard]] std::int64_t convert_to_64bit(T value)
68 if constexpr (std::numeric_limits<T>::max() > std::numeric_limits<std::int64_t>::max()) {
69 if (value > std::numeric_limits<std::int64_t>::max()) {
70 throw std::overflow_error(
"convert_to_64bit(x): x too large to fit std::int64_t");
73 return std::int64_t(value);
79 template <
typename T, std::enable_if_t<std::is_
floating_po
int_v<T>>* =
nullptr>
80 [[nodiscard]]
double convert_to_64bit(T value)
93 #ifdef KVIKIO_CUDA_FOUND
94 inline bool is_host_memory(
const void* ptr)
96 CUpointer_attribute attrs[1] = {
97 CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
99 CUmemorytype memtype{};
100 void* data[1] = {&memtype};
102 cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
105 if (result == CUDA_ERROR_NOT_INITIALIZED) {
return true; }
106 CUDA_DRIVER_TRY(result);
111 return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
114 constexpr
bool is_host_memory(
const void* ptr) {
return true; }
123 [[nodiscard]]
inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
127 cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
139 [[nodiscard]] KVIKIO_EXPORT
inline CUcontext get_primary_cuda_context(
int ordinal)
141 static std::map<int, CUcontext> _cache;
142 static std::mutex _mutex;
143 std::lock_guard
const lock(_mutex);
145 if (_cache.find(ordinal) == _cache.end()) {
148 CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
153 CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
154 _cache.emplace(ordinal, ctx);
156 return _cache.at(ordinal);
165 [[nodiscard]]
inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
167 CUcontext ctx =
nullptr;
169 cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
170 if (err == CUDA_SUCCESS && ctx !=
nullptr) {
return ctx; }
171 if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
181 [[nodiscard]]
inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
183 CUdeviceptr current_ctx_dev_ptr{};
184 const CUresult err = cudaAPI::instance().PointerGetAttribute(
185 ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
186 if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) {
return true; }
187 if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
207 [[nodiscard]]
inline CUcontext get_context_from_pointer(
const void* devPtr)
209 CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
213 auto ctx = get_context_associated_pointer(dev_ptr);
214 if (ctx.has_value()) {
return ctx.value(); }
220 CUcontext ctx =
nullptr;
221 CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
222 if (ctx !=
nullptr && current_context_can_access_pointer(dev_ptr)) {
return ctx; }
227 return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
240 CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
249 CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx),
CUfileException);
251 std::cerr << e.what() << std::endl;
257 inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(
const void* devPtr,
258 CUcontext* ctx =
nullptr)
260 auto dev = convert_void2deviceptr(devPtr);
261 CUdeviceptr base_ptr{};
262 std::size_t base_size{};
264 if (ctx !=
nullptr) {
267 _ctx = get_context_from_pointer(devPtr);
269 PushAndPopContext context(_ctx);
270 CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
271 std::size_t offset = dev - base_ptr;
273 return std::make_tuple(
reinterpret_cast<void*
>(base_ptr), base_size, offset);
276 template <
typename T>
277 inline bool is_future_done(
const T& future)
279 return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
282 #ifdef KVIKIO_CUDA_FOUND
286 struct libkvikio_domain {
287 static constexpr
char const* name{
"libkvikio"};
291 #define KVIKIO_CONCAT_HELPER(x, y) x##y
292 #define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y)
296 #define KVIKIO_REGISTER_STRING(msg) \
297 [](const char* a_msg) -> auto& { \
298 static nvtx3::registered_string_in<libkvikio_domain> a_reg_str{a_msg}; \
303 #define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
305 #define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) \
306 nvtx3::scoped_range_in<libkvikio_domain> KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \
308 nvtx3::event_attributes \
310 KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) } \
314 #define KVIKIO_NVTX_MARKER_IMPL(msg, val) \
315 nvtx3::mark_in<libkvikio_domain>( \
316 nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}})
335 #ifdef KVIKIO_CUDA_FOUND
336 #define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL()
338 #define KVIKIO_NVTX_FUNC_RANGE(...) \
357 #ifdef KVIKIO_CUDA_FOUND
358 #define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val)
360 #define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \
382 #ifdef KVIKIO_CUDA_FOUND
383 #define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload)
385 #define KVIKIO_NVTX_MARKER(message, payload) \
Push CUDA context on creation and pop it on destruction.