libkvikio: posix_io.hpp Source File

 /*

  * Copyright (c) 2022-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <unistd.h>

 #include <cstddef>

 #include <cstdlib>

 #include <map>

 #include <thread>


 #include <kvikio/bounce_buffer.hpp>

 #include <kvikio/error.hpp>

 #include <kvikio/nvtx.hpp>

 #include <kvikio/shim/cuda.hpp>

 #include <kvikio/utils.hpp>


 namespace kvikio::detail {


 enum class IOOperationType : uint8_t {

   READ,

   WRITE,

 };


 enum class PartialIO : uint8_t {

   YES,

   NO,

 };


 class StreamsByThread {

  private:

   std::map<std::pair<CUcontext, std::thread::id>, CUstream> _streams;


  public:

   StreamsByThread() = default;


   // Here we intentionally do not destroy in the destructor the CUDA resources

   // (e.g. CUstream) with static storage duration, but instead let them leak

   // on program termination. This is to prevent undefined behavior in CUDA. See

   // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>

   // This also prevents crash (segmentation fault) if clients call

   // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination.

   ~StreamsByThread() = default;


   KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id);


   static CUstream get();


   StreamsByThread(StreamsByThread const&)            = delete;

   StreamsByThread& operator=(StreamsByThread const&) = delete;

   StreamsByThread(StreamsByThread&& o)               = delete;

   StreamsByThread& operator=(StreamsByThread&& o)    = delete;

 };


 template <IOOperationType Operation, PartialIO PartialIOStatus>

 ssize_t posix_host_io(int fd, void const* buf, size_t count, off_t offset)

 {

   off_t cur_offset      = offset;

   size_t byte_remaining = count;

   char* buffer          = const_cast<char*>(static_cast<char const*>(buf));

   while (byte_remaining > 0) {

     ssize_t nbytes = 0;

     if constexpr (Operation == IOOperationType::READ) {

       nbytes = ::pread(fd, buffer, byte_remaining, cur_offset);

     } else {

       nbytes = ::pwrite(fd, buffer, byte_remaining, cur_offset);

     }

     if (nbytes == -1) {

       std::string const name = (Operation == IOOperationType::READ) ? "pread" : "pwrite";

       KVIKIO_EXPECT(errno != EBADF, "POSIX error: Operation not permitted");

       KVIKIO_FAIL("POSIX error on " + name + ": " + strerror(errno));

     }

     if constexpr (Operation == IOOperationType::READ) {

       KVIKIO_EXPECT(nbytes != 0, "POSIX error on pread: EOF");

     }

     if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes; }

     buffer += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)

     cur_offset += nbytes;

     byte_remaining -= nbytes;

   }

   return convert_size2ssize(count);

 }


 template <IOOperationType Operation>

 std::size_t posix_device_io(int fd,

                             void const* devPtr_base,

                             std::size_t size,

                             std::size_t file_offset,

                             std::size_t devPtr_offset)

 {

   auto alloc              = AllocRetain::instance().get();

   CUdeviceptr devPtr      = convert_void2deviceptr(devPtr_base) + devPtr_offset;

   off_t cur_file_offset   = convert_size2off(file_offset);

   off_t byte_remaining    = convert_size2off(size);

   off_t const chunk_size2 = convert_size2off(alloc.size());


   // Get a stream for the current CUDA context and thread

   CUstream stream = StreamsByThread::get();


   while (byte_remaining > 0) {

     off_t const nbytes_requested = std::min(chunk_size2, byte_remaining);

     ssize_t nbytes_got           = nbytes_requested;

     if constexpr (Operation == IOOperationType::READ) {

       nbytes_got = posix_host_io<IOOperationType::READ, PartialIO::YES>(

         fd, alloc.get(), nbytes_requested, cur_file_offset);

       CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream));

       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));

     } else {  // Is a write operation

       CUDA_DRIVER_TRY(

         cudaAPI::instance().MemcpyDtoHAsync(alloc.get(), devPtr, nbytes_requested, stream));

       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));

       posix_host_io<IOOperationType::WRITE, PartialIO::NO>(

         fd, alloc.get(), nbytes_requested, cur_file_offset);

     }

     cur_file_offset += nbytes_got;

     devPtr += nbytes_got;

     byte_remaining -= nbytes_got;

   }

   return size;

 }


 template <PartialIO PartialIOStatus>

 std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t file_offset)

 {

   KVIKIO_NVTX_SCOPED_RANGE("posix_host_read()", size);

   return detail::posix_host_io<IOOperationType::READ, PartialIOStatus>(

     fd, buf, size, convert_size2off(file_offset));

 }


 template <PartialIO PartialIOStatus>

 std::size_t posix_host_write(int fd, void const* buf, std::size_t size, std::size_t file_offset)

 {

   KVIKIO_NVTX_SCOPED_RANGE("posix_host_write()", size);

   return detail::posix_host_io<IOOperationType::WRITE, PartialIOStatus>(

     fd, buf, size, convert_size2off(file_offset));

 }


 std::size_t posix_device_read(int fd,

                               void const* devPtr_base,

                               std::size_t size,

                               std::size_t file_offset,

                               std::size_t devPtr_offset);


 std::size_t posix_device_write(int fd,

                                void const* devPtr_base,

                                std::size_t size,

                                std::size_t file_offset,

                                std::size_t devPtr_offset);


 }  // namespace kvikio::detail

kvikio::detail::StreamsByThread
Singleton class to retrieve a CUDA stream for device-host copying.
Definition: posix_io.hpp:54

KVIKIO_EXPECT
#define KVIKIO_EXPECT(...)
Macro for checking pre-conditions or conditions that throws an exception when a condition is violated...
Definition: error.hpp:216

CUDA_DRIVER_TRY
#define CUDA_DRIVER_TRY(...)
Error checking macro for CUDA driver API functions.
Definition: error.hpp:68

KVIKIO_FAIL
#define KVIKIO_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:252