datasource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/types.hpp>
10 #include <cudf/utilities/export.hpp>
11 #include <cudf/utilities/span.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <future>
16 #include <memory>
17 #include <optional>
18 
19 namespace CUDF_EXPORT cudf {
21 namespace io {
22 
32 class datasource {
33  public:
34  template <typename Container>
35  class owning_buffer; // forward declaration
41  class buffer {
42  public:
48  [[nodiscard]] virtual size_t size() const = 0;
49 
55  [[nodiscard]] virtual uint8_t const* data() const = 0;
56 
60  virtual ~buffer() = default;
61 
71  {
72  return cudf::host_span<uint8_t const>{data(), size()};
73  }
74 
82  template <typename Container>
83  static std::unique_ptr<buffer> create(Container&& data_owner)
84  {
85  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
86  }
87  };
88 
106  static std::unique_ptr<datasource> create(std::string const& filepath,
107  size_t offset = 0,
108  size_t max_size_estimate = 0,
109  std::optional<std::size_t> known_size = std::nullopt);
110 
117  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
118 
125  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
126 
133  static std::unique_ptr<datasource> create(datasource* source);
134 
141  template <typename T>
142  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
143  {
144  std::vector<std::unique_ptr<datasource>> sources;
145  sources.reserve(args.size());
146  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
147  return datasource::create(arg);
148  });
149  return sources;
150  }
151 
155  virtual ~datasource() = default;
156 
165  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
166 
179  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
180  size_t size);
181 
191  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
192 
207  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
208 
221  [[nodiscard]] virtual bool supports_device_read() const { return false; }
222 
229  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
230  {
231  return supports_device_read();
232  }
233 
250  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
251  size_t size,
252  rmm::cuda_stream_view stream)
253  {
254  CUDF_FAIL("datasource classes that support device_read must override it.");
255  }
256 
274  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
275  {
276  CUDF_FAIL("datasource classes that support device_read must override it.");
277  }
278 
302  virtual std::future<size_t> device_read_async(size_t offset,
303  size_t size,
304  uint8_t* dst,
305  rmm::cuda_stream_view stream)
306  {
307  CUDF_FAIL("datasource classes that support device_read_async must override it.");
308  }
309 
315  [[nodiscard]] virtual size_t size() const = 0;
316 
322  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
323 
327  class non_owning_buffer : public buffer {
328  public:
329  non_owning_buffer() = default;
330 
337  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
338 
344  [[nodiscard]] size_t size() const override { return _size; }
345 
351  [[nodiscard]] uint8_t const* data() const override { return _data; }
352 
353  private:
354  uint8_t const* _data{nullptr};
355  size_t _size{0};
356  };
357 
365  template <typename Container>
366  class owning_buffer : public buffer {
367  public:
368  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
369  // reference).
370  static_assert(std::is_rvalue_reference_v<Container&&>,
371  "The container argument passed to the constructor must be an rvalue.");
372 
379  owning_buffer(Container&& moved_data_owner)
380  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
381  {
382  }
383 
393  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
394  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
395  {
396  }
397 
403  [[nodiscard]] size_t size() const override { return _size; }
404 
410  [[nodiscard]] uint8_t const* data() const override
411  {
412  return static_cast<uint8_t const*>(_data_ptr);
413  }
414 
415  private:
416  Container _data;
417  void const* _data_ptr;
418  size_t _size;
419  };
420 };
421 
433 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
434  size_t offset = 0,
435  size_t max_size_estimate = 0);
436  // end of group
438 } // namespace io
439 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:41
virtual ~buffer()=default
Base class destructor.
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:83
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:327
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:344
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:351
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:337
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:366
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:379
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:393
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:403
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:410
Interface class for providing input data to the readers.
Definition: datasource.hpp:32
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0, std::optional< std::size_t > known_size=std::nullopt)
Creates a source from a file path.
virtual ~datasource()=default
Base class destructor.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:142
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:221
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual std::future< std::unique_ptr< datasource::buffer > > host_read_async(size_t offset, size_t size)
Asynchronously reads a specified portion of data from the datasource.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:274
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:229
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:302
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:322
virtual std::future< size_t > host_read_async(size_t offset, size_t size, uint8_t *dst)
Asynchronously reads data from the source into the provided host memory buffer.
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:250
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
std::vector< std::unique_ptr< cudf::io::datasource > > make_datasources(source_info const &info, size_t offset=0, size_t max_size_estimate=0)
Constructs datasources from dataset source information.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, output_nullability null_policy=output_nullability::PRESERVE, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:186
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
Definition: span.hpp:320
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
APIs for spans.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Source information for read interfaces.
Definition: io/types.hpp:328