datasource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/types.hpp>
10 #include <cudf/utilities/export.hpp>
11 #include <cudf/utilities/span.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <future>
16 #include <memory>
17 
18 namespace CUDF_EXPORT cudf {
20 namespace io {
21 
31 class datasource {
32  public:
33  template <typename Container>
34  class owning_buffer; // forward declaration
40  class buffer {
41  public:
47  [[nodiscard]] virtual size_t size() const = 0;
48 
54  [[nodiscard]] virtual uint8_t const* data() const = 0;
55 
59  virtual ~buffer() = default;
60 
70  {
71  return cudf::host_span<uint8_t const>{data(), size()};
72  }
73 
81  template <typename Container>
82  static std::unique_ptr<buffer> create(Container&& data_owner)
83  {
84  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
85  }
86  };
87 
103  static std::unique_ptr<datasource> create(std::string const& filepath,
104  size_t offset = 0,
105  size_t max_size_estimate = 0);
106 
113  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
114 
121  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
122 
129  static std::unique_ptr<datasource> create(datasource* source);
130 
137  template <typename T>
138  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
139  {
140  std::vector<std::unique_ptr<datasource>> sources;
141  sources.reserve(args.size());
142  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
143  return datasource::create(arg);
144  });
145  return sources;
146  }
147 
151  virtual ~datasource() = default;
152 
161  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
162 
175  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
176  size_t size);
177 
187  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
188 
203  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
204 
217  [[nodiscard]] virtual bool supports_device_read() const { return false; }
218 
225  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
226  {
227  return supports_device_read();
228  }
229 
246  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
247  size_t size,
248  rmm::cuda_stream_view stream)
249  {
250  CUDF_FAIL("datasource classes that support device_read must override it.");
251  }
252 
270  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
271  {
272  CUDF_FAIL("datasource classes that support device_read must override it.");
273  }
274 
298  virtual std::future<size_t> device_read_async(size_t offset,
299  size_t size,
300  uint8_t* dst,
301  rmm::cuda_stream_view stream)
302  {
303  CUDF_FAIL("datasource classes that support device_read_async must override it.");
304  }
305 
311  [[nodiscard]] virtual size_t size() const = 0;
312 
318  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
319 
323  class non_owning_buffer : public buffer {
324  public:
325  non_owning_buffer() = default;
326 
333  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
334 
340  [[nodiscard]] size_t size() const override { return _size; }
341 
347  [[nodiscard]] uint8_t const* data() const override { return _data; }
348 
349  private:
350  uint8_t const* _data{nullptr};
351  size_t _size{0};
352  };
353 
361  template <typename Container>
362  class owning_buffer : public buffer {
363  public:
364  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
365  // reference).
366  static_assert(std::is_rvalue_reference_v<Container&&>,
367  "The container argument passed to the constructor must be an rvalue.");
368 
375  owning_buffer(Container&& moved_data_owner)
376  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
377  {
378  }
379 
389  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
390  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
391  {
392  }
393 
399  [[nodiscard]] size_t size() const override { return _size; }
400 
406  [[nodiscard]] uint8_t const* data() const override
407  {
408  return static_cast<uint8_t const*>(_data_ptr);
409  }
410 
411  private:
412  Container _data;
413  void const* _data_ptr;
414  size_t _size;
415  };
416 };
417 
429 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
430  size_t offset = 0,
431  size_t max_size_estimate = 0);
432  // end of group
434 } // namespace io
435 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:40
virtual ~buffer()=default
Base class destructor.
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:82
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:323
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:340
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:347
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:333
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:362
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:375
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:389
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:399
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:406
Interface class for providing input data to the readers.
Definition: datasource.hpp:31
virtual ~datasource()=default
Base class destructor.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:138
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:217
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual std::future< std::unique_ptr< datasource::buffer > > host_read_async(size_t offset, size_t size)
Asynchronously reads a specified portion of data from the datasource.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:270
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:225
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:298
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:318
virtual std::future< size_t > host_read_async(size_t offset, size_t size, uint8_t *dst)
Asynchronously reads data from the source into the provided host memory buffer.
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:246
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0)
Creates a source from a file path.
std::vector< std::unique_ptr< cudf::io::datasource > > make_datasources(source_info const &info, size_t offset=0, size_t max_size_estimate=0)
Constructs datasources from dataset source information.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, output_nullability null_policy=output_nullability::PRESERVE, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:182
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
Definition: span.hpp:320
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
APIs for spans.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Source information for read interfaces.
Definition: io/types.hpp:316