datasource.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/types.hpp>
10 #include <cudf/utilities/export.hpp>
11 #include <cudf/utilities/span.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <future>
16 #include <memory>
17 
18 namespace CUDF_EXPORT cudf {
20 namespace io {
21 
31 class datasource {
32  public:
33  template <typename Container>
34  class owning_buffer; // forward declaration
40  class buffer {
41  public:
47  [[nodiscard]] virtual size_t size() const = 0;
48 
54  [[nodiscard]] virtual uint8_t const* data() const = 0;
55 
59  virtual ~buffer() = default;
60 
68  template <typename Container>
69  static std::unique_ptr<buffer> create(Container&& data_owner)
70  {
71  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
72  }
73  };
74 
90  static std::unique_ptr<datasource> create(std::string const& filepath,
91  size_t offset = 0,
92  size_t max_size_estimate = 0);
93 
100  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
101 
108  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
109 
116  static std::unique_ptr<datasource> create(datasource* source);
117 
124  template <typename T>
125  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
126  {
127  std::vector<std::unique_ptr<datasource>> sources;
128  sources.reserve(args.size());
129  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
130  return datasource::create(arg);
131  });
132  return sources;
133  }
134 
138  virtual ~datasource() = default;
139 
148  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
149 
162  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
163  size_t size);
164 
174  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
175 
190  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
191 
204  [[nodiscard]] virtual bool supports_device_read() const { return false; }
205 
212  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
213  {
214  return supports_device_read();
215  }
216 
233  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
234  size_t size,
235  rmm::cuda_stream_view stream)
236  {
237  CUDF_FAIL("datasource classes that support device_read must override it.");
238  }
239 
257  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
258  {
259  CUDF_FAIL("datasource classes that support device_read must override it.");
260  }
261 
285  virtual std::future<size_t> device_read_async(size_t offset,
286  size_t size,
287  uint8_t* dst,
288  rmm::cuda_stream_view stream)
289  {
290  CUDF_FAIL("datasource classes that support device_read_async must override it.");
291  }
292 
298  [[nodiscard]] virtual size_t size() const = 0;
299 
305  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
306 
310  class non_owning_buffer : public buffer {
311  public:
312  non_owning_buffer() = default;
313 
320  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
321 
327  [[nodiscard]] size_t size() const override { return _size; }
328 
334  [[nodiscard]] uint8_t const* data() const override { return _data; }
335 
336  private:
337  uint8_t const* _data{nullptr};
338  size_t _size{0};
339  };
340 
348  template <typename Container>
349  class owning_buffer : public buffer {
350  public:
351  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
352  // reference).
353  static_assert(std::is_rvalue_reference_v<Container&&>,
354  "The container argument passed to the constructor must be an rvalue.");
355 
362  owning_buffer(Container&& moved_data_owner)
363  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
364  {
365  }
366 
376  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
377  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
378  {
379  }
380 
386  [[nodiscard]] size_t size() const override { return _size; }
387 
393  [[nodiscard]] uint8_t const* data() const override
394  {
395  return static_cast<uint8_t const*>(_data_ptr);
396  }
397 
398  private:
399  Container _data;
400  void const* _data_ptr;
401  size_t _size;
402  };
403 };
404  // end of group
406 } // namespace io
407 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:40
virtual ~buffer()=default
Base class destructor.
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:69
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:310
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:327
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:334
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:320
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:349
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:362
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:376
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:386
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:393
Interface class for providing input data to the readers.
Definition: datasource.hpp:31
virtual ~datasource()=default
Base class destructor.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:125
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:204
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual std::future< std::unique_ptr< datasource::buffer > > host_read_async(size_t offset, size_t size)
Asynchronously reads a specified portion of data from the datasource.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:257
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:212
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:285
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:305
virtual std::future< size_t > host_read_async(size_t offset, size_t size, uint8_t *dst)
Asynchronously reads data from the source into the provided host memory buffer.
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:233
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0)
Creates a source from a file path.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:182
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:323
C++20 std::span with reduced feature set.
Definition: span.hpp:182