datasource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
20 #include <cudf/utilities/error.hpp>
21 #include <cudf/utilities/export.hpp>
22 #include <cudf/utilities/span.hpp>
23 
24 #include <rmm/cuda_stream_view.hpp>
25 
26 #include <future>
27 #include <memory>
28 
29 namespace CUDF_EXPORT cudf {
31 namespace io {
32 
42 class datasource {
43  public:
44  template <typename Container>
45  class owning_buffer; // forward declaration
51  class buffer {
52  public:
58  [[nodiscard]] virtual size_t size() const = 0;
59 
65  [[nodiscard]] virtual uint8_t const* data() const = 0;
66 
70  virtual ~buffer() = default;
71 
79  template <typename Container>
80  static std::unique_ptr<buffer> create(Container&& data_owner)
81  {
82  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
83  }
84  };
85 
101  static std::unique_ptr<datasource> create(std::string const& filepath,
102  size_t offset = 0,
103  size_t max_size_estimate = 0);
104 
111  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
112 
119  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
120 
127  static std::unique_ptr<datasource> create(datasource* source);
128 
135  template <typename T>
136  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
137  {
138  std::vector<std::unique_ptr<datasource>> sources;
139  sources.reserve(args.size());
140  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
141  return datasource::create(arg);
142  });
143  return sources;
144  }
145 
149  virtual ~datasource() = default;
150 
159  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
160 
173  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
174  size_t size);
175 
185  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
186 
201  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
202 
215  [[nodiscard]] virtual bool supports_device_read() const { return false; }
216 
223  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
224  {
225  return supports_device_read();
226  }
227 
244  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
245  size_t size,
246  rmm::cuda_stream_view stream)
247  {
248  CUDF_FAIL("datasource classes that support device_read must override it.");
249  }
250 
268  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
269  {
270  CUDF_FAIL("datasource classes that support device_read must override it.");
271  }
272 
296  virtual std::future<size_t> device_read_async(size_t offset,
297  size_t size,
298  uint8_t* dst,
299  rmm::cuda_stream_view stream)
300  {
301  CUDF_FAIL("datasource classes that support device_read_async must override it.");
302  }
303 
309  [[nodiscard]] virtual size_t size() const = 0;
310 
316  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
317 
321  class non_owning_buffer : public buffer {
322  public:
323  non_owning_buffer() = default;
324 
331  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
332 
338  [[nodiscard]] size_t size() const override { return _size; }
339 
345  [[nodiscard]] uint8_t const* data() const override { return _data; }
346 
347  private:
348  uint8_t const* _data{nullptr};
349  size_t _size{0};
350  };
351 
359  template <typename Container>
360  class owning_buffer : public buffer {
361  public:
362  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
363  // reference).
364  static_assert(std::is_rvalue_reference_v<Container&&>,
365  "The container argument passed to the constructor must be an rvalue.");
366 
373  owning_buffer(Container&& moved_data_owner)
374  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
375  {
376  }
377 
387  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
388  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
389  {
390  }
391 
397  [[nodiscard]] size_t size() const override { return _size; }
398 
404  [[nodiscard]] uint8_t const* data() const override
405  {
406  return static_cast<uint8_t const*>(_data_ptr);
407  }
408 
409  private:
410  Container _data;
411  void const* _data_ptr;
412  size_t _size;
413  };
414 };
415  // end of group
417 } // namespace io
418 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:51
virtual ~buffer()=default
Base class destructor.
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:80
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:321
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:338
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:345
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:331
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:360
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:373
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:387
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:397
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:404
Interface class for providing input data to the readers.
Definition: datasource.hpp:42
virtual ~datasource()=default
Base class destructor.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:136
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:215
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual std::future< std::unique_ptr< datasource::buffer > > host_read_async(size_t offset, size_t size)
Asynchronously reads a specified portion of data from the datasource.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:268
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:223
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:296
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:316
virtual std::future< size_t > host_read_async(size_t offset, size_t size, uint8_t *dst)
Asynchronously reads data from the source into the provided host memory buffer.
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:244
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0)
Creates a source from a file path.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:193
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:37
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:355
C++20 std::span with reduced feature set.
Definition: span.hpp:194