datasource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
20 #include <cudf/utilities/error.hpp>
21 #include <cudf/utilities/export.hpp>
22 #include <cudf/utilities/span.hpp>
23 
24 #include <rmm/cuda_stream_view.hpp>
25 
26 #include <future>
27 #include <memory>
28 
29 namespace CUDF_EXPORT cudf {
31 namespace io {
32 
42 class datasource {
43  public:
44  template <typename Container>
45  class owning_buffer; // forward declaration
51  class buffer {
52  public:
58  [[nodiscard]] virtual size_t size() const = 0;
59 
65  [[nodiscard]] virtual uint8_t const* data() const = 0;
66 
70  virtual ~buffer() {}
71 
79  template <typename Container>
80  static std::unique_ptr<buffer> create(Container&& data_owner)
81  {
82  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
83  }
84  };
85 
101  static std::unique_ptr<datasource> create(std::string const& filepath,
102  size_t offset = 0,
103  size_t max_size_estimate = 0);
104 
113  static std::unique_ptr<datasource> create(host_buffer const& buffer);
114 
121  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
122 
129  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
130 
137  static std::unique_ptr<datasource> create(datasource* source);
138 
145  template <typename T>
146  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
147  {
148  std::vector<std::unique_ptr<datasource>> sources;
149  sources.reserve(args.size());
150  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
151  return datasource::create(arg);
152  });
153  return sources;
154  }
155 
159  virtual ~datasource(){};
160 
169  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
170 
180  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
181 
194  [[nodiscard]] virtual bool supports_device_read() const { return false; }
195 
202  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
203  {
204  return supports_device_read();
205  }
206 
223  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
224  size_t size,
225  rmm::cuda_stream_view stream)
226  {
227  CUDF_FAIL("datasource classes that support device_read must override it.");
228  }
229 
247  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
248  {
249  CUDF_FAIL("datasource classes that support device_read must override it.");
250  }
251 
272  virtual std::future<size_t> device_read_async(size_t offset,
273  size_t size,
274  uint8_t* dst,
275  rmm::cuda_stream_view stream)
276  {
277  CUDF_FAIL("datasource classes that support device_read_async must override it.");
278  }
279 
285  [[nodiscard]] virtual size_t size() const = 0;
286 
292  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
293 
297  class non_owning_buffer : public buffer {
298  public:
299  non_owning_buffer() {}
300 
307  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
308 
314  [[nodiscard]] size_t size() const override { return _size; }
315 
321  [[nodiscard]] uint8_t const* data() const override { return _data; }
322 
323  private:
324  uint8_t const* _data{nullptr};
325  size_t _size{0};
326  };
327 
335  template <typename Container>
336  class owning_buffer : public buffer {
337  public:
338  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
339  // reference).
340  static_assert(std::is_rvalue_reference_v<Container&&>,
341  "The container argument passed to the constructor must be an rvalue.");
342 
349  owning_buffer(Container&& moved_data_owner)
350  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
351  {
352  }
353 
363  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
364  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
365  {
366  }
367 
373  [[nodiscard]] size_t size() const override { return _size; }
374 
380  [[nodiscard]] uint8_t const* data() const override
381  {
382  return static_cast<uint8_t const*>(_data_ptr);
383  }
384 
385  private:
386  Container _data;
387  void const* _data_ptr;
388  size_t _size;
389  };
390 };
391  // end of group
393 } // namespace io
394 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:51
virtual ~buffer()
Base class destructor.
Definition: datasource.hpp:70
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:80
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:297
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:314
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:321
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:307
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:336
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:349
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:363
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:373
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:380
Interface class for providing input data to the readers.
Definition: datasource.hpp:42
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:146
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:194
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:247
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:202
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:272
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:292
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
static std::unique_ptr< datasource > create(host_buffer const &buffer)
Creates a source from a host memory buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:223
virtual ~datasource()
Base class destructor.
Definition: datasource.hpp:159
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0)
Creates a source from a file path.
std::unique_ptr< column > transform(column_view const &input, std::string const &unary_udf, data_type output_type, bool is_ptx, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a unary function against every element of an input column.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:338
C++20 std::span with reduced feature set.
Definition: span.hpp:219
Non-owning view of a host memory buffer.
Definition: io/types.hpp:304