datasource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
20 #include <cudf/utilities/error.hpp>
21 #include <cudf/utilities/export.hpp>
22 #include <cudf/utilities/span.hpp>
23 
24 #include <rmm/cuda_stream_view.hpp>
25 
26 #include <future>
27 #include <memory>
28 
29 namespace CUDF_EXPORT cudf {
31 namespace io {
32 
42 class datasource {
43  public:
44  template <typename Container>
45  class owning_buffer; // forward declaration
51  class buffer {
52  public:
58  [[nodiscard]] virtual size_t size() const = 0;
59 
65  [[nodiscard]] virtual uint8_t const* data() const = 0;
66 
70  virtual ~buffer() {}
71 
79  template <typename Container>
80  static std::unique_ptr<buffer> create(Container&& data_owner)
81  {
82  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
83  }
84  };
85 
94  static std::unique_ptr<datasource> create(std::string const& filepath,
95  size_t offset = 0,
96  size_t size = 0);
97 
106  static std::unique_ptr<datasource> create(host_buffer const& buffer);
107 
114  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
115 
122  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
123 
130  static std::unique_ptr<datasource> create(datasource* source);
131 
138  template <typename T>
139  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
140  {
141  std::vector<std::unique_ptr<datasource>> sources;
142  sources.reserve(args.size());
143  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
144  return datasource::create(arg);
145  });
146  return sources;
147  }
148 
152  virtual ~datasource(){};
153 
162  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
163 
173  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
174 
187  [[nodiscard]] virtual bool supports_device_read() const { return false; }
188 
195  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
196  {
197  return supports_device_read();
198  }
199 
216  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
217  size_t size,
218  rmm::cuda_stream_view stream)
219  {
220  CUDF_FAIL("datasource classes that support device_read must override it.");
221  }
222 
240  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
241  {
242  CUDF_FAIL("datasource classes that support device_read must override it.");
243  }
244 
265  virtual std::future<size_t> device_read_async(size_t offset,
266  size_t size,
267  uint8_t* dst,
268  rmm::cuda_stream_view stream)
269  {
270  CUDF_FAIL("datasource classes that support device_read_async must override it.");
271  }
272 
278  [[nodiscard]] virtual size_t size() const = 0;
279 
285  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
286 
290  class non_owning_buffer : public buffer {
291  public:
292  non_owning_buffer() {}
293 
300  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
301 
307  [[nodiscard]] size_t size() const override { return _size; }
308 
314  [[nodiscard]] uint8_t const* data() const override { return _data; }
315 
316  private:
317  uint8_t const* _data{nullptr};
318  size_t _size{0};
319  };
320 
328  template <typename Container>
329  class owning_buffer : public buffer {
330  public:
336  owning_buffer(Container&& data_owner)
337  : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
338  {
339  }
340 
349  owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
350  : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
351  {
352  }
353 
359  [[nodiscard]] size_t size() const override { return _size; }
360 
366  [[nodiscard]] uint8_t const* data() const override
367  {
368  return static_cast<uint8_t const*>(_data_ptr);
369  }
370 
371  private:
372  Container _data;
373  void const* _data_ptr;
374  size_t _size;
375  };
376 };
377  // end of group
379 } // namespace io
380 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:51
virtual ~buffer()
Base class destructor.
Definition: datasource.hpp:70
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:80
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:290
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:307
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:314
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:300
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:329
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:359
owning_buffer(Container &&data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:336
owning_buffer(Container &&data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:349
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:366
Interface class for providing input data to the readers.
Definition: datasource.hpp:42
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t size=0)
Creates a source from a file path.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:139
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:187
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:240
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:195
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:265
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:285
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
static std::unique_ptr< datasource > create(host_buffer const &buffer)
Creates a source from a host memory buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:216
virtual ~datasource()
Base class destructor.
Definition: datasource.hpp:152
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
std::unique_ptr< column > transform(column_view const &input, std::string const &unary_udf, data_type output_type, bool is_ptx, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Creates a new column by applying a unary function against every element of an input column.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:328
C++20 std::span with reduced feature set.
Definition: span.hpp:231
Non-owning view of a host memory buffer.
Definition: io/types.hpp:304