datasource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
20 #include <cudf/utilities/error.hpp>
21 #include <cudf/utilities/span.hpp>
22 
23 #include <rmm/cuda_stream_view.hpp>
24 
25 #include <future>
26 #include <memory>
27 
28 namespace cudf {
30 namespace io {
31 
41 class datasource {
42  public:
43  template <typename Container>
44  class owning_buffer; // forward declaration
50  class buffer {
51  public:
57  [[nodiscard]] virtual size_t size() const = 0;
58 
64  [[nodiscard]] virtual uint8_t const* data() const = 0;
65 
69  virtual ~buffer() {}
70 
78  template <typename Container>
79  static std::unique_ptr<buffer> create(Container&& data_owner)
80  {
81  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
82  }
83  };
84 
93  static std::unique_ptr<datasource> create(std::string const& filepath,
94  size_t offset = 0,
95  size_t size = 0);
96 
105  static std::unique_ptr<datasource> create(host_buffer const& buffer);
106 
113  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
114 
121  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
122 
129  static std::unique_ptr<datasource> create(datasource* source);
130 
137  template <typename T>
138  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
139  {
140  std::vector<std::unique_ptr<datasource>> sources;
141  sources.reserve(args.size());
142  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
143  return datasource::create(arg);
144  });
145  return sources;
146  }
147 
151  virtual ~datasource(){};
152 
161  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
162 
172  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
173 
186  [[nodiscard]] virtual bool supports_device_read() const { return false; }
187 
194  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
195  {
196  return supports_device_read();
197  }
198 
215  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
216  size_t size,
217  rmm::cuda_stream_view stream)
218  {
219  CUDF_FAIL("datasource classes that support device_read must override it.");
220  }
221 
239  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
240  {
241  CUDF_FAIL("datasource classes that support device_read must override it.");
242  }
243 
264  virtual std::future<size_t> device_read_async(size_t offset,
265  size_t size,
266  uint8_t* dst,
267  rmm::cuda_stream_view stream)
268  {
269  CUDF_FAIL("datasource classes that support device_read_async must override it.");
270  }
271 
277  [[nodiscard]] virtual size_t size() const = 0;
278 
284  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
285 
289  class non_owning_buffer : public buffer {
290  public:
291  non_owning_buffer() {}
292 
299  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
300 
306  [[nodiscard]] size_t size() const override { return _size; }
307 
313  [[nodiscard]] uint8_t const* data() const override { return _data; }
314 
315  private:
316  uint8_t const* _data{nullptr};
317  size_t _size{0};
318  };
319 
327  template <typename Container>
328  class owning_buffer : public buffer {
329  public:
335  owning_buffer(Container&& data_owner)
336  : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
337  {
338  }
339 
348  owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
349  : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
350  {
351  }
352 
358  [[nodiscard]] size_t size() const override { return _size; }
359 
365  [[nodiscard]] uint8_t const* data() const override
366  {
367  return static_cast<uint8_t const*>(_data_ptr);
368  }
369 
370  private:
371  Container _data;
372  void const* _data_ptr;
373  size_t _size;
374  };
375 };
376  // end of group
378 } // namespace io
379 } // namespace cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:50
virtual ~buffer()
Base class destructor.
Definition: datasource.hpp:69
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:79
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:289
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:306
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:313
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:299
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:328
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:358
owning_buffer(Container &&data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:335
owning_buffer(Container &&data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:348
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:365
Interface class for providing input data to the readers.
Definition: datasource.hpp:41
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t size=0)
Creates a source from a file path.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:138
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:186
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:239
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:194
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:264
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:284
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
static std::unique_ptr< datasource > create(host_buffer const &buffer)
Creates a source from a host memory buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:215
virtual ~datasource()
Base class destructor.
Definition: datasource.hpp:151
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
std::unique_ptr< column > transform(column_view const &input, std::string const &unary_udf, data_type output_type, bool is_ptx, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Creates a new column by applying a unary function against every element of an input column.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:215
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:34
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:291
C++20 std::span with reduced feature set.
Definition: span.hpp:224
Non-owning view of a host memory buffer.
Definition: io/types.hpp:261