io/types.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
11 #pragma once
12 
13 #include <cudf/table/table.hpp>
14 #include <cudf/types.hpp>
15 #include <cudf/utilities/span.hpp>
16 
17 #include <map>
18 #include <memory>
19 #include <optional>
20 #include <string>
21 #include <unordered_map>
22 #include <utility>
23 #include <vector>
24 
25 namespace CUDF_EXPORT cudf {
27 namespace io {
28 class data_sink;
29 class datasource;
30 } // namespace io
31 } // namespace CUDF_EXPORT cudf
32 
34 namespace CUDF_EXPORT cudf {
36 namespace io {
46 enum class compression_type : int32_t {
47  NONE,
48  AUTO,
49  SNAPPY,
50  GZIP,
51  BZIP2,
52  BROTLI,
53  ZIP,
54  XZ,
55  ZLIB,
56  LZ4,
57  LZO,
58  ZSTD
59 };
60 
64 enum class io_type : int32_t {
65  FILEPATH,
66  HOST_BUFFER,
68  VOID,
70 };
71 
75 enum class quote_style : int32_t {
76  MINIMAL,
77  ALL,
78  NONNUMERIC,
79  NONE
80 };
81 
85 enum statistics_freq : int32_t {
90 };
91 
95 enum class column_encoding : int32_t {
96  // Common encodings:
97  USE_DEFAULT = -1,
98  DICTIONARY,
99  // Parquet encodings:
100  PLAIN,
101  DELTA_BINARY_PACKED,
102  DELTA_LENGTH_BYTE_ARRAY,
104  DELTA_BYTE_ARRAY,
106  BYTE_STREAM_SPLIT,
107  // ORC encodings:
108  DIRECT,
109  DIRECT_V2,
110  DICTIONARY_V2,
111 };
112 
117  public:
122 
131  writer_compression_statistics(size_t num_compressed_bytes,
132  size_t num_failed_bytes,
133  size_t num_skipped_bytes,
134  size_t num_compressed_output_bytes)
135  : _num_compressed_bytes(num_compressed_bytes),
136  _num_failed_bytes(num_failed_bytes),
137  _num_skipped_bytes(num_skipped_bytes),
138  _num_compressed_output_bytes(num_compressed_output_bytes)
139  {
140  }
141 
149  {
150  _num_compressed_bytes += other._num_compressed_bytes;
151  _num_failed_bytes += other._num_failed_bytes;
152  _num_skipped_bytes += other._num_skipped_bytes;
153  _num_compressed_output_bytes += other._num_compressed_output_bytes;
154  return *this;
155  }
156 
165  [[nodiscard]] auto num_compressed_bytes() const noexcept { return _num_compressed_bytes; }
166 
172  [[nodiscard]] auto num_failed_bytes() const noexcept { return _num_failed_bytes; }
173 
179  [[nodiscard]] auto num_skipped_bytes() const noexcept { return _num_skipped_bytes; }
180 
186  [[nodiscard]] auto num_total_input_bytes() const noexcept
187  {
188  return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
189  }
190 
199  [[nodiscard]] auto compression_ratio() const noexcept
200  {
201  return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
202  }
203 
204  private:
205  std::size_t _num_compressed_bytes = 0;
206  std::size_t _num_failed_bytes = 0;
207  std::size_t _num_skipped_bytes = 0;
208  std::size_t _num_compressed_output_bytes = 0;
209 };
210 
214 enum dictionary_policy : int32_t {
215  NEVER = 0,
216  ADAPTIVE = 1,
217  ALWAYS = 2
218 };
219 
227  std::string name;
228  std::optional<bool> is_nullable;
229  std::optional<bool> is_binary;
230  std::optional<int32_t> type_length;
231  std::vector<column_name_info> children;
232 
240  column_name_info(std::string _name,
241  std::optional<bool> _is_nullable = std::nullopt,
242  std::optional<bool> _is_binary = std::nullopt)
243  : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
244  {
245  }
246 
247  column_name_info() = default;
248 
255  bool operator==(column_name_info const& rhs) const
256  {
257  return ((name == rhs.name) && (is_nullable == rhs.is_nullable) &&
258  (is_binary == rhs.is_binary) && (type_length == rhs.type_length) &&
259  (children == rhs.children));
260  };
261 };
262 
267  std::vector<column_name_info>
269  std::vector<size_t> num_rows_per_source;
272  std::map<std::string, std::string> user_data;
274  std::vector<std::unordered_map<std::string, std::string>>
276 
277  // The following variables are currently only computed for Parquet reader
278  size_type num_input_row_groups{0};
279  std::optional<size_type>
283  std::optional<size_type>
287 };
288 
293  std::unique_ptr<table> tbl;
295 };
296 
304 template <typename T>
305 constexpr inline auto is_byte_like_type()
306 {
307  using non_cv_T = std::remove_cv_t<T>;
308  return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
309  std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
310  std::is_same_v<non_cv_T, std::byte>;
311 }
312 
316 struct source_info {
320  source_info() = default;
321 
327  explicit source_info(std::vector<std::string> file_paths)
328  : _type(io_type::FILEPATH), _num_sources(file_paths.size()), _filepaths(std::move(file_paths))
329  {
330  }
331 
337  explicit source_info(std::string file_path)
338  : _type(io_type::FILEPATH), _num_sources(1), _filepaths({std::move(file_path)})
339  {
340  }
341 
347  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
348  explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
349  : _type(io_type::HOST_BUFFER), _num_sources(host_buffers.size())
350  {
351  if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
352  _host_buffers.reserve(host_buffers.size());
353  std::transform(host_buffers.begin(),
354  host_buffers.end(),
355  std::back_inserter(_host_buffers),
356  [](auto const s) {
357  return cudf::host_span<std::byte const>{
358  reinterpret_cast<std::byte const*>(s.data()), s.size()};
359  });
360  } else {
361  _host_buffers.assign(host_buffers.begin(), host_buffers.end());
362  }
363  }
364 
370  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
371  explicit source_info(cudf::host_span<T> host_data)
372  : _type(io_type::HOST_BUFFER),
373  _num_sources(1),
374  _host_buffers{cudf::host_span<std::byte const>(
375  reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
376  {
377  }
378 
385  : _type(io_type::DEVICE_BUFFER),
386  _num_sources(device_buffers.size()),
387  _device_buffers(device_buffers.begin(), device_buffers.end())
388  {
389  }
390 
397  : _type(io_type::DEVICE_BUFFER), _num_sources(1), _device_buffers({{d_buffer}})
398  {
399  }
400 
406  explicit source_info(std::vector<cudf::io::datasource*> const& sources)
407  : _type(io_type::USER_IMPLEMENTED), _num_sources(sources.size()), _user_sources(sources)
408  {
409  }
410 
417  : _type(io_type::USER_IMPLEMENTED), _num_sources(1), _user_sources({source})
418  {
419  }
420 
426  [[nodiscard]] auto type() const { return _type; }
432  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
438  [[nodiscard]] auto const& host_buffers() const { return _host_buffers; }
444  [[nodiscard]] auto const& device_buffers() const { return _device_buffers; }
450  [[nodiscard]] auto const& user_sources() const { return _user_sources; }
451 
457  [[nodiscard]] auto num_sources() const { return _num_sources; }
458 
459  private:
460  io_type _type = io_type::VOID;
461  size_t _num_sources = 0;
462  std::vector<std::string> _filepaths;
463  std::vector<cudf::host_span<std::byte const>> _host_buffers;
464  std::vector<cudf::device_span<std::byte const>> _device_buffers;
465  std::vector<cudf::io::datasource*> _user_sources;
466 };
467 
471 struct sink_info {
472  sink_info() = default;
478  sink_info(size_t num_sinks) : _num_sinks(num_sinks) {}
479 
485  explicit sink_info(std::vector<std::string> file_paths)
486  : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(std::move(file_paths))
487  {
488  }
489 
495  explicit sink_info(std::string file_path)
496  : _type(io_type::FILEPATH), _filepaths({std::move(file_path)})
497  {
498  }
499 
505  explicit sink_info(std::vector<std::vector<char>*> buffers)
506  : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(std::move(buffers))
507  {
508  }
514  explicit sink_info(std::vector<char>* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {}
515 
521  explicit sink_info(std::vector<cudf::io::data_sink*> const& user_sinks)
522  : _type(io_type::USER_IMPLEMENTED),
523  _num_sinks(user_sinks.size()),
524  _user_sinks(std::move(user_sinks))
525  {
526  }
527 
533  explicit sink_info(class cudf::io::data_sink* user_sink)
534  : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink})
535  {
536  }
537 
543  [[nodiscard]] auto type() const { return _type; }
549  [[nodiscard]] auto num_sinks() const { return _num_sinks; }
555  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
561  [[nodiscard]] auto const& buffers() const { return _buffers; }
567  [[nodiscard]] auto const& user_sinks() const { return _user_sinks; }
568 
569  private:
570  io_type _type = io_type::VOID;
571  size_t _num_sinks = 1;
572  std::vector<std::string> _filepaths;
573  std::vector<std::vector<char>*> _buffers;
574  std::vector<cudf::io::data_sink*> _user_sinks;
575 };
576 
577 class table_input_metadata;
578 
583  friend table_input_metadata;
584  std::string _name = "";
585  std::optional<bool> _nullable;
586  bool _list_column_is_map = false;
587  bool _use_int96_timestamp = false;
588  bool _output_as_binary = false;
589  bool _skip_compression = false;
590  std::optional<uint8_t> _decimal_precision;
591  std::optional<int32_t> _parquet_field_id;
592  std::optional<int32_t> _type_length;
593  std::vector<column_in_metadata> children;
594  column_encoding _encoding = column_encoding::USE_DEFAULT;
595 
596  public:
597  column_in_metadata() = default;
603  column_in_metadata(std::string_view name) : _name{name} {}
611  {
612  children.push_back(child);
613  return *this;
614  }
615 
622  column_in_metadata& set_name(std::string const& name) noexcept
623  {
624  _name = name;
625  return *this;
626  }
627 
635  {
636  _nullable = nullable;
637  return *this;
638  }
639 
648  {
649  _list_column_is_map = true;
650  return *this;
651  }
652 
662  {
663  _use_int96_timestamp = req;
664  return *this;
665  }
666 
674  column_in_metadata& set_decimal_precision(uint8_t precision) noexcept
675  {
676  _decimal_precision = precision;
677  return *this;
678  }
679 
687  column_in_metadata& set_type_length(int32_t length) noexcept
688  {
689  _type_length = length;
690  return *this;
691  }
692 
699  column_in_metadata& set_parquet_field_id(int32_t field_id) noexcept
700  {
701  _parquet_field_id = field_id;
702  return *this;
703  }
704 
714  {
715  _output_as_binary = binary;
716  if (_output_as_binary and children.size() == 1) {
717  children.emplace_back();
718  } else if (!_output_as_binary and children.size() == 2) {
719  children.pop_back();
720  }
721  return *this;
722  }
723 
732  {
733  _skip_compression = skip;
734  return *this;
735  }
736 
748  {
749  _encoding = encoding;
750  return *this;
751  }
752 
759  column_in_metadata& child(size_type i) noexcept { return children[i]; }
760 
767  [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; }
768 
774  [[nodiscard]] std::string const& get_name() const noexcept { return _name; }
775 
781  [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); }
782 
790  [[nodiscard]] bool nullable() const { return _nullable.value(); }
791 
797  [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }
798 
805  [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }
806 
812  [[nodiscard]] bool is_decimal_precision_set() const noexcept
813  {
814  return _decimal_precision.has_value();
815  }
816 
824  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
825 
831  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
832 
840  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
841 
847  [[nodiscard]] bool is_parquet_field_id_set() const noexcept
848  {
849  return _parquet_field_id.has_value();
850  }
851 
859  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
860 
866  [[nodiscard]] size_type num_children() const noexcept { return children.size(); }
867 
873  [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
874 
880  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
881 
887  [[nodiscard]] column_encoding get_encoding() const { return _encoding; }
888 };
889 
894  public:
895  table_input_metadata() = default; // Required by cython
896 
905 
914  explicit table_input_metadata(table_metadata const& metadata);
915 
916  std::vector<column_in_metadata> column_metadata;
917 };
918 
928 
929  partition_info() = default;
936  partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows)
937  {
938  }
939 };
940 
946  // Whether to read binary data as a string column
947  bool _convert_binary_to_strings{true};
948  int32_t _type_length{0};
949 
950  std::vector<reader_column_schema> children;
951 
952  public:
953  reader_column_schema() = default;
954 
960  reader_column_schema(size_type number_of_children) { children.resize(number_of_children); }
961 
968  {
969  children.assign(child_span.begin(), child_span.end());
970  }
971 
979  {
980  children.push_back(child);
981  return *this;
982  }
983 
990  [[nodiscard]] reader_column_schema& child(size_type i) { return children[i]; }
991 
998  [[nodiscard]] reader_column_schema const& child(size_type i) const { return children[i]; }
999 
1009  {
1010  _convert_binary_to_strings = convert_to_string;
1011  return *this;
1012  }
1013 
1021  {
1022  _type_length = type_length;
1023  return *this;
1024  }
1025 
1031  [[nodiscard]] bool is_enabled_convert_binary_to_strings() const
1032  {
1033  return _convert_binary_to_strings;
1034  }
1035 
1041  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
1042 
1048  [[nodiscard]] size_t get_num_children() const { return children.size(); }
1049 };
1050  // end of group
1052 } // namespace io
1053 } // namespace CUDF_EXPORT cudf
constexpr CUDF_HOST_DEVICE iterator end() const noexcept
Returns an iterator to the element following the last element of the span.
Definition: span.hpp:93
constexpr CUDF_HOST_DEVICE iterator begin() const noexcept
Returns an iterator to the first element of the span.
Definition: span.hpp:85
Metadata for a column.
Definition: io/types.hpp:582
column_in_metadata & set_name(std::string const &name) noexcept
Set the name of this column.
Definition: io/types.hpp:622
column_in_metadata & add_child(column_in_metadata const &child)
Add the children metadata of this column.
Definition: io/types.hpp:610
bool is_enabled_output_as_binary() const noexcept
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:873
column_in_metadata & set_parquet_field_id(int32_t field_id) noexcept
Set the parquet field id of this column.
Definition: io/types.hpp:699
column_in_metadata & set_int96_timestamps(bool req) noexcept
Specifies whether this timestamp column should be encoded using the deprecated int96 physical type....
Definition: io/types.hpp:661
column_in_metadata & set_decimal_precision(uint8_t precision) noexcept
Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.
Definition: io/types.hpp:674
bool is_enabled_int96_timestamps() const noexcept
Get whether to encode this timestamp column using deprecated int96 physical type.
Definition: io/types.hpp:805
bool is_parquet_field_id_set() const noexcept
Get whether parquet field id has been set for this column.
Definition: io/types.hpp:847
bool is_decimal_precision_set() const noexcept
Get whether precision has been set for this decimal column.
Definition: io/types.hpp:812
bool is_type_length_set() const noexcept
Get whether type length has been set for this column.
Definition: io/types.hpp:831
bool nullable() const
Gets the explicitly set nullability for this column.
Definition: io/types.hpp:790
size_type num_children() const noexcept
Get the number of children of this column.
Definition: io/types.hpp:866
bool is_map() const noexcept
If this is the metadata of a list column, returns whether it is to be encoded as a map.
Definition: io/types.hpp:797
uint8_t get_decimal_precision() const
Get the decimal precision that was set for this column.
Definition: io/types.hpp:824
column_in_metadata & set_encoding(column_encoding encoding) noexcept
Sets the encoding to use for this column.
Definition: io/types.hpp:747
column_encoding get_encoding() const
Get the encoding that was set for this column.
Definition: io/types.hpp:887
uint8_t get_type_length() const
Get the type length that was set for this column.
Definition: io/types.hpp:840
column_in_metadata & set_output_as_binary(bool binary) noexcept
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:713
column_in_metadata & child(size_type i) noexcept
Get reference to a child of this column.
Definition: io/types.hpp:759
column_in_metadata & set_type_length(int32_t length) noexcept
Set the data length of the column. Only valid if this column is a fixed-length byte array.
Definition: io/types.hpp:687
bool is_enabled_skip_compression() const noexcept
Get whether to skip compressing this column.
Definition: io/types.hpp:880
int32_t get_parquet_field_id() const
Get the parquet field id that was set for this column.
Definition: io/types.hpp:859
column_in_metadata & set_list_column_as_map() noexcept
Specify that this list column should be encoded as a map in the written file.
Definition: io/types.hpp:647
column_in_metadata(std::string_view name)
Construct a new column in metadata object.
Definition: io/types.hpp:603
std::string const & get_name() const noexcept
Get the name of this column.
Definition: io/types.hpp:774
column_in_metadata & set_nullability(bool nullable) noexcept
Set the nullability of this column.
Definition: io/types.hpp:634
bool is_nullability_defined() const noexcept
Get whether nullability has been explicitly set for this column.
Definition: io/types.hpp:781
column_in_metadata const & child(size_type i) const noexcept
Get const reference to a child of this column.
Definition: io/types.hpp:767
column_in_metadata & set_skip_compression(bool skip) noexcept
Specifies whether this column should not be compressed regardless of the compression codec specified ...
Definition: io/types.hpp:731
Interface class for storing the output data from the writers.
Definition: data_sink.hpp:32
Interface class for providing input data to the readers.
Definition: datasource.hpp:31
schema element for reader
Definition: io/types.hpp:945
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
Definition: io/types.hpp:998
reader_column_schema & set_type_length(int32_t type_length)
Sets the length of fixed length data.
Definition: io/types.hpp:1020
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:1031
int32_t get_type_length() const
Get the length in bytes of this fixed length data.
Definition: io/types.hpp:1041
reader_column_schema(host_span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
Definition: io/types.hpp:967
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:1008
size_t get_num_children() const
Get the number of child objects.
Definition: io/types.hpp:1048
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
Definition: io/types.hpp:978
reader_column_schema & child(size_type i)
Get reference to a child of this column.
Definition: io/types.hpp:990
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
Definition: io/types.hpp:960
Metadata for a table.
Definition: io/types.hpp:893
table_input_metadata(table_view const &table)
Construct a new table_input_metadata from a table_view.
table_input_metadata(table_metadata const &metadata)
Construct a new table_input_metadata from a table_metadata object.
std::vector< column_in_metadata > column_metadata
List of column metadata.
Definition: io/types.hpp:916
Statistics about compression performed by a writer.
Definition: io/types.hpp:116
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
Definition: io/types.hpp:199
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
Definition: io/types.hpp:186
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
Definition: io/types.hpp:148
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
Definition: io/types.hpp:172
writer_compression_statistics()=default
Default constructor.
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
Definition: io/types.hpp:179
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
Definition: io/types.hpp:131
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
Definition: io/types.hpp:165
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:85
column_encoding
Valid encodings for use with column_in_metadata::set_encoding()
Definition: io/types.hpp:95
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:75
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
Definition: io/types.hpp:305
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:214
compression_type
Compression algorithms.
Definition: io/types.hpp:46
io_type
Data source or destination types.
Definition: io/types.hpp:64
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
Definition: io/types.hpp:89
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:87
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:86
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:88
@ USE_DEFAULT
No encoding has been requested, use default encoding.
@ MINIMAL
Quote only fields which contain special characters.
@ NONNUMERIC
Quote all non-numeric fields.
@ ALWAYS
Use dictionary regardless of impact on compression.
Definition: io/types.hpp:217
@ ADAPTIVE
Use dictionary when it will not impact compression.
Definition: io/types.hpp:216
@ NEVER
Never use dictionary encoding.
Definition: io/types.hpp:215
@ XZ
XZ format, using LZMA(2) algorithm.
@ ZIP
ZIP format, using DEFLATE algorithm.
@ BZIP2
BZIP2 format, using Burrows-Wheeler transform.
@ AUTO
Automatically detect or select compression format.
@ HOST_BUFFER
Input/output is a buffer in host memory.
@ USER_IMPLEMENTED
Input/output is handled by a custom user class.
@ VOID
Input/output is nothing. No work is done. Useful for benchmarking.
@ FILEPATH
Input/output is a file path.
@ DEVICE_BUFFER
Input/output is a buffer in device memory.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cuDF interfaces
Definition: host_udf.hpp:26
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
@ ALL
All initialization steps (default behavior)
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:323
C++20 std::span with reduced feature set.
Definition: span.hpp:182
Detailed name (and optionally nullability) information for output columns.
Definition: io/types.hpp:226
std::optional< bool > is_nullable
Column nullability.
Definition: io/types.hpp:228
std::optional< bool > is_binary
Column is binary (i.e. not a list)
Definition: io/types.hpp:229
std::vector< column_name_info > children
Child column names.
Definition: io/types.hpp:231
bool operator==(column_name_info const &rhs) const
Compares two column name info structs for equality.
Definition: io/types.hpp:255
std::optional< int32_t > type_length
Byte width of data (for fixed length data)
Definition: io/types.hpp:230
std::string name
Column name.
Definition: io/types.hpp:227
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
Construct a column name info with a name, optional nullabilty, and no children.
Definition: io/types.hpp:240
Information used while writing partitioned datasets.
Definition: io/types.hpp:925
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
Definition: io/types.hpp:936
size_type start_row
The start row of the partition.
Definition: io/types.hpp:926
size_type num_rows
The number of rows in the partition.
Definition: io/types.hpp:927
Destination information for write interfaces.
Definition: io/types.hpp:471
auto const & buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:561
sink_info(std::vector< std::vector< char > * > buffers)
Construct a new sink info object for multiple host buffers.
Definition: io/types.hpp:505
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:555
sink_info(std::string file_path)
Construct a new sink info object for a single file.
Definition: io/types.hpp:495
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
Definition: io/types.hpp:533
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
Definition: io/types.hpp:521
auto num_sinks() const
Get the number of sinks.
Definition: io/types.hpp:549
auto const & user_sinks() const
Get the user sinks of the input.
Definition: io/types.hpp:567
sink_info(size_t num_sinks)
Construct a new sink info object.
Definition: io/types.hpp:478
auto type() const
Get the type of the input.
Definition: io/types.hpp:543
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
Definition: io/types.hpp:514
sink_info(std::vector< std::string > file_paths)
Construct a new sink info object for multiple files.
Definition: io/types.hpp:485
Source information for read interfaces.
Definition: io/types.hpp:316
auto const & device_buffers() const
Get the device buffers of the input.
Definition: io/types.hpp:444
source_info()=default
Default constructor for the next-gen parquet reader.
source_info(std::vector< std::string > file_paths)
Construct a new source info object for multiple files.
Definition: io/types.hpp:327
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:432
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
Definition: io/types.hpp:371
auto num_sources() const
Get the number of input sources.
Definition: io/types.hpp:457
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
Definition: io/types.hpp:348
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
Definition: io/types.hpp:396
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
Definition: io/types.hpp:416
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
Definition: io/types.hpp:406
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
Definition: io/types.hpp:384
auto const & host_buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:438
auto type() const
Get the type of the input.
Definition: io/types.hpp:426
source_info(std::string file_path)
Construct a new source info object for a single file.
Definition: io/types.hpp:337
auto const & user_sources() const
Get the user sources of the input.
Definition: io/types.hpp:450
Table metadata returned by IO readers.
Definition: io/types.hpp:266
std::vector< std::unordered_map< std::string, std::string > > per_file_user_data
Per file format-dependent metadata as key-values pairs.
Definition: io/types.hpp:275
std::optional< size_type > num_row_groups_after_stats_filter
Definition: io/types.hpp:280
std::optional< size_type > num_row_groups_after_bloom_filter
Definition: io/types.hpp:284
std::vector< size_t > num_rows_per_source
Definition: io/types.hpp:269
std::vector< column_name_info > schema_info
Detailed name information for the entire output hierarchy.
Definition: io/types.hpp:268
std::map< std::string, std::string > user_data
Definition: io/types.hpp:272
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
std::unique_ptr< table > tbl
Table.
Definition: io/types.hpp:293
table_metadata metadata
Table metadata.
Definition: io/types.hpp:294
Class definition for cudf::table.
Type declarations for libcudf.