io/types.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
11 #pragma once
12 
13 #include <cudf/table/table.hpp>
14 #include <cudf/types.hpp>
15 #include <cudf/utilities/span.hpp>
16 
17 #include <map>
18 #include <memory>
19 #include <optional>
20 #include <span>
21 #include <string>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 namespace CUDF_EXPORT cudf {
28 namespace io {
29 class data_sink;
30 class datasource;
31 } // namespace io
32 } // namespace CUDF_EXPORT cudf
33 
35 namespace CUDF_EXPORT cudf {
37 namespace io {
47 enum class compression_type : int32_t {
48  NONE,
49  AUTO,
50  SNAPPY,
51  GZIP,
52  BZIP2,
53  BROTLI,
54  ZIP,
55  XZ,
56  ZLIB,
57  LZ4,
58  LZO,
59  ZSTD
60 };
61 
65 enum class io_type : int32_t {
66  FILEPATH,
67  HOST_BUFFER,
69  VOID,
71 };
72 
76 enum class quote_style : int32_t {
77  MINIMAL,
78  ALL,
79  NONNUMERIC,
80  NONE
81 };
82 
86 enum statistics_freq : int32_t {
91 };
92 
96 enum class column_encoding : int32_t {
97  // Common encodings:
98  USE_DEFAULT = -1,
99  DICTIONARY,
100  // Parquet encodings:
101  PLAIN,
102  DELTA_BINARY_PACKED,
103  DELTA_LENGTH_BYTE_ARRAY,
105  DELTA_BYTE_ARRAY,
107  BYTE_STREAM_SPLIT,
108  // ORC encodings:
109  DIRECT,
110  DIRECT_V2,
111  DICTIONARY_V2,
112 };
113 
118  public:
123 
132  writer_compression_statistics(size_t num_compressed_bytes,
133  size_t num_failed_bytes,
134  size_t num_skipped_bytes,
135  size_t num_compressed_output_bytes)
136  : _num_compressed_bytes(num_compressed_bytes),
137  _num_failed_bytes(num_failed_bytes),
138  _num_skipped_bytes(num_skipped_bytes),
139  _num_compressed_output_bytes(num_compressed_output_bytes)
140  {
141  }
142 
150  {
151  _num_compressed_bytes += other._num_compressed_bytes;
152  _num_failed_bytes += other._num_failed_bytes;
153  _num_skipped_bytes += other._num_skipped_bytes;
154  _num_compressed_output_bytes += other._num_compressed_output_bytes;
155  return *this;
156  }
157 
166  [[nodiscard]] auto num_compressed_bytes() const noexcept { return _num_compressed_bytes; }
167 
173  [[nodiscard]] auto num_failed_bytes() const noexcept { return _num_failed_bytes; }
174 
180  [[nodiscard]] auto num_skipped_bytes() const noexcept { return _num_skipped_bytes; }
181 
187  [[nodiscard]] auto num_total_input_bytes() const noexcept
188  {
189  return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
190  }
191 
200  [[nodiscard]] auto compression_ratio() const noexcept
201  {
202  return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
203  }
204 
205  private:
206  std::size_t _num_compressed_bytes = 0;
207  std::size_t _num_failed_bytes = 0;
208  std::size_t _num_skipped_bytes = 0;
209  std::size_t _num_compressed_output_bytes = 0;
210 };
211 
215 enum dictionary_policy : int32_t {
216  NEVER = 0,
217  ADAPTIVE = 1,
218  ALWAYS = 2
219 };
220 
228  std::string name;
229  std::optional<bool> is_nullable;
230  std::optional<bool> is_binary;
231  std::optional<int32_t> type_length;
232  std::vector<column_name_info> children;
233 
241  column_name_info(std::string _name,
242  std::optional<bool> _is_nullable = std::nullopt,
243  std::optional<bool> _is_binary = std::nullopt)
244  : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
245  {
246  }
247 
248  column_name_info() = default;
249 
256  bool operator==(column_name_info const& rhs) const
257  {
258  return ((name == rhs.name) && (is_nullable == rhs.is_nullable) &&
259  (is_binary == rhs.is_binary) && (type_length == rhs.type_length) &&
260  (children == rhs.children));
261  };
262 };
263 
268  std::vector<column_name_info>
270  std::vector<size_t> num_rows_per_source;
273  std::map<std::string, std::string> user_data;
275  std::vector<std::unordered_map<std::string, std::string>>
277 
278  // The following variables are currently only computed for Parquet reader
279  size_type num_input_row_groups{0};
280  std::optional<size_type>
284  std::optional<size_type>
288 };
289 
294  std::unique_ptr<table> tbl;
296 };
297 
305 template <typename T>
306 constexpr inline auto is_byte_like_type()
307 {
308  using non_cv_T = std::remove_cv_t<T>;
309  return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
310  std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
311  std::is_same_v<non_cv_T, std::byte>;
312 }
313 
321  std::string path;
322  std::optional<std::size_t> size{};
323 };
324 
328 struct source_info {
332  source_info() = default;
333 
339  explicit source_info(std::vector<std::string> file_paths)
340  : _type(io_type::FILEPATH), _num_sources(file_paths.size())
341  {
342  _filepath_sources.reserve(file_paths.size());
343  for (auto& path : file_paths) {
344  _filepath_sources.push_back({std::move(path), std::nullopt});
345  }
346  rebuild_filepaths();
347  }
348 
354  explicit source_info(std::string file_path)
355  : source_info(std::vector<std::string>{std::move(file_path)})
356  {
357  }
358 
364  explicit source_info(std::vector<filepath_source> sources)
365  : _type(io_type::FILEPATH), _num_sources(sources.size()), _filepath_sources(std::move(sources))
366  {
367  rebuild_filepaths();
368  }
369 
375  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
376  explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
377  : _type(io_type::HOST_BUFFER), _num_sources(host_buffers.size())
378  {
379  if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
380  _host_buffers.reserve(host_buffers.size());
381  std::transform(host_buffers.begin(),
382  host_buffers.end(),
383  std::back_inserter(_host_buffers),
384  [](auto const s) {
385  return cudf::host_span<std::byte const>{
386  reinterpret_cast<std::byte const*>(s.data()), s.size()};
387  });
388  } else {
389  _host_buffers.assign(host_buffers.begin(), host_buffers.end());
390  }
391  }
392 
398  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
399  explicit source_info(cudf::host_span<T> host_data)
400  : _type(io_type::HOST_BUFFER),
401  _num_sources(1),
402  _host_buffers{cudf::host_span<std::byte const>(
403  reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
404  {
405  }
406 
413  : _type(io_type::DEVICE_BUFFER),
414  _num_sources(device_buffers.size()),
415  _device_buffers(device_buffers.begin(), device_buffers.end())
416  {
417  }
418 
425  : _type(io_type::DEVICE_BUFFER), _num_sources(1), _device_buffers({{d_buffer}})
426  {
427  }
428 
434  explicit source_info(std::vector<cudf::io::datasource*> const& sources)
435  : _type(io_type::USER_IMPLEMENTED), _num_sources(sources.size()), _user_sources(sources)
436  {
437  }
438 
445  : _type(io_type::USER_IMPLEMENTED), _num_sources(1), _user_sources({source})
446  {
447  }
448 
454  [[nodiscard]] auto type() const { return _type; }
460  [[nodiscard]] auto const& filepath_sources() const { return _filepath_sources; }
466  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
472  [[nodiscard]] auto const& host_buffers() const { return _host_buffers; }
478  [[nodiscard]] auto const& device_buffers() const { return _device_buffers; }
484  [[nodiscard]] auto const& user_sources() const { return _user_sources; }
485 
491  [[nodiscard]] auto num_sources() const { return _num_sources; }
492 
493  private:
494  void rebuild_filepaths()
495  {
496  _filepaths.clear();
497  _filepaths.reserve(_filepath_sources.size());
498  for (auto const& source : _filepath_sources) {
499  _filepaths.push_back(source.path);
500  }
501  }
502 
503  io_type _type = io_type::VOID;
504  size_t _num_sources = 0;
505  std::vector<filepath_source> _filepath_sources;
506  std::vector<std::string> _filepaths;
507  std::vector<cudf::host_span<std::byte const>> _host_buffers;
508  std::vector<cudf::device_span<std::byte const>> _device_buffers;
509  std::vector<cudf::io::datasource*> _user_sources;
510 };
511 
515 struct sink_info {
516  sink_info() = default;
522  sink_info(size_t num_sinks) : _num_sinks(num_sinks) {}
523 
529  explicit sink_info(std::vector<std::string> file_paths)
530  : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(std::move(file_paths))
531  {
532  }
533 
539  explicit sink_info(std::string file_path)
540  : _type(io_type::FILEPATH), _filepaths({std::move(file_path)})
541  {
542  }
543 
549  explicit sink_info(std::vector<std::vector<char>*> buffers)
550  : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(std::move(buffers))
551  {
552  }
558  explicit sink_info(std::vector<char>* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {}
559 
565  explicit sink_info(std::vector<cudf::io::data_sink*> const& user_sinks)
566  : _type(io_type::USER_IMPLEMENTED),
567  _num_sinks(user_sinks.size()),
568  _user_sinks(std::move(user_sinks))
569  {
570  }
571 
577  explicit sink_info(class cudf::io::data_sink* user_sink)
578  : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink})
579  {
580  }
581 
587  [[nodiscard]] auto type() const { return _type; }
593  [[nodiscard]] auto num_sinks() const { return _num_sinks; }
599  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
605  [[nodiscard]] auto const& buffers() const { return _buffers; }
611  [[nodiscard]] auto const& user_sinks() const { return _user_sinks; }
612 
613  private:
614  io_type _type = io_type::VOID;
615  size_t _num_sinks = 1;
616  std::vector<std::string> _filepaths;
617  std::vector<std::vector<char>*> _buffers;
618  std::vector<cudf::io::data_sink*> _user_sinks;
619 };
620 
621 class table_input_metadata;
622 
627  friend table_input_metadata;
628  std::string _name = "";
629  std::optional<bool> _nullable;
630  bool _list_column_is_map = false;
631  bool _use_int96_timestamp = false;
632  bool _output_as_binary = false;
633  bool _skip_compression = false;
634  std::optional<uint8_t> _decimal_precision;
635  std::optional<int32_t> _parquet_field_id;
636  std::optional<int32_t> _type_length;
637  std::vector<column_in_metadata> children;
638  column_encoding _encoding = column_encoding::USE_DEFAULT;
639 
640  public:
641  column_in_metadata() = default;
647  column_in_metadata(std::string_view name) : _name{name} {}
655  {
656  children.push_back(child);
657  return *this;
658  }
659 
666  column_in_metadata& set_name(std::string const& name) noexcept
667  {
668  _name = name;
669  return *this;
670  }
671 
679  {
680  _nullable = nullable;
681  return *this;
682  }
683 
692  {
693  _list_column_is_map = true;
694  return *this;
695  }
696 
706  {
707  _use_int96_timestamp = req;
708  return *this;
709  }
710 
718  column_in_metadata& set_decimal_precision(uint8_t precision) noexcept
719  {
720  _decimal_precision = precision;
721  return *this;
722  }
723 
731  column_in_metadata& set_type_length(int32_t length) noexcept
732  {
733  _type_length = length;
734  return *this;
735  }
736 
743  column_in_metadata& set_parquet_field_id(int32_t field_id) noexcept
744  {
745  _parquet_field_id = field_id;
746  return *this;
747  }
748 
758  {
759  _output_as_binary = binary;
760  if (_output_as_binary and children.size() == 1) {
761  children.emplace_back();
762  } else if (!_output_as_binary and children.size() == 2) {
763  children.pop_back();
764  }
765  return *this;
766  }
767 
776  {
777  _skip_compression = skip;
778  return *this;
779  }
780 
792  {
793  _encoding = encoding;
794  return *this;
795  }
796 
803  column_in_metadata& child(size_type i) noexcept { return children[i]; }
804 
811  [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; }
812 
818  [[nodiscard]] std::string const& get_name() const noexcept { return _name; }
819 
825  [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); }
826 
834  [[nodiscard]] bool nullable() const { return _nullable.value(); }
835 
841  [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }
842 
849  [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }
850 
856  [[nodiscard]] bool is_decimal_precision_set() const noexcept
857  {
858  return _decimal_precision.has_value();
859  }
860 
868  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
869 
875  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
876 
884  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
885 
891  [[nodiscard]] bool is_parquet_field_id_set() const noexcept
892  {
893  return _parquet_field_id.has_value();
894  }
895 
903  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
904 
910  [[nodiscard]] size_type num_children() const noexcept { return children.size(); }
911 
917  [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
918 
924  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
925 
931  [[nodiscard]] column_encoding get_encoding() const { return _encoding; }
932 };
933 
938  public:
939  table_input_metadata() = default; // Required by cython
940 
949 
958  explicit table_input_metadata(table_metadata const& metadata);
959 
960  std::vector<column_in_metadata> column_metadata;
961 };
962 
972 
973  partition_info() = default;
980  partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows)
981  {
982  }
983 };
984 
990  // Whether to read binary data as a string column
991  bool _convert_binary_to_strings{true};
992  int32_t _type_length{0};
993 
994  std::vector<reader_column_schema> children;
995 
996  public:
997  reader_column_schema() = default;
998 
1004  reader_column_schema(size_type number_of_children) { children.resize(number_of_children); }
1005 
1011  reader_column_schema(std::span<reader_column_schema> const& child_span)
1012  {
1013  children.assign(child_span.begin(), child_span.end());
1014  }
1015 
1023  {
1024  children.push_back(child);
1025  return *this;
1026  }
1027 
1034  [[nodiscard]] reader_column_schema& child(size_type i) { return children[i]; }
1035 
1042  [[nodiscard]] reader_column_schema const& child(size_type i) const { return children[i]; }
1043 
1053  {
1054  _convert_binary_to_strings = convert_to_string;
1055  return *this;
1056  }
1057 
1065  {
1066  _type_length = type_length;
1067  return *this;
1068  }
1069 
1075  [[nodiscard]] bool is_enabled_convert_binary_to_strings() const
1076  {
1077  return _convert_binary_to_strings;
1078  }
1079 
1085  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
1086 
1092  [[nodiscard]] size_t get_num_children() const { return children.size(); }
1093 };
1094  // end of group
1096 } // namespace io
1097 } // namespace CUDF_EXPORT cudf
Metadata for a column.
Definition: io/types.hpp:626
column_in_metadata & set_name(std::string const &name) noexcept
Set the name of this column.
Definition: io/types.hpp:666
column_in_metadata & add_child(column_in_metadata const &child)
Add the children metadata of this column.
Definition: io/types.hpp:654
bool is_enabled_output_as_binary() const noexcept
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:917
column_in_metadata & set_parquet_field_id(int32_t field_id) noexcept
Set the parquet field id of this column.
Definition: io/types.hpp:743
column_in_metadata & set_int96_timestamps(bool req) noexcept
Specifies whether this timestamp column should be encoded using the deprecated int96 physical type....
Definition: io/types.hpp:705
column_in_metadata & set_decimal_precision(uint8_t precision) noexcept
Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.
Definition: io/types.hpp:718
bool is_enabled_int96_timestamps() const noexcept
Get whether to encode this timestamp column using deprecated int96 physical type.
Definition: io/types.hpp:849
bool is_parquet_field_id_set() const noexcept
Get whether parquet field id has been set for this column.
Definition: io/types.hpp:891
bool is_decimal_precision_set() const noexcept
Get whether precision has been set for this decimal column.
Definition: io/types.hpp:856
bool is_type_length_set() const noexcept
Get whether type length has been set for this column.
Definition: io/types.hpp:875
bool nullable() const
Gets the explicitly set nullability for this column.
Definition: io/types.hpp:834
size_type num_children() const noexcept
Get the number of children of this column.
Definition: io/types.hpp:910
bool is_map() const noexcept
If this is the metadata of a list column, returns whether it is to be encoded as a map.
Definition: io/types.hpp:841
uint8_t get_decimal_precision() const
Get the decimal precision that was set for this column.
Definition: io/types.hpp:868
column_in_metadata & set_encoding(column_encoding encoding) noexcept
Sets the encoding to use for this column.
Definition: io/types.hpp:791
column_encoding get_encoding() const
Get the encoding that was set for this column.
Definition: io/types.hpp:931
uint8_t get_type_length() const
Get the type length that was set for this column.
Definition: io/types.hpp:884
column_in_metadata & set_output_as_binary(bool binary) noexcept
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:757
column_in_metadata & child(size_type i) noexcept
Get reference to a child of this column.
Definition: io/types.hpp:803
column_in_metadata & set_type_length(int32_t length) noexcept
Set the data length of the column. Only valid if this column is a fixed-length byte array.
Definition: io/types.hpp:731
bool is_enabled_skip_compression() const noexcept
Get whether to skip compressing this column.
Definition: io/types.hpp:924
int32_t get_parquet_field_id() const
Get the parquet field id that was set for this column.
Definition: io/types.hpp:903
column_in_metadata & set_list_column_as_map() noexcept
Specify that this list column should be encoded as a map in the written file.
Definition: io/types.hpp:691
column_in_metadata(std::string_view name)
Construct a new column in metadata object.
Definition: io/types.hpp:647
std::string const & get_name() const noexcept
Get the name of this column.
Definition: io/types.hpp:818
column_in_metadata & set_nullability(bool nullable) noexcept
Set the nullability of this column.
Definition: io/types.hpp:678
bool is_nullability_defined() const noexcept
Get whether nullability has been explicitly set for this column.
Definition: io/types.hpp:825
column_in_metadata const & child(size_type i) const noexcept
Get const reference to a child of this column.
Definition: io/types.hpp:811
column_in_metadata & set_skip_compression(bool skip) noexcept
Specifies whether this column should not be compressed regardless of the compression codec specified ...
Definition: io/types.hpp:775
Interface class for storing the output data from the writers.
Definition: data_sink.hpp:32
Interface class for providing input data to the readers.
Definition: datasource.hpp:32
schema element for reader
Definition: io/types.hpp:989
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
Definition: io/types.hpp:1042
reader_column_schema & set_type_length(int32_t type_length)
Sets the length of fixed length data.
Definition: io/types.hpp:1064
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:1075
int32_t get_type_length() const
Get the length in bytes of this fixed length data.
Definition: io/types.hpp:1085
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:1052
size_t get_num_children() const
Get the number of child objects.
Definition: io/types.hpp:1092
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
Definition: io/types.hpp:1022
reader_column_schema(std::span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
Definition: io/types.hpp:1011
reader_column_schema & child(size_type i)
Get reference to a child of this column.
Definition: io/types.hpp:1034
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
Definition: io/types.hpp:1004
Metadata for a table.
Definition: io/types.hpp:937
table_input_metadata(table_view const &table)
Construct a new table_input_metadata from a table_view.
table_input_metadata(table_metadata const &metadata)
Construct a new table_input_metadata from a table_metadata object.
std::vector< column_in_metadata > column_metadata
List of column metadata.
Definition: io/types.hpp:960
Statistics about compression performed by a writer.
Definition: io/types.hpp:117
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
Definition: io/types.hpp:200
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
Definition: io/types.hpp:187
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
Definition: io/types.hpp:149
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
Definition: io/types.hpp:173
writer_compression_statistics()=default
Default constructor.
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
Definition: io/types.hpp:180
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
Definition: io/types.hpp:132
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
Definition: io/types.hpp:166
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:86
column_encoding
Valid encodings for use with column_in_metadata::set_encoding()
Definition: io/types.hpp:96
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:76
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
Definition: io/types.hpp:306
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:215
compression_type
Compression algorithms.
Definition: io/types.hpp:47
io_type
Data source or destination types.
Definition: io/types.hpp:65
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
Definition: io/types.hpp:90
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:88
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:87
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:89
@ USE_DEFAULT
No encoding has been requested, use default encoding.
@ MINIMAL
Quote only fields which contain special characters.
@ NONNUMERIC
Quote all non-numeric fields.
@ ALWAYS
Use dictionary regardless of impact on compression.
Definition: io/types.hpp:218
@ ADAPTIVE
Use dictionary when it will not impact compression.
Definition: io/types.hpp:217
@ NEVER
Never use dictionary encoding.
Definition: io/types.hpp:216
@ XZ
XZ format, using LZMA(2) algorithm.
@ ZIP
ZIP format, using DEFLATE algorithm.
@ BZIP2
BZIP2 format, using Burrows-Wheeler transform.
@ AUTO
Automatically detect or select compression format.
@ HOST_BUFFER
Input/output is a buffer in host memory.
@ USER_IMPLEMENTED
Input/output is handled by a custom user class.
@ VOID
Input/output is nothing. No work is done. Useful for benchmarking.
@ FILEPATH
Input/output is a file path.
@ DEVICE_BUFFER
Input/output is a buffer in device memory.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, output_nullability null_policy=output_nullability::PRESERVE, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
Definition: span.hpp:320
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
cuDF interfaces
Definition: host_udf.hpp:26
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
@ ALL
All initialization steps (default behavior)
APIs for spans.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Detailed name (and optionally nullability) information for output columns.
Definition: io/types.hpp:227
std::optional< bool > is_nullable
Column nullability.
Definition: io/types.hpp:229
std::optional< bool > is_binary
Column is binary (i.e. not a list)
Definition: io/types.hpp:230
std::vector< column_name_info > children
Child column names.
Definition: io/types.hpp:232
bool operator==(column_name_info const &rhs) const
Compares two column name info structs for equality.
Definition: io/types.hpp:256
std::optional< int32_t > type_length
Byte width of data (for fixed length data)
Definition: io/types.hpp:231
std::string name
Column name.
Definition: io/types.hpp:228
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
Construct a column name info with a name, optional nullabilty, and no children.
Definition: io/types.hpp:241
A file path with an optional known size in bytes.
Definition: io/types.hpp:320
std::string path
Path or URL of the input file.
Definition: io/types.hpp:321
Information used while writing partitioned datasets.
Definition: io/types.hpp:969
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
Definition: io/types.hpp:980
size_type start_row
The start row of the partition.
Definition: io/types.hpp:970
size_type num_rows
The number of rows in the partition.
Definition: io/types.hpp:971
Destination information for write interfaces.
Definition: io/types.hpp:515
auto const & buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:605
sink_info(std::vector< std::vector< char > * > buffers)
Construct a new sink info object for multiple host buffers.
Definition: io/types.hpp:549
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:599
sink_info(std::string file_path)
Construct a new sink info object for a single file.
Definition: io/types.hpp:539
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
Definition: io/types.hpp:577
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
Definition: io/types.hpp:565
auto num_sinks() const
Get the number of sinks.
Definition: io/types.hpp:593
auto const & user_sinks() const
Get the user sinks of the input.
Definition: io/types.hpp:611
sink_info(size_t num_sinks)
Construct a new sink info object.
Definition: io/types.hpp:522
auto type() const
Get the type of the input.
Definition: io/types.hpp:587
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
Definition: io/types.hpp:558
sink_info(std::vector< std::string > file_paths)
Construct a new sink info object for multiple files.
Definition: io/types.hpp:529
Source information for read interfaces.
Definition: io/types.hpp:328
auto const & device_buffers() const
Get the device buffers of the input.
Definition: io/types.hpp:478
source_info()=default
Default constructor for the next-gen parquet reader.
source_info(std::vector< std::string > file_paths)
Construct a new source info object for multiple files.
Definition: io/types.hpp:339
auto const & filepath_sources() const
Get the filepath sources of the input.
Definition: io/types.hpp:460
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:466
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
Definition: io/types.hpp:399
source_info(std::vector< filepath_source > sources)
Construct a new source info object from filepath sources with optional known sizes.
Definition: io/types.hpp:364
auto num_sources() const
Get the number of input sources.
Definition: io/types.hpp:491
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
Definition: io/types.hpp:376
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
Definition: io/types.hpp:424
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
Definition: io/types.hpp:444
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
Definition: io/types.hpp:434
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
Definition: io/types.hpp:412
auto const & host_buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:472
auto type() const
Get the type of the input.
Definition: io/types.hpp:454
source_info(std::string file_path)
Construct a new source info object for a single file.
Definition: io/types.hpp:354
auto const & user_sources() const
Get the user sources of the input.
Definition: io/types.hpp:484
Table metadata returned by IO readers.
Definition: io/types.hpp:267
std::vector< std::unordered_map< std::string, std::string > > per_file_user_data
Per file format-dependent metadata as key-values pairs.
Definition: io/types.hpp:276
std::optional< size_type > num_row_groups_after_stats_filter
Definition: io/types.hpp:281
std::optional< size_type > num_row_groups_after_bloom_filter
Definition: io/types.hpp:285
std::vector< size_t > num_rows_per_source
Definition: io/types.hpp:270
std::vector< column_name_info > schema_info
Detailed name information for the entire output hierarchy.
Definition: io/types.hpp:269
std::map< std::string, std::string > user_data
Definition: io/types.hpp:273
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:293
std::unique_ptr< table > tbl
Table.
Definition: io/types.hpp:294
table_metadata metadata
Table metadata.
Definition: io/types.hpp:295
Class definition for cudf::table.
Type declarations for libcudf.