io/types.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
22 #pragma once
23 
24 #include <cudf/table/table.hpp>
25 #include <cudf/types.hpp>
26 #include <cudf/utilities/span.hpp>
27 
28 #include <map>
29 #include <memory>
30 #include <optional>
31 #include <string>
32 #include <unordered_map>
33 #include <utility>
34 #include <vector>
35 
36 namespace CUDF_EXPORT cudf {
38 namespace io {
39 class data_sink;
40 class datasource;
41 } // namespace io
42 } // namespace CUDF_EXPORT cudf
43 
45 namespace CUDF_EXPORT cudf {
47 namespace io {
57 enum class compression_type : int32_t {
58  NONE,
59  AUTO,
60  SNAPPY,
61  GZIP,
62  BZIP2,
63  BROTLI,
64  ZIP,
65  XZ,
66  ZLIB,
67  LZ4,
68  LZO,
69  ZSTD
70 };
71 
75 enum class io_type : int32_t {
76  FILEPATH,
77  HOST_BUFFER,
79  VOID,
81 };
82 
86 enum class quote_style : int32_t {
87  MINIMAL,
88  ALL,
89  NONNUMERIC,
90  NONE
91 };
92 
96 enum statistics_freq : int32_t {
101 };
102 
106 enum class column_encoding : int32_t {
107  // Common encodings:
108  USE_DEFAULT = -1,
109  DICTIONARY,
110  // Parquet encodings:
111  PLAIN,
112  DELTA_BINARY_PACKED,
113  DELTA_LENGTH_BYTE_ARRAY,
115  DELTA_BYTE_ARRAY,
117  BYTE_STREAM_SPLIT,
118  // ORC encodings:
119  DIRECT,
120  DIRECT_V2,
121  DICTIONARY_V2,
122 };
123 
128  public:
133 
142  writer_compression_statistics(size_t num_compressed_bytes,
143  size_t num_failed_bytes,
144  size_t num_skipped_bytes,
145  size_t num_compressed_output_bytes)
146  : _num_compressed_bytes(num_compressed_bytes),
147  _num_failed_bytes(num_failed_bytes),
148  _num_skipped_bytes(num_skipped_bytes),
149  _num_compressed_output_bytes(num_compressed_output_bytes)
150  {
151  }
152 
160  {
161  _num_compressed_bytes += other._num_compressed_bytes;
162  _num_failed_bytes += other._num_failed_bytes;
163  _num_skipped_bytes += other._num_skipped_bytes;
164  _num_compressed_output_bytes += other._num_compressed_output_bytes;
165  return *this;
166  }
167 
176  [[nodiscard]] auto num_compressed_bytes() const noexcept { return _num_compressed_bytes; }
177 
183  [[nodiscard]] auto num_failed_bytes() const noexcept { return _num_failed_bytes; }
184 
190  [[nodiscard]] auto num_skipped_bytes() const noexcept { return _num_skipped_bytes; }
191 
197  [[nodiscard]] auto num_total_input_bytes() const noexcept
198  {
199  return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
200  }
201 
210  [[nodiscard]] auto compression_ratio() const noexcept
211  {
212  return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
213  }
214 
215  private:
216  std::size_t _num_compressed_bytes = 0;
217  std::size_t _num_failed_bytes = 0;
218  std::size_t _num_skipped_bytes = 0;
219  std::size_t _num_compressed_output_bytes = 0;
220 };
221 
225 enum dictionary_policy : int32_t {
226  NEVER = 0,
227  ADAPTIVE = 1,
228  ALWAYS = 2
229 };
230 
238  std::string name;
239  std::optional<bool> is_nullable;
240  std::optional<bool> is_binary;
241  std::optional<int32_t> type_length;
242  std::vector<column_name_info> children;
243 
251  column_name_info(std::string _name,
252  std::optional<bool> _is_nullable = std::nullopt,
253  std::optional<bool> _is_binary = std::nullopt)
254  : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
255  {
256  }
257 
258  column_name_info() = default;
259 
266  bool operator==(column_name_info const& rhs) const
267  {
268  return ((name == rhs.name) && (is_nullable == rhs.is_nullable) &&
269  (is_binary == rhs.is_binary) && (type_length == rhs.type_length) &&
270  (children == rhs.children));
271  };
272 };
273 
278  std::vector<column_name_info>
280  std::vector<size_t> num_rows_per_source;
283  std::map<std::string, std::string> user_data;
285  std::vector<std::unordered_map<std::string, std::string>>
287 
288  // The following variables are currently only computed for Parquet reader
289  size_type num_input_row_groups{0};
290  std::optional<size_type>
294  std::optional<size_type>
298 };
299 
304  std::unique_ptr<table> tbl;
306 };
307 
315 template <typename T>
316 constexpr inline auto is_byte_like_type()
317 {
318  using non_cv_T = std::remove_cv_t<T>;
319  return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
320  std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
321  std::is_same_v<non_cv_T, std::byte>;
322 }
323 
327 struct source_info {
331  source_info() = default;
332 
338  explicit source_info(std::vector<std::string> file_paths)
339  : _type(io_type::FILEPATH), _num_sources(file_paths.size()), _filepaths(std::move(file_paths))
340  {
341  }
342 
348  explicit source_info(std::string file_path)
349  : _type(io_type::FILEPATH), _num_sources(1), _filepaths({std::move(file_path)})
350  {
351  }
352 
358  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
359  explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
360  : _type(io_type::HOST_BUFFER), _num_sources(host_buffers.size())
361  {
362  if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
363  _host_buffers.reserve(host_buffers.size());
364  std::transform(host_buffers.begin(),
365  host_buffers.end(),
366  std::back_inserter(_host_buffers),
367  [](auto const s) {
368  return cudf::host_span<std::byte const>{
369  reinterpret_cast<std::byte const*>(s.data()), s.size()};
370  });
371  } else {
372  _host_buffers.assign(host_buffers.begin(), host_buffers.end());
373  }
374  }
375 
381  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
382  explicit source_info(cudf::host_span<T> host_data)
383  : _type(io_type::HOST_BUFFER),
384  _num_sources(1),
385  _host_buffers{cudf::host_span<std::byte const>(
386  reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
387  {
388  }
389 
396  : _type(io_type::DEVICE_BUFFER),
397  _num_sources(device_buffers.size()),
398  _device_buffers(device_buffers.begin(), device_buffers.end())
399  {
400  }
401 
408  : _type(io_type::DEVICE_BUFFER), _num_sources(1), _device_buffers({{d_buffer}})
409  {
410  }
411 
417  explicit source_info(std::vector<cudf::io::datasource*> const& sources)
418  : _type(io_type::USER_IMPLEMENTED), _num_sources(sources.size()), _user_sources(sources)
419  {
420  }
421 
428  : _type(io_type::USER_IMPLEMENTED), _num_sources(1), _user_sources({source})
429  {
430  }
431 
437  [[nodiscard]] auto type() const { return _type; }
443  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
449  [[nodiscard]] auto const& host_buffers() const { return _host_buffers; }
455  [[nodiscard]] auto const& device_buffers() const { return _device_buffers; }
461  [[nodiscard]] auto const& user_sources() const { return _user_sources; }
462 
468  [[nodiscard]] auto num_sources() const { return _num_sources; }
469 
470  private:
471  io_type _type = io_type::VOID;
472  size_t _num_sources = 0;
473  std::vector<std::string> _filepaths;
474  std::vector<cudf::host_span<std::byte const>> _host_buffers;
475  std::vector<cudf::device_span<std::byte const>> _device_buffers;
476  std::vector<cudf::io::datasource*> _user_sources;
477 };
478 
482 struct sink_info {
483  sink_info() = default;
489  sink_info(size_t num_sinks) : _num_sinks(num_sinks) {}
490 
496  explicit sink_info(std::vector<std::string> file_paths)
497  : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(std::move(file_paths))
498  {
499  }
500 
506  explicit sink_info(std::string file_path)
507  : _type(io_type::FILEPATH), _filepaths({std::move(file_path)})
508  {
509  }
510 
516  explicit sink_info(std::vector<std::vector<char>*> buffers)
517  : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(std::move(buffers))
518  {
519  }
525  explicit sink_info(std::vector<char>* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {}
526 
532  explicit sink_info(std::vector<cudf::io::data_sink*> const& user_sinks)
533  : _type(io_type::USER_IMPLEMENTED),
534  _num_sinks(user_sinks.size()),
535  _user_sinks(std::move(user_sinks))
536  {
537  }
538 
544  explicit sink_info(class cudf::io::data_sink* user_sink)
545  : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink})
546  {
547  }
548 
554  [[nodiscard]] auto type() const { return _type; }
560  [[nodiscard]] auto num_sinks() const { return _num_sinks; }
566  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
572  [[nodiscard]] auto const& buffers() const { return _buffers; }
578  [[nodiscard]] auto const& user_sinks() const { return _user_sinks; }
579 
580  private:
581  io_type _type = io_type::VOID;
582  size_t _num_sinks = 1;
583  std::vector<std::string> _filepaths;
584  std::vector<std::vector<char>*> _buffers;
585  std::vector<cudf::io::data_sink*> _user_sinks;
586 };
587 
588 class table_input_metadata;
589 
594  friend table_input_metadata;
595  std::string _name = "";
596  std::optional<bool> _nullable;
597  bool _list_column_is_map = false;
598  bool _use_int96_timestamp = false;
599  bool _output_as_binary = false;
600  bool _skip_compression = false;
601  std::optional<uint8_t> _decimal_precision;
602  std::optional<int32_t> _parquet_field_id;
603  std::optional<int32_t> _type_length;
604  std::vector<column_in_metadata> children;
605  column_encoding _encoding = column_encoding::USE_DEFAULT;
606 
607  public:
608  column_in_metadata() = default;
614  column_in_metadata(std::string_view name) : _name{name} {}
622  {
623  children.push_back(child);
624  return *this;
625  }
626 
633  column_in_metadata& set_name(std::string const& name) noexcept
634  {
635  _name = name;
636  return *this;
637  }
638 
646  {
647  _nullable = nullable;
648  return *this;
649  }
650 
659  {
660  _list_column_is_map = true;
661  return *this;
662  }
663 
673  {
674  _use_int96_timestamp = req;
675  return *this;
676  }
677 
685  column_in_metadata& set_decimal_precision(uint8_t precision) noexcept
686  {
687  _decimal_precision = precision;
688  return *this;
689  }
690 
698  column_in_metadata& set_type_length(int32_t length) noexcept
699  {
700  _type_length = length;
701  return *this;
702  }
703 
710  column_in_metadata& set_parquet_field_id(int32_t field_id) noexcept
711  {
712  _parquet_field_id = field_id;
713  return *this;
714  }
715 
725  {
726  _output_as_binary = binary;
727  if (_output_as_binary and children.size() == 1) {
728  children.emplace_back();
729  } else if (!_output_as_binary and children.size() == 2) {
730  children.pop_back();
731  }
732  return *this;
733  }
734 
743  {
744  _skip_compression = skip;
745  return *this;
746  }
747 
759  {
760  _encoding = encoding;
761  return *this;
762  }
763 
770  column_in_metadata& child(size_type i) noexcept { return children[i]; }
771 
778  [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; }
779 
785  [[nodiscard]] std::string const& get_name() const noexcept { return _name; }
786 
792  [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); }
793 
801  [[nodiscard]] bool nullable() const { return _nullable.value(); }
802 
808  [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }
809 
816  [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }
817 
823  [[nodiscard]] bool is_decimal_precision_set() const noexcept
824  {
825  return _decimal_precision.has_value();
826  }
827 
835  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
836 
842  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
843 
851  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
852 
858  [[nodiscard]] bool is_parquet_field_id_set() const noexcept
859  {
860  return _parquet_field_id.has_value();
861  }
862 
870  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
871 
877  [[nodiscard]] size_type num_children() const noexcept { return children.size(); }
878 
884  [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
885 
891  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
892 
898  [[nodiscard]] column_encoding get_encoding() const { return _encoding; }
899 };
900 
905  public:
906  table_input_metadata() = default; // Required by cython
907 
916 
925  explicit table_input_metadata(table_metadata const& metadata);
926 
927  std::vector<column_in_metadata> column_metadata;
928 };
929 
939 
940  partition_info() = default;
947  partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows)
948  {
949  }
950 };
951 
957  // Whether to read binary data as a string column
958  bool _convert_binary_to_strings{true};
959  int32_t _type_length{0};
960 
961  std::vector<reader_column_schema> children;
962 
963  public:
964  reader_column_schema() = default;
965 
971  reader_column_schema(size_type number_of_children) { children.resize(number_of_children); }
972 
979  {
980  children.assign(child_span.begin(), child_span.end());
981  }
982 
990  {
991  children.push_back(child);
992  return *this;
993  }
994 
1001  [[nodiscard]] reader_column_schema& child(size_type i) { return children[i]; }
1002 
1009  [[nodiscard]] reader_column_schema const& child(size_type i) const { return children[i]; }
1010 
1020  {
1021  _convert_binary_to_strings = convert_to_string;
1022  return *this;
1023  }
1024 
1032  {
1033  _type_length = type_length;
1034  return *this;
1035  }
1036 
1042  [[nodiscard]] bool is_enabled_convert_binary_to_strings() const
1043  {
1044  return _convert_binary_to_strings;
1045  }
1046 
1052  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
1053 
1059  [[nodiscard]] size_t get_num_children() const { return children.size(); }
1060 };
1061  // end of group
1063 } // namespace io
1064 } // namespace CUDF_EXPORT cudf
constexpr CUDF_HOST_DEVICE iterator end() const noexcept
Returns an iterator to the element following the last element of the span.
Definition: span.hpp:104
constexpr CUDF_HOST_DEVICE iterator begin() const noexcept
Returns an iterator to the first element of the span.
Definition: span.hpp:96
Metadata for a column.
Definition: io/types.hpp:593
column_in_metadata & set_name(std::string const &name) noexcept
Set the name of this column.
Definition: io/types.hpp:633
column_in_metadata & add_child(column_in_metadata const &child)
Add the children metadata of this column.
Definition: io/types.hpp:621
bool is_enabled_output_as_binary() const noexcept
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:884
column_in_metadata & set_parquet_field_id(int32_t field_id) noexcept
Set the parquet field id of this column.
Definition: io/types.hpp:710
column_in_metadata & set_int96_timestamps(bool req) noexcept
Specifies whether this timestamp column should be encoded using the deprecated int96 physical type....
Definition: io/types.hpp:672
column_in_metadata & set_decimal_precision(uint8_t precision) noexcept
Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.
Definition: io/types.hpp:685
bool is_enabled_int96_timestamps() const noexcept
Get whether to encode this timestamp column using deprecated int96 physical type.
Definition: io/types.hpp:816
bool is_parquet_field_id_set() const noexcept
Get whether parquet field id has been set for this column.
Definition: io/types.hpp:858
bool is_decimal_precision_set() const noexcept
Get whether precision has been set for this decimal column.
Definition: io/types.hpp:823
bool is_type_length_set() const noexcept
Get whether type length has been set for this column.
Definition: io/types.hpp:842
bool nullable() const
Gets the explicitly set nullability for this column.
Definition: io/types.hpp:801
size_type num_children() const noexcept
Get the number of children of this column.
Definition: io/types.hpp:877
bool is_map() const noexcept
If this is the metadata of a list column, returns whether it is to be encoded as a map.
Definition: io/types.hpp:808
uint8_t get_decimal_precision() const
Get the decimal precision that was set for this column.
Definition: io/types.hpp:835
column_in_metadata & set_encoding(column_encoding encoding) noexcept
Sets the encoding to use for this column.
Definition: io/types.hpp:758
column_encoding get_encoding() const
Get the encoding that was set for this column.
Definition: io/types.hpp:898
uint8_t get_type_length() const
Get the type length that was set for this column.
Definition: io/types.hpp:851
column_in_metadata & set_output_as_binary(bool binary) noexcept
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:724
column_in_metadata & child(size_type i) noexcept
Get reference to a child of this column.
Definition: io/types.hpp:770
column_in_metadata & set_type_length(int32_t length) noexcept
Set the data length of the column. Only valid if this column is a fixed-length byte array.
Definition: io/types.hpp:698
bool is_enabled_skip_compression() const noexcept
Get whether to skip compressing this column.
Definition: io/types.hpp:891
int32_t get_parquet_field_id() const
Get the parquet field id that was set for this column.
Definition: io/types.hpp:870
column_in_metadata & set_list_column_as_map() noexcept
Specify that this list column should be encoded as a map in the written file.
Definition: io/types.hpp:658
column_in_metadata(std::string_view name)
Construct a new column in metadata object.
Definition: io/types.hpp:614
std::string const & get_name() const noexcept
Get the name of this column.
Definition: io/types.hpp:785
column_in_metadata & set_nullability(bool nullable) noexcept
Set the nullability of this column.
Definition: io/types.hpp:645
bool is_nullability_defined() const noexcept
Get whether nullability has been explicitly set for this column.
Definition: io/types.hpp:792
column_in_metadata const & child(size_type i) const noexcept
Get const reference to a child of this column.
Definition: io/types.hpp:778
column_in_metadata & set_skip_compression(bool skip) noexcept
Specifies whether this column should not be compressed regardless of the compression codec specified ...
Definition: io/types.hpp:742
Interface class for storing the output data from the writers.
Definition: data_sink.hpp:43
Interface class for providing input data to the readers.
Definition: datasource.hpp:42
schema element for reader
Definition: io/types.hpp:956
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
Definition: io/types.hpp:1009
reader_column_schema & set_type_length(int32_t type_length)
Sets the length of fixed length data.
Definition: io/types.hpp:1031
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
Definition: io/types.hpp:1042
int32_t get_type_length() const
Get the length in bytes of this fixed length data.
Definition: io/types.hpp:1052
reader_column_schema(host_span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
Definition: io/types.hpp:978
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
Definition: io/types.hpp:1019
size_t get_num_children() const
Get the number of child objects.
Definition: io/types.hpp:1059
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
Definition: io/types.hpp:989
reader_column_schema & child(size_type i)
Get reference to a child of this column.
Definition: io/types.hpp:1001
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
Definition: io/types.hpp:971
Metadata for a table.
Definition: io/types.hpp:904
table_input_metadata(table_view const &table)
Construct a new table_input_metadata from a table_view.
table_input_metadata(table_metadata const &metadata)
Construct a new table_input_metadata from a table_metadata object.
std::vector< column_in_metadata > column_metadata
List of column metadata.
Definition: io/types.hpp:927
Statistics about compression performed by a writer.
Definition: io/types.hpp:127
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
Definition: io/types.hpp:210
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
Definition: io/types.hpp:197
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
Definition: io/types.hpp:159
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
Definition: io/types.hpp:183
writer_compression_statistics()=default
Default constructor.
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
Definition: io/types.hpp:190
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
Definition: io/types.hpp:142
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
Definition: io/types.hpp:176
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:96
column_encoding
Valid encodings for use with column_in_metadata::set_encoding()
Definition: io/types.hpp:106
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:86
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
Definition: io/types.hpp:316
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:225
compression_type
Compression algorithms.
Definition: io/types.hpp:57
io_type
Data source or destination types.
Definition: io/types.hpp:75
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
Definition: io/types.hpp:100
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:98
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:97
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:99
@ USE_DEFAULT
No encoding has been requested, use default encoding.
@ MINIMAL
Quote only fields which contain special characters.
@ NONNUMERIC
Quote all non-numeric fields.
@ ALWAYS
Use dictionary regardless of impact on compression.
Definition: io/types.hpp:228
@ ADAPTIVE
Use dictionary when it will not impact compression.
Definition: io/types.hpp:227
@ NEVER
Never use dictionary encoding.
Definition: io/types.hpp:226
@ XZ
XZ format, using LZMA(2) algorithm.
@ ZIP
ZIP format, using DEFLATE algorithm.
@ BZIP2
BZIP2 format, using Burrows-Wheeler transform.
@ AUTO
Automatically detect or select compression format.
@ HOST_BUFFER
Input/output is a buffer in host memory.
@ USER_IMPLEMENTED
Input/output is handled by a custom user class.
@ VOID
Input/output is nothing. No work is done. Useful for benchmarking.
@ FILEPATH
Input/output is a file path.
@ DEVICE_BUFFER
Input/output is a buffer in device memory.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF interfaces
Definition: host_udf.hpp:37
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
@ ALL
All initialization steps (default behavior)
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:334
C++20 std::span with reduced feature set.
Definition: span.hpp:193
Detailed name (and optionally nullability) information for output columns.
Definition: io/types.hpp:237
std::optional< bool > is_nullable
Column nullability.
Definition: io/types.hpp:239
std::optional< bool > is_binary
Column is binary (i.e. not a list)
Definition: io/types.hpp:240
std::vector< column_name_info > children
Child column names.
Definition: io/types.hpp:242
bool operator==(column_name_info const &rhs) const
Compares two column name info structs for equality.
Definition: io/types.hpp:266
std::optional< int32_t > type_length
Byte width of data (for fixed length data)
Definition: io/types.hpp:241
std::string name
Column name.
Definition: io/types.hpp:238
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
Construct a column name info with a name, optional nullabilty, and no children.
Definition: io/types.hpp:251
Information used while writing partitioned datasets.
Definition: io/types.hpp:936
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
Definition: io/types.hpp:947
size_type start_row
The start row of the partition.
Definition: io/types.hpp:937
size_type num_rows
The number of rows in the partition.
Definition: io/types.hpp:938
Destination information for write interfaces.
Definition: io/types.hpp:482
auto const & buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:572
sink_info(std::vector< std::vector< char > * > buffers)
Construct a new sink info object for multiple host buffers.
Definition: io/types.hpp:516
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:566
sink_info(std::string file_path)
Construct a new sink info object for a single file.
Definition: io/types.hpp:506
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
Definition: io/types.hpp:544
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
Definition: io/types.hpp:532
auto num_sinks() const
Get the number of sinks.
Definition: io/types.hpp:560
auto const & user_sinks() const
Get the user sinks of the input.
Definition: io/types.hpp:578
sink_info(size_t num_sinks)
Construct a new sink info object.
Definition: io/types.hpp:489
auto type() const
Get the type of the input.
Definition: io/types.hpp:554
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
Definition: io/types.hpp:525
sink_info(std::vector< std::string > file_paths)
Construct a new sink info object for multiple files.
Definition: io/types.hpp:496
Source information for read interfaces.
Definition: io/types.hpp:327
auto const & device_buffers() const
Get the device buffers of the input.
Definition: io/types.hpp:455
source_info()=default
Default constructor for the next-gen parquet reader.
source_info(std::vector< std::string > file_paths)
Construct a new source info object for multiple files.
Definition: io/types.hpp:338
auto const & filepaths() const
Get the filepaths of the input.
Definition: io/types.hpp:443
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
Definition: io/types.hpp:382
auto num_sources() const
Get the number of input sources.
Definition: io/types.hpp:468
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
Definition: io/types.hpp:359
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
Definition: io/types.hpp:407
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
Definition: io/types.hpp:427
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
Definition: io/types.hpp:417
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
Definition: io/types.hpp:395
auto const & host_buffers() const
Get the host buffers of the input.
Definition: io/types.hpp:449
auto type() const
Get the type of the input.
Definition: io/types.hpp:437
source_info(std::string file_path)
Construct a new source info object for a single file.
Definition: io/types.hpp:348
auto const & user_sources() const
Get the user sources of the input.
Definition: io/types.hpp:461
Table metadata returned by IO readers.
Definition: io/types.hpp:277
std::vector< std::unordered_map< std::string, std::string > > per_file_user_data
Per file format-dependent metadata as key-values pairs.
Definition: io/types.hpp:286
std::optional< size_type > num_row_groups_after_stats_filter
Definition: io/types.hpp:291
std::optional< size_type > num_row_groups_after_bloom_filter
Definition: io/types.hpp:295
std::vector< size_t > num_rows_per_source
Definition: io/types.hpp:280
std::vector< column_name_info > schema_info
Detailed name information for the entire output hierarchy.
Definition: io/types.hpp:279
std::map< std::string, std::string > user_data
Definition: io/types.hpp:283
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:303
std::unique_ptr< table > tbl
Table.
Definition: io/types.hpp:304
table_metadata metadata
Table metadata.
Definition: io/types.hpp:305
Class definition for cudf::table.
Type declarations for libcudf.