orc.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/detail/orc.hpp>
9 #include <cudf/io/types.hpp>
11 #include <cudf/types.hpp>
12 #include <cudf/utilities/export.hpp>
14 
15 #include <memory>
16 #include <optional>
17 #include <string>
18 #include <unordered_map>
19 #include <utility>
20 #include <vector>
21 
22 namespace CUDF_EXPORT cudf {
23 namespace io {
30 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
31 constexpr size_type default_stripe_size_rows = 1000000;
33 
43 [[nodiscard]] bool is_supported_read_orc(compression_type compression);
44 
54 [[nodiscard]] bool is_supported_write_orc(compression_type compression);
55 
60 
65  source_info _source;
66 
67  // Names of column to read; `nullopt` is all
68  std::optional<std::vector<std::string>> _columns;
69 
70  // List of individual stripes to read (ignored if empty)
71  std::vector<std::vector<size_type>> _stripes;
72  // Rows to skip from the start
73  int64_t _skip_rows = 0;
74  // Rows to read; `nullopt` is all
75  std::optional<int64_t> _num_rows;
76 
77  // Whether to use row index to speed-up reading
78  bool _use_index = true;
79 
80  // Whether to use numpy-compatible dtypes
81  bool _use_np_dtypes = true;
82  // Cast timestamp columns to a specific type
83  data_type _timestamp_type{type_id::EMPTY};
84 
85  // Columns that should be read as Decimal128
86  std::vector<std::string> _decimal128_columns;
87 
88  // Ignore writer timezone in the stripe footer, read as UTC timezone
89  bool _ignore_timezone_in_stripe_footer = false;
90 
92 
98  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
99 
100  public:
106  orc_reader_options() = default;
107 
115 
121  [[nodiscard]] source_info const& get_source() const { return _source; }
122 
128  [[nodiscard]] auto const& get_columns() const { return _columns; }
129 
135  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
136 
142  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
143 
150  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
151 
157  [[nodiscard]] bool is_enabled_use_index() const { return _use_index; }
158 
164  [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
165 
171  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
172 
178  [[nodiscard]] std::vector<std::string> const& get_decimal128_columns() const
179  {
180  return _decimal128_columns;
181  }
182 
188  [[nodiscard]] bool get_ignore_timezone_in_stripe_footer() const
189  {
190  return _ignore_timezone_in_stripe_footer;
191  }
192 
193  // Setters
194 
200  void set_source(source_info src) { _source = std::move(src); }
201 
207  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
208 
219  void set_stripes(std::vector<std::vector<size_type>> stripes)
220  {
221  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
222  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
223  "Can't set stripes along with num_rows");
224  _stripes = std::move(stripes);
225  }
226 
235  void set_skip_rows(int64_t rows)
236  {
237  CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
238  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
239  _skip_rows = rows;
240  }
241 
250  void set_num_rows(int64_t nrows)
251  {
252  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
253  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
254  _num_rows = nrows;
255  }
256 
262  void enable_use_index(bool use) { _use_index = use; }
263 
269  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
270 
276  void set_timestamp_type(data_type type) { _timestamp_type = type; }
277 
283  void set_decimal128_columns(std::vector<std::string> val)
284  {
285  _decimal128_columns = std::move(val);
286  }
287 };
288 
293  orc_reader_options options;
294 
295  public:
301  explicit orc_reader_options_builder() = default;
302 
308  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
309 
316  orc_reader_options_builder& columns(std::vector<std::string> col_names)
317  {
318  options._columns = std::move(col_names);
319  return *this;
320  }
321 
328  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
329  {
330  options.set_stripes(std::move(stripes));
331  return *this;
332  }
333 
341  {
342  options.set_skip_rows(rows);
343  return *this;
344  }
345 
353  {
354  options.set_num_rows(nrows);
355  return *this;
356  }
357 
365  {
366  options._use_index = use;
367  return *this;
368  }
369 
377  {
378  options._use_np_dtypes = use;
379  return *this;
380  }
381 
389  {
390  options._timestamp_type = type;
391  return *this;
392  }
393 
400  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
401  {
402  options._decimal128_columns = std::move(val);
403  return *this;
404  }
405 
413  {
414  options._ignore_timezone_in_stripe_footer = ignore;
415  return *this;
416  }
417 
421  operator orc_reader_options&&() { return std::move(options); }
422 
430  orc_reader_options&& build() { return std::move(options); }
431 };
432 
451  orc_reader_options const& options,
454 
465  public:
472 
518  std::size_t chunk_read_limit,
519  std::size_t pass_read_limit,
520  size_type output_row_granularity,
521  orc_reader_options const& options,
524 
540  std::size_t chunk_read_limit,
541  std::size_t pass_read_limit,
542  orc_reader_options const& options,
545 
559  std::size_t chunk_read_limit,
560  orc_reader_options const& options,
563 
568 
574  [[nodiscard]] bool has_next() const;
575 
587  [[nodiscard]] table_with_metadata read_chunk() const;
588 
589  private:
590  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
591 };
592  // end of group
604 
614 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
615 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
616 
621  // Specify the sink to use for writer output
622  sink_info _sink;
623  // Specify the compression format to use
624  compression_type _compression = compression_type::SNAPPY;
625  // Specify frequency of statistics collection
626  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
627  // Maximum size of each stripe (unless smaller than a single row group)
628  size_t _stripe_size_bytes = default_stripe_size_bytes;
629  // Maximum number of rows in stripe (unless smaller than a single row group)
630  size_type _stripe_size_rows = default_stripe_size_rows;
631  // Row index stride (maximum number of rows in each row group)
632  size_type _row_index_stride = default_row_index_stride;
633  // Set of columns to output
634  table_view _table;
635  // Optional associated metadata
636  std::optional<table_input_metadata> _metadata;
637  // Optional footer key_value_metadata
638  std::map<std::string, std::string> _user_data;
639  // Optional compression statistics
640  std::shared_ptr<writer_compression_statistics> _compression_stats;
641  // Specify whether string dictionaries should be alphabetically sorted
642  bool _enable_dictionary_sort = true;
643 
645 
653  : _sink(std::move(sink)), _table(std::move(table))
654  {
655  }
656 
657  public:
663  explicit orc_writer_options() = default;
664 
674 
680  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
681 
687  [[nodiscard]] compression_type get_compression() const { return _compression; }
688 
694  [[nodiscard]] bool is_enabled_statistics() const
695  {
696  return _stats_freq != statistics_freq::STATISTICS_NONE;
697  }
698 
704  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
705 
711  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
712 
718  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
719 
725  [[nodiscard]] auto get_row_index_stride() const
726  {
727  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
728  return unaligned_stride - unaligned_stride % 8;
729  }
730 
736  [[nodiscard]] table_view get_table() const { return _table; }
737 
743  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
744 
750  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
751  {
752  return _user_data;
753  }
754 
760  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
761  {
762  return _compression_stats;
763  }
764 
770  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
771 
772  // Setters
773 
780  {
781  _compression = comp;
782  if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
783  }
784 
795  void enable_statistics(statistics_freq val) { _stats_freq = val; }
796 
804  void set_stripe_size_bytes(size_t size_bytes)
805  {
806  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
807  _stripe_size_bytes = size_bytes;
808  }
809 
821  {
822  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
823  _stripe_size_rows = size_rows;
824  }
825 
836  {
837  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
838  _row_index_stride = stride;
839  }
840 
846  void set_table(table_view tbl) { _table = tbl; }
847 
853  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
854 
860  void set_key_value_metadata(std::map<std::string, std::string> metadata)
861  {
862  _user_data = std::move(metadata);
863  }
864 
870  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
871  {
872  _compression_stats = std::move(comp_stats);
873  }
874 
880  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
881 };
882 
887  orc_writer_options options;
888 
889  public:
896 
903  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
904  {
905  }
906 
914  {
915  options.set_compression(comp);
916  return *this;
917  }
918 
931  {
932  options._stats_freq = val;
933  return *this;
934  }
935 
943  {
944  options.set_stripe_size_bytes(val);
945  return *this;
946  }
947 
955  {
956  options.set_stripe_size_rows(val);
957  return *this;
958  }
959 
967  {
968  options.set_row_index_stride(val);
969  return *this;
970  }
971 
979  {
980  options._table = tbl;
981  return *this;
982  }
983 
991  {
992  options._metadata = std::move(meta);
993  return *this;
994  }
995 
1002  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
1003  {
1004  options._user_data = std::move(metadata);
1005  return *this;
1006  }
1007 
1015  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1016  {
1017  options._compression_stats = comp_stats;
1018  return *this;
1019  }
1020 
1028  {
1029  options._enable_dictionary_sort = val;
1030  return *this;
1031  }
1032 
1036  operator orc_writer_options&&() { return std::move(options); }
1037 
1045  orc_writer_options&& build() { return std::move(options); }
1046 };
1047 
1061 void write_orc(orc_writer_options const& options,
1063 
1068 
1073  // Specify the sink to use for writer output
1074  sink_info _sink;
1075  // Specify the compression format to use
1076  compression_type _compression = compression_type::SNAPPY;
1077  // Specify granularity of statistics collection
1078  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
1079  // Maximum size of each stripe (unless smaller than a single row group)
1080  size_t _stripe_size_bytes = default_stripe_size_bytes;
1081  // Maximum number of rows in stripe (unless smaller than a single row group)
1082  size_type _stripe_size_rows = default_stripe_size_rows;
1083  // Row index stride (maximum number of rows in each row group)
1084  size_type _row_index_stride = default_row_index_stride;
1085  // Optional associated metadata
1086  std::optional<table_input_metadata> _metadata;
1087  // Optional footer key_value_metadata
1088  std::map<std::string, std::string> _user_data;
1089  // Optional compression statistics
1090  std::shared_ptr<writer_compression_statistics> _compression_stats;
1091  // Specify whether string dictionaries should be alphabetically sorted
1092  bool _enable_dictionary_sort = true;
1093 
1095 
1101  chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {}
1102 
1103  public:
1109  explicit chunked_orc_writer_options() = default;
1110 
1119 
1125  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1126 
1132  [[nodiscard]] compression_type get_compression() const { return _compression; }
1133 
1139  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
1140 
1146  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
1147 
1153  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
1154 
1160  [[nodiscard]] auto get_row_index_stride() const
1161  {
1162  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
1163  return unaligned_stride - unaligned_stride % 8;
1164  }
1165 
1171  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1172 
1178  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
1179  {
1180  return _user_data;
1181  }
1182 
1188  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1189  {
1190  return _compression_stats;
1191  }
1192 
1198  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1199 
1200  // Setters
1201 
1208  {
1209  _compression = comp;
1210  if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
1211  }
1212 
1223  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1224 
1232  void set_stripe_size_bytes(size_t size_bytes)
1233  {
1234  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1235  _stripe_size_bytes = size_bytes;
1236  }
1237 
1249  {
1250  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1251  _stripe_size_rows = size_rows;
1252  }
1253 
1264  {
1265  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1266  _row_index_stride = stride;
1267  }
1268 
1274  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1275 
1281  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1282  {
1283  _user_data = std::move(metadata);
1284  }
1285 
1291  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1292  {
1293  _compression_stats = std::move(comp_stats);
1294  }
1295 
1301  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1302 };
1303 
1309 
1310  public:
1317 
1323  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1324 
1332  {
1333  options.set_compression(comp);
1334  return *this;
1335  }
1336 
1349  {
1350  options._stats_freq = val;
1351  return *this;
1352  }
1353 
1361  {
1362  options.set_stripe_size_bytes(val);
1363  return *this;
1364  }
1365 
1373  {
1374  options.set_stripe_size_rows(val);
1375  return *this;
1376  }
1377 
1385  {
1386  options.set_row_index_stride(val);
1387  return *this;
1388  }
1389 
1397  {
1398  options._metadata = std::move(meta);
1399  return *this;
1400  }
1401 
1409  std::map<std::string, std::string> metadata)
1410  {
1411  options._user_data = std::move(metadata);
1412  return *this;
1413  }
1414 
1422  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1423  {
1424  options._compression_stats = comp_stats;
1425  return *this;
1426  }
1427 
1435  {
1436  options._enable_dictionary_sort = val;
1437  return *this;
1438  }
1439 
1443  operator chunked_orc_writer_options&&() { return std::move(options); }
1444 
1452  chunked_orc_writer_options&& build() { return std::move(options); }
1453 };
1454 
1477  public:
1483 
1488 
1497 
1505 
1509  void close();
1510 
1512  std::unique_ptr<orc::detail::writer> writer;
1513 };
1514  // end of group
1516 } // namespace io
1517 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:238
The chunked orc reader class to read an ORC file iteratively into a series of tables,...
Definition: orc.hpp:464
chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from output size limits along with other ORC reader options.
bool has_next() const
Check if there is any data in the given data sources has not yet read.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits along with other ORC reader options.
~chunked_orc_reader()
Destructor, destroying the internal reader instance.
table_with_metadata read_chunk() const
Read a chunk of rows in the given data sources.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits, output row granularity, along with other ORC read...
chunked_orc_reader()
Default constructor, this should never be used.
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1307
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1434
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1452
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1360
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1372
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1421
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1408
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1331
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1396
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1323
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1348
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1384
Settings to use for write_orc_chunked().
Definition: orc.hpp:1072
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1232
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1274
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1281
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1291
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:1125
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:1153
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:1160
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1263
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:1139
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1207
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:1178
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1301
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:1171
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1198
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:1132
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1248
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:1188
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:1146
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1223
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1476
~orc_chunked_writer()
virtual destructor, Added so we don't leak detail types.
orc_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1512
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
Builds settings to use for read_orc().
Definition: orc.hpp:292
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:364
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:400
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:376
orc_reader_options_builder & ignore_timezone_in_stripe_footer(bool ignore)
Set whether to ignore writer timezone in the stripe footer.
Definition: orc.hpp:412
orc_reader_options_builder & skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:340
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:308
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:328
orc_reader_options_builder & num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:352
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:316
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:430
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:388
Settings to use for read_orc().
Definition: orc.hpp:64
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:142
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:269
void set_num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:250
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:135
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:283
void set_skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:235
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:262
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:207
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:219
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:171
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:128
void set_source(source_info src)
Sets source info.
Definition: orc.hpp:200
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
std::optional< int64_t > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:150
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:121
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:164
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:157
bool get_ignore_timezone_in_stripe_footer() const
Returns whether to ignore writer timezone in the stripe footer.
Definition: orc.hpp:188
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:178
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:276
Builds settings to use for write_orc().
Definition: orc.hpp:886
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:978
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:966
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:930
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:990
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:903
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:1045
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1002
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1014
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:954
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1027
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:913
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:942
Settings to use for write_orc().
Definition: orc.hpp:620
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:795
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:743
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:750
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:760
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:694
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:711
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:820
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:860
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:718
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:736
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:853
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:704
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:870
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:725
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:846
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:779
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:880
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:835
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:687
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:770
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:804
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:680
Metadata for a table.
Definition: io/types.hpp:893
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
bool is_supported_read_orc(compression_type compression)
Check if the compression type is supported for reading ORC files.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:31
bool is_supported_write_orc(compression_type compression)
Check if the compression type is supported for writing ORC files.
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:32
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:30
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:85
compression_type
Compression algorithms.
Definition: io/types.hpp:46
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:87
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:86
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:88
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:143
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Destination information for write interfaces.
Definition: io/types.hpp:471
Source information for read interfaces.
Definition: io/types.hpp:316
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.