orc.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/orc.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/export.hpp>
25 
26 #include <memory>
27 #include <optional>
28 #include <string>
29 #include <unordered_map>
30 #include <utility>
31 #include <vector>
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io {
41 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
42 constexpr size_type default_stripe_size_rows = 1000000;
44 
54 [[nodiscard]] bool is_supported_read_orc(compression_type compression);
55 
65 [[nodiscard]] bool is_supported_write_orc(compression_type compression);
66 
71 
76  source_info _source;
77 
78  // Names of column to read; `nullopt` is all
79  std::optional<std::vector<std::string>> _columns;
80 
81  // List of individual stripes to read (ignored if empty)
82  std::vector<std::vector<size_type>> _stripes;
83  // Rows to skip from the start
84  int64_t _skip_rows = 0;
85  // Rows to read; `nullopt` is all
86  std::optional<int64_t> _num_rows;
87 
88  // Whether to use row index to speed-up reading
89  bool _use_index = true;
90 
91  // Whether to use numpy-compatible dtypes
92  bool _use_np_dtypes = true;
93  // Cast timestamp columns to a specific type
94  data_type _timestamp_type{type_id::EMPTY};
95 
96  // Columns that should be read as Decimal128
97  std::vector<std::string> _decimal128_columns;
98 
99  // Ignore writer timezone in the stripe footer, read as UTC timezone
100  bool _ignore_timezone_in_stripe_footer = false;
101 
103 
109  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
110 
111  public:
117  orc_reader_options() = default;
118 
126 
132  [[nodiscard]] source_info const& get_source() const { return _source; }
133 
139  [[nodiscard]] auto const& get_columns() const { return _columns; }
140 
146  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
147 
153  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
154 
161  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
162 
168  [[nodiscard]] bool is_enabled_use_index() const { return _use_index; }
169 
175  [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
176 
182  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
183 
189  [[nodiscard]] std::vector<std::string> const& get_decimal128_columns() const
190  {
191  return _decimal128_columns;
192  }
193 
199  [[nodiscard]] bool get_ignore_timezone_in_stripe_footer() const
200  {
201  return _ignore_timezone_in_stripe_footer;
202  }
203 
204  // Setters
205 
211  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
212 
223  void set_stripes(std::vector<std::vector<size_type>> stripes)
224  {
225  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
226  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
227  "Can't set stripes along with num_rows");
228  _stripes = std::move(stripes);
229  }
230 
239  void set_skip_rows(int64_t rows)
240  {
241  CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
242  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
243  _skip_rows = rows;
244  }
245 
254  void set_num_rows(int64_t nrows)
255  {
256  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
257  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
258  _num_rows = nrows;
259  }
260 
266  void enable_use_index(bool use) { _use_index = use; }
267 
273  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
274 
280  void set_timestamp_type(data_type type) { _timestamp_type = type; }
281 
287  void set_decimal128_columns(std::vector<std::string> val)
288  {
289  _decimal128_columns = std::move(val);
290  }
291 };
292 
297  orc_reader_options options;
298 
299  public:
305  explicit orc_reader_options_builder() = default;
306 
312  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
313 
320  orc_reader_options_builder& columns(std::vector<std::string> col_names)
321  {
322  options._columns = std::move(col_names);
323  return *this;
324  }
325 
332  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
333  {
334  options.set_stripes(std::move(stripes));
335  return *this;
336  }
337 
345  {
346  options.set_skip_rows(rows);
347  return *this;
348  }
349 
357  {
358  options.set_num_rows(nrows);
359  return *this;
360  }
361 
369  {
370  options._use_index = use;
371  return *this;
372  }
373 
381  {
382  options._use_np_dtypes = use;
383  return *this;
384  }
385 
393  {
394  options._timestamp_type = type;
395  return *this;
396  }
397 
404  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
405  {
406  options._decimal128_columns = std::move(val);
407  return *this;
408  }
409 
417  {
418  options._ignore_timezone_in_stripe_footer = ignore;
419  return *this;
420  }
421 
425  operator orc_reader_options&&() { return std::move(options); }
426 
434  orc_reader_options&& build() { return std::move(options); }
435 };
436 
455  orc_reader_options const& options,
458 
469  public:
476 
522  std::size_t chunk_read_limit,
523  std::size_t pass_read_limit,
524  size_type output_row_granularity,
525  orc_reader_options const& options,
528 
544  std::size_t chunk_read_limit,
545  std::size_t pass_read_limit,
546  orc_reader_options const& options,
549 
563  std::size_t chunk_read_limit,
564  orc_reader_options const& options,
567 
572 
578  [[nodiscard]] bool has_next() const;
579 
591  [[nodiscard]] table_with_metadata read_chunk() const;
592 
593  private:
594  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
595 };
596  // end of group
608 
618 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
619 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
620 
625  // Specify the sink to use for writer output
626  sink_info _sink;
627  // Specify the compression format to use
628  compression_type _compression = compression_type::SNAPPY;
629  // Specify frequency of statistics collection
630  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
631  // Maximum size of each stripe (unless smaller than a single row group)
632  size_t _stripe_size_bytes = default_stripe_size_bytes;
633  // Maximum number of rows in stripe (unless smaller than a single row group)
634  size_type _stripe_size_rows = default_stripe_size_rows;
635  // Row index stride (maximum number of rows in each row group)
636  size_type _row_index_stride = default_row_index_stride;
637  // Set of columns to output
638  table_view _table;
639  // Optional associated metadata
640  std::optional<table_input_metadata> _metadata;
641  // Optional footer key_value_metadata
642  std::map<std::string, std::string> _user_data;
643  // Optional compression statistics
644  std::shared_ptr<writer_compression_statistics> _compression_stats;
645  // Specify whether string dictionaries should be alphabetically sorted
646  bool _enable_dictionary_sort = true;
647 
649 
657  : _sink(std::move(sink)), _table(std::move(table))
658  {
659  }
660 
661  public:
667  explicit orc_writer_options() = default;
668 
678 
684  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
685 
691  [[nodiscard]] compression_type get_compression() const { return _compression; }
692 
698  [[nodiscard]] bool is_enabled_statistics() const
699  {
700  return _stats_freq != statistics_freq::STATISTICS_NONE;
701  }
702 
708  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
709 
715  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
716 
722  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
723 
729  [[nodiscard]] auto get_row_index_stride() const
730  {
731  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
732  return unaligned_stride - unaligned_stride % 8;
733  }
734 
740  [[nodiscard]] table_view get_table() const { return _table; }
741 
747  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
748 
754  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
755  {
756  return _user_data;
757  }
758 
764  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
765  {
766  return _compression_stats;
767  }
768 
774  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
775 
776  // Setters
777 
784  {
785  _compression = comp;
786  if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
787  }
788 
799  void enable_statistics(statistics_freq val) { _stats_freq = val; }
800 
808  void set_stripe_size_bytes(size_t size_bytes)
809  {
810  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
811  _stripe_size_bytes = size_bytes;
812  }
813 
825  {
826  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
827  _stripe_size_rows = size_rows;
828  }
829 
840  {
841  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
842  _row_index_stride = stride;
843  }
844 
850  void set_table(table_view tbl) { _table = tbl; }
851 
857  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
858 
864  void set_key_value_metadata(std::map<std::string, std::string> metadata)
865  {
866  _user_data = std::move(metadata);
867  }
868 
874  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
875  {
876  _compression_stats = std::move(comp_stats);
877  }
878 
884  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
885 };
886 
891  orc_writer_options options;
892 
893  public:
900 
907  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
908  {
909  }
910 
918  {
919  options.set_compression(comp);
920  return *this;
921  }
922 
935  {
936  options._stats_freq = val;
937  return *this;
938  }
939 
947  {
948  options.set_stripe_size_bytes(val);
949  return *this;
950  }
951 
959  {
960  options.set_stripe_size_rows(val);
961  return *this;
962  }
963 
971  {
972  options.set_row_index_stride(val);
973  return *this;
974  }
975 
983  {
984  options._table = tbl;
985  return *this;
986  }
987 
995  {
996  options._metadata = std::move(meta);
997  return *this;
998  }
999 
1006  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
1007  {
1008  options._user_data = std::move(metadata);
1009  return *this;
1010  }
1011 
1019  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1020  {
1021  options._compression_stats = comp_stats;
1022  return *this;
1023  }
1024 
1032  {
1033  options._enable_dictionary_sort = val;
1034  return *this;
1035  }
1036 
1040  operator orc_writer_options&&() { return std::move(options); }
1041 
1049  orc_writer_options&& build() { return std::move(options); }
1050 };
1051 
1065 void write_orc(orc_writer_options const& options,
1067 
1072 
1077  // Specify the sink to use for writer output
1078  sink_info _sink;
1079  // Specify the compression format to use
1080  compression_type _compression = compression_type::SNAPPY;
1081  // Specify granularity of statistics collection
1082  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
1083  // Maximum size of each stripe (unless smaller than a single row group)
1084  size_t _stripe_size_bytes = default_stripe_size_bytes;
1085  // Maximum number of rows in stripe (unless smaller than a single row group)
1086  size_type _stripe_size_rows = default_stripe_size_rows;
1087  // Row index stride (maximum number of rows in each row group)
1088  size_type _row_index_stride = default_row_index_stride;
1089  // Optional associated metadata
1090  std::optional<table_input_metadata> _metadata;
1091  // Optional footer key_value_metadata
1092  std::map<std::string, std::string> _user_data;
1093  // Optional compression statistics
1094  std::shared_ptr<writer_compression_statistics> _compression_stats;
1095  // Specify whether string dictionaries should be alphabetically sorted
1096  bool _enable_dictionary_sort = true;
1097 
1099 
1105  chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {}
1106 
1107  public:
1113  explicit chunked_orc_writer_options() = default;
1114 
1123 
1129  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1130 
1136  [[nodiscard]] compression_type get_compression() const { return _compression; }
1137 
1143  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
1144 
1150  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
1151 
1157  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
1158 
1164  [[nodiscard]] auto get_row_index_stride() const
1165  {
1166  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
1167  return unaligned_stride - unaligned_stride % 8;
1168  }
1169 
1175  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1176 
1182  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
1183  {
1184  return _user_data;
1185  }
1186 
1192  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1193  {
1194  return _compression_stats;
1195  }
1196 
1202  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1203 
1204  // Setters
1205 
1212  {
1213  _compression = comp;
1214  if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
1215  }
1216 
1227  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1228 
1236  void set_stripe_size_bytes(size_t size_bytes)
1237  {
1238  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1239  _stripe_size_bytes = size_bytes;
1240  }
1241 
1253  {
1254  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1255  _stripe_size_rows = size_rows;
1256  }
1257 
1268  {
1269  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1270  _row_index_stride = stride;
1271  }
1272 
1278  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1279 
1285  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1286  {
1287  _user_data = std::move(metadata);
1288  }
1289 
1295  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1296  {
1297  _compression_stats = std::move(comp_stats);
1298  }
1299 
1305  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1306 };
1307 
1313 
1314  public:
1321 
1327  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1328 
1336  {
1337  options.set_compression(comp);
1338  return *this;
1339  }
1340 
1353  {
1354  options._stats_freq = val;
1355  return *this;
1356  }
1357 
1365  {
1366  options.set_stripe_size_bytes(val);
1367  return *this;
1368  }
1369 
1377  {
1378  options.set_stripe_size_rows(val);
1379  return *this;
1380  }
1381 
1389  {
1390  options.set_row_index_stride(val);
1391  return *this;
1392  }
1393 
1401  {
1402  options._metadata = std::move(meta);
1403  return *this;
1404  }
1405 
1413  std::map<std::string, std::string> metadata)
1414  {
1415  options._user_data = std::move(metadata);
1416  return *this;
1417  }
1418 
1426  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1427  {
1428  options._compression_stats = comp_stats;
1429  return *this;
1430  }
1431 
1439  {
1440  options._enable_dictionary_sort = val;
1441  return *this;
1442  }
1443 
1447  operator chunked_orc_writer_options&&() { return std::move(options); }
1448 
1456  chunked_orc_writer_options&& build() { return std::move(options); }
1457 };
1458 
1481  public:
1487 
1492 
1501 
1509 
1513  void close();
1514 
1516  std::unique_ptr<orc::detail::writer> writer;
1517 };
1518  // end of group
1520 } // namespace io
1521 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:249
The chunked orc reader class to read an ORC file iteratively into a series of tables,...
Definition: orc.hpp:468
chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from output size limits along with other ORC reader options.
bool has_next() const
Check if there is any data in the given data sources has not yet read.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits along with other ORC reader options.
~chunked_orc_reader()
Destructor, destroying the internal reader instance.
table_with_metadata read_chunk() const
Read a chunk of rows in the given data sources.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits, output row granularity, along with other ORC read...
chunked_orc_reader()
Default constructor, this should never be used.
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1311
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1438
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1456
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1364
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1376
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1425
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1412
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1335
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1400
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1327
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1352
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1388
Settings to use for write_orc_chunked().
Definition: orc.hpp:1076
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1236
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1278
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1285
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1295
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:1129
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:1157
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:1164
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1267
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:1143
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1211
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:1182
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1305
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:1175
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1202
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:1136
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1252
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:1192
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:1150
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1227
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1480
~orc_chunked_writer()
virtual destructor, Added so we don't leak detail types.
orc_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1516
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
Builds settings to use for read_orc().
Definition: orc.hpp:296
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:368
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:404
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:380
orc_reader_options_builder & ignore_timezone_in_stripe_footer(bool ignore)
Set whether to ignore writer timezone in the stripe footer.
Definition: orc.hpp:416
orc_reader_options_builder & skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:344
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:312
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:332
orc_reader_options_builder & num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:356
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:320
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:434
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:392
Settings to use for read_orc().
Definition: orc.hpp:75
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:153
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:273
void set_num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:254
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:146
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:287
void set_skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:239
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:266
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:211
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:223
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:182
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:139
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
std::optional< int64_t > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:161
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:132
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:175
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:168
bool get_ignore_timezone_in_stripe_footer() const
Returns whether to ignore writer timezone in the stripe footer.
Definition: orc.hpp:199
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:189
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:280
Builds settings to use for write_orc().
Definition: orc.hpp:890
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:982
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:970
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:934
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:994
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:907
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:1049
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1006
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1018
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:958
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1031
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:917
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:946
Settings to use for write_orc().
Definition: orc.hpp:624
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:799
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:747
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:754
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:764
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:698
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:715
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:824
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:864
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:722
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:740
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:857
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:708
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:874
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:729
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:850
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:783
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:884
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:839
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:691
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:774
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:808
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:684
Metadata for a table.
Definition: io/types.hpp:890
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
bool is_supported_read_orc(compression_type compression)
Check if the compression type is supported for reading ORC files.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:42
bool is_supported_write_orc(compression_type compression)
Check if the compression type is supported for writing ORC files.
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:43
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:41
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:96
compression_type
Compression algorithms.
Definition: io/types.hpp:57
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:98
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:97
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:99
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::async_resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:154
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:37
Destination information for write interfaces.
Definition: io/types.hpp:468
Source information for read interfaces.
Definition: io/types.hpp:327
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:303
Class definitions for (mutable)_table_view
Type declarations for libcudf.