orc.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/orc.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/export.hpp>
25 
26 #include <memory>
27 #include <optional>
28 #include <string>
29 #include <unordered_map>
30 #include <utility>
31 #include <vector>
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io {
41 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
42 constexpr size_type default_stripe_size_rows = 1000000;
44 
49 
54  source_info _source;
55 
56  // Names of column to read; `nullopt` is all
57  std::optional<std::vector<std::string>> _columns;
58 
59  // List of individual stripes to read (ignored if empty)
60  std::vector<std::vector<size_type>> _stripes;
61  // Rows to skip from the start
62  int64_t _skip_rows = 0;
63  // Rows to read; `nullopt` is all
64  std::optional<int64_t> _num_rows;
65 
66  // Whether to use row index to speed-up reading
67  bool _use_index = true;
68 
69  // Whether to use numpy-compatible dtypes
70  bool _use_np_dtypes = true;
71  // Cast timestamp columns to a specific type
72  data_type _timestamp_type{type_id::EMPTY};
73 
74  // Columns that should be read as Decimal128
75  std::vector<std::string> _decimal128_columns;
76 
78 
84  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
85 
86  public:
92  orc_reader_options() = default;
93 
101 
107  [[nodiscard]] source_info const& get_source() const { return _source; }
108 
114  [[nodiscard]] auto const& get_columns() const { return _columns; }
115 
121  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
122 
128  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
129 
136  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
137 
143  [[nodiscard]] bool is_enabled_use_index() const { return _use_index; }
144 
150  [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
151 
157  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
158 
164  [[nodiscard]] std::vector<std::string> const& get_decimal128_columns() const
165  {
166  return _decimal128_columns;
167  }
168 
169  // Setters
170 
176  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
177 
188  void set_stripes(std::vector<std::vector<size_type>> stripes)
189  {
190  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
191  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
192  "Can't set stripes along with num_rows");
193  _stripes = std::move(stripes);
194  }
195 
204  void set_skip_rows(int64_t rows)
205  {
206  CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
207  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
208  _skip_rows = rows;
209  }
210 
219  void set_num_rows(int64_t nrows)
220  {
221  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
222  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
223  _num_rows = nrows;
224  }
225 
231  void enable_use_index(bool use) { _use_index = use; }
232 
238  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
239 
245  void set_timestamp_type(data_type type) { _timestamp_type = type; }
246 
252  void set_decimal128_columns(std::vector<std::string> val)
253  {
254  _decimal128_columns = std::move(val);
255  }
256 };
257 
262  orc_reader_options options;
263 
264  public:
270  explicit orc_reader_options_builder() = default;
271 
277  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
278 
285  orc_reader_options_builder& columns(std::vector<std::string> col_names)
286  {
287  options._columns = std::move(col_names);
288  return *this;
289  }
290 
297  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
298  {
299  options.set_stripes(std::move(stripes));
300  return *this;
301  }
302 
310  {
311  options.set_skip_rows(rows);
312  return *this;
313  }
314 
322  {
323  options.set_num_rows(nrows);
324  return *this;
325  }
326 
334  {
335  options._use_index = use;
336  return *this;
337  }
338 
346  {
347  options._use_np_dtypes = use;
348  return *this;
349  }
350 
358  {
359  options._timestamp_type = type;
360  return *this;
361  }
362 
369  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
370  {
371  options._decimal128_columns = std::move(val);
372  return *this;
373  }
374 
378  operator orc_reader_options&&() { return std::move(options); }
379 
387  orc_reader_options&& build() { return std::move(options); }
388 };
389 
408  orc_reader_options const& options,
411 
422  public:
429 
475  std::size_t chunk_read_limit,
476  std::size_t pass_read_limit,
477  size_type output_row_granularity,
478  orc_reader_options const& options,
481 
497  std::size_t chunk_read_limit,
498  std::size_t pass_read_limit,
499  orc_reader_options const& options,
502 
516  std::size_t chunk_read_limit,
517  orc_reader_options const& options,
520 
525 
531  [[nodiscard]] bool has_next() const;
532 
544  [[nodiscard]] table_with_metadata read_chunk() const;
545 
546  private:
547  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
548 };
549  // end of group
561 
571 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
572 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
573 
578  // Specify the sink to use for writer output
579  sink_info _sink;
580  // Specify the compression format to use
581  compression_type _compression = compression_type::AUTO;
582  // Specify frequency of statistics collection
583  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
584  // Maximum size of each stripe (unless smaller than a single row group)
585  size_t _stripe_size_bytes = default_stripe_size_bytes;
586  // Maximum number of rows in stripe (unless smaller than a single row group)
587  size_type _stripe_size_rows = default_stripe_size_rows;
588  // Row index stride (maximum number of rows in each row group)
589  size_type _row_index_stride = default_row_index_stride;
590  // Set of columns to output
591  table_view _table;
592  // Optional associated metadata
593  std::optional<table_input_metadata> _metadata;
594  // Optional footer key_value_metadata
595  std::map<std::string, std::string> _user_data;
596  // Optional compression statistics
597  std::shared_ptr<writer_compression_statistics> _compression_stats;
598  // Specify whether string dictionaries should be alphabetically sorted
599  bool _enable_dictionary_sort = true;
600 
602 
610  : _sink(std::move(sink)), _table(std::move(table))
611  {
612  }
613 
614  public:
620  explicit orc_writer_options() = default;
621 
631 
637  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
638 
644  [[nodiscard]] compression_type get_compression() const { return _compression; }
645 
651  [[nodiscard]] bool is_enabled_statistics() const
652  {
653  return _stats_freq != statistics_freq::STATISTICS_NONE;
654  }
655 
661  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
662 
668  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
669 
675  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
676 
682  [[nodiscard]] auto get_row_index_stride() const
683  {
684  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
685  return unaligned_stride - unaligned_stride % 8;
686  }
687 
693  [[nodiscard]] table_view get_table() const { return _table; }
694 
700  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
701 
707  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
708  {
709  return _user_data;
710  }
711 
717  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
718  {
719  return _compression_stats;
720  }
721 
727  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
728 
729  // Setters
730 
736  void set_compression(compression_type comp) { _compression = comp; }
737 
748  void enable_statistics(statistics_freq val) { _stats_freq = val; }
749 
757  void set_stripe_size_bytes(size_t size_bytes)
758  {
759  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
760  _stripe_size_bytes = size_bytes;
761  }
762 
774  {
775  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
776  _stripe_size_rows = size_rows;
777  }
778 
789  {
790  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
791  _row_index_stride = stride;
792  }
793 
799  void set_table(table_view tbl) { _table = tbl; }
800 
806  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
807 
813  void set_key_value_metadata(std::map<std::string, std::string> metadata)
814  {
815  _user_data = std::move(metadata);
816  }
817 
823  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
824  {
825  _compression_stats = std::move(comp_stats);
826  }
827 
833  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
834 };
835 
840  orc_writer_options options;
841 
842  public:
849 
856  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
857  {
858  }
859 
867  {
868  options._compression = comp;
869  return *this;
870  }
871 
884  {
885  options._stats_freq = val;
886  return *this;
887  }
888 
896  {
897  options.set_stripe_size_bytes(val);
898  return *this;
899  }
900 
908  {
909  options.set_stripe_size_rows(val);
910  return *this;
911  }
912 
920  {
921  options.set_row_index_stride(val);
922  return *this;
923  }
924 
932  {
933  options._table = tbl;
934  return *this;
935  }
936 
944  {
945  options._metadata = std::move(meta);
946  return *this;
947  }
948 
955  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
956  {
957  options._user_data = std::move(metadata);
958  return *this;
959  }
960 
968  std::shared_ptr<writer_compression_statistics> const& comp_stats)
969  {
970  options._compression_stats = comp_stats;
971  return *this;
972  }
973 
981  {
982  options._enable_dictionary_sort = val;
983  return *this;
984  }
985 
989  operator orc_writer_options&&() { return std::move(options); }
990 
998  orc_writer_options&& build() { return std::move(options); }
999 };
1000 
1014 void write_orc(orc_writer_options const& options,
1016 
1021 
1026  // Specify the sink to use for writer output
1027  sink_info _sink;
1028  // Specify the compression format to use
1029  compression_type _compression = compression_type::AUTO;
1030  // Specify granularity of statistics collection
1031  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
1032  // Maximum size of each stripe (unless smaller than a single row group)
1033  size_t _stripe_size_bytes = default_stripe_size_bytes;
1034  // Maximum number of rows in stripe (unless smaller than a single row group)
1035  size_type _stripe_size_rows = default_stripe_size_rows;
1036  // Row index stride (maximum number of rows in each row group)
1037  size_type _row_index_stride = default_row_index_stride;
1038  // Optional associated metadata
1039  std::optional<table_input_metadata> _metadata;
1040  // Optional footer key_value_metadata
1041  std::map<std::string, std::string> _user_data;
1042  // Optional compression statistics
1043  std::shared_ptr<writer_compression_statistics> _compression_stats;
1044  // Specify whether string dictionaries should be alphabetically sorted
1045  bool _enable_dictionary_sort = true;
1046 
1048 
1054  chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {}
1055 
1056  public:
1062  explicit chunked_orc_writer_options() = default;
1063 
1072 
1078  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1079 
1085  [[nodiscard]] compression_type get_compression() const { return _compression; }
1086 
1092  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
1093 
1099  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
1100 
1106  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
1107 
1113  [[nodiscard]] auto get_row_index_stride() const
1114  {
1115  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
1116  return unaligned_stride - unaligned_stride % 8;
1117  }
1118 
1124  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1125 
1131  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
1132  {
1133  return _user_data;
1134  }
1135 
1141  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1142  {
1143  return _compression_stats;
1144  }
1145 
1151  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1152 
1153  // Setters
1154 
1160  void set_compression(compression_type comp) { _compression = comp; }
1161 
1172  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1173 
1181  void set_stripe_size_bytes(size_t size_bytes)
1182  {
1183  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1184  _stripe_size_bytes = size_bytes;
1185  }
1186 
1198  {
1199  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1200  _stripe_size_rows = size_rows;
1201  }
1202 
1213  {
1214  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1215  _row_index_stride = stride;
1216  }
1217 
1223  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1224 
1230  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1231  {
1232  _user_data = std::move(metadata);
1233  }
1234 
1240  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1241  {
1242  _compression_stats = std::move(comp_stats);
1243  }
1244 
1250  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1251 };
1252 
1258 
1259  public:
1266 
1272  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1273 
1281  {
1282  options._compression = comp;
1283  return *this;
1284  }
1285 
1298  {
1299  options._stats_freq = val;
1300  return *this;
1301  }
1302 
1310  {
1311  options.set_stripe_size_bytes(val);
1312  return *this;
1313  }
1314 
1322  {
1323  options.set_stripe_size_rows(val);
1324  return *this;
1325  }
1326 
1334  {
1335  options.set_row_index_stride(val);
1336  return *this;
1337  }
1338 
1346  {
1347  options._metadata = std::move(meta);
1348  return *this;
1349  }
1350 
1358  std::map<std::string, std::string> metadata)
1359  {
1360  options._user_data = std::move(metadata);
1361  return *this;
1362  }
1363 
1371  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1372  {
1373  options._compression_stats = comp_stats;
1374  return *this;
1375  }
1376 
1384  {
1385  options._enable_dictionary_sort = val;
1386  return *this;
1387  }
1388 
1392  operator chunked_orc_writer_options&&() { return std::move(options); }
1393 
1401  chunked_orc_writer_options&& build() { return std::move(options); }
1402 };
1403 
1426  public:
1432 
1437 
1446 
1454 
1458  void close();
1459 
1461  std::unique_ptr<orc::detail::writer> writer;
1462 };
1463  // end of group
1465 } // namespace io
1466 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
The chunked orc reader class to read an ORC file iteratively into a series of tables,...
Definition: orc.hpp:421
chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from output size limits along with other ORC reader options.
bool has_next() const
Check if there is any data in the given data sources has not yet read.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits along with other ORC reader options.
~chunked_orc_reader()
Destructor, destroying the internal reader instance.
table_with_metadata read_chunk() const
Read a chunk of rows in the given data sources.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits, output row granularity, along with other ORC read...
chunked_orc_reader()
Default constructor, this should never be used.
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1256
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1383
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1401
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1309
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1321
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1370
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1357
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1280
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1345
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1272
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1297
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1333
Settings to use for write_orc_chunked().
Definition: orc.hpp:1025
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1181
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1223
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1230
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1240
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:1078
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:1106
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:1113
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1212
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:1092
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1160
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:1131
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1250
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:1124
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1151
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:1085
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1197
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:1141
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:1099
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1172
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1425
~orc_chunked_writer()
virtual destructor, Added so we don't leak detail types.
orc_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1461
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
Builds settings to use for read_orc().
Definition: orc.hpp:261
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:333
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:369
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:345
orc_reader_options_builder & skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:309
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:277
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:297
orc_reader_options_builder & num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:321
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:285
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:387
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:357
Settings to use for read_orc().
Definition: orc.hpp:53
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:128
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:238
void set_num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:219
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:121
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:252
void set_skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:204
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:231
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:176
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:188
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:157
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:114
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
std::optional< int64_t > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:136
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:107
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:150
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:143
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:164
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:245
Builds settings to use for write_orc().
Definition: orc.hpp:839
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:931
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:919
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:883
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:943
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:856
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:998
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:955
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:967
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:907
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:980
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:866
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:895
Settings to use for write_orc().
Definition: orc.hpp:577
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:748
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:700
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:707
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:717
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:651
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:668
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:773
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:813
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:675
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:693
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:806
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:661
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:823
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:682
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:799
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:736
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:833
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:788
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:644
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:727
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:757
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:637
Metadata for a table.
Definition: io/types.hpp:932
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:42
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:43
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:41
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:96
compression_type
Compression algorithms.
Definition: io/types.hpp:57
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:98
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:97
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:99
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
Destination information for write interfaces.
Definition: io/types.hpp:512
Source information for read interfaces.
Definition: io/types.hpp:337
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.