orc.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/orc.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/export.hpp>
24 
26 #include <rmm/resource_ref.hpp>
27 
28 #include <memory>
29 #include <optional>
30 #include <string>
31 #include <unordered_map>
32 #include <utility>
33 #include <vector>
34 
35 namespace CUDF_EXPORT cudf {
36 namespace io {
43 constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024;
44 constexpr size_type default_stripe_size_rows = 1000000;
46 
51 
56  source_info _source;
57 
58  // Names of column to read; `nullopt` is all
59  std::optional<std::vector<std::string>> _columns;
60 
61  // List of individual stripes to read (ignored if empty)
62  std::vector<std::vector<size_type>> _stripes;
63  // Rows to skip from the start
64  int64_t _skip_rows = 0;
65  // Rows to read; `nullopt` is all
66  std::optional<int64_t> _num_rows;
67 
68  // Whether to use row index to speed-up reading
69  bool _use_index = true;
70 
71  // Whether to use numpy-compatible dtypes
72  bool _use_np_dtypes = true;
73  // Cast timestamp columns to a specific type
74  data_type _timestamp_type{type_id::EMPTY};
75 
76  // Columns that should be read as Decimal128
77  std::vector<std::string> _decimal128_columns;
78 
80 
86  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
87 
88  public:
94  orc_reader_options() = default;
95 
103 
109  [[nodiscard]] source_info const& get_source() const { return _source; }
110 
116  [[nodiscard]] auto const& get_columns() const { return _columns; }
117 
123  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
124 
130  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
131 
138  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
139 
145  [[nodiscard]] bool is_enabled_use_index() const { return _use_index; }
146 
152  [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
153 
159  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
160 
166  [[nodiscard]] std::vector<std::string> const& get_decimal128_columns() const
167  {
168  return _decimal128_columns;
169  }
170 
171  // Setters
172 
178  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
179 
190  void set_stripes(std::vector<std::vector<size_type>> stripes)
191  {
192  CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0), "Can't set stripes along with skip_rows");
193  CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
194  "Can't set stripes along with num_rows");
195  _stripes = std::move(stripes);
196  }
197 
206  void set_skip_rows(int64_t rows)
207  {
208  CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
209  CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
210  _skip_rows = rows;
211  }
212 
221  void set_num_rows(int64_t nrows)
222  {
223  CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
224  CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
225  _num_rows = nrows;
226  }
227 
233  void enable_use_index(bool use) { _use_index = use; }
234 
240  void enable_use_np_dtypes(bool use) { _use_np_dtypes = use; }
241 
247  void set_timestamp_type(data_type type) { _timestamp_type = type; }
248 
254  void set_decimal128_columns(std::vector<std::string> val)
255  {
256  _decimal128_columns = std::move(val);
257  }
258 };
259 
264  orc_reader_options options;
265 
266  public:
272  explicit orc_reader_options_builder() = default;
273 
279  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
280 
287  orc_reader_options_builder& columns(std::vector<std::string> col_names)
288  {
289  options._columns = std::move(col_names);
290  return *this;
291  }
292 
299  orc_reader_options_builder& stripes(std::vector<std::vector<size_type>> stripes)
300  {
301  options.set_stripes(std::move(stripes));
302  return *this;
303  }
304 
312  {
313  options.set_skip_rows(rows);
314  return *this;
315  }
316 
324  {
325  options.set_num_rows(nrows);
326  return *this;
327  }
328 
336  {
337  options._use_index = use;
338  return *this;
339  }
340 
348  {
349  options._use_np_dtypes = use;
350  return *this;
351  }
352 
360  {
361  options._timestamp_type = type;
362  return *this;
363  }
364 
371  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
372  {
373  options._decimal128_columns = std::move(val);
374  return *this;
375  }
376 
380  operator orc_reader_options&&() { return std::move(options); }
381 
389  orc_reader_options&& build() { return std::move(options); }
390 };
391 
410  orc_reader_options const& options,
413 
424  public:
431 
477  std::size_t chunk_read_limit,
478  std::size_t pass_read_limit,
479  size_type output_row_granularity,
480  orc_reader_options const& options,
483 
499  std::size_t chunk_read_limit,
500  std::size_t pass_read_limit,
501  orc_reader_options const& options,
504 
518  std::size_t chunk_read_limit,
519  orc_reader_options const& options,
522 
527 
533  [[nodiscard]] bool has_next() const;
534 
546  [[nodiscard]] table_with_metadata read_chunk() const;
547 
548  private:
549  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
550 };
551  // end of group
563 
573 static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
574 static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
575 
580  // Specify the sink to use for writer output
581  sink_info _sink;
582  // Specify the compression format to use
583  compression_type _compression = compression_type::AUTO;
584  // Specify frequency of statistics collection
585  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
586  // Maximum size of each stripe (unless smaller than a single row group)
587  size_t _stripe_size_bytes = default_stripe_size_bytes;
588  // Maximum number of rows in stripe (unless smaller than a single row group)
589  size_type _stripe_size_rows = default_stripe_size_rows;
590  // Row index stride (maximum number of rows in each row group)
591  size_type _row_index_stride = default_row_index_stride;
592  // Set of columns to output
593  table_view _table;
594  // Optional associated metadata
595  std::optional<table_input_metadata> _metadata;
596  // Optional footer key_value_metadata
597  std::map<std::string, std::string> _user_data;
598  // Optional compression statistics
599  std::shared_ptr<writer_compression_statistics> _compression_stats;
600  // Specify whether string dictionaries should be alphabetically sorted
601  bool _enable_dictionary_sort = true;
602 
604 
612  : _sink(std::move(sink)), _table(std::move(table))
613  {
614  }
615 
616  public:
622  explicit orc_writer_options() = default;
623 
633 
639  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
640 
646  [[nodiscard]] compression_type get_compression() const { return _compression; }
647 
653  [[nodiscard]] bool is_enabled_statistics() const
654  {
655  return _stats_freq != statistics_freq::STATISTICS_NONE;
656  }
657 
663  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
664 
670  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
671 
677  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
678 
684  [[nodiscard]] auto get_row_index_stride() const
685  {
686  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
687  return unaligned_stride - unaligned_stride % 8;
688  }
689 
695  [[nodiscard]] table_view get_table() const { return _table; }
696 
702  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
703 
709  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
710  {
711  return _user_data;
712  }
713 
719  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
720  {
721  return _compression_stats;
722  }
723 
729  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
730 
731  // Setters
732 
738  void set_compression(compression_type comp) { _compression = comp; }
739 
750  void enable_statistics(statistics_freq val) { _stats_freq = val; }
751 
759  void set_stripe_size_bytes(size_t size_bytes)
760  {
761  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
762  _stripe_size_bytes = size_bytes;
763  }
764 
776  {
777  CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
778  _stripe_size_rows = size_rows;
779  }
780 
791  {
792  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
793  _row_index_stride = stride;
794  }
795 
801  void set_table(table_view tbl) { _table = tbl; }
802 
808  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
809 
815  void set_key_value_metadata(std::map<std::string, std::string> metadata)
816  {
817  _user_data = std::move(metadata);
818  }
819 
825  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
826  {
827  _compression_stats = std::move(comp_stats);
828  }
829 
835  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
836 };
837 
842  orc_writer_options options;
843 
844  public:
851 
858  orc_writer_options_builder(sink_info const& sink, table_view const& table) : options{sink, table}
859  {
860  }
861 
869  {
870  options._compression = comp;
871  return *this;
872  }
873 
886  {
887  options._stats_freq = val;
888  return *this;
889  }
890 
898  {
899  options.set_stripe_size_bytes(val);
900  return *this;
901  }
902 
910  {
911  options.set_stripe_size_rows(val);
912  return *this;
913  }
914 
922  {
923  options.set_row_index_stride(val);
924  return *this;
925  }
926 
934  {
935  options._table = tbl;
936  return *this;
937  }
938 
946  {
947  options._metadata = std::move(meta);
948  return *this;
949  }
950 
957  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
958  {
959  options._user_data = std::move(metadata);
960  return *this;
961  }
962 
970  std::shared_ptr<writer_compression_statistics> const& comp_stats)
971  {
972  options._compression_stats = comp_stats;
973  return *this;
974  }
975 
983  {
984  options._enable_dictionary_sort = val;
985  return *this;
986  }
987 
991  operator orc_writer_options&&() { return std::move(options); }
992 
1000  orc_writer_options&& build() { return std::move(options); }
1001 };
1002 
1016 void write_orc(orc_writer_options const& options,
1018 
1023 
1028  // Specify the sink to use for writer output
1029  sink_info _sink;
1030  // Specify the compression format to use
1031  compression_type _compression = compression_type::AUTO;
1032  // Specify granularity of statistics collection
1033  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
1034  // Maximum size of each stripe (unless smaller than a single row group)
1035  size_t _stripe_size_bytes = default_stripe_size_bytes;
1036  // Maximum number of rows in stripe (unless smaller than a single row group)
1037  size_type _stripe_size_rows = default_stripe_size_rows;
1038  // Row index stride (maximum number of rows in each row group)
1039  size_type _row_index_stride = default_row_index_stride;
1040  // Optional associated metadata
1041  std::optional<table_input_metadata> _metadata;
1042  // Optional footer key_value_metadata
1043  std::map<std::string, std::string> _user_data;
1044  // Optional compression statistics
1045  std::shared_ptr<writer_compression_statistics> _compression_stats;
1046  // Specify whether string dictionaries should be alphabetically sorted
1047  bool _enable_dictionary_sort = true;
1048 
1050 
1056  chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {}
1057 
1058  public:
1064  explicit chunked_orc_writer_options() = default;
1065 
1074 
1080  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1081 
1087  [[nodiscard]] compression_type get_compression() const { return _compression; }
1088 
1094  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
1095 
1101  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
1102 
1108  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
1109 
1115  [[nodiscard]] auto get_row_index_stride() const
1116  {
1117  auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
1118  return unaligned_stride - unaligned_stride % 8;
1119  }
1120 
1126  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
1127 
1133  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
1134  {
1135  return _user_data;
1136  }
1137 
1143  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
1144  {
1145  return _compression_stats;
1146  }
1147 
1153  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
1154 
1155  // Setters
1156 
1162  void set_compression(compression_type comp) { _compression = comp; }
1163 
1174  void enable_statistics(statistics_freq val) { _stats_freq = val; }
1175 
1183  void set_stripe_size_bytes(size_t size_bytes)
1184  {
1185  CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
1186  _stripe_size_bytes = size_bytes;
1187  }
1188 
1200  {
1201  CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
1202  _stripe_size_rows = size_rows;
1203  }
1204 
1215  {
1216  CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
1217  _row_index_stride = stride;
1218  }
1219 
1225  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
1226 
1232  void set_key_value_metadata(std::map<std::string, std::string> metadata)
1233  {
1234  _user_data = std::move(metadata);
1235  }
1236 
1242  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
1243  {
1244  _compression_stats = std::move(comp_stats);
1245  }
1246 
1252  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
1253 };
1254 
1260 
1261  public:
1268 
1274  explicit chunked_orc_writer_options_builder(sink_info const& sink) : options{sink} {}
1275 
1283  {
1284  options._compression = comp;
1285  return *this;
1286  }
1287 
1300  {
1301  options._stats_freq = val;
1302  return *this;
1303  }
1304 
1312  {
1313  options.set_stripe_size_bytes(val);
1314  return *this;
1315  }
1316 
1324  {
1325  options.set_stripe_size_rows(val);
1326  return *this;
1327  }
1328 
1336  {
1337  options.set_row_index_stride(val);
1338  return *this;
1339  }
1340 
1348  {
1349  options._metadata = std::move(meta);
1350  return *this;
1351  }
1352 
1360  std::map<std::string, std::string> metadata)
1361  {
1362  options._user_data = std::move(metadata);
1363  return *this;
1364  }
1365 
1373  std::shared_ptr<writer_compression_statistics> const& comp_stats)
1374  {
1375  options._compression_stats = comp_stats;
1376  return *this;
1377  }
1378 
1386  {
1387  options._enable_dictionary_sort = val;
1388  return *this;
1389  }
1390 
1394  operator chunked_orc_writer_options&&() { return std::move(options); }
1395 
1403  chunked_orc_writer_options&& build() { return std::move(options); }
1404 };
1405 
1428  public:
1434 
1439 
1448 
1456 
1460  void close();
1461 
1463  std::unique_ptr<orc::detail::writer> writer;
1464 };
1465  // end of group
1467 } // namespace io
1468 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
The chunked orc reader class to read an ORC file iteratively into a series of tables,...
Definition: orc.hpp:423
chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Construct the reader from output size limits along with other ORC reader options.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Construct the reader from input/output size limits along with other ORC reader options.
bool has_next() const
Check if there is any data in the given data sources has not yet read.
~chunked_orc_reader()
Destructor, destroying the internal reader instance.
table_with_metadata read_chunk() const
Read a chunk of rows in the given data sources.
chunked_orc_reader()
Default constructor, this should never be used.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Construct the reader from input/output size limits, output row granularity, along with other ORC read...
Builds settings to use for write_orc_chunked().
Definition: orc.hpp:1258
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1385
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
Definition: orc.hpp:1403
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1311
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:1323
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1372
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1359
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1282
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1347
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
Definition: orc.hpp:1274
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1299
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:1335
Settings to use for write_orc_chunked().
Definition: orc.hpp:1027
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:1183
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:1225
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:1232
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:1242
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:1080
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:1108
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:1115
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:1214
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
Definition: orc.hpp:1094
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:1162
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:1133
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:1252
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:1126
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:1153
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:1087
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:1199
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:1143
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:1101
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:1174
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
Definition: orc.hpp:1427
~orc_chunked_writer()
virtual destructor, Added so we don't leak detail types.
orc_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
Definition: orc.hpp:1463
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
Builds settings to use for read_orc().
Definition: orc.hpp:263
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:335
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
Definition: orc.hpp:371
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:347
orc_reader_options_builder & skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:311
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
Definition: orc.hpp:279
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
Definition: orc.hpp:299
orc_reader_options_builder & num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:323
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:287
orc_reader_options && build()
move orc_reader_options member once it's built.
Definition: orc.hpp:389
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:359
Settings to use for read_orc().
Definition: orc.hpp:55
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: orc.hpp:130
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
Definition: orc.hpp:240
void set_num_rows(int64_t nrows)
Sets number of row to read.
Definition: orc.hpp:221
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
Definition: orc.hpp:123
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
Definition: orc.hpp:254
void set_skip_rows(int64_t rows)
Sets number of rows to skip from the start.
Definition: orc.hpp:206
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
Definition: orc.hpp:233
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
Definition: orc.hpp:178
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
Definition: orc.hpp:190
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
Definition: orc.hpp:159
auto const & get_columns() const
Returns names of the columns to read, if set.
Definition: orc.hpp:116
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
std::optional< int64_t > const & get_num_rows() const
Returns number of row to read.
Definition: orc.hpp:138
source_info const & get_source() const
Returns source info.
Definition: orc.hpp:109
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
Definition: orc.hpp:152
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
Definition: orc.hpp:145
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
Definition: orc.hpp:166
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Definition: orc.hpp:247
Builds settings to use for write_orc().
Definition: orc.hpp:841
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:933
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Definition: orc.hpp:921
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
Definition: orc.hpp:885
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:945
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: orc.hpp:858
orc_writer_options && build()
move orc_writer_options member once it's built.
Definition: orc.hpp:1000
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
Definition: orc.hpp:957
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:969
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
Definition: orc.hpp:909
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:982
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:868
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:897
Settings to use for write_orc().
Definition: orc.hpp:579
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
Definition: orc.hpp:750
auto const & get_metadata() const
Returns associated metadata.
Definition: orc.hpp:702
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: orc.hpp:709
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: orc.hpp:719
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
Definition: orc.hpp:653
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
Definition: orc.hpp:670
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
Definition: orc.hpp:775
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
Definition: orc.hpp:815
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
Definition: orc.hpp:677
table_view get_table() const
Returns table to be written to output.
Definition: orc.hpp:695
void set_metadata(table_input_metadata meta)
Sets associated metadata.
Definition: orc.hpp:808
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
Definition: orc.hpp:663
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
Definition: orc.hpp:825
auto get_row_index_stride() const
Returns the row index stride.
Definition: orc.hpp:684
void set_table(table_view tbl)
Sets table to be written to output.
Definition: orc.hpp:801
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
Definition: orc.hpp:738
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
Definition: orc.hpp:835
void set_row_index_stride(size_type stride)
Sets the row index stride.
Definition: orc.hpp:790
compression_type get_compression() const
Returns compression type.
Definition: orc.hpp:646
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
Definition: orc.hpp:729
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
Definition: orc.hpp:759
sink_info const & get_sink() const
Returns sink info.
Definition: orc.hpp:639
Metadata for a table.
Definition: io/types.hpp:932
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:41
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
Definition: orc.hpp:44
constexpr size_type default_row_index_stride
10K rows default orc row index stride
Definition: orc.hpp:45
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
Definition: orc.hpp:43
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Reads an ORC dataset into a set of columns.
compression_type
Compression algorithms.
Definition: io/types.hpp:57
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:96
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:98
@ STATISTICS_NONE
No column statistics.
Definition: io/types.hpp:97
@ STATISTICS_PAGE
Per-page column statistics.
Definition: io/types.hpp:99
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
Destination information for write interfaces.
Definition: io/types.hpp:512
Source information for read interfaces.
Definition: io/types.hpp:337
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.