parquet.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/ast/expressions.hpp>
20 #include <cudf/io/detail/parquet.hpp>
21 #include <cudf/io/types.hpp>
23 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
26 
27 #include <iostream>
28 #include <memory>
29 #include <optional>
30 #include <string>
31 #include <utility>
32 #include <vector>
33 
34 namespace CUDF_EXPORT cudf {
35 namespace io {
42 constexpr size_t default_row_group_size_bytes =
43  std::numeric_limits<size_t>::max();
44 constexpr size_type default_row_group_size_rows = 1'000'000;
45 constexpr size_t default_max_page_size_bytes = 512 * 1024;
47 constexpr int32_t default_column_index_truncate_length = 64;
48 constexpr size_t default_max_dictionary_size = 1024 * 1024;
50 
52 
57  source_info _source;
58 
59  // Path in schema of column to read; `nullopt` is all
60  std::optional<std::vector<std::string>> _columns;
61 
62  // List of individual row groups to read (ignored if empty)
63  std::vector<std::vector<size_type>> _row_groups;
64  // Number of rows to skip from the start; Parquet stores the number of rows as int64_t
65  int64_t _skip_rows = 0;
66  // Number of rows to read; `nullopt` is all
67  std::optional<size_type> _num_rows;
68 
69  // Predicate filter as AST to filter output rows.
70  std::optional<std::reference_wrapper<ast::expression const>> _filter;
71 
72  // Whether to store string data as categorical type
73  bool _convert_strings_to_categories = false;
74  // Whether to use PANDAS metadata to load columns
75  bool _use_pandas_metadata = true;
76  // Whether to read and use ARROW schema
77  bool _use_arrow_schema = true;
78  // Whether to allow reading matching select columns from mismatched Parquet files.
79  bool _allow_mismatched_pq_schemas = false;
80  // Cast timestamp columns to a specific type
81  data_type _timestamp_type{type_id::EMPTY};
82 
83  std::optional<std::vector<reader_column_schema>> _reader_column_schema;
84 
90  explicit parquet_reader_options(source_info src) : _source{std::move(src)} {}
91 
93 
94  public:
100  explicit parquet_reader_options() = default;
101 
109 
115  [[nodiscard]] source_info const& get_source() const { return _source; }
116 
123  [[nodiscard]] bool is_enabled_convert_strings_to_categories() const
124  {
125  return _convert_strings_to_categories;
126  }
127 
133  [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
134 
140  [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
141 
149  [[nodiscard]] bool is_enabled_allow_mismatched_pq_schemas() const
150  {
151  return _allow_mismatched_pq_schemas;
152  }
153 
159  [[nodiscard]] std::optional<std::vector<reader_column_schema>> get_column_schema() const
160  {
161  return _reader_column_schema;
162  }
163 
169  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
170 
177  [[nodiscard]] std::optional<size_type> const& get_num_rows() const { return _num_rows; }
178 
184  [[nodiscard]] auto const& get_columns() const { return _columns; }
185 
191  [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
192 
198  [[nodiscard]] auto const& get_filter() const { return _filter; }
199 
205  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
206 
212  void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
213 
219  void set_row_groups(std::vector<std::vector<size_type>> row_groups);
220 
251  void set_filter(ast::expression const& filter) { _filter = filter; }
252 
258  void enable_convert_strings_to_categories(bool val) { _convert_strings_to_categories = val; }
259 
265  void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
266 
272  void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
273 
281  void enable_allow_mismatched_pq_schemas(bool val) { _allow_mismatched_pq_schemas = val; }
282 
289  void set_column_schema(std::vector<reader_column_schema> val)
290  {
291  _reader_column_schema = std::move(val);
292  }
293 
299  void set_skip_rows(int64_t val);
300 
307 
313  void set_timestamp_type(data_type type) { _timestamp_type = type; }
314 };
315 
320  parquet_reader_options options;
321 
322  public:
329 
335  explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {}
336 
343  parquet_reader_options_builder& columns(std::vector<std::string> col_names)
344  {
345  options._columns = std::move(col_names);
346  return *this;
347  }
348 
355  parquet_reader_options_builder& row_groups(std::vector<std::vector<size_type>> row_groups)
356  {
357  options.set_row_groups(std::move(row_groups));
358  return *this;
359  }
360 
366  {
367  options.set_filter(filter);
368  return *this;
369  }
370 
378  {
379  options._convert_strings_to_categories = val;
380  return *this;
381  }
382 
390  {
391  options._use_pandas_metadata = val;
392  return *this;
393  }
394 
402  {
403  options._use_arrow_schema = val;
404  return *this;
405  }
406 
416  {
417  options._allow_mismatched_pq_schemas = val;
418  return *this;
419  }
420 
427  parquet_reader_options_builder& set_column_schema(std::vector<reader_column_schema> val)
428  {
429  options._reader_column_schema = std::move(val);
430  return *this;
431  }
432 
440  {
441  options.set_skip_rows(val);
442  return *this;
443  }
444 
452  {
453  options.set_num_rows(val);
454  return *this;
455  }
456 
464  {
465  options._timestamp_type = type;
466  return *this;
467  }
468 
472  operator parquet_reader_options&&() { return std::move(options); }
473 
481  parquet_reader_options&& build() { return std::move(options); }
482 };
483 
502  parquet_reader_options const& options,
505 
516  public:
524 
539  std::size_t chunk_read_limit,
540  parquet_reader_options const& options,
543 
564  std::size_t chunk_read_limit,
565  std::size_t pass_read_limit,
566  parquet_reader_options const& options,
569 
578 
584  [[nodiscard]] bool has_next() const;
585 
597  [[nodiscard]] table_with_metadata read_chunk() const;
598 
599  private:
600  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
601 };
602  // end of group
614  int column_idx{};
615  bool is_descending{false};
616  bool is_nulls_first{true};
617 };
618 
623  // Specify the sink to use for writer output
624  sink_info _sink;
625  // Specify the compression format to use
626  compression_type _compression = compression_type::SNAPPY;
627  // Specify the level of statistics in the output file
629  // Optional associated metadata
630  std::optional<table_input_metadata> _metadata;
631  // Optional footer key_value_metadata
632  std::vector<std::map<std::string, std::string>> _user_data;
633  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
634  // If true then overrides any per-column setting in _metadata.
635  bool _write_timestamps_as_int96 = false;
636  // Parquet writer can write timestamps as UTC
637  // Defaults to true because libcudf timestamps are implicitly UTC
638  bool _write_timestamps_as_UTC = true;
639  // Whether to write ARROW schema
640  bool _write_arrow_schema = false;
641  // Maximum size of each row group (unless smaller than a single page)
642  size_t _row_group_size_bytes = default_row_group_size_bytes;
643  // Maximum number of rows in row group (unless smaller than a single page)
644  size_type _row_group_size_rows = default_row_group_size_rows;
645  // Maximum size of each page (uncompressed)
646  size_t _max_page_size_bytes = default_max_page_size_bytes;
647  // Maximum number of rows in a page
648  size_type _max_page_size_rows = default_max_page_size_rows;
649  // Maximum size of min or max values in column index
650  int32_t _column_index_truncate_length = default_column_index_truncate_length;
651  // When to use dictionary encoding for data
652  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
653  // Maximum size of column chunk dictionary (in bytes)
654  size_t _max_dictionary_size = default_max_dictionary_size;
655  // Maximum number of rows in a page fragment
656  std::optional<size_type> _max_page_fragment_size;
657  // Optional compression statistics
658  std::shared_ptr<writer_compression_statistics> _compression_stats;
659  // write V2 page headers?
660  bool _v2_page_headers = false;
661  // Which columns in _table are used for sorting
662  std::optional<std::vector<sorting_column>> _sorting_columns;
663 
664  protected:
670  explicit parquet_writer_options_base(sink_info sink) : _sink(std::move(sink)) {}
671 
672  public:
679 
685  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
686 
692  [[nodiscard]] compression_type get_compression() const { return _compression; }
693 
699  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
700 
706  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
707 
713  [[nodiscard]] std::vector<std::map<std::string, std::string>> const& get_key_value_metadata()
714  const
715  {
716  return _user_data;
717  }
718 
724  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
725 
731  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
732 
738  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
739 
745  [[nodiscard]] auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
746 
752  [[nodiscard]] auto get_row_group_size_rows() const { return _row_group_size_rows; }
753 
761  [[nodiscard]] auto get_max_page_size_bytes() const
762  {
763  return std::min(_max_page_size_bytes, get_row_group_size_bytes());
764  }
765 
773  [[nodiscard]] auto get_max_page_size_rows() const
774  {
775  return std::min(_max_page_size_rows, get_row_group_size_rows());
776  }
777 
783  [[nodiscard]] auto get_column_index_truncate_length() const
784  {
785  return _column_index_truncate_length;
786  }
787 
793  [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; }
794 
800  [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; }
801 
807  [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
808 
814  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
815  {
816  return _compression_stats;
817  }
818 
824  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
825 
831  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
832 
839 
845  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
846 
859 
866  void enable_int96_timestamps(bool req);
867 
873  void enable_utc_timestamps(bool val);
874 
881 
887  void set_row_group_size_bytes(size_t size_bytes);
888 
895 
901  void set_max_page_size_bytes(size_t size_bytes);
902 
909 
915  void set_column_index_truncate_length(int32_t size_bytes);
916 
923 
929  void set_max_dictionary_size(size_t size_bytes);
930 
937 
943  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats);
944 
950  void enable_write_v2_headers(bool val);
951 
957  void set_sorting_columns(std::vector<sorting_column> sorting_columns);
958 };
959 
963 template <class BuilderT, class OptionsT>
965  OptionsT _options;
966 
967  protected:
973  inline OptionsT& get_options() { return _options; }
974 
980  explicit parquet_writer_options_builder_base(OptionsT options);
981 
982  public:
989 
996  BuilderT& metadata(table_input_metadata metadata);
997 
1004  BuilderT& key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
1005 
1013 
1020  BuilderT& compression(compression_type compression);
1021 
1028  BuilderT& row_group_size_bytes(size_t val);
1029 
1037 
1048  BuilderT& max_page_size_bytes(size_t val);
1049 
1058 
1072  BuilderT& column_index_truncate_length(int32_t val);
1073 
1092 
1104  BuilderT& max_dictionary_size(size_t val);
1105 
1117 
1125  std::shared_ptr<writer_compression_statistics> const& comp_stats);
1126 
1133  BuilderT& int96_timestamps(bool enabled);
1134 
1141  BuilderT& utc_timestamps(bool enabled);
1142 
1149  BuilderT& write_arrow_schema(bool enabled);
1150 
1157  BuilderT& write_v2_headers(bool enabled);
1158 
1165  BuilderT& sorting_columns(std::vector<sorting_column> sorting_columns);
1166 
1170  operator OptionsT&&();
1171 
1179  OptionsT&& build();
1180 };
1181 
1183 
1188  // Sets of columns to output
1189  table_view _table;
1190  // Partitions described as {start_row, num_rows} pairs
1191  std::vector<partition_info> _partitions;
1192  // Column chunks file paths to be set in the raw output metadata. One per output file
1193  std::vector<std::string> _column_chunks_file_paths;
1194 
1196 
1203  explicit parquet_writer_options(sink_info const& sink, table_view const& table);
1204 
1205  public:
1212 
1222 
1229 
1235  [[nodiscard]] table_view get_table() const { return _table; }
1236 
1242  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
1243 
1249  [[nodiscard]] std::vector<std::string> const& get_column_chunks_file_paths() const
1250  {
1251  return _column_chunks_file_paths;
1252  }
1253 
1260  void set_partitions(std::vector<partition_info> partitions);
1261 
1268  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
1269 };
1270 
1275  : public parquet_writer_options_builder_base<parquet_writer_options_builder,
1276  parquet_writer_options> {
1277  public:
1283  explicit parquet_writer_options_builder() = default;
1284 
1292 
1300  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions);
1301 
1309  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths);
1310 };
1311 
1328 std::unique_ptr<std::vector<uint8_t>> write_parquet(
1330 
1340 std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
1341  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
1342 
1344 
1355 
1357 
1358  public:
1365 
1374 };
1375 
1380  : public parquet_writer_options_builder_base<chunked_parquet_writer_options_builder,
1381  chunked_parquet_writer_options> {
1382  public:
1389 
1396 };
1397 
1418  public:
1425 
1439 
1452  std::vector<partition_info> const& partitions = {});
1453 
1462  std::unique_ptr<std::vector<uint8_t>> close(
1463  std::vector<std::string> const& column_chunks_file_paths = {});
1464 
1466  std::unique_ptr<parquet::detail::writer> writer;
1467 };
1468  // end of group
1470 
1471 } // namespace io
1472 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
The chunked parquet reader class to read Parquet file iteratively in to a series of tables,...
Definition: parquet.hpp:515
table_with_metadata read_chunk() const
Read a chunk of rows in the given Parquet file.
bool has_next() const
Check if there is any data in the given file has not yet read.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for chunked reader.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance.
chunked_parquet_reader()
Default constructor, this should never be used.
Class to build chunked_parquet_writer_options.
Definition: parquet.hpp:1381
chunked_parquet_writer_options_builder()=default
Default constructor.
chunked_parquet_writer_options_builder(sink_info const &sink)
Constructor from sink.
Settings for parquet_chunked_writer.
Definition: parquet.hpp:1348
static chunked_parquet_writer_options_builder builder(sink_info const &sink)
creates builder to build chunked_parquet_writer_options.
chunked_parquet_writer_options()=default
Default constructor.
chunked parquet writer class to handle options and write tables in chunks.
Definition: parquet.hpp:1417
std::unique_ptr< std::vector< uint8_t > > close(std::vector< std::string > const &column_chunks_file_paths={})
Finishes the chunked/streamed write process.
parquet_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython....
~parquet_chunked_writer()
Default destructor. This is added to not leak detail API.
std::unique_ptr< parquet::detail::writer > writer
Unique pointer to impl writer class.
Definition: parquet.hpp:1466
parquet_chunked_writer & write(table_view const &table, std::vector< partition_info > const &partitions={})
Writes table to output.
parquet_chunked_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
Builds parquet_reader_options to use for read_parquet().
Definition: parquet.hpp:319
parquet_reader_options_builder & use_arrow_schema(bool val)
Sets to enable/disable use of arrow schema to read.
Definition: parquet.hpp:401
parquet_reader_options_builder(source_info src)
Constructor from source info.
Definition: parquet.hpp:335
parquet_reader_options_builder & skip_rows(int64_t val)
Sets number of rows to skip.
Definition: parquet.hpp:439
parquet_reader_options_builder & allow_mismatched_pq_schemas(bool val)
Sets to enable/disable reading of matching projected and filter columns from mismatched Parquet sourc...
Definition: parquet.hpp:415
parquet_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: parquet.hpp:343
parquet_reader_options_builder & timestamp_type(data_type type)
timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:463
parquet_reader_options_builder & use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:389
parquet_reader_options_builder()=default
Default constructor.
parquet_reader_options_builder & row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
Definition: parquet.hpp:355
parquet_reader_options_builder & set_column_schema(std::vector< reader_column_schema > val)
Sets reader metadata.
Definition: parquet.hpp:427
parquet_reader_options && build()
move parquet_reader_options member once it's built.
Definition: parquet.hpp:481
parquet_reader_options_builder & filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
Definition: parquet.hpp:365
parquet_reader_options_builder & num_rows(size_type val)
Sets number of rows to read.
Definition: parquet.hpp:451
parquet_reader_options_builder & convert_strings_to_categories(bool val)
Sets enable/disable conversion of strings to categories.
Definition: parquet.hpp:377
Settings for read_parquet().
Definition: parquet.hpp:56
data_type get_timestamp_type() const
Returns timestamp type used to cast timestamp columns.
Definition: parquet.hpp:205
parquet_reader_options()=default
Default constructor.
static parquet_reader_options_builder builder(source_info src)
Creates a parquet_reader_options_builder which will build parquet_reader_options.
void enable_allow_mismatched_pq_schemas(bool val)
Sets to enable/disable reading of matching projected and filter columns from mismatched Parquet sourc...
Definition: parquet.hpp:281
void set_skip_rows(int64_t val)
Sets number of rows to skip.
void set_columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: parquet.hpp:212
void enable_convert_strings_to_categories(bool val)
Sets to enable/disable conversion of strings to categories.
Definition: parquet.hpp:258
std::optional< std::vector< reader_column_schema > > get_column_schema() const
Returns optional tree of metadata.
Definition: parquet.hpp:159
source_info const & get_source() const
Returns source info.
Definition: parquet.hpp:115
auto const & get_row_groups() const
Returns list of individual row groups to be read.
Definition: parquet.hpp:191
std::optional< size_type > const & get_num_rows() const
Returns number of rows to read.
Definition: parquet.hpp:177
void set_row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
void set_num_rows(size_type val)
Sets number of rows to read.
auto const & get_columns() const
Returns names of column to be read, if set.
Definition: parquet.hpp:184
void set_timestamp_type(data_type type)
Sets timestamp_type used to cast timestamp columns.
Definition: parquet.hpp:313
bool is_enabled_convert_strings_to_categories() const
Returns true/false depending on whether strings should be converted to categories or not.
Definition: parquet.hpp:123
void enable_use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
Definition: parquet.hpp:265
void enable_use_arrow_schema(bool val)
Sets to enable/disable use of arrow schema to read.
Definition: parquet.hpp:272
bool is_enabled_use_pandas_metadata() const
Returns true/false depending whether to use pandas metadata or not while reading.
Definition: parquet.hpp:133
bool is_enabled_allow_mismatched_pq_schemas() const
Returns true/false depending on whether to read matching projected and filter columns from mismatched...
Definition: parquet.hpp:149
void set_column_schema(std::vector< reader_column_schema > val)
Sets reader column schema.
Definition: parquet.hpp:289
bool is_enabled_use_arrow_schema() const
Returns true/false depending whether to use arrow schema while reading.
Definition: parquet.hpp:140
void set_filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
Definition: parquet.hpp:251
auto const & get_filter() const
Returns AST based filter for predicate pushdown.
Definition: parquet.hpp:198
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Definition: parquet.hpp:169
Base settings for write_parquet() and parquet_chunked_writer.
Definition: parquet.hpp:622
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
auto const & get_sorting_columns() const
Returns the sorting_columns.
Definition: parquet.hpp:831
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
Definition: parquet.hpp:745
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
Definition: parquet.hpp:724
void set_metadata(table_input_metadata metadata)
Sets metadata.
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
parquet_writer_options_base(sink_info sink)
Constructor from sink.
Definition: parquet.hpp:670
void set_stats_level(statistics_freq sf)
Sets the level of statistics.
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
Definition: parquet.hpp:752
parquet_writer_options_base()=default
Default constructor.
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
void set_sorting_columns(std::vector< sorting_column > sorting_columns)
Sets sorting columns.
auto is_enabled_write_arrow_schema() const
Returns true if arrow schema will be written.
Definition: parquet.hpp:738
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
Definition: parquet.hpp:824
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
auto get_max_page_size_bytes() const
Returns the maximum uncompressed page size, in bytes.
Definition: parquet.hpp:761
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
compression_type get_compression() const
Returns compression format used.
Definition: parquet.hpp:692
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
Definition: parquet.hpp:800
void set_compression(compression_type compression)
Sets compression type.
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
Definition: parquet.hpp:793
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
Definition: parquet.hpp:814
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
Definition: parquet.hpp:807
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets metadata.
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
void enable_write_arrow_schema(bool val)
Sets preference for writing arrow schema. Write arrow schema if set to true.
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
Definition: parquet.hpp:731
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences. INT96 timestamps will be written if true and TIMESTAMP_MICROS wil...
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
Definition: parquet.hpp:699
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
Definition: parquet.hpp:713
auto const & get_metadata() const
Returns associated metadata.
Definition: parquet.hpp:706
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
Definition: parquet.hpp:773
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
Definition: parquet.hpp:783
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
sink_info const & get_sink() const
Returns sink info.
Definition: parquet.hpp:685
Base class for Parquet options builders.
Definition: parquet.hpp:964
BuilderT & compression(compression_type compression)
Sets compression type.
BuilderT & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata.
OptionsT & get_options()
Return reference to the options object being built.
Definition: parquet.hpp:973
BuilderT & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
BuilderT & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
BuilderT & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
OptionsT && build()
move options member once it's built.
BuilderT & stats_level(statistics_freq sf)
Sets the level of statistics.
BuilderT & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
BuilderT & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
BuilderT & metadata(table_input_metadata metadata)
Sets metadata.
BuilderT & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
parquet_writer_options_builder_base(OptionsT options)
Constructor from options.
BuilderT & int96_timestamps(bool enabled)
Sets whether int96 timestamps are written or not.
BuilderT & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
BuilderT & sorting_columns(std::vector< sorting_column > sorting_columns)
Sets column sorting metadata.
BuilderT & write_arrow_schema(bool enabled)
Set to true if arrow schema is to be written.
parquet_writer_options_builder_base()=default
Default constructor.
BuilderT & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
BuilderT & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
BuilderT & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
BuilderT & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
Class to build parquet_writer_options.
Definition: parquet.hpp:1276
parquet_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
parquet_writer_options_builder()=default
Default constructor.
parquet_writer_options_builder & partitions(std::vector< partition_info > partitions)
Sets partitions in parquet_writer_options.
parquet_writer_options_builder & column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
Settings for write_parquet().
Definition: parquet.hpp:1187
void set_partitions(std::vector< partition_info > partitions)
Sets partitions.
static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create parquet_writer_options.
parquet_writer_options()=default
Default constructor.
std::vector< std::string > const & get_column_chunks_file_paths() const
Returns Column chunks file paths to be set in the raw output metadata.
Definition: parquet.hpp:1249
table_view get_table() const
Returns table_view.
Definition: parquet.hpp:1235
void set_column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
static parquet_writer_options_builder builder()
Create builder to create parquet_writer_options.
std::vector< partition_info > const & get_partitions() const
Returns partitions.
Definition: parquet.hpp:1242
Metadata for a table.
Definition: io/types.hpp:932
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_row_group_size_rows
1 million rows per row group
Definition: parquet.hpp:44
constexpr int32_t default_column_index_truncate_length
truncate to 64 bytes
Definition: parquet.hpp:47
constexpr size_t default_row_group_size_bytes
Infinite bytes per row group.
Definition: parquet.hpp:42
constexpr size_type default_max_page_fragment_size
5000 rows per page fragment
Definition: parquet.hpp:49
constexpr size_t default_max_dictionary_size
1MB dictionary size
Definition: parquet.hpp:48
table_with_metadata read_parquet(parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a Parquet dataset into a set of columns.
constexpr size_t default_max_page_size_bytes
512KB per page
Definition: parquet.hpp:45
constexpr size_type default_max_page_size_rows
20k rows per page
Definition: parquet.hpp:46
statistics_freq
Column statistics granularity type for parquet/orc writers.
Definition: io/types.hpp:96
dictionary_policy
Control use of dictionary encoding for parquet writer.
Definition: io/types.hpp:225
compression_type
Compression algorithms.
Definition: io/types.hpp:57
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
Definition: io/types.hpp:98
@ ADAPTIVE
Use dictionary when it will not impact compression.
Definition: io/types.hpp:227
std::unique_ptr< std::vector< uint8_t > > merge_row_group_metadata(std::vector< std::unique_ptr< std::vector< uint8_t >>> const &metadata_list)
Merges multiple raw metadata blobs that were previously created by write_parquet into a single metada...
std::unique_ptr< std::vector< uint8_t > > write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to parquet format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:46
Destination information for write interfaces.
Definition: io/types.hpp:512
Struct used to describe column sorting metadata.
Definition: parquet.hpp:613
Source information for read interfaces.
Definition: io/types.hpp:337
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.