Io Writers#

group io_writers

Functions

void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#

Writes a set of columns to CSV format.

The following code snippet demonstrates how to write columns to a file:

auto destination = cudf::io::sink_info("dataset.csv");
auto options     = cudf::io::csv_writer_options(destination, table->view())
  .na_rep(na)
  .include_header(include_header)
  .rows_per_chunk(rows_per_chunk);

cudf::io::write_csv(options);

Parameters:
  • options – Settings for controlling writing behavior

  • stream – CUDA stream used for device memory operations and kernel launches

  • mr – Device memory resource to use for device memory allocation

void write_json(json_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#

Writes a set of columns to JSON format.

The following code snippet demonstrates how to write columns to a file:

auto destination = cudf::io::sink_info("dataset.json");
auto options     = cudf::io::json_writer_options(destination, table->view())
  .na_rep(na)
  .lines(lines)
  .rows_per_chunk(rows_per_chunk);

cudf::io::write_json(options);

Parameters:
  • options – Settings for controlling writing behavior

  • stream – CUDA stream used for device memory operations and kernel launches

  • mr – Device memory resource to use for device memory allocation

void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#

Writes a set of columns to ORC format.

The following code snippet demonstrates how to write columns to a file:

auto destination = cudf::io::sink_info("dataset.orc");
auto options     = cudf::io::orc_writer_options::builder(destination, table->view());
cudf::io::write_orc(options);

Parameters:
  • options – Settings for controlling reading behavior

  • stream – CUDA stream used for device memory operations and kernel launches

std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#

Writes a set of columns to parquet format.

The following code snippet demonstrates how to write columns to a file:

auto destination = cudf::io::sink_info("dataset.parquet");
auto options     = cudf::io::parquet_writer_options::builder(destination, table->view());
cudf::io::write_parquet(options);

Parameters:
  • options – Settings for controlling writing behavior

  • stream – CUDA stream used for device memory operations and kernel launches

Returns:

A blob that contains the file metadata (parquet FileMetadata thrift message) if requested in parquet_writer_options (empty blob otherwise).

std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(std::vector<std::unique_ptr<std::vector<uint8_t>>> const &metadata_list)#

Merges multiple raw metadata blobs that were previously created by write_parquet into a single metadata blob.

Parameters:

metadata_list[in] List of input file metadata

Returns:

A parquet-compatible blob that contains the data for all row groups in the list

Variables

static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP#

Constants to disambiguate statistics terminology for ORC.

ORC refers to its finest granularity of row-grouping as “row group”, which corresponds to Parquet “pages”. Similarly, ORC’s “stripe” corresponds to a Parquet “row group”. The following constants disambiguate the terminology for the statistics collected at each level.

static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE#
class csv_writer_options#
#include <csv.hpp>

Settings to use for write_csv().

Public Functions

explicit csv_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink used for writer output.

Returns:

sink used for writer output

inline table_view const &get_table() const#

Returns table that would be written to output.

Returns:

Table that would be written to output

inline std::vector<std::string> const &get_names() const#

Returns names of the columns.

Returns:

Names of the columns in the output file

inline std::string get_na_rep() const#

Returns string to used for null entries.

Returns:

string to used for null entries

inline bool is_enabled_include_header() const#

Whether to write headers to csv.

Returns:

true if writing headers to csv

inline size_type get_rows_per_chunk() const#

Returns maximum number of rows to process for each file write.

Returns:

Maximum number of rows to process for each file write

inline std::string get_line_terminator() const#

Returns character used for separating lines.

Returns:

Character used for separating lines

inline char get_inter_column_delimiter() const#

Returns character used for separating column values.

Returns:

Character used for separating column values.

inline std::string get_true_value() const#

Returns string used for values != 0 in INT8 types.

Returns:

string used for values != 0 in INT8 types

inline std::string get_false_value() const#

Returns string used for values == 0 in INT8 types.

Returns:

string used for values == 0 in INT8 types

inline quote_style get_quoting() const#

Returns the quote style for the writer.

Note: Only MINIMAL and NONE are supported.

  1. MINIMAL: String columns containing special characters like row-delimiters field-delimiter/quotes will be quoted.

  2. NONE: No quoting is done for any columns.

Returns:

quote_style The quote style for the writer

inline void set_names(std::vector<std::string> names)#

Sets optional associated column names.

Parameters:

names – Associated column names

inline void set_na_rep(std::string val)#

Sets string to used for null entries.

Parameters:

val – String to represent null value

inline void enable_include_header(bool val)#

Enables/Disables headers being written to csv.

Parameters:

val – Boolean value to enable/disable

inline void set_rows_per_chunk(size_type val)#

Sets maximum number of rows to process for each file write.

Parameters:

val – Number of rows per chunk

inline void set_line_terminator(std::string term)#

Sets character used for separating lines.

Parameters:

term – Character to represent line termination

inline void set_inter_column_delimiter(char delim)#

Sets character used for separating column values.

Parameters:

delim – Character to delimit column values

inline void set_true_value(std::string val)#

Sets string used for values != 0 in INT8 types.

Parameters:

val – String to represent values != 0 in INT8 types

inline void set_false_value(std::string val)#

Sets string used for values == 0 in INT8 types.

Parameters:

val – String to represent values == 0 in INT8 types

inline void set_table(table_view const &table)#

(Re)sets the table being written.

Parameters:

table – Table to be written

inline void set_quoting(quote_style quoting)#

Sets the quote style for the writer.

Note: Only the following quote styles are supported:

  1. MINIMAL: String columns containing special characters like row-delimiters/ field-delimiter/quotes will be quoted.

  2. NONE: No quoting is done for any columns.

Parameters:

quoting – The new quote_style for the writer.

Public Static Functions

static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)#

Create builder to create csv_writer_options.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

Returns:

Builder to build csv_writer_options

class csv_writer_options_builder#
#include <csv.hpp>

Builder to build options for writer_csv()

Public Functions

explicit csv_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline explicit csv_writer_options_builder(sink_info const &sink, table_view const &table)#

Constructor from sink and table.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

inline csv_writer_options_builder &names(std::vector<std::string> names)#

Sets optional column names.

Parameters:

names – Column names

Returns:

this for chaining

inline csv_writer_options_builder &na_rep(std::string val)#

Sets string to used for null entries.

Parameters:

val – String to represent null value

Returns:

this for chaining

inline csv_writer_options_builder &include_header(bool val)#

Enables/Disables headers being written to csv.

Parameters:

val – Boolean value to enable/disable

Returns:

this for chaining

inline csv_writer_options_builder &rows_per_chunk(int val)#

Sets maximum number of rows to process for each file write.

Parameters:

val – Number of rows per chunk

Returns:

this for chaining

inline csv_writer_options_builder &line_terminator(std::string term)#

Sets character used for separating lines.

Parameters:

term – Character to represent line termination

Returns:

this for chaining

inline csv_writer_options_builder &inter_column_delimiter(char delim)#

Sets character used for separating column values.

Parameters:

delim – Character to delimit column values

Returns:

this for chaining

inline csv_writer_options_builder &true_value(std::string val)#

Sets string used for values != 0 in INT8 types.

Parameters:

val – String to represent values != 0 in INT8 types

Returns:

this for chaining

inline csv_writer_options_builder &false_value(std::string val)#

Sets string used for values == 0 in INT8 types.

Parameters:

val – String to represent values == 0 in INT8 types

Returns:

this for chaining

inline csv_writer_options_builder &quoting(quote_style quoting)#

Sets the quote style for the writer.

Only MINIMAL and NONE are supported.

Parameters:

quoting – The new quote style for the writer.

Returns:

this for chaining

inline operator csv_writer_options&&()#

move csv_writer_options member once it’s built.

inline csv_writer_options &&build()#

move csv_writer_options member once it’s built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built csv_writer_options object’s r-value reference

class json_writer_options#
#include <json.hpp>

Settings to use for write_json().

Public Functions

explicit json_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink used for writer output.

Returns:

sink used for writer output

inline table_view const &get_table() const#

Returns table that would be written to output.

Returns:

Table that would be written to output

inline std::optional<table_metadata> const &get_metadata() const#

Returns metadata information.

Returns:

Metadata information

inline std::string const &get_na_rep() const#

Returns string to used for null entries.

Returns:

string to used for null entries

inline bool is_enabled_include_nulls() const#

Whether to output nulls as ‘null’.

Returns:

true if nulls are output as ‘null’

inline bool is_enabled_lines() const#

Whether to use JSON lines for records format.

Returns:

true if JSON lines is used for records format

inline size_type get_rows_per_chunk() const#

Returns maximum number of rows to process for each file write.

Returns:

Maximum number of rows to process for each file write

inline std::string const &get_true_value() const#

Returns string used for values != 0 in INT8 types.

Returns:

string used for values != 0 in INT8 types

inline std::string const &get_false_value() const#

Returns string used for values == 0 in INT8 types.

Returns:

string used for values == 0 in INT8 types

inline void set_table(table_view tbl)#

Sets table to be written to output.

Parameters:

tbl – Table for the output

inline void set_metadata(table_metadata metadata)#

Sets metadata.

Parameters:

metadata – Associated metadata

inline void set_na_rep(std::string val)#

Sets string to used for null entries.

Parameters:

val – String to represent null value

inline void enable_include_nulls(bool val)#

Enables/Disables output of nulls as ‘null’.

Parameters:

val – Boolean value to enable/disable

inline void enable_lines(bool val)#

Enables/Disables JSON lines for records format.

Parameters:

val – Boolean value to enable/disable JSON lines

inline void set_rows_per_chunk(size_type val)#

Sets maximum number of rows to process for each file write.

Parameters:

val – Number of rows per chunk

inline void set_true_value(std::string val)#

Sets string used for values != 0 in INT8 types.

Parameters:

val – String to represent values != 0 in INT8 types

inline void set_false_value(std::string val)#

Sets string used for values == 0 in INT8 types.

Parameters:

val – String to represent values == 0 in INT8 types

Public Static Functions

static json_writer_options_builder builder(sink_info const &sink, table_view const &table)#

Create builder to create json_writer_options.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

Returns:

Builder to build json_writer_options

class json_writer_options_builder#
#include <json.hpp>

Builder to build options for writer_json()

Public Functions

explicit json_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline explicit json_writer_options_builder(sink_info const &sink, table_view const &table)#

Constructor from sink and table.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

inline json_writer_options_builder &table(table_view tbl)#

Sets table to be written to output.

Parameters:

tbl – Table for the output

Returns:

this for chaining

inline json_writer_options_builder &metadata(table_metadata metadata)#

Sets optional metadata (with column names).

Parameters:

metadata – metadata (with column names)

Returns:

this for chaining

inline json_writer_options_builder &na_rep(std::string val)#

Sets string to used for null entries.

Parameters:

val – String to represent null value

Returns:

this for chaining

inline json_writer_options_builder &include_nulls(bool val)#

Enables/Disables output of nulls as ‘null’.

Parameters:

val – Boolean value to enable/disable

Returns:

this for chaining

inline json_writer_options_builder &lines(bool val)#

Enables/Disables JSON lines for records format.

Parameters:

val – Boolean value to enable/disable

Returns:

this for chaining

inline json_writer_options_builder &rows_per_chunk(int val)#

Sets maximum number of rows to process for each file write.

Parameters:

val – Number of rows per chunk

Returns:

this for chaining

inline json_writer_options_builder &true_value(std::string val)#

Sets string used for values != 0 in INT8 types.

Parameters:

val – String to represent values != 0 in INT8 types

Returns:

this for chaining

inline json_writer_options_builder &false_value(std::string val)#

Sets string used for values == 0 in INT8 types.

Parameters:

val – String to represent values == 0 in INT8 types

Returns:

this for chaining

inline operator json_writer_options&&()#

move json_writer_options member once it’s built.

inline json_writer_options &&build()#

move json_writer_options member once it’s built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built json_writer_options object’s r-value reference

class orc_writer_options#
#include <orc.hpp>

Settings to use for write_orc().

Public Functions

explicit orc_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink info.

Returns:

Sink info

inline compression_type get_compression() const#

Returns compression type.

Returns:

Compression type

inline bool is_enabled_statistics() const#

Whether writing column statistics is enabled/disabled.

Returns:

true if writing column statistics is enabled

inline statistics_freq get_statistics_freq() const#

Returns frequency of statistics collection.

Returns:

Frequency of statistics collection

inline auto get_stripe_size_bytes() const#

Returns maximum stripe size, in bytes.

Returns:

Maximum stripe size, in bytes

inline auto get_stripe_size_rows() const#

Returns maximum stripe size, in rows.

Returns:

Maximum stripe size, in rows

inline auto get_row_index_stride() const#

Returns the row index stride.

Returns:

Row index stride

inline table_view get_table() const#

Returns table to be written to output.

Returns:

Table to be written to output

inline auto const &get_metadata() const#

Returns associated metadata.

Returns:

Associated metadata

inline std::map<std::string, std::string> const &get_key_value_metadata() const#

Returns Key-Value footer metadata information.

Returns:

Key-Value footer metadata information

inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#

Returns a shared pointer to the user-provided compression statistics.

Returns:

Compression statistics

inline bool get_enable_dictionary_sort() const#

Returns whether string dictionaries should be sorted.

Returns:

true if string dictionaries should be sorted

inline void set_compression(compression_type comp)#

Sets compression type.

Parameters:

comp – Compression type

inline void enable_statistics(statistics_freq val)#

Choose granularity of statistics collection.

The granularity can be set to:

  • cudf::io::STATISTICS_NONE: No statistics are collected.

  • cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.

  • cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.

Parameters:

val – Frequency of statistics collection

inline void set_stripe_size_bytes(size_t size_bytes)#

Sets the maximum stripe size, in bytes.

Parameters:

size_bytes – Maximum stripe size, in bytes to be set

Throws:

cudf::logic_error – if a value below the minimal size is passed

inline void set_stripe_size_rows(size_type size_rows)#

Sets the maximum stripe size, in rows.

If the stripe size is smaller that the row group size, row group size will be reduced to math the stripe size.

Parameters:

size_rows – Maximum stripe size, in rows to be set

Throws:

cudf::logic_error – if a value below the minimal number of rows is passed

inline void set_row_index_stride(size_type stride)#

Sets the row index stride.

Rounded down to a multiple of 8.

Parameters:

stride – Row index stride to be set

Throws:

cudf::logic_error – if a value below the minimal row index stride is passed

inline void set_table(table_view tbl)#

Sets table to be written to output.

Parameters:

tbl – Table for the output

inline void set_metadata(table_input_metadata meta)#

Sets associated metadata.

Parameters:

meta – Associated metadata

inline void set_key_value_metadata(std::map<std::string, std::string> metadata)#

Sets metadata.

Parameters:

metadata – Key-Value footer metadata

inline void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be updated after writing

inline void set_enable_dictionary_sort(bool val)#

Sets whether string dictionaries should be sorted.

Parameters:

val – Boolean value to enable/disable

Public Static Functions

static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)#

Create builder to create orc_writer_options.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

Returns:

Builder to build orc_writer_options

class orc_writer_options_builder#
#include <orc.hpp>

Builds settings to use for write_orc().

Public Functions

orc_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline orc_writer_options_builder(sink_info const &sink, table_view const &table)#

Constructor from sink and table.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

inline orc_writer_options_builder &compression(compression_type comp)#

Sets compression type.

Parameters:

comp – The compression type to use

Returns:

this for chaining

inline orc_writer_options_builder &enable_statistics(statistics_freq val)#

Choose granularity of column statistics to be written.

The granularity can be set to:

  • cudf::io::STATISTICS_NONE: No statistics are collected.

  • cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.

  • cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.

Parameters:

val – Level of statistics collection

Returns:

this for chaining

inline orc_writer_options_builder &stripe_size_bytes(size_t val)#

Sets the maximum stripe size, in bytes.

Parameters:

val – maximum stripe size

Returns:

this for chaining

inline orc_writer_options_builder &stripe_size_rows(size_type val)#

Sets the maximum number of rows in output stripes.

Parameters:

val – maximum number or rows

Returns:

this for chaining

inline orc_writer_options_builder &row_index_stride(size_type val)#

Sets the row index stride.

Parameters:

val – new row index stride

Returns:

this for chaining

inline orc_writer_options_builder &table(table_view tbl)#

Sets table to be written to output.

Parameters:

tbl – Table for the output

Returns:

this for chaining

inline orc_writer_options_builder &metadata(table_input_metadata meta)#

Sets associated metadata.

Parameters:

meta – Associated metadata

Returns:

this for chaining

inline orc_writer_options_builder &key_value_metadata(std::map<std::string, std::string> metadata)#

Sets Key-Value footer metadata.

Parameters:

metadata – Key-Value footer metadata

Returns:

this for chaining

inline orc_writer_options_builder &compression_statistics(std::shared_ptr<writer_compression_statistics> const &comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be filled once writer is done

Returns:

this for chaining

inline orc_writer_options_builder &enable_dictionary_sort(bool val)#

Sets whether string dictionaries should be sorted.

Parameters:

val – Boolean value to enable/disable

Returns:

this for chaining

inline operator orc_writer_options&&()#

move orc_writer_options member once it’s built.

inline orc_writer_options &&build()#

move orc_writer_options member once it’s built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built orc_writer_options object’s r-value reference

class chunked_orc_writer_options#
#include <orc.hpp>

Settings to use for write_orc_chunked().

Public Functions

explicit chunked_orc_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink info.

Returns:

Sink info

inline compression_type get_compression() const#

Returns compression type.

Returns:

Compression type

inline statistics_freq get_statistics_freq() const#

Returns granularity of statistics collection.

Returns:

Granularity of statistics collection

inline auto get_stripe_size_bytes() const#

Returns maximum stripe size, in bytes.

Returns:

Maximum stripe size, in bytes

inline auto get_stripe_size_rows() const#

Returns maximum stripe size, in rows.

Returns:

Maximum stripe size, in rows

inline auto get_row_index_stride() const#

Returns the row index stride.

Returns:

Row index stride

inline auto const &get_metadata() const#

Returns associated metadata.

Returns:

Associated metadata

inline std::map<std::string, std::string> const &get_key_value_metadata() const#

Returns Key-Value footer metadata information.

Returns:

Key-Value footer metadata information

inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#

Returns a shared pointer to the user-provided compression statistics.

Returns:

Compression statistics

inline bool get_enable_dictionary_sort() const#

Returns whether string dictionaries should be sorted.

Returns:

true if string dictionaries should be sorted

inline void set_compression(compression_type comp)#

Sets compression type.

Parameters:

comp – The compression type to use

inline void enable_statistics(statistics_freq val)#

Choose granularity of statistics collection.

The granularity can be set to:

  • cudf::io::STATISTICS_NONE: No statistics are collected.

  • cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.

  • cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.

Parameters:

val – Frequency of statistics collection

inline void set_stripe_size_bytes(size_t size_bytes)#

Sets the maximum stripe size, in bytes.

Parameters:

size_bytes – Maximum stripe size, in bytes to be set

Throws:

cudf::logic_error – if a value below the minimal stripe size is passed

inline void set_stripe_size_rows(size_type size_rows)#

Sets the maximum stripe size, in rows.

If the stripe size is smaller that the row group size, row group size will be reduced to math the stripe size.

Parameters:

size_rows – Maximum stripe size, in rows to be set

Throws:

cudf::logic_error – if a value below the minimal number of rows in a stripe is passed

inline void set_row_index_stride(size_type stride)#

Sets the row index stride.

Rounded down to a multiple of 8.

Parameters:

stride – Row index stride to be set

Throws:

cudf::logic_error – if a value below the minimal number of rows in a row group is passed

inline void metadata(table_input_metadata meta)#

Sets associated metadata.

Parameters:

meta – Associated metadata

inline void set_key_value_metadata(std::map<std::string, std::string> metadata)#

Sets Key-Value footer metadata.

Parameters:

metadata – Key-Value footer metadata

inline void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be updated after writing

inline void set_enable_dictionary_sort(bool val)#

Sets whether string dictionaries should be sorted.

Parameters:

val – Boolean value to enable/disable

Public Static Functions

static chunked_orc_writer_options_builder builder(sink_info const &sink)#

Create builder to create chunked_orc_writer_options.

Parameters:

sink – The sink used for writer output

Returns:

Builder to build chunked_orc_writer_options

class chunked_orc_writer_options_builder#
#include <orc.hpp>

Builds settings to use for write_orc_chunked().

Public Functions

chunked_orc_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline explicit chunked_orc_writer_options_builder(sink_info const &sink)#

Constructor from sink and table.

Parameters:

sink – The sink used for writer output

inline chunked_orc_writer_options_builder &compression(compression_type comp)#

Sets compression type.

Parameters:

comp – The compression type to use

Returns:

this for chaining

inline chunked_orc_writer_options_builder &enable_statistics(statistics_freq val)#

Choose granularity of statistics collection.

The granularity can be set to:

  • cudf::io::STATISTICS_NONE: No statistics are collected.

  • cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.

  • cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.

Parameters:

val – Frequency of statistics collection

Returns:

this for chaining

inline chunked_orc_writer_options_builder &stripe_size_bytes(size_t val)#

Sets the maximum stripe size, in bytes.

Parameters:

val – maximum stripe size

Returns:

this for chaining

inline chunked_orc_writer_options_builder &stripe_size_rows(size_type val)#

Sets the maximum number of rows in output stripes.

Parameters:

val – maximum number or rows

Returns:

this for chaining

inline chunked_orc_writer_options_builder &row_index_stride(size_type val)#

Sets the row index stride.

Parameters:

val – new row index stride

Returns:

this for chaining

inline chunked_orc_writer_options_builder &metadata(table_input_metadata meta)#

Sets associated metadata.

Parameters:

meta – Associated metadata

Returns:

this for chaining

inline chunked_orc_writer_options_builder &key_value_metadata(std::map<std::string, std::string> metadata)#

Sets Key-Value footer metadata.

Parameters:

metadata – Key-Value footer metadata

Returns:

this for chaining

inline chunked_orc_writer_options_builder &compression_statistics(std::shared_ptr<writer_compression_statistics> const &comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be filled once writer is done

Returns:

this for chaining

inline chunked_orc_writer_options_builder &enable_dictionary_sort(bool val)#

Sets whether string dictionaries should be sorted.

Parameters:

val – Boolean value to enable/disable

Returns:

this for chaining

inline operator chunked_orc_writer_options&&()#

move chunked_orc_writer_options member once it’s built.

inline chunked_orc_writer_options &&build()#

move chunked_orc_writer_options member once it’s built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built chunked_orc_writer_options object’s r-value reference

class orc_chunked_writer#
#include <orc.hpp>

Chunked orc writer class writes an ORC file in a chunked/stream form.

The intent of the write_orc_chunked_ path is to allow writing of an arbitrarily large / arbitrary number of rows to an ORC file in multiple passes.

The following code snippet demonstrates how to write a single ORC file containing one logical table by writing a series of individual cudf::tables.

 ...
 std::string filepath = "dataset.orc";
 cudf::io::chunked_orc_writer_options options = cudf::io::chunked_orc_writer_options
options::builder(cudf::sink_info(filepath));
 ...
 orc_chunked_writer writer(options)
 writer.write(table0)
 writer.write(table1)
   ...
 writer.close();

Public Functions

orc_chunked_writer() = default#

Default constructor, this should never be used. This is added just to satisfy cython.

orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#

Constructor with chunked writer options.

Parameters:
  • options[in] options used to write table

  • stream[in] CUDA stream used for device memory operations and kernel launches

orc_chunked_writer &write(table_view const &table)#

Writes table to output.

Parameters:

table[in] Table that needs to be written

Returns:

returns reference of the class object

void close()#

Finishes the chunked/streamed write process.

Public Members

std::unique_ptr<orc::detail::writer> writer#

Unique pointer to impl writer class.

struct sorting_column#
#include <parquet.hpp>

Struct used to describe column sorting metadata.

Public Members

int column_idx = {}#

leaf column index within the row group

bool is_descending = {false}#

true if sort order is descending

bool is_nulls_first = {true}#

true if nulls come before non-null values

class parquet_writer_options#
#include <parquet.hpp>

Settings for write_parquet().

Public Functions

parquet_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink info.

Returns:

Sink info

inline compression_type get_compression() const#

Returns compression format used.

Returns:

Compression format

inline statistics_freq get_stats_level() const#

Returns level of statistics requested in output file.

Returns:

level of statistics requested in output file

inline table_view get_table() const#

Returns table_view.

Returns:

Table view

inline std::vector<partition_info> const &get_partitions() const#

Returns partitions.

Returns:

Partitions

inline auto const &get_metadata() const#

Returns associated metadata.

Returns:

Associated metadata

inline std::vector<std::map<std::string, std::string>> const &get_key_value_metadata() const#

Returns Key-Value footer metadata information.

Returns:

Key-Value footer metadata information

inline bool is_enabled_int96_timestamps() const#

Returns true if timestamps will be written as INT96.

Returns:

true if timestamps will be written as INT96

inline auto is_enabled_utc_timestamps() const#

Returns true if timestamps will be written as UTC.

Returns:

true if timestamps will be written as UTC

inline std::vector<std::string> const &get_column_chunks_file_paths() const#

Returns Column chunks file paths to be set in the raw output metadata.

Returns:

Column chunks file paths to be set in the raw output metadata

inline auto get_row_group_size_bytes() const#

Returns maximum row group size, in bytes.

Returns:

Maximum row group size, in bytes

inline auto get_row_group_size_rows() const#

Returns maximum row group size, in rows.

Returns:

Maximum row group size, in rows

inline auto get_max_page_size_bytes() const#

Returns the maximum uncompressed page size, in bytes.

If set larger than the row group size, then this will return the row group size.

Returns:

Maximum uncompressed page size, in bytes

inline auto get_max_page_size_rows() const#

Returns maximum page size, in rows.

If set larger than the row group size, then this will return the row group size.

Returns:

Maximum page size, in rows

inline auto get_column_index_truncate_length() const#

Returns maximum length of min or max values in column index, in bytes.

Returns:

length min/max will be truncated to

inline dictionary_policy get_dictionary_policy() const#

Returns policy for dictionary use.

Returns:

policy for dictionary use

inline auto get_max_dictionary_size() const#

Returns maximum dictionary size, in bytes.

Returns:

Maximum dictionary size, in bytes.

inline auto get_max_page_fragment_size() const#

Returns maximum page fragment size, in rows.

Returns:

Maximum page fragment size, in rows.

inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#

Returns a shared pointer to the user-provided compression statistics.

Returns:

Compression statistics

inline auto is_enabled_write_v2_headers() const#

Returns true if V2 page headers should be written.

Returns:

true if V2 page headers should be written.

inline auto const &get_sorting_columns() const#

Returns the sorting_columns.

Returns:

Column sort order metadata

void set_partitions(std::vector<partition_info> partitions)#

Sets partitions.

Parameters:

partitions – Partitions of input table in {start_row, num_rows} pairs. If specified, must be same size as number of sinks in sink_info

inline void set_metadata(table_input_metadata metadata)#

Sets metadata.

Parameters:

metadata – Associated metadata

void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#

Sets metadata.

Parameters:

metadata – Key-Value footer metadata

inline void set_stats_level(statistics_freq sf)#

Sets the level of statistics.

Parameters:

sf – Level of statistics requested in the output file

inline void set_compression(compression_type compression)#

Sets compression type.

Parameters:

compression – The compression type to use

inline void enable_int96_timestamps(bool req)#

Sets timestamp writing preferences. INT96 timestamps will be written if true and TIMESTAMP_MICROS will be written if false.

Parameters:

req – Boolean value to enable/disable writing of INT96 timestamps

inline void enable_utc_timestamps(bool val)#

Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.

Parameters:

val – Boolean value to enable/disable writing of timestamps as UTC.

void set_column_chunks_file_paths(std::vector<std::string> file_paths)#

Sets column chunks file path to be set in the raw output metadata.

Parameters:

file_paths – Vector of Strings which indicates file path. Must be same size as number of data sinks in sink info

void set_row_group_size_bytes(size_t size_bytes)#

Sets the maximum row group size, in bytes.

Parameters:

size_bytes – Maximum row group size, in bytes to set

void set_row_group_size_rows(size_type size_rows)#

Sets the maximum row group size, in rows.

Parameters:

size_rows – Maximum row group size, in rows to set

void set_max_page_size_bytes(size_t size_bytes)#

Sets the maximum uncompressed page size, in bytes.

Parameters:

size_bytes – Maximum uncompressed page size, in bytes to set

void set_max_page_size_rows(size_type size_rows)#

Sets the maximum page size, in rows.

Parameters:

size_rows – Maximum page size, in rows to set

void set_column_index_truncate_length(int32_t size_bytes)#

Sets the maximum length of min or max values in column index, in bytes.

Parameters:

size_bytes – length min/max will be truncated to

void set_dictionary_policy(dictionary_policy policy)#

Sets the policy for dictionary use.

Parameters:

policy – Policy for dictionary use

void set_max_dictionary_size(size_t size_bytes)#

Sets the maximum dictionary size, in bytes.

Parameters:

size_bytes – Maximum dictionary size, in bytes

void set_max_page_fragment_size(size_type size_rows)#

Sets the maximum page fragment size, in rows.

Parameters:

size_rows – Maximum page fragment size, in rows.

inline void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be updated after writing

inline void enable_write_v2_headers(bool val)#

Sets preference for V2 page headers. Write V2 page headers if set to true.

Parameters:

val – Boolean value to enable/disable writing of V2 page headers.

inline void set_sorting_columns(std::vector<sorting_column> sorting_columns)#

Sets sorting columns.

Parameters:

sorting_columns – Column sort order metadata

Public Static Functions

static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)#

Create builder to create parquet_writer_options.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

Returns:

Builder to build parquet_writer_options

static parquet_writer_options_builder builder()#

Create builder to create parquet_writer_options.

Returns:

parquet_writer_options_builder

class parquet_writer_options_builder#
#include <parquet.hpp>

Class to build parquet_writer_options.

Public Functions

explicit parquet_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline explicit parquet_writer_options_builder(sink_info const &sink, table_view const &table)#

Constructor from sink and table.

Parameters:
  • sink – The sink used for writer output

  • table – Table to be written to output

parquet_writer_options_builder &partitions(std::vector<partition_info> partitions)#

Sets partitions in parquet_writer_options.

Parameters:

partitions – Partitions of input table in {start_row, num_rows} pairs. If specified, must be same size as number of sinks in sink_info

Returns:

this for chaining

inline parquet_writer_options_builder &metadata(table_input_metadata metadata)#

Sets metadata in parquet_writer_options.

Parameters:

metadata – Associated metadata

Returns:

this for chaining

parquet_writer_options_builder &key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#

Sets Key-Value footer metadata in parquet_writer_options.

Parameters:

metadata – Key-Value footer metadata

Returns:

this for chaining

inline parquet_writer_options_builder &stats_level(statistics_freq sf)#

Sets the level of statistics in parquet_writer_options.

Parameters:

sf – Level of statistics requested in the output file

Returns:

this for chaining

inline parquet_writer_options_builder &compression(compression_type compression)#

Sets compression type in parquet_writer_options.

Parameters:

compression – The compression type to use

Returns:

this for chaining

parquet_writer_options_builder &column_chunks_file_paths(std::vector<std::string> file_paths)#

Sets column chunks file path to be set in the raw output metadata.

Parameters:

file_paths – Vector of Strings which indicates file path. Must be same size as number of data sinks

Returns:

this for chaining

inline parquet_writer_options_builder &row_group_size_bytes(size_t val)#

Sets the maximum row group size, in bytes.

Parameters:

val – maximum row group size

Returns:

this for chaining

inline parquet_writer_options_builder &row_group_size_rows(size_type val)#

Sets the maximum number of rows in output row groups.

Parameters:

val – maximum number or rows

Returns:

this for chaining

inline parquet_writer_options_builder &max_page_size_bytes(size_t val)#

Sets the maximum uncompressed page size, in bytes.

Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, and will be adjusted to match if it is.

Parameters:

val – maximum page size

Returns:

this for chaining

inline parquet_writer_options_builder &max_page_size_rows(size_type val)#

Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. Cannot be larger than the row group size in rows, and will be adjusted to match if it is.

Parameters:

val – maximum rows per page

Returns:

this for chaining

inline parquet_writer_options_builder &column_index_truncate_length(int32_t val)#

Sets the desired maximum size in bytes for min and max values in the column index.

Values exceeding this limit will be truncated, but modified such that they will still be valid lower and upper bounds. This only applies to variable length types, such as string. Maximum values will not be truncated if there is no suitable truncation that results in a valid upper bound.

Default value is 64.

Parameters:

val – length min/max will be truncated to, with 0 indicating no truncation

Returns:

this for chaining

parquet_writer_options_builder &dictionary_policy(enum dictionary_policy val)#

Sets the policy for dictionary use.

Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can be compressed. In some circumstances, the dictionary can grow beyond this limit, which will prevent the column from being compressed. This setting controls how the writer should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable dictionary encoding for columns where the dictionary exceeds the limit. A setting of dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in the disabling of compression for columns that would otherwise be compressed.

The default value is dictionary_policy::ADAPTIVE.

Parameters:

val – policy for dictionary use

Returns:

this for chaining

parquet_writer_options_builder &max_dictionary_size(size_t val)#

Sets the maximum dictionary size, in bytes.

Disables dictionary encoding for any column chunk where the dictionary will exceed this limit. Only used when the dictionary_policy is set to ‘ADAPTIVE’.

Default value is 1048576 (1MiB).

Parameters:

val – maximum dictionary size

Returns:

this for chaining

parquet_writer_options_builder &max_page_fragment_size(size_type val)#

Sets the maximum page fragment size, in rows.

Files with nested schemas or very long strings may need a page fragment size smaller than the default value of 5000 to ensure a single fragment will not exceed the desired maximum page size in bytes.

Parameters:

val – maximum page fragment size

Returns:

this for chaining

inline parquet_writer_options_builder &compression_statistics(std::shared_ptr<writer_compression_statistics> const &comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be filled once writer is done

Returns:

this for chaining

inline parquet_writer_options_builder &int96_timestamps(bool enabled)#

Sets whether int96 timestamps are written or not in parquet_writer_options.

Parameters:

enabled – Boolean value to enable/disable int96 timestamps

Returns:

this for chaining

inline parquet_writer_options_builder &utc_timestamps(bool enabled)#

Set to true if timestamps are to be written as UTC.

Parameters:

enabled – Boolean value to enable/disable writing of timestamps as UTC.

Returns:

this for chaining

parquet_writer_options_builder &write_v2_headers(bool enabled)#

Set to true if V2 page headers are to be written.

Parameters:

enabled – Boolean value to enable/disable writing of V2 page headers.

Returns:

this for chaining

parquet_writer_options_builder &sorting_columns(std::vector<sorting_column> sorting_columns)#

Sets column sorting metadata to chunked_parquet_writer_options.

Parameters:

sorting_columns – Column sort order metadata

Returns:

this for chaining

inline operator parquet_writer_options&&()#

move parquet_writer_options member once it’s built.

inline parquet_writer_options &&build()#

move parquet_writer_options member once it’s built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built parquet_writer_options object’s r-value reference

class chunked_parquet_writer_options#
#include <parquet.hpp>

Settings for write_parquet_chunked().

Public Functions

chunked_parquet_writer_options() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline sink_info const &get_sink() const#

Returns sink info.

Returns:

Sink info

inline compression_type get_compression() const#

Returns compression format used.

Returns:

Compression format

inline statistics_freq get_stats_level() const#

Returns level of statistics requested in output file.

Returns:

Level of statistics requested in output file

inline auto const &get_metadata() const#

Returns metadata information.

Returns:

Metadata information

inline std::vector<std::map<std::string, std::string>> const &get_key_value_metadata() const#

Returns Key-Value footer metadata information.

Returns:

Key-Value footer metadata information

inline bool is_enabled_int96_timestamps() const#

Returns true if timestamps will be written as INT96.

Returns:

true if timestamps will be written as INT96

inline auto is_enabled_utc_timestamps() const#

Returns true if timestamps will be written as UTC.

Returns:

true if timestamps will be written as UTC

inline auto get_row_group_size_bytes() const#

Returns maximum row group size, in bytes.

Returns:

Maximum row group size, in bytes

inline auto get_row_group_size_rows() const#

Returns maximum row group size, in rows.

Returns:

Maximum row group size, in rows

inline auto get_max_page_size_bytes() const#

Returns maximum uncompressed page size, in bytes.

If set larger than the row group size, then this will return the row group size.

Returns:

Maximum uncompressed page size, in bytes

inline auto get_max_page_size_rows() const#

Returns maximum page size, in rows.

If set larger than the row group size, then this will return the row group size.

Returns:

Maximum page size, in rows

inline auto get_column_index_truncate_length() const#

Returns maximum length of min or max values in column index, in bytes.

Returns:

length min/max will be truncated to

inline dictionary_policy get_dictionary_policy() const#

Returns policy for dictionary use.

Returns:

policy for dictionary use

inline auto get_max_dictionary_size() const#

Returns maximum dictionary size, in bytes.

Returns:

Maximum dictionary size, in bytes.

inline auto get_max_page_fragment_size() const#

Returns maximum page fragment size, in rows.

Returns:

Maximum page fragment size, in rows.

inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#

Returns a shared pointer to the user-provided compression statistics.

Returns:

Compression statistics

inline auto is_enabled_write_v2_headers() const#

Returns true if V2 page headers should be written.

Returns:

true if V2 page headers should be written.

inline auto const &get_sorting_columns() const#

Returns the sorting_columns.

Returns:

Column sort order metadata

inline void set_metadata(table_input_metadata metadata)#

Sets metadata.

Parameters:

metadata – Associated metadata

void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#

Sets Key-Value footer metadata.

Parameters:

metadata – Key-Value footer metadata

inline void set_stats_level(statistics_freq sf)#

Sets the level of statistics in parquet_writer_options.

Parameters:

sf – Level of statistics requested in the output file

inline void set_compression(compression_type compression)#

Sets compression type.

Parameters:

compression – The compression type to use

inline void enable_int96_timestamps(bool req)#

Sets timestamp writing preferences.

INT96 timestamps will be written if true and TIMESTAMP_MICROS will be written if false.

Parameters:

req – Boolean value to enable/disable writing of INT96 timestamps

inline void enable_utc_timestamps(bool val)#

Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.

Parameters:

val – Boolean value to enable/disable writing of timestamps as UTC.

void set_row_group_size_bytes(size_t size_bytes)#

Sets the maximum row group size, in bytes.

Parameters:

size_bytes – Maximum row group size, in bytes to set

void set_row_group_size_rows(size_type size_rows)#

Sets the maximum row group size, in rows.

Parameters:

size_rows – The maximum row group size, in rows to set

void set_max_page_size_bytes(size_t size_bytes)#

Sets the maximum uncompressed page size, in bytes.

Parameters:

size_bytes – Maximum uncompressed page size, in bytes to set

void set_max_page_size_rows(size_type size_rows)#

Sets the maximum page size, in rows.

Parameters:

size_rows – The maximum page size, in rows to set

void set_column_index_truncate_length(int32_t size_bytes)#

Sets the maximum length of min or max values in column index, in bytes.

Parameters:

size_bytes – length min/max will be truncated to

void set_dictionary_policy(dictionary_policy policy)#

Sets the policy for dictionary use.

Parameters:

policy – Policy for dictionary use

void set_max_dictionary_size(size_t size_bytes)#

Sets the maximum dictionary size, in bytes.

Parameters:

size_bytes – Maximum dictionary size, in bytes

void set_max_page_fragment_size(size_type size_rows)#

Sets the maximum page fragment size, in rows.

Parameters:

size_rows – Maximum page fragment size, in rows.

inline void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be updated after writing

inline void enable_write_v2_headers(bool val)#

Sets preference for V2 page headers. Write V2 page headers if set to true.

Parameters:

val – Boolean value to enable/disable writing of V2 page headers.

inline void set_sorting_columns(std::vector<sorting_column> sorting_columns)#

Sets sorting columns.

Parameters:

sorting_columns – Column sort order metadata

Public Static Functions

static chunked_parquet_writer_options_builder builder(sink_info const &sink)#

creates builder to build chunked_parquet_writer_options.

Parameters:

sink – sink to use for writer output

Returns:

Builder to build chunked_parquet_writer_options

class chunked_parquet_writer_options_builder#
#include <parquet.hpp>

Builds options for chunked_parquet_writer_options.

Public Functions

chunked_parquet_writer_options_builder() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline chunked_parquet_writer_options_builder(sink_info const &sink)#

Constructor from sink.

Parameters:

sink – The sink used for writer output

inline chunked_parquet_writer_options_builder &metadata(table_input_metadata metadata)#

Sets metadata to chunked_parquet_writer_options.

Parameters:

metadata – Associated metadata

Returns:

this for chaining

chunked_parquet_writer_options_builder &key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#

Sets Key-Value footer metadata in parquet_writer_options.

Parameters:

metadata – Key-Value footer metadata

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &stats_level(statistics_freq sf)#

Sets the level of statistics in chunked_parquet_writer_options.

Parameters:

sf – Level of statistics requested in the output file

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &compression(compression_type compression)#

Sets compression type to chunked_parquet_writer_options.

Parameters:

compression – The compression type to use

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &int96_timestamps(bool enabled)#

Set to true if timestamps should be written as int96 types instead of int64 types. Even though int96 is deprecated and is not an internal type for cudf, it needs to be written for backwards compatibility reasons.

Parameters:

enabled – Boolean value to enable/disable int96 timestamps

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &utc_timestamps(bool enabled)#

Set to true if timestamps are to be written as UTC.

Parameters:

enabled – Boolean value to enable/disable writing of timestamps as UTC.

Returns:

this for chaining

chunked_parquet_writer_options_builder &write_v2_headers(bool enabled)#

Set to true if V2 page headers are to be written.

Parameters:

enabled – Boolean value to enable/disable writing of V2 page headers.

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &row_group_size_bytes(size_t val)#

Sets the maximum row group size, in bytes.

Parameters:

val – maximum row group size

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &row_group_size_rows(size_type val)#

Sets the maximum number of rows in output row groups.

Parameters:

val – maximum number or rows

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &max_page_size_bytes(size_t val)#

Sets the maximum uncompressed page size, in bytes.

Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, and will be adjusted to match if it is.

Parameters:

val – maximum page size

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &max_page_size_rows(size_type val)#

Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. Cannot be larger than the row group size in rows, and will be adjusted to match if it is.

Parameters:

val – maximum rows per page

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &column_index_truncate_length(int32_t val)#

Sets the desired maximum size in bytes for min and max values in the column index.

Values exceeding this limit will be truncated, but modified such that they will still be valid lower and upper bounds. This only applies to variable length types, such as string. Maximum values will not be truncated if there is no suitable truncation that results in a valid upper bound.

Default value is 64.

Parameters:

val – length min/max will be truncated to, with 0 indicating no truncation

Returns:

this for chaining

chunked_parquet_writer_options_builder &dictionary_policy(enum dictionary_policy val)#

Sets the policy for dictionary use.

Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can be compressed. In some circumstances, the dictionary can grow beyond this limit, which will prevent the column from being compressed. This setting controls how the writer should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable dictionary encoding for columns where the dictionary exceeds the limit. A setting of dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in the disabling of compression for columns that would otherwise be compressed.

The default value is dictionary_policy::ADAPTIVE.

Parameters:

val – policy for dictionary use

Returns:

this for chaining

chunked_parquet_writer_options_builder &max_dictionary_size(size_t val)#

Sets the maximum dictionary size, in bytes.

Disables dictionary encoding for any column chunk where the dictionary will exceed this limit. Only used when the dictionary_policy is set to ‘ADAPTIVE’.

Default value is 1048576 (1MiB).

Parameters:

val – maximum dictionary size

Returns:

this for chaining

chunked_parquet_writer_options_builder &max_page_fragment_size(size_type val)#

Sets the maximum page fragment size, in rows.

Files with nested schemas or very long strings may need a page fragment size smaller than the default value of 5000 to ensure a single fragment will not exceed the desired maximum page size in bytes.

Parameters:

val – maximum page fragment size

Returns:

this for chaining

inline chunked_parquet_writer_options_builder &compression_statistics(std::shared_ptr<writer_compression_statistics> const &comp_stats)#

Sets the pointer to the output compression statistics.

Parameters:

comp_stats – Pointer to compression statistics to be filled once writer is done

Returns:

this for chaining

chunked_parquet_writer_options_builder &sorting_columns(std::vector<sorting_column> sorting_columns)#

Sets column sorting metadata to chunked_parquet_writer_options.

Parameters:

sorting_columns – Column sort order metadata

Returns:

this for chaining

inline operator chunked_parquet_writer_options&&()#

move chunked_parquet_writer_options member once it’s built.

inline chunked_parquet_writer_options &&build()#

move chunked_parquet_writer_options member once it’s is built.

This has been added since Cython does not support overloading of conversion operators.

Returns:

Built chunked_parquet_writer_options object’s r-value reference

class parquet_chunked_writer#
#include <parquet.hpp>

chunked parquet writer class to handle options and write tables in chunks.

The intent of the parquet_chunked_writer is to allow writing of an arbitrarily large / arbitrary number of rows to a parquet file in multiple passes.

The following code snippet demonstrates how to write a single parquet file containing one logical table by writing a series of individual cudf::tables.

auto destination = cudf::io::sink_info("dataset.parquet");
auto options = cudf::io::chunked_parquet_writer_options::builder(destination, table->view());
auto writer  = cudf::io::parquet_chunked_writer(options);

writer.write(table0)
writer.write(table1)
writer.close()

Public Functions

parquet_chunked_writer() = default#

Default constructor, this should never be used. This is added just to satisfy cython.

parquet_chunked_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#

Constructor with chunked writer options.

Parameters:
  • options[in] options used to write table

  • stream[in] CUDA stream used for device memory operations and kernel launches

parquet_chunked_writer &write(table_view const &table, std::vector<partition_info> const &partitions = {})#

Writes table to output.

Parameters:
  • table[in] Table that needs to be written

  • partitions[in] Optional partitions to divide the table into. If specified, must be same size as number of sinks.

Throws:
  • cudf::logic_error – If the number of partitions is not the same as number of sinks

  • rmm::bad_alloc – if there is insufficient space for temporary buffers

Returns:

returns reference of the class object

std::unique_ptr<std::vector<uint8_t>> close(std::vector<std::string> const &column_chunks_file_paths = {})#

Finishes the chunked/streamed write process.

Parameters:

column_chunks_file_paths[in] Column chunks file path to be set in the raw output metadata

Returns:

A parquet-compatible blob that contains the data for all rowgroups in the list only if column_chunks_file_paths is provided, else null.

Public Members

std::unique_ptr<parquet::detail::writer> writer#

Unique pointer to impl writer class.