Io Types#
- group io_types
Typedefs
-
using no_statistics = std::monostate#
Monostate type alias for the statistics variant.
-
using date_statistics = minmax_statistics<int32_t>#
Statistics for date(time) columns.
-
using binary_statistics = sum_statistics<int64_t>#
Statistics for binary columns.
The
sum
is the total number of bytes across all elements.
-
using statistics_type = std::variant<no_statistics, integer_statistics, double_statistics, string_statistics, bucket_statistics, decimal_statistics, date_statistics, binary_statistics, timestamp_statistics>#
Variant type for ORC type-specific column statistics.
The variant can hold any of the supported column statistics types.
Enums
-
enum CompressionKind#
Identifies a compression algorithm.
Values:
-
enumerator NONE#
-
enumerator ZLIB#
-
enumerator SNAPPY#
-
enumerator LZO#
-
enumerator LZ4#
-
enumerator ZSTD#
-
enumerator NONE#
-
enum TypeKind#
Identifies a data type in an orc file.
Values:
-
enumerator INVALID_TYPE_KIND#
-
enumerator BOOLEAN#
-
enumerator BYTE#
-
enumerator SHORT#
-
enumerator INT#
-
enumerator LONG#
-
enumerator FLOAT#
-
enumerator DOUBLE#
-
enumerator STRING#
-
enumerator BINARY#
-
enumerator TIMESTAMP#
-
enumerator LIST#
-
enumerator MAP#
-
enumerator STRUCT#
-
enumerator UNION#
-
enumerator DECIMAL#
-
enumerator DATE#
-
enumerator VARCHAR#
-
enumerator CHAR#
-
enumerator INVALID_TYPE_KIND#
-
enum StreamKind#
Identifies the type of data stream.
Values:
-
enumerator INVALID_STREAM_KIND#
-
enumerator PRESENT#
-
enumerator DATA#
-
enumerator LENGTH#
-
enumerator DICTIONARY_DATA#
-
enumerator DICTIONARY_COUNT#
-
enumerator SECONDARY#
-
enumerator ROW_INDEX#
-
enumerator BLOOM_FILTER#
-
enumerator BLOOM_FILTER_UTF8#
-
enumerator INVALID_STREAM_KIND#
-
enum ColumnEncodingKind#
Identifies the encoding of columns.
Values:
-
enumerator INVALID_ENCODING_KIND#
-
enumerator DIRECT#
-
enumerator DICTIONARY#
-
enumerator DIRECT_V2#
-
enumerator DICTIONARY_V2#
-
enumerator INVALID_ENCODING_KIND#
-
enum ProtofType#
Identifies the type of encoding in a protocol buffer.
Values:
-
enumerator VARINT#
-
enumerator FIXED64#
-
enumerator FIXEDLEN#
-
enumerator START_GROUP#
-
enumerator END_GROUP#
-
enumerator FIXED32#
-
enumerator INVALID_6#
-
enumerator INVALID_7#
-
enumerator VARINT#
-
enum class compression_type : int32_t#
Compression algorithms.
Values:
-
enumerator NONE#
No compression.
-
enumerator AUTO#
Automatically detect or select compression format.
-
enumerator SNAPPY#
Snappy format, using byte-oriented LZ77.
-
enumerator GZIP#
GZIP format, using DEFLATE algorithm.
-
enumerator BZIP2#
BZIP2 format, using Burrows-Wheeler transform.
-
enumerator BROTLI#
BROTLI format, using LZ77 + Huffman + 2nd order context modeling.
-
enumerator ZIP#
ZIP format, using DEFLATE algorithm.
-
enumerator XZ#
XZ format, using LZMA(2) algorithm.
-
enumerator ZLIB#
ZLIB format, using DEFLATE algorithm.
-
enumerator LZ4#
LZ4 format, using LZ77.
-
enumerator LZO#
Lempel–Ziv–Oberhumer format.
-
enumerator ZSTD#
Zstandard format.
-
enumerator NONE#
-
enum class io_type : int32_t#
Data source or destination types.
Values:
-
enumerator FILEPATH#
Input/output is a file path.
-
enumerator HOST_BUFFER#
Input/output is a buffer in host memory.
-
enumerator DEVICE_BUFFER#
Input/output is a buffer in device memory.
-
enumerator VOID#
Input/output is nothing. No work is done. Useful for benchmarking.
-
enumerator USER_IMPLEMENTED#
Input/output is handled by a custom user class.
-
enumerator FILEPATH#
-
enum class quote_style : int32_t#
Behavior when handling quotations in field data.
Values:
-
enumerator MINIMAL#
Quote only fields which contain special characters.
-
enumerator ALL#
Quote all fields.
-
enumerator NONNUMERIC#
Quote all non-numeric fields.
-
enumerator NONE#
Never quote fields; disable quotation parsing.
-
enumerator MINIMAL#
-
enum statistics_freq#
Column statistics granularity type for parquet/orc writers.
Values:
-
enumerator STATISTICS_NONE#
No column statistics.
-
enumerator STATISTICS_ROWGROUP#
Per-Rowgroup column statistics.
-
enumerator STATISTICS_PAGE#
Per-page column statistics.
-
enumerator STATISTICS_COLUMN#
Full column and offset indices. Implies STATISTICS_ROWGROUP.
-
enumerator STATISTICS_NONE#
-
enum class column_encoding : int32_t#
Valid encodings for use with
column_in_metadata::set_encoding()
Values:
-
enumerator USE_DEFAULT#
No encoding has been requested, use default encoding.
-
enumerator DICTIONARY#
Use dictionary encoding.
-
enumerator PLAIN#
Use plain encoding.
-
enumerator DELTA_BINARY_PACKED#
Use DELTA_BINARY_PACKED encoding (only valid for integer columns)
-
enumerator DELTA_LENGTH_BYTE_ARRAY#
Use DELTA_LENGTH_BYTE_ARRAY encoding (only valid for BYTE_ARRAY columns)
-
enumerator DELTA_BYTE_ARRAY#
Use DELTA_BYTE_ARRAY encoding (only valid for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
-
enumerator BYTE_STREAM_SPLIT#
Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
-
enumerator DIRECT#
Use DIRECT encoding.
-
enumerator DIRECT_V2#
Use DIRECT_V2 encoding.
-
enumerator DICTIONARY_V2#
Use DICTIONARY_V2 encoding.
-
enumerator USE_DEFAULT#
Functions
-
template<typename T>
inline constexpr auto is_byte_like_type()# Returns
true
if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.- Template Parameters:
T – The representation type
- Returns:
true
if the type is considered a byte-like type
-
struct raw_orc_statistics#
- #include <orc_metadata.hpp>
Holds column names and buffers containing raw file-level and stripe-level statistics.
The buffers can be parsed using a Protobuf parser. Alternatively, use
parsed_orc_statistics
to get the statistics parsed into a libcudf representation.The
column_names
andfile_stats
members contain one element per column. Thestripes_stats
contains one element per stripe, where each element contains column statistics for each column.
-
template<typename T>
struct minmax_statistics# - #include <orc_metadata.hpp>
Base class for column statistics that include optional minimum and maximum.
Includes accessors for the minimum and maximum values.
-
template<typename T>
struct sum_statistics# - #include <orc_metadata.hpp>
Base class for column statistics that include an optional sum.
Includes accessors for the sum value.
-
struct integer_statistics : public cudf::io::minmax_statistics<int64_t>, public cudf::io::sum_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for integral columns.
-
struct double_statistics : public cudf::io::minmax_statistics<double>, public cudf::io::sum_statistics<double>#
- #include <orc_metadata.hpp>
Statistics for floating point columns.
-
struct string_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for string columns.
The
minimum
andmaximum
are the first and last elements, respectively, in lexicographical order. Thesum
is the total length of elements in the column. Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value
-
struct bucket_statistics#
- #include <orc_metadata.hpp>
Statistics for boolean columns.
The
count
array contains the count oftrue
values.Public Members
-
std::vector<uint64_t> count#
count of
true
values
-
std::vector<uint64_t> count#
-
struct decimal_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<std::string>#
- #include <orc_metadata.hpp>
Statistics for decimal columns.
-
struct timestamp_statistics : public cudf::io::minmax_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for timestamp columns.
The
minimum
andmaximum
min/max elements in the column, as the number of milliseconds since the UNIX epoch. Theminimum_utc
andmaximum_utc
are the same values adjusted to UTC.
-
struct column_statistics#
- #include <orc_metadata.hpp>
Contains per-column ORC statistics.
All columns can have the
number_of_values
statistics. Depending on the data type, a column can have additional statistics, accessible throughtype_specific_stats
accessor.Public Functions
Public Members
-
std::optional<uint64_t> number_of_values#
number of statistics
-
std::optional<bool> has_null#
column has any nulls
-
statistics_type type_specific_stats#
type-specific statistics
-
std::optional<uint64_t> number_of_values#
-
struct parsed_orc_statistics#
- #include <orc_metadata.hpp>
Holds column names and parsed file-level and stripe-level statistics.
The
column_names
andfile_stats
members contain one element per column. Thestripes_stats
member contains one element per stripe, where each element contains column statistics for each column.Public Members
-
std::vector<std::string> column_names#
column names
-
std::vector<column_statistics> file_stats#
file-level statistics
-
std::vector<std::vector<column_statistics>> stripes_stats#
stripe-level statistics
-
std::vector<std::string> column_names#
-
struct orc_column_schema#
- #include <orc_metadata.hpp>
Schema of an ORC column, including the nested columns.
Public Functions
-
inline orc_column_schema(std::string_view name, orc::TypeKind type, std::vector<orc_column_schema> children)#
constructor
- Parameters:
name – column name
type – ORC type
children – child columns (empty for non-nested types)
-
inline auto name() const#
Returns ORC column name; can be empty.
- Returns:
Column name
-
inline auto type_kind() const#
Returns ORC type of the column.
- Returns:
Column ORC type
-
inline auto const &children() const &#
Returns schemas of all child columns.
- Returns:
Children schemas
-
inline auto children() &&#
Returns schemas of all child columns.
- Returns:
Children schemas Children array is moved out of the object (rvalues only).
-
inline auto const &child(int idx) const &#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema
-
inline auto child(int idx) &&#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema Child is moved out of the object (rvalues only).
-
inline auto num_children() const#
Returns the number of child columns.
- Returns:
Children count
-
inline orc_column_schema(std::string_view name, orc::TypeKind type, std::vector<orc_column_schema> children)#
-
struct orc_schema#
- #include <orc_metadata.hpp>
Schema of an ORC file.
Public Functions
-
inline orc_schema(orc_column_schema root_column_schema)#
constructor
- Parameters:
root_column_schema – root column
-
inline auto const &root() const &#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema
-
inline auto root() &&#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema Root column schema is moved out of the object (rvalues only).
-
inline orc_schema(orc_column_schema root_column_schema)#
-
class orc_metadata#
- #include <orc_metadata.hpp>
Information about content of an ORC file.
Public Functions
-
inline orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)#
constructor
- Parameters:
schema – ORC schema
num_rows – number of rows
num_stripes – number of stripes
-
inline auto const &schema() const#
Returns the ORC schema.
- Returns:
ORC schema Number of rows in the root column; can vary for nested columns
-
inline auto num_rows() const#
Returns the number of rows of the root column.
If a file contains list columns, nested columns can have a different number of rows.
- Returns:
Number of rows
-
inline auto num_stripes() const#
Returns the number of stripes in the file.
- Returns:
Number of stripes
-
inline orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)#
-
struct parquet_column_schema#
- #include <parquet_metadata.hpp>
Schema of a parquet column, including the nested columns.
Public Functions
-
explicit parquet_column_schema() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline parquet_column_schema(std::string_view name, parquet::TypeKind type, std::vector<parquet_column_schema> children)#
constructor
- Parameters:
name – column name
type – parquet type
children – child columns (empty for non-nested types)
-
inline auto name() const#
Returns parquet column name; can be empty.
- Returns:
Column name
-
inline auto type_kind() const#
Returns parquet type of the column.
- Returns:
Column parquet type
-
inline auto const &children() const &#
Returns schemas of all child columns.
- Returns:
Children schemas
-
inline auto children() &&#
Returns schemas of all child columns.
- Returns:
Children schemas Children array is moved out of the object (rvalues only).
-
inline auto const &child(int idx) const &#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema
-
inline auto child(int idx) &&#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema Child is moved out of the object (rvalues only).
-
inline auto num_children() const#
Returns the number of child columns.
- Returns:
Children count
-
explicit parquet_column_schema() = default#
-
struct parquet_schema#
- #include <parquet_metadata.hpp>
Schema of a parquet file.
Public Functions
-
explicit parquet_schema() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline parquet_schema(parquet_column_schema root_column_schema)#
constructor
- Parameters:
root_column_schema – root column
-
inline auto const &root() const &#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema
-
inline auto root() &&#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema Root column schema is moved out of the object (rvalues only).
-
explicit parquet_schema() = default#
-
class parquet_metadata#
- #include <parquet_metadata.hpp>
Information about content of a parquet file.
Public Types
-
using key_value_metadata = std::unordered_map<std::string, std::string>#
Key-value metadata in the file footer.
-
using row_group_metadata = std::unordered_map<std::string, int64_t>#
row group metadata from each RowGroup element.
Public Functions
-
explicit parquet_metadata() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline parquet_metadata(parquet_schema schema, int64_t num_rows, size_type num_rowgroups, key_value_metadata file_metadata, std::vector<row_group_metadata> rg_metadata)#
constructor
- Parameters:
schema – parquet schema
num_rows – number of rows
num_rowgroups – number of row groups
file_metadata – key-value metadata in the file footer
rg_metadata – vector of maps containing metadata for each row group
-
inline auto const &schema() const#
Returns the parquet schema.
- Returns:
parquet schema
-
inline auto num_rows() const#
Returns the number of rows of the root column.
If a file contains list columns, nested columns can have a different number of rows.
- Returns:
Number of rows
-
inline auto num_rowgroups() const#
Returns the number of rowgroups in the file.
- Returns:
Number of row groups
-
inline auto const &metadata() const#
Returns the Key value metadata in the file footer.
- Returns:
Key value metadata as a map
-
inline auto const &rowgroup_metadata() const#
Returns the row group metadata in the file footer.
- Returns:
vector of row group metadata as maps
-
using key_value_metadata = std::unordered_map<std::string, std::string>#
-
class writer_compression_statistics#
- #include <types.hpp>
Statistics about compression performed by a writer.
Public Functions
-
writer_compression_statistics() = default#
Default constructor.
-
inline writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)#
Constructor with initial values.
- Parameters:
num_compressed_bytes – The number of bytes that were successfully compressed
num_failed_bytes – The number of bytes that failed to compress
num_skipped_bytes – The number of bytes that were skipped during compression
num_compressed_output_bytes – The number of bytes in the compressed output
-
inline writer_compression_statistics &operator+=(writer_compression_statistics const &other) noexcept#
Adds the values from another
writer_compression_statistics
object.- Parameters:
other – The other writer_compression_statistics object
- Returns:
writer_compression_statistics& Reference to this object
-
inline auto num_compressed_bytes() const noexcept#
Returns the number of bytes in blocks that were successfully compressed.
This is the number of bytes that were actually compressed, not the size of the compressed output.
- Returns:
size_t The number of bytes that were successfully compressed
-
inline auto num_failed_bytes() const noexcept#
Returns the number of bytes in blocks that failed to compress.
- Returns:
size_t The number of bytes that failed to compress
-
inline auto num_skipped_bytes() const noexcept#
Returns the number of bytes in blocks that were skipped during compression.
- Returns:
size_t The number of bytes that were skipped during compression
-
inline auto num_total_input_bytes() const noexcept#
Returns the total size of compression inputs.
- Returns:
size_t The total size of compression inputs
-
inline auto compression_ratio() const noexcept#
Returns the compression ratio for the successfully compressed blocks.
Returns nan if there were no successfully compressed blocks.
- Returns:
double The ratio between the size of the compression inputs and the size of the compressed output.
-
writer_compression_statistics() = default#
-
struct column_name_info#
- #include <types.hpp>
Detailed name (and optionally nullability) information for output columns.
The hierarchy of children matches the hierarchy of children in the output cudf columns.
Public Functions
-
inline column_name_info(std::string _name, std::optional<bool> _is_nullable = std::nullopt, std::optional<bool> _is_binary = std::nullopt)#
Construct a column name info with a name, optional nullabilty, and no children.
- Parameters:
_name – Column name
_is_nullable – True if column is nullable
_is_binary – True if column is binary data
-
inline bool operator==(column_name_info const &rhs) const#
Compares two column name info structs for equality.
- Parameters:
rhs – column name info struct to compare against
- Returns:
boolean indicating if this and rhs are equal
Public Members
-
std::string name#
Column name.
-
std::optional<bool> is_nullable#
Column nullability.
-
std::optional<bool> is_binary#
Column is binary (i.e. not a list)
-
std::optional<int32_t> type_length#
Byte width of data (for fixed length data)
-
std::vector<column_name_info> children#
Child column names.
-
inline column_name_info(std::string _name, std::optional<bool> _is_nullable = std::nullopt, std::optional<bool> _is_binary = std::nullopt)#
-
struct table_metadata#
- #include <types.hpp>
Table metadata returned by IO readers.
Public Members
-
std::vector<column_name_info> schema_info#
Detailed name information for the entire output hierarchy.
-
std::vector<size_t> num_rows_per_source#
Number of rows read from each data source. Currently only computed for Parquet readers if no AST filters being used. Empty vector otherwise.
-
std::map<std::string, std::string> user_data#
Format-dependent metadata of the first input file as key-values pairs (deprecated)
-
std::vector<std::unordered_map<std::string, std::string>> per_file_user_data#
Per file format-dependent metadata as key-values pairs.
-
std::vector<column_name_info> schema_info#
-
struct table_with_metadata#
- #include <types.hpp>
Table with table metadata used by io readers to return the metadata by value.
-
struct host_buffer#
- #include <types.hpp>
Non-owning view of a host memory buffer.
- Deprecated:
Since 23.04
Used to describe buffer input in
source_info
objects.Public Functions
-
inline host_buffer(char const *data, size_t size)#
Construct a new host buffer object.
- Parameters:
data – Pointer to the buffer
size – Size of the buffer
-
struct source_info#
- #include <types.hpp>
Source information for read interfaces.
Public Functions
-
inline explicit source_info(std::vector<std::string> const &file_paths)#
Construct a new source info object for multiple files.
- Parameters:
file_paths – Input files paths
-
inline explicit source_info(std::string const &file_path)#
Construct a new source info object for a single file.
- Parameters:
file_path – Single input file
-
inline explicit source_info(std::vector<host_buffer> const &host_buffers)#
Construct a new source info object for multiple buffers in host memory.
- Deprecated:
Since 23.04
- Parameters:
host_buffers – Input buffers in host memory
-
inline explicit source_info(char const *host_data, size_t size)#
Construct a new source info object for a single buffer.
- Deprecated:
Since 23.04
- Parameters:
host_data – Input buffer in host memory
size – Size of the buffer
-
template<typename T>
inline explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)# Construct a new source info object for multiple buffers in host memory.
- Parameters:
host_buffers – Input buffers in host memory
-
template<typename T>
inline explicit source_info(cudf::host_span<T> host_data)# Construct a new source info object for a single buffer.
- Parameters:
host_data – Input buffer in host memory
-
inline explicit source_info(cudf::host_span<cudf::device_span<std::byte const>> device_buffers)#
Construct a new source info object for multiple buffers in device memory.
- Parameters:
device_buffers – Input buffers in device memory
-
inline explicit source_info(cudf::device_span<std::byte const> d_buffer)#
Construct a new source info object from a device buffer.
- Parameters:
d_buffer – Input buffer in device memory
-
inline explicit source_info(std::vector<cudf::io::datasource*> const &sources)#
Construct a new source info object for multiple user-implemented sources.
- Parameters:
sources – User-implemented input sources
-
inline explicit source_info(cudf::io::datasource *source)#
Construct a new source info object for a single user-implemented source.
- Parameters:
source – Single user-implemented Input source
-
inline auto type() const#
Get the type of the input.
- Returns:
The type of the input
-
inline auto const &filepaths() const#
Get the filepaths of the input.
- Returns:
The filepaths of the input
-
inline auto const &host_buffers() const#
Get the host buffers of the input.
- Returns:
The host buffers of the input
-
inline auto const &device_buffers() const#
Get the device buffers of the input.
- Returns:
The device buffers of the input
-
inline auto const &user_sources() const#
Get the user sources of the input.
- Returns:
The user sources of the input
-
inline explicit source_info(std::vector<std::string> const &file_paths)#
-
struct sink_info#
- #include <types.hpp>
Destination information for write interfaces.
Public Functions
-
inline sink_info(size_t num_sinks)#
Construct a new sink info object.
- Parameters:
num_sinks – Number of sinks
-
inline explicit sink_info(std::vector<std::string> const &file_paths)#
Construct a new sink info object for multiple files.
- Parameters:
file_paths – Output files paths
-
inline explicit sink_info(std::string const &file_path)#
Construct a new sink info object for a single file.
- Parameters:
file_path – Single output file path
-
inline explicit sink_info(std::vector<std::vector<char>*> const &buffers)#
Construct a new sink info object for multiple host buffers.
- Parameters:
buffers – Output host buffers
-
inline explicit sink_info(std::vector<char> *buffer)#
Construct a new sink info object for a single host buffer.
- Parameters:
buffer – Single output host buffer
-
inline explicit sink_info(std::vector<cudf::io::data_sink*> const &user_sinks)#
Construct a new sink info object for multiple user-implemented sinks.
- Parameters:
user_sinks – Output user-implemented sinks
-
inline explicit sink_info(class cudf::io::data_sink *user_sink)#
Construct a new sink info object for a single user-implemented sink.
- Parameters:
user_sink – Single output user-implemented sink
-
inline auto type() const#
Get the type of the input.
- Returns:
The type of the input
-
inline auto num_sinks() const#
Get the number of sinks.
- Returns:
The number of sinks
-
inline auto const &filepaths() const#
Get the filepaths of the input.
- Returns:
The filepaths of the input
-
inline auto const &buffers() const#
Get the host buffers of the input.
- Returns:
The host buffers of the input
-
inline auto const &user_sinks() const#
Get the user sinks of the input.
- Returns:
The user sinks of the input
-
inline sink_info(size_t num_sinks)#
-
class column_in_metadata#
- #include <types.hpp>
Metadata for a column.
Public Functions
-
inline column_in_metadata(std::string_view name)#
Construct a new column in metadata object.
- Parameters:
name – Column name
-
inline column_in_metadata &add_child(column_in_metadata const &child)#
Add the children metadata of this column.
- Parameters:
child – The children metadata of this column to add
- Returns:
this for chaining
-
inline column_in_metadata &set_name(std::string const &name) noexcept#
Set the name of this column.
- Parameters:
name – Name of the column
- Returns:
this for chaining
-
inline column_in_metadata &set_nullability(bool nullable) noexcept#
Set the nullability of this column.
- Parameters:
nullable – Whether this column is nullable
- Returns:
this for chaining
-
inline column_in_metadata &set_list_column_as_map() noexcept#
Specify that this list column should be encoded as a map in the written file.
The column must have the structure list<struct<key, value>>. This option is invalid otherwise
- Returns:
this for chaining
-
inline column_in_metadata &set_int96_timestamps(bool req) noexcept#
Specifies whether this timestamp column should be encoded using the deprecated int96 physical type. Only valid for the following column types: timestamp_s, timestamp_ms, timestamp_us, timestamp_ns.
- Parameters:
req – True = use int96 physical type. False = use int64 physical type
- Returns:
this for chaining
-
inline column_in_metadata &set_decimal_precision(uint8_t precision) noexcept#
Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.
- Parameters:
precision – The integer precision to set for this decimal column
- Returns:
this for chaining
-
inline column_in_metadata &set_type_length(int32_t length) noexcept#
Set the data length of the column. Only valid if this column is a fixed-length byte array.
- Parameters:
length – The data length to set for this column
- Returns:
this for chaining
-
inline column_in_metadata &set_parquet_field_id(int32_t field_id) noexcept#
Set the parquet field id of this column.
- Parameters:
field_id – The parquet field id to set
- Returns:
this for chaining
-
inline column_in_metadata &set_output_as_binary(bool binary) noexcept#
Specifies whether this column should be written as binary or string data Only valid for the following column types: string.
- Parameters:
binary – True = use binary data type. False = use string data type
- Returns:
this for chaining
-
inline column_in_metadata &set_skip_compression(bool skip) noexcept#
Specifies whether this column should not be compressed regardless of the compression codec specified for the file.
- Parameters:
skip – If
true
do not compress this column- Returns:
this for chaining
-
inline column_in_metadata &set_encoding(column_encoding encoding) noexcept#
Sets the encoding to use for this column.
This is just a request, and the encoder may still choose to use a different encoding depending on resource constraints. Use the constants defined in the
parquet_encoding
struct.- Parameters:
encoding – The encoding to use
- Returns:
this for chaining
-
inline column_in_metadata &child(size_type i) noexcept#
Get reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline column_in_metadata const &child(size_type i) const noexcept#
Get const reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline std::string get_name() const noexcept#
Get the name of this column.
- Returns:
The name of this column
-
inline bool is_nullability_defined() const noexcept#
Get whether nullability has been explicitly set for this column.
- Returns:
Boolean indicating whether nullability has been explicitly set for this column
-
inline bool nullable() const#
Gets the explicitly set nullability for this column.
- Throws:
std::bad_optional_access – If nullability is not explicitly defined for this column. Check using
is_nullability_defined()
first.- Returns:
Boolean indicating whether this column is nullable
-
inline bool is_map() const noexcept#
If this is the metadata of a list column, returns whether it is to be encoded as a map.
- Returns:
Boolean indicating whether this column is to be encoded as a map
-
inline bool is_enabled_int96_timestamps() const noexcept#
Get whether to encode this timestamp column using deprecated int96 physical type.
- Returns:
Boolean indicating whether to encode this timestamp column using deprecated int96 physical type
-
inline bool is_decimal_precision_set() const noexcept#
Get whether precision has been set for this decimal column.
- Returns:
Boolean indicating whether precision has been set for this decimal column
-
inline uint8_t get_decimal_precision() const#
Get the decimal precision that was set for this column.
- Throws:
std::bad_optional_access – If decimal precision was not set for this column. Check using
is_decimal_precision_set()
first.- Returns:
The decimal precision that was set for this column
-
inline bool is_type_length_set() const noexcept#
Get whether type length has been set for this column.
- Returns:
Boolean indicating whether type length has been set for this column
-
inline uint8_t get_type_length() const#
Get the type length that was set for this column.
- Throws:
std::bad_optional_access – If type length was not set for this column. Check using
is_type_length_set()
first.- Returns:
The decimal precision that was set for this column
-
inline bool is_parquet_field_id_set() const noexcept#
Get whether parquet field id has been set for this column.
- Returns:
Boolean indicating whether parquet field id has been set for this column
-
inline int32_t get_parquet_field_id() const#
Get the parquet field id that was set for this column.
- Throws:
std::bad_optional_access – If parquet field id was not set for this column. Check using
is_parquet_field_id_set()
first.- Returns:
The parquet field id that was set for this column
-
inline size_type num_children() const noexcept#
Get the number of children of this column.
- Returns:
The number of children of this column
-
inline bool is_enabled_output_as_binary() const noexcept#
Get whether to encode this column as binary or string data.
- Returns:
Boolean indicating whether to encode this column as binary data
-
inline bool is_enabled_skip_compression() const noexcept#
Get whether to skip compressing this column.
- Returns:
Boolean indicating whether to skip compression of this column
-
inline column_encoding get_encoding() const#
Get the encoding that was set for this column.
- Returns:
The encoding that was set for this column
-
inline column_in_metadata(std::string_view name)#
-
class table_input_metadata#
- #include <types.hpp>
Metadata for a table.
Public Functions
-
explicit table_input_metadata(table_view const &table)#
Construct a new table_input_metadata from a table_view.
The constructed table_input_metadata has the same structure as the passed table_view
- Parameters:
table – The table_view to construct metadata for
-
explicit table_input_metadata(table_metadata const &metadata)#
Construct a new table_input_metadata from a table_metadata object.
The constructed table_input_metadata has the same structure, column names and nullability as the passed table_metadata.
- Parameters:
metadata – The table_metadata to construct table_intput_metadata for
Public Members
-
std::vector<column_in_metadata> column_metadata#
List of column metadata.
-
explicit table_input_metadata(table_view const &table)#
-
struct partition_info#
- #include <types.hpp>
Information used while writing partitioned datasets.
This information defines the slice of an input table to write to file. In partitioned dataset writing, one partition_info struct defines one partition and corresponds to one output file
Public Functions
-
inline partition_info(size_type start_row, size_type num_rows)#
Construct a new partition_info.
- Parameters:
start_row – The start row of the partition
num_rows – The number of rows in the partition
-
inline partition_info(size_type start_row, size_type num_rows)#
-
class reader_column_schema#
- #include <types.hpp>
schema element for reader
Public Functions
-
inline reader_column_schema(size_type number_of_children)#
Construct a new reader column schema object.
- Parameters:
number_of_children – number of child schema objects to default construct
-
inline reader_column_schema(host_span<reader_column_schema> const &child_span)#
Construct a new reader column schema object with a span defining the children.
- Parameters:
child_span – span of child schema objects
-
inline reader_column_schema &add_child(reader_column_schema const &child)#
Add the children metadata of this column.
- Parameters:
child – The children metadata of this column to add
- Returns:
this for chaining
-
inline reader_column_schema &child(size_type i)#
Get reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline reader_column_schema const &child(size_type i) const#
Get const reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline reader_column_schema &set_convert_binary_to_strings(bool convert_to_string)#
Specifies whether this column should be written as binary or string data Only valid for the following column types: string, list<int8>
- Parameters:
convert_to_string – True = convert binary to strings False = return binary
- Returns:
this for chaining
-
inline reader_column_schema &set_type_length(int32_t type_length)#
Sets the length of fixed length data.
- Parameters:
type_length – Size of the data type in bytes
- Returns:
this for chaining
-
inline bool is_enabled_convert_binary_to_strings() const#
Get whether to encode this column as binary or string data.
- Returns:
Boolean indicating whether to encode this column as binary data
-
inline int32_t get_type_length() const#
Get the length in bytes of this fixed length data.
- Returns:
The length in bytes of the data type
-
inline size_t get_num_children() const#
Get the number of child objects.
- Returns:
number of children
-
inline reader_column_schema(size_type number_of_children)#
-
using no_statistics = std::monostate#