Io Types#

group IO Types

Typedefs

using no_statistics = std::monostate#

Monostate type alias for the statistics variant.

using date_statistics = minmax_statistics<int32_t>#

Statistics for date(time) columns.

using binary_statistics = sum_statistics<int64_t>#

Statistics for binary columns.

The sum is the total number of bytes across all elements.

using statistics_type = std::variant<no_statistics, integer_statistics, double_statistics, string_statistics, bucket_statistics, decimal_statistics, date_statistics, binary_statistics, timestamp_statistics>#

Variant type for ORC type-specific column statistics.

The variant can hold any of the supported column statistics types.

Enums

enum CompressionKind#

Identifies a compression algorithm.

Values:

enumerator NONE#
enumerator ZLIB#
enumerator SNAPPY#
enumerator LZO#
enumerator LZ4#
enumerator ZSTD#
enum TypeKind#

Identifies a data type in an orc file.

Values:

enumerator INVALID_TYPE_KIND#
enumerator BOOLEAN#
enumerator BYTE#
enumerator SHORT#
enumerator INT#
enumerator LONG#
enumerator FLOAT#
enumerator DOUBLE#
enumerator STRING#
enumerator BINARY#
enumerator TIMESTAMP#
enumerator LIST#
enumerator MAP#
enumerator STRUCT#
enumerator UNION#
enumerator DECIMAL#
enumerator DATE#
enumerator VARCHAR#
enumerator CHAR#
enum StreamKind#

Identifies the type of data stream.

Values:

enumerator INVALID_STREAM_KIND#
enumerator PRESENT#
enumerator DATA#
enumerator LENGTH#
enumerator DICTIONARY_DATA#
enumerator DICTIONARY_COUNT#
enumerator SECONDARY#
enumerator ROW_INDEX#
enumerator BLOOM_FILTER#
enumerator BLOOM_FILTER_UTF8#
enum ColumnEncodingKind#

Identifies the encoding of columns.

Values:

enumerator INVALID_ENCODING_KIND#
enumerator DIRECT#
enumerator DICTIONARY#
enumerator DIRECT_V2#
enumerator DICTIONARY_V2#
enum ProtofType#

Identifies the type of encoding in a protocol buffer.

Values:

enumerator VARINT#
enumerator FIXED64#
enumerator FIXEDLEN#
enumerator START_GROUP#
enumerator END_GROUP#
enumerator FIXED32#
enumerator INVALID_6#
enumerator INVALID_7#
enum class Type : int8_t#

Basic data types in Parquet, determines how data is physically stored.

Values:

enumerator UNDEFINED#
enumerator BOOLEAN#
enumerator INT32#
enumerator INT64#
enumerator INT96#
enumerator FLOAT#
enumerator DOUBLE#
enumerator BYTE_ARRAY#
enumerator FIXED_LEN_BYTE_ARRAY#
enum class ConvertedType : int8_t#

High-level data types in Parquet, determines how data is logically interpreted.

Values:

enumerator UNKNOWN#
enumerator UTF8#
enumerator MAP#
enumerator MAP_KEY_VALUE#
enumerator LIST#
enumerator ENUM#
enumerator DECIMAL#
enumerator DATE#
enumerator TIME_MILLIS#
enumerator TIME_MICROS#
enumerator TIMESTAMP_MILLIS#
enumerator TIMESTAMP_MICROS#
enumerator UINT_8#
enumerator UINT_16#
enumerator UINT_32#
enumerator UINT_64#
enumerator INT_8#
enumerator INT_16#
enumerator INT_32#
enumerator INT_64#
enumerator JSON#
enumerator BSON#
enumerator INTERVAL#
enumerator NA#
enum class Encoding : uint8_t#

Encoding types for the actual data stream.

Values:

enumerator PLAIN#
enumerator GROUP_VAR_INT#
enumerator PLAIN_DICTIONARY#
enumerator RLE#
enumerator BIT_PACKED#
enumerator DELTA_BINARY_PACKED#
enumerator DELTA_LENGTH_BYTE_ARRAY#
enumerator DELTA_BYTE_ARRAY#
enumerator RLE_DICTIONARY#
enumerator BYTE_STREAM_SPLIT#
enumerator NUM_ENCODINGS#
enum class Compression : uint8_t#

Compression codec used for compressed data pages.

Values:

enumerator UNCOMPRESSED#
enumerator SNAPPY#
enumerator GZIP#
enumerator LZO#
enumerator BROTLI#
enumerator LZ4#
enumerator ZSTD#
enumerator LZ4_RAW#
enum class FieldRepetitionType : int8_t#

Compression codec used for compressed data pages.

Values:

enumerator UNSPECIFIED#
enumerator REQUIRED#
enumerator OPTIONAL#
enumerator REPEATED#
enum class PageType : uint8_t#

Types of pages.

Values:

enumerator DATA_PAGE#
enumerator INDEX_PAGE#
enumerator DICTIONARY_PAGE#
enumerator DATA_PAGE_V2#
enum class BoundaryOrder : uint8_t#

Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so, in which direction.

Values:

enumerator UNORDERED#
enumerator ASCENDING#
enumerator DESCENDING#
enum class FieldType : uint8_t#

Thrift compact protocol struct field types.

Values:

enumerator BOOLEAN_TRUE#
enumerator BOOLEAN_FALSE#
enumerator I8#
enumerator I16#
enumerator I32#
enumerator I64#
enumerator DOUBLE#
enumerator BINARY#
enumerator LIST#
enumerator SET#
enumerator MAP#
enumerator STRUCT#
enumerator UUID#
enum class compression_type : int32_t#

Compression algorithms.

Values:

enumerator NONE#

No compression.

enumerator AUTO#

Automatically detect or select compression format.

enumerator SNAPPY#

Snappy format, using byte-oriented LZ77.

enumerator GZIP#

GZIP format, using DEFLATE algorithm.

enumerator BZIP2#

BZIP2 format, using Burrows-Wheeler transform.

enumerator BROTLI#

BROTLI format, using LZ77 + Huffman + 2nd order context modeling.

enumerator ZIP#

ZIP format, using DEFLATE algorithm.

enumerator XZ#

XZ format, using LZMA(2) algorithm.

enumerator ZLIB#

ZLIB format, using DEFLATE algorithm.

enumerator LZ4#

LZ4 format, using LZ77.

enumerator LZO#

Lempel–Ziv–Oberhumer format.

enumerator ZSTD#

Zstandard format.

enum class io_type : int32_t#

Data source or destination types.

Values:

enumerator FILEPATH#

Input/output is a file path.

enumerator HOST_BUFFER#

Input/output is a buffer in host memory.

enumerator DEVICE_BUFFER#

Input/output is a buffer in device memory.

enumerator VOID#

Input/output is nothing. No work is done. Useful for benchmarking.

enumerator USER_IMPLEMENTED#

Input/output is handled by a custom user class.

enum class quote_style : int32_t#

Behavior when handling quotations in field data.

Values:

enumerator MINIMAL#

Quote only fields which contain special characters.

enumerator ALL#

Quote all fields.

enumerator NONNUMERIC#

Quote all non-numeric fields.

enumerator NONE#

Never quote fields; disable quotation parsing.

enum statistics_freq#

Column statistics granularity type for parquet/orc writers.

Values:

enumerator STATISTICS_NONE#

No column statistics.

enumerator STATISTICS_ROWGROUP#

Per-Rowgroup column statistics.

enumerator STATISTICS_PAGE#

Per-page column statistics.

enumerator STATISTICS_COLUMN#

Full column and offset indices. Implies STATISTICS_ROWGROUP.

enum class column_encoding : int32_t#

Valid encodings for use with column_in_metadata::set_encoding()

Values:

enumerator USE_DEFAULT#

No encoding has been requested, use default encoding.

enumerator DICTIONARY#

Use dictionary encoding.

enumerator PLAIN#

Use plain encoding.

enumerator DELTA_BINARY_PACKED#

Use DELTA_BINARY_PACKED encoding (only valid for integer columns)

enumerator DELTA_LENGTH_BYTE_ARRAY#

Use DELTA_LENGTH_BYTE_ARRAY encoding (only valid for BYTE_ARRAY columns)

enumerator DELTA_BYTE_ARRAY#

Use DELTA_BYTE_ARRAY encoding (only valid for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)

enumerator BYTE_STREAM_SPLIT#

Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)

enumerator DIRECT#

Use DIRECT encoding.

enumerator DIRECT_V2#

Use DIRECT_V2 encoding.

enumerator DICTIONARY_V2#

Use DICTIONARY_V2 encoding.

enum dictionary_policy#

Control use of dictionary encoding for parquet writer.

Values:

enumerator NEVER#

Never use dictionary encoding.

enumerator ADAPTIVE#

Use dictionary when it will not impact compression.

enumerator ALWAYS#

Use dictionary regardless of impact on compression.

Functions

template<typename T>
inline constexpr auto is_byte_like_type()#

Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.

Template Parameters:

T – The representation type

Returns:

true if the type is considered a byte-like type

struct raw_orc_statistics#
#include <orc_metadata.hpp>

Holds column names and buffers containing raw file-level and stripe-level statistics.

The buffers can be parsed using a Protobuf parser. Alternatively, use parsed_orc_statistics to get the statistics parsed into a libcudf representation.

The column_names and file_stats members contain one element per column. The stripes_stats contains one element per stripe, where each element contains column statistics for each column.

Public Members

std::vector<std::string> column_names#

Column names.

std::vector<std::string> file_stats#

File-level statistics for each column.

std::vector<std::vector<std::string>> stripes_stats#

Stripe-level statistics for each column.

template<typename T>
struct minmax_statistics#
#include <orc_metadata.hpp>

Base class for column statistics that include optional minimum and maximum.

Includes accessors for the minimum and maximum values.

Public Members

std::optional<T> minimum#

Minimum value.

std::optional<T> maximum#

Maximum value.

template<typename T>
struct sum_statistics#
#include <orc_metadata.hpp>

Base class for column statistics that include an optional sum.

Includes accessors for the sum value.

Public Members

std::optional<T> sum#

Sum of values in column.

struct integer_statistics : public cudf::io::minmax_statistics<int64_t>, public cudf::io::sum_statistics<int64_t>#
#include <orc_metadata.hpp>

Statistics for integral columns.

struct double_statistics : public cudf::io::minmax_statistics<double>, public cudf::io::sum_statistics<double>#
#include <orc_metadata.hpp>

Statistics for floating point columns.

struct string_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<int64_t>#
#include <orc_metadata.hpp>

Statistics for string columns.

The minimum and maximum are the first and last elements, respectively, in lexicographical order. The sum is the total length of elements in the column. Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value

struct bucket_statistics#
#include <orc_metadata.hpp>

Statistics for boolean columns.

The count array contains the count of true values.

Public Members

std::vector<uint64_t> count#

count of true values

struct decimal_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<std::string>#
#include <orc_metadata.hpp>

Statistics for decimal columns.

struct timestamp_statistics : public cudf::io::minmax_statistics<int64_t>#
#include <orc_metadata.hpp>

Statistics for timestamp columns.

The minimum and maximum min/max elements in the column, as the number of milliseconds since the UNIX epoch. The minimum_utc and maximum_utc are the same values adjusted to UTC.

Public Members

std::optional<int64_t> minimum_utc#

minimum in milliseconds

std::optional<int64_t> maximum_utc#

maximum in milliseconds

std::optional<uint32_t> minimum_nanos#

nanoseconds part of the minimum

std::optional<uint32_t> maximum_nanos#

nanoseconds part of the maximum

struct column_statistics#
#include <orc_metadata.hpp>

Contains per-column ORC statistics.

All columns can have the number_of_values statistics. Depending on the data type, a column can have additional statistics, accessible through type_specific_stats accessor.

Public Functions

column_statistics(orc::detail::column_statistics &&detail_statistics)#

Construct a new column statistics object.

Parameters:

detail_statistics – The statistics to initialize the object with

Public Members

std::optional<uint64_t> number_of_values#

number of statistics

std::optional<bool> has_null#

column has any nulls

statistics_type type_specific_stats#

type-specific statistics

struct parsed_orc_statistics#
#include <orc_metadata.hpp>

Holds column names and parsed file-level and stripe-level statistics.

The column_names and file_stats members contain one element per column. The stripes_stats member contains one element per stripe, where each element contains column statistics for each column.

Public Members

std::vector<std::string> column_names#

column names

std::vector<column_statistics> file_stats#

file-level statistics

std::vector<std::vector<column_statistics>> stripes_stats#

stripe-level statistics

struct orc_column_schema#
#include <orc_metadata.hpp>

Schema of an ORC column, including the nested columns.

Public Functions

inline orc_column_schema(std::string_view name, orc::TypeKind type, std::vector<orc_column_schema> children)#

constructor

Parameters:
  • name – column name

  • type – ORC type

  • children – child columns (empty for non-nested types)

inline auto name() const#

Returns ORC column name; can be empty.

Returns:

Column name

inline auto type_kind() const#

Returns ORC type of the column.

Returns:

Column ORC type

inline auto const &children() const &#

Returns schemas of all child columns.

Returns:

Children schemas

inline auto children() &&#

Returns schemas of all child columns.

Returns:

Children schemas Children array is moved out of the object (rvalues only).

inline auto const &child(int idx) const &#

Returns schema of the child with the given index.

Parameters:

idx – child index

Returns:

Child schema

inline auto child(int idx) &&#

Returns schema of the child with the given index.

Parameters:

idx – child index

Returns:

Child schema Child is moved out of the object (rvalues only).

inline auto num_children() const#

Returns the number of child columns.

Returns:

Children count

struct orc_schema#
#include <orc_metadata.hpp>

Schema of an ORC file.

Public Functions

inline orc_schema(orc_column_schema root_column_schema)#

constructor

Parameters:

root_column_schema – root column

inline auto const &root() const &#

Returns the schema of the struct column that contains all columns as fields.

Returns:

Root column schema

inline auto root() &&#

Returns the schema of the struct column that contains all columns as fields.

Returns:

Root column schema Root column schema is moved out of the object (rvalues only).

class orc_metadata#
#include <orc_metadata.hpp>

Information about content of an ORC file.

Public Functions

inline orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)#

constructor

Parameters:
  • schema – ORC schema

  • num_rows – number of rows

  • num_stripes – number of stripes

inline auto const &schema() const#

Returns the ORC schema.

Returns:

ORC schema Number of rows in the root column; can vary for nested columns

inline auto num_rows() const#

Returns the number of rows of the root column.

If a file contains list columns, nested columns can have a different number of rows.

Returns:

Number of rows

inline auto num_stripes() const#

Returns the number of stripes in the file.

Returns:

Number of stripes

struct parquet_column_schema#
#include <parquet_metadata.hpp>

Schema of a parquet column, including the nested columns.

Public Functions

explicit parquet_column_schema() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline parquet_column_schema(std::string_view name, Type type, std::vector<parquet_column_schema> children)#

constructor

Parameters:
  • name – column name

  • type – parquet type

  • children – child columns (empty for non-nested types)

inline auto name() const#

Returns parquet column name; can be empty.

Returns:

Column name

inline auto type() const#

Returns parquet physical type of the column.

Returns:

Column parquet physical type

inline auto const &children() const &#

Returns schemas of all child columns.

Returns:

Children schemas

inline auto children() &&#

Returns schemas of all child columns.

Returns:

Children schemas Children array is moved out of the object (rvalues only)

inline auto const &child(int idx) const &#

Returns schema of the child with the given index.

Parameters:

idx – child index

Returns:

Child schema

inline auto child(int idx) &&#

Returns schema of the child with the given index.

Parameters:

idx – child index

Returns:

Child schema Child is moved out of the object (rvalues only)

inline auto num_children() const#

Returns the number of child columns.

Returns:

Children count

struct parquet_schema#
#include <parquet_metadata.hpp>

Schema of a parquet file.

Public Functions

explicit parquet_schema() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack

inline parquet_schema(parquet_column_schema root_column_schema)#

constructor

Parameters:

root_column_schema – root column

inline auto const &root() const &#

Returns the schema of the struct column that contains all columns as fields.

Returns:

Root column schema

inline auto root() &&#

Returns the schema of the struct column that contains all columns as fields.

Returns:

Root column schema Root column schema is moved out of the object (rvalues only)

class parquet_metadata#
#include <parquet_metadata.hpp>

Information about content of a parquet file.

Public Types

using key_value_metadata = std::unordered_map<std::string, std::string>#

Key-value metadata in the file footer.

using row_group_metadata = std::unordered_map<std::string, int64_t>#

Row group metadata from each RowGroup element.

using column_chunk_metadata = std::unordered_map<std::string, std::vector<int64_t>>#

Column chunk metadata from each ColumnChunkMetaData element.

Public Functions

explicit parquet_metadata() = default#

Default constructor.

This has been added since Cython requires a default constructor to create objects on stack.

inline parquet_metadata(parquet_schema schema, int64_t num_rows, size_type num_rowgroups, std::vector<size_type> num_rowgroups_per_file, key_value_metadata file_metadata, std::vector<row_group_metadata> rg_metadata, column_chunk_metadata column_chunk_metadata)#

constructor

Parameters:
  • schema – parquet schema

  • num_rows – number of rows

  • num_rowgroups – total number of row groups

  • num_rowgroups_per_file – number of row groups per file

  • file_metadata – key-value metadata in the file footer

  • rg_metadata – vector of maps containing metadata for each row group

  • column_chunk_metadata – map of column names to vectors of total_uncompressed_size metadata from all their column chunks

inline auto const &schema() const#

Returns the parquet schema.

Returns:

parquet schema

inline auto num_rows() const#

Returns the number of rows of the root column.

If a file contains list columns, nested columns can have a different number of rows.

Returns:

Number of rows

inline auto num_rowgroups() const#

Returns the total number of rowgroups.

Returns:

Total number of row groups

inline auto const &num_rowgroups_per_file() const#

Returns the number of rowgroups in each file.

Returns:

Number of row groups per file

inline auto const &metadata() const#

Returns the Key value metadata in the file footer.

Returns:

Key value metadata as a map

inline auto const &rowgroup_metadata() const#

Returns the row group metadata in the file footer.

Returns:

Vector of row group metadata as maps

inline auto const &columnchunk_metadata() const#

Returns a map of column names to vectors of total_uncompressed_size metadata from all their column chunks.

Returns:

Map of column names to vectors of total_uncompressed_size metadata from all their column chunks

struct file_header_s#
#include <parquet_schema.hpp>

Struct that describes the Parquet file data header.

Public Members

uint32_t magic#

Parquet 4-byte magic number “PAR1”.

struct file_ender_s#
#include <parquet_schema.hpp>

Struct that describes the Parquet file data postscript.

Public Members

uint32_t footer_len#

Length of the footer.

uint32_t magic#

Parquet 4-byte magic number “PAR1”.

struct DecimalType#
#include <parquet_schema.hpp>

Struct that describes the decimal logical type annotation.

Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.

Public Members

int32_t scale = 0#

Scale must be zero or a positive integer less than or equal to the precision.

int32_t precision = 0#

Precision must be a non-zero positive integer.

struct TimeUnit#
#include <parquet_schema.hpp>

Time units for temporal logical types.

Public Types

enum Type#

Available time units.

Values:

enumerator UNDEFINED#
enumerator MILLIS#
enumerator MICROS#
enumerator NANOS#

Public Members

Type type#

Time unit type.

struct TimeType#
#include <parquet_schema.hpp>

Struct that describes the time logical type annotation.

Allowed for physical types: INT32 (millis), INT64 (micros, nanos)

Public Members

bool isAdjustedToUTC = true#

Default to true because the timestamps are implicitly in UTC. Writer option overrides this to default

TimeUnit unit = {TimeUnit::Type::MILLIS}#

Time unit type.

struct TimestampType#
#include <parquet_schema.hpp>

Struct that describes the timestamp logical type annotation.

Allowed for physical types: INT64

Public Members

bool isAdjustedToUTC = true#

Default to true because the timestamps are implicitly in UTC. Writer option overrides this to default

TimeUnit unit = {TimeUnit::Type::MILLIS}#

Timestamp’s time unit.

struct IntType#
#include <parquet_schema.hpp>

Struct that describes the integer logical type annotation.

Allowed for physical types: INT32, INT64

Public Members

int8_t bitWidth = 0#

bitWidth must be 8, 16, 32, or 64.

bool isSigned = false#

Whether the integer is signed.

struct LogicalType#
#include <parquet_schema.hpp>

Struct that describes the logical type annotation.

Public Types

enum Type#

Logical type annotations to replace ConvertedType.

Values:

enumerator UNDEFINED#
enumerator STRING#
enumerator MAP#
enumerator LIST#
enumerator ENUM#
enumerator DECIMAL#
enumerator DATE#
enumerator TIME#
enumerator TIMESTAMP#
enumerator INTEGER#
enumerator UNKNOWN#
enumerator JSON#
enumerator BSON#

Public Functions

inline LogicalType(Type tp = Type::UNDEFINED)#

Default constructor.

Parameters:

tp – Logical type

inline LogicalType(DecimalType &&dt)#

Constructor for Decimal logical type.

Parameters:

dt – Decimal type

inline LogicalType(TimeType &&tt)#

Constructor for Time logical type.

Parameters:

tt – Time type

inline LogicalType(TimestampType &&tst)#

Constructor for Timestamp logical type.

Parameters:

tst – Timestamp type

inline LogicalType(IntType &&it)#

Constructor for Integer logical type.

Parameters:

it – Integer type

inline constexpr bool is_time_millis() const#

Check if the time is in milliseconds.

Returns:

True if the time is in milliseconds, false otherwise

inline constexpr bool is_time_micros() const#

Check if the time is in microseconds.

Returns:

True if the time is in microseconds, false otherwise

inline constexpr bool is_time_nanos() const#

Check if the time is in nanoseconds.

Returns:

True if the time is in nanoseconds, false otherwise

inline constexpr bool is_timestamp_millis() const#

Check if the timestamp is in milliseconds.

Returns:

True if the timestamp is in milliseconds, false otherwise

inline constexpr bool is_timestamp_micros() const#

Check if the timestamp is in microseconds.

Returns:

True if the timestamp is in microseconds, false otherwise

inline constexpr bool is_timestamp_nanos() const#

Check if the timestamp is in nanoseconds.

Returns:

True if the timestamp is in nanoseconds, false otherwise

inline constexpr int8_t bit_width() const#

Get the bit width of the integer type.

Returns:

The bit width of the integer type, or -1 if the type is not an integer

inline constexpr bool is_signed() const#

Check if the integer is signed.

Returns:

True if the integer is signed, false otherwise

inline constexpr int32_t scale() const#

Get the scale of the decimal type.

Returns:

The scale of the decimal type, or -1 if the type is not a decimal

inline constexpr int32_t precision() const#

Get the precision of the decimal type.

Returns:

The precision of the decimal type, or -1 if the type is not a decimal

Public Members

Type type#

Logical type.

cuda::std::optional<DecimalType> decimal_type#

Decimal type.

cuda::std::optional<TimeType> time_type#

Time type.

cuda::std::optional<TimestampType> timestamp_type#

Timestamp type.

cuda::std::optional<IntType> int_type#

Integer type.

struct ColumnOrder#
#include <parquet_schema.hpp>

Union to specify the order used for the min_value and max_value fields for a column.

Public Types

enum Type#

Available column order types.

Values:

enumerator UNDEFINED#
enumerator TYPE_ORDER#

Public Members

Type type#

Column order type.

struct SchemaElement#
#include <parquet_schema.hpp>

Struct for describing an element/field in the Parquet format schema.

Parquet is a strongly-typed format so the file layout can be interpreted as as a schema tree.

Public Functions

inline bool operator==(SchemaElement const &other) const#

Check if two schema elements are equal.

Parameters:

other – The other schema element to compare to

Returns:

True if the two schema elements are equal, false otherwise

inline bool is_stub() const#

Check if the schema element is a stub.

Returns:

True if the schema element is a stub, false otherwise

inline bool is_one_level_list(SchemaElement const &parent) const#

Check if the schema element is a one-level list.

apache/parquet-cpp One-level LIST encoding: Only allows required lists with required cells: repeated value_type name

Parameters:

parent – The parent schema element

Returns:

True if the schema element is a one-level list, false otherwise

inline bool is_list() const#

Check if the schema element is a list.

Returns:

True if the schema element is a list, false otherwise

inline bool is_struct() const#

Check if the schema element is a struct.

In parquet terms, a group is a level of nesting in the schema. a group can be a struct or a list

Returns:

True if the schema element is a struct, false otherwise

Public Members

Type type = Type::UNDEFINED#

1: parquet physical type for output

int32_t type_length = 0#

2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types

FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED#

3: repetition of the field

std::string name = ""#

4: name of the field

int32_t num_children = 0#

5: nested fields

std::optional<ConvertedType> converted_type#

6: DEPRECATED: record the original type before conversion to parquet type

int32_t decimal_scale = 0#

7: DEPRECATED: record the scale for DECIMAL converted type

int32_t decimal_precision = 0#

8: DEPRECATED: record the precision for DECIMAL converted type

std::optional<int32_t> field_id#

9: save field_id from original schema

std::optional<LogicalType> logical_type#

10: replaces converted type

bool output_as_byte_array = false#

extra cudf specific fields

std::optional<type_id> arrow_type#

cudf type determined from arrow:schema

int max_definition_level = 0#

Maximum definition level.

int max_repetition_level = 0#

Maximum repetition level.

size_type parent_idx = 0#

Parent index.

std::vector<size_type> children_idx#

Children indices.

struct Statistics#
#include <parquet_schema.hpp>

Thrift-derived struct describing column chunk statistics.

Public Members

std::optional<std::vector<uint8_t>> max#

deprecated max value in signed comparison order

std::optional<std::vector<uint8_t>> min#

deprecated min value in signed comparison order

std::optional<int64_t> null_count#

count of null values in the column

std::optional<int64_t> distinct_count#

count of distinct values occurring

std::optional<std::vector<uint8_t>> max_value#

max value for column determined by ColumnOrder

std::optional<std::vector<uint8_t>> min_value#

min value for column determined by ColumnOrder

std::optional<bool> is_max_value_exact#

If true, max_value is the actual maximum value for a column.

std::optional<bool> is_min_value_exact#

If true, min_value is the actual minimum value for a column.

struct SizeStatistics#
#include <parquet_schema.hpp>

Thrift-derived struct containing statistics used to estimate page and column chunk sizes.

Public Members

std::optional<int64_t> unencoded_byte_array_data_bytes#

Number of variable-width bytes stored for the page/chunk. Should not be set for anything but the BYTE_ARRAY physical type.

std::optional<std::vector<int64_t>> repetition_level_histogram#

When present, there is expected to be one element corresponding to each repetition (i.e. size=max repetition_level+1) where each element represents the number of times the repetition level was observed in the data.

This value should not be written if max_repetition_level is 0.

std::optional<std::vector<int64_t>> definition_level_histogram#

Same as repetition_level_histogram except for definition levels.

This value should not be written if max_definition_level is 0 or 1.

struct PageLocation#
#include <parquet_schema.hpp>

Thrift-derived struct describing page location information stored in the offsets index.

Public Members

int64_t offset#

Offset of the page in the file.

int32_t compressed_page_size#

Compressed page size in bytes plus the heeader length.

int64_t first_row_index#

Index within the column chunk of the first row of the page. reset to 0 at the beginning of each column chunk

struct OffsetIndex#
#include <parquet_schema.hpp>

Thrift-derived struct describing the offset index.

Public Members

std::vector<PageLocation> page_locations#

Page locations.

std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes#

per-page size info. see description of the same field in SizeStatistics. only present for columns with a BYTE_ARRAY physical type.

struct ColumnIndex#
#include <parquet_schema.hpp>

Thrift-derived struct describing the column index.

Public Members

std::vector<bool> null_pages#

Boolean used to determine if a page contains only null values.

std::vector<std::vector<uint8_t>> min_values#

Lower bound for values in each page.

std::vector<std::vector<uint8_t>> max_values#

Upper bound for values in each page.

BoundaryOrder boundary_order = BoundaryOrder::UNORDERED#

Indicates if min and max values are ordered.

std::optional<std::vector<int64_t>> null_counts#

Optional count of null values per page.

std::optional<std::vector<int64_t>> repetition_level_histogram#

Repetition level histogram for the column chunk.

std::optional<std::vector<int64_t>> definition_level_histogram#

Definition level histogram for the column chunk.

struct PageEncodingStats#
#include <parquet_schema.hpp>

Thrift-derived struct describing page encoding statistics.

Public Members

PageType page_type#

The page type (data/dic/…)

Encoding encoding#

Encoding of the page.

int32_t count#

Number of pages of this type with this encoding.

struct SortingColumn#
#include <parquet_schema.hpp>

Thrift-derived struct describing column sort order.

Public Members

int32_t column_idx#

The column index (in this row group)

bool descending#

If true, indicates this column is sorted in descending order.

bool nulls_first#

If true, nulls will come before non-null values.

struct ColumnChunkMetaData#
#include <parquet_schema.hpp>

Thrift-derived struct describing a column chunk.

Public Members

Type type = Type::BOOLEAN#

Type of this column.

std::vector<Encoding> encodings#

Set of all encodings used for this column. The purpose is to validate whether we can decode those pages.

std::vector<std::string> path_in_schema#

Path in schema.

Compression codec = Compression::UNCOMPRESSED#

Compression codec.

int64_t num_values = 0#

Number of values in this column.

int64_t total_uncompressed_size = 0#

Total byte size of all uncompressed pages in this column chunk (including the headers)

int64_t total_compressed_size = 0#

Total byte size of all compressed pages in this column chunk (including the headers)

int64_t data_page_offset = 0#

Byte offset from beginning of file to first data page.

int64_t index_page_offset = 0#

Byte offset from beginning of file to root index page.

int64_t dictionary_page_offset = 0#

Byte offset from the beginning of file to first (only) dictionary page.

Statistics statistics#

Optional statistics for this column chunk.

std::optional<std::vector<PageEncodingStats>> encoding_stats#

Set of all encodings used for pages in this column chunk. This information can be used to determine if all data pages are dictionary encoded for example.

std::optional<int64_t> bloom_filter_offset#

Byte offset from beginning of file to Bloom filter data.

std::optional<int32_t> bloom_filter_length#

Size of Bloom filter data including the serialized header, in bytes. Added in 2.10 so readers may not read this field from old files and it can be obtained after the BloomFilterHeader has been deserialized. Writers should write this field so readers can read the bloom filter in a single I/O.

std::optional<SizeStatistics> size_statistics#

Optional statistics to help estimate total memory when converted to in-memory representations. The histograms contained in these statistics can also be useful in some cases for more fine-grained nullability/list length filter pushdown.

struct BloomFilterAlgorithm#
#include <parquet_schema.hpp>

The algorithm used in bloom filter.

Public Types

enum Algorithm#

Available bloom filter algorithms.

Values:

enumerator UNDEFINED#
enumerator SPLIT_BLOCK#

Public Members

Algorithm algorithm = {Algorithm::SPLIT_BLOCK}#

Bloom filter algorithm.

struct BloomFilterHash#
#include <parquet_schema.hpp>

The hash function used in Bloom filter.

Public Types

enum Hash#

Available bloom filter hashers.

Values:

enumerator UNDEFINED#
enumerator XXHASH#

Public Members

Hash hash = {Hash::XXHASH}#

Bloom filter hasher.

struct BloomFilterCompression#
#include <parquet_schema.hpp>

The compression used in the bloom filter.

Public Types

enum Compression#

Available bloom filter compression types.

Values:

enumerator UNDEFINED#
enumerator UNCOMPRESSED#

Public Members

Compression compression = {Compression::UNCOMPRESSED}#

Bloom filter compression type.

struct BloomFilterHeader#
#include <parquet_schema.hpp>

Bloom filter header struct.

The bloom filter data of a column chunk stores this header at the beginning following by the filter bitset.

Public Members

int32_t num_bytes#

The size of bitset in bytes.

BloomFilterAlgorithm algorithm#

The algorithm for setting bits.

BloomFilterHash hash#

The hash function used for bloom filter.

BloomFilterCompression compression#

The compression used in the bloom filter.

struct ColumnChunk#
#include <parquet_schema.hpp>

Thrift-derived struct describing a chunk of data for a particular column.

Each column chunk lives in a particular row group and are guaranteed to be contiguous in the file. Any missing or corrupted chunks can be skipped during reading.

Public Members

std::string file_path = ""#

File where column data is stored. If not set, assumed to be same file as metadata. This path is relative to the current file.

int64_t file_offset = 0#

Deprecated: Byte offset in file_path to the ColumnMetaData.

ColumnChunkMetaData meta_data#

Column metadata for this chunk. Some writers may also replicate this at the location pointed to by file_path/file_offset.

int64_t offset_index_offset = 0#

File offset of ColumnChunk’s OffsetIndex.

int32_t offset_index_length = 0#

Size of ColumnChunk’s OffsetIndex, in bytes.

int64_t column_index_offset = 0#

File offset of ColumnChunk’s ColumnIndex.

int32_t column_index_length = 0#

Size of ColumnChunk’s ColumnIndex, in bytes.

int schema_idx = -1#

Index in flattened schema (derived from path_in_schema)

std::optional<OffsetIndex> offset_index#

OffsetIndex for this column chunk

std::optional<ColumnIndex> column_index#

ColumnIndex for this column chunk

struct RowGroup#
#include <parquet_schema.hpp>

Thrift-derived struct describing a group of row data.

There may be one or more row groups within a dataset, with each row group consisting of a column chunk for each column.

Public Members

std::vector<ColumnChunk> columns#

Metadata for each column chunk in this row group.

int64_t total_byte_size = 0#

Total byte size of all the uncompressed column data in this row group.

int64_t num_rows = 0#

Number of rows in this row group.

std::optional<std::vector<SortingColumn>> sorting_columns#

If set, specifies a sort ordering of the rows in this RowGroup.

std::optional<int64_t> file_offset#

Byte offset from beginning of file to first page (data or dictionary) in this row group.

std::optional<int64_t> total_compressed_size#

Total byte size of all compressed (and potentially encrypted) column data in this row group.

std::optional<int16_t> ordinal#

Row group ordinal in the file.

struct KeyValue#
#include <parquet_schema.hpp>

Thrift-derived struct describing a key-value pair, for user metadata.

Public Members

std::string key#

string key

std::string value#

string value

struct FileMetaData#
#include <parquet_schema.hpp>

Thrift-derived struct describing file-level metadata.

The additional information stored in the key_value_metadata can be used during reading to reconstruct the output data to the exact original dataset prior to conversion to Parquet.

Public Members

int32_t version = 0#

Version of this file.

std::vector<SchemaElement> schema#

Parquet schema for this file. This schema contains metadata for all the columns. The schema is represented as a tree with a single root. The nodes of the tree are flattened to a list by doing a depth-first traversal. The column metadata contains the path in the schema for that column which can be used to map columns to nodes in the schema. The first element is the root

int64_t num_rows = 0#

Number of rows in this file.

std::vector<RowGroup> row_groups#

Row groups in this file.

std::vector<KeyValue> key_value_metadata#

Optional key/value metadata.

std::string created_by = ""#

String for application that wrote this file.

std::optional<std::vector<ColumnOrder>> column_orders#

Sort order used for the min_value and max_value fields in the Statistics objects and the min_values and max_values fields in the ColumnIndex objects of each column in this file.

struct DataPageHeader#
#include <parquet_schema.hpp>

Thrift-derived struct describing the header for a data page.

Public Members

int32_t num_values = 0#

Number of values, including NULLs, in this data page.

Encoding encoding = Encoding::PLAIN#

Encoding used for this data page.

Encoding definition_level_encoding = Encoding::PLAIN#

Encoding used for definition levels.

Encoding repetition_level_encoding = Encoding::PLAIN#

Encoding used for repetition levels.

struct DataPageHeaderV2#
#include <parquet_schema.hpp>

Thrift-derived struct describing the header for a V2 data page.

Public Members

int32_t num_values = 0#

Number of values, including NULLs, in this data page.

int32_t num_nulls = 0#

Number of NULL values, in this data page.

int32_t num_rows = 0#

Number of rows in this data page. which means pages change on record boundaries (r = 0)

Encoding encoding = Encoding::PLAIN#

Encoding used for this data page.

int32_t definition_levels_byte_length = 0#

Length of the definition levels.

int32_t repetition_levels_byte_length = 0#

Length of the repetition levels.

bool is_compressed = true#

Whether the values are compressed.

struct DictionaryPageHeader#
#include <parquet_schema.hpp>

Thrift-derived struct describing the header for a dictionary page.

Public Members

int32_t num_values = 0#

Number of values in the dictionary.

Encoding encoding = Encoding::PLAIN#

Encoding using this dictionary page.

struct PageHeader#
#include <parquet_schema.hpp>

Thrift-derived struct describing the page header.

Column data are divided into individual chunks, which are subdivided into pages. Each page has an associated header, describing the page type. There can be multiple page types interleaved in a column chunk, and each page is individually compressed and encoded. Any missing or corrupted pages can be skipped during reading.

Public Members

PageType type = PageType::DATA_PAGE#

The type of the page: indicates which of the *_header fields is set.

int32_t uncompressed_page_size = 0#

Uncompressed page size in bytes (not including the header)

int32_t compressed_page_size = 0#

Compressed page size in bytes (not including the header)

DataPageHeader data_page_header#

Data page header.

DictionaryPageHeader dictionary_page_header#

Dictionary page header.

DataPageHeaderV2 data_page_header_v2#

V2 data page header.

class writer_compression_statistics#
#include <types.hpp>

Statistics about compression performed by a writer.

Public Functions

writer_compression_statistics() = default#

Default constructor.

inline writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)#

Constructor with initial values.

Parameters:
  • num_compressed_bytes – The number of bytes that were successfully compressed

  • num_failed_bytes – The number of bytes that failed to compress

  • num_skipped_bytes – The number of bytes that were skipped during compression

  • num_compressed_output_bytes – The number of bytes in the compressed output

inline writer_compression_statistics &operator+=(writer_compression_statistics const &other) noexcept#

Adds the values from another writer_compression_statistics object.

Parameters:

other – The other writer_compression_statistics object

Returns:

writer_compression_statistics& Reference to this object

inline auto num_compressed_bytes() const noexcept#

Returns the number of bytes in blocks that were successfully compressed.

This is the number of bytes that were actually compressed, not the size of the compressed output.

Returns:

size_t The number of bytes that were successfully compressed

inline auto num_failed_bytes() const noexcept#

Returns the number of bytes in blocks that failed to compress.

Returns:

size_t The number of bytes that failed to compress

inline auto num_skipped_bytes() const noexcept#

Returns the number of bytes in blocks that were skipped during compression.

Returns:

size_t The number of bytes that were skipped during compression

inline auto num_total_input_bytes() const noexcept#

Returns the total size of compression inputs.

Returns:

size_t The total size of compression inputs

inline auto compression_ratio() const noexcept#

Returns the compression ratio for the successfully compressed blocks.

Returns nan if there were no successfully compressed blocks.

Returns:

double The ratio between the size of the compression inputs and the size of the compressed output.

struct column_name_info#
#include <types.hpp>

Detailed name (and optionally nullability) information for output columns.

The hierarchy of children matches the hierarchy of children in the output cudf columns.

Public Functions

inline column_name_info(std::string _name, std::optional<bool> _is_nullable = std::nullopt, std::optional<bool> _is_binary = std::nullopt)#

Construct a column name info with a name, optional nullabilty, and no children.

Parameters:
  • _name – Column name

  • _is_nullable – True if column is nullable

  • _is_binary – True if column is binary data

inline bool operator==(column_name_info const &rhs) const#

Compares two column name info structs for equality.

Parameters:

rhs – column name info struct to compare against

Returns:

boolean indicating if this and rhs are equal

Public Members

std::string name#

Column name.

std::optional<bool> is_nullable#

Column nullability.

std::optional<bool> is_binary#

Column is binary (i.e. not a list)

std::optional<int32_t> type_length#

Byte width of data (for fixed length data)

std::vector<column_name_info> children#

Child column names.

struct table_metadata#
#include <types.hpp>

Table metadata returned by IO readers.

Public Members

std::vector<column_name_info> schema_info#

Detailed name information for the entire output hierarchy.

std::vector<size_t> num_rows_per_source#

Number of rows read from each data source Currently only computed for Parquet readers if no AST filters being used. Empty vector otherwise

std::map<std::string, std::string> user_data#

Format-dependent metadata of the first input file as key-values pairs (deprecated)

std::vector<std::unordered_map<std::string, std::string>> per_file_user_data#

Per file format-dependent metadata as key-values pairs.

size_type num_input_row_groups = {0}#

Total number of input row groups across all data sources.

std::optional<size_type> num_row_groups_after_stats_filter#

Number of remaining row groups after stats filter. std::nullopt if no filtering done. Currently only reported by Parquet readers

std::optional<size_type> num_row_groups_after_bloom_filter#

Number of remaining row groups after bloom filter. std::nullopt if no filtering done. Currently only reported by Parquet readers

struct table_with_metadata#
#include <types.hpp>

Table with table metadata used by io readers to return the metadata by value.

Public Members

std::unique_ptr<table> tbl#

Table.

table_metadata metadata#

Table metadata.

struct source_info#
#include <types.hpp>

Source information for read interfaces.

Public Functions

source_info() = default#

Default constructor for the next-gen parquet reader.

inline explicit source_info(std::vector<std::string> file_paths)#

Construct a new source info object for multiple files.

Parameters:

file_paths – Input files paths

inline explicit source_info(std::string file_path)#

Construct a new source info object for a single file.

Parameters:

file_path – Single input file

template<typename T>
inline explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)#

Construct a new source info object for multiple buffers in host memory.

Parameters:

host_buffers – Input buffers in host memory

template<typename T>
inline explicit source_info(cudf::host_span<T> host_data)#

Construct a new source info object for a single buffer.

Parameters:

host_data – Input buffer in host memory

inline explicit source_info(cudf::host_span<cudf::device_span<std::byte const>> device_buffers)#

Construct a new source info object for multiple buffers in device memory.

Parameters:

device_buffers – Input buffers in device memory

inline explicit source_info(cudf::device_span<std::byte const> d_buffer)#

Construct a new source info object from a device buffer.

Parameters:

d_buffer – Input buffer in device memory

inline explicit source_info(std::vector<cudf::io::datasource*> const &sources)#

Construct a new source info object for multiple user-implemented sources.

Parameters:

sources – User-implemented input sources

inline explicit source_info(cudf::io::datasource *source)#

Construct a new source info object for a single user-implemented source.

Parameters:

source – Single user-implemented Input source

inline auto type() const#

Get the type of the input.

Returns:

The type of the input

inline auto const &filepaths() const#

Get the filepaths of the input.

Returns:

The filepaths of the input

inline auto const &host_buffers() const#

Get the host buffers of the input.

Returns:

The host buffers of the input

inline auto const &device_buffers() const#

Get the device buffers of the input.

Returns:

The device buffers of the input

inline auto const &user_sources() const#

Get the user sources of the input.

Returns:

The user sources of the input

inline auto num_sources() const#

Get the number of input sources.

Returns:

The number of input sources

struct sink_info#
#include <types.hpp>

Destination information for write interfaces.

Public Functions

inline sink_info(size_t num_sinks)#

Construct a new sink info object.

Parameters:

num_sinks – Number of sinks

inline explicit sink_info(std::vector<std::string> file_paths)#

Construct a new sink info object for multiple files.

Parameters:

file_paths – Output files paths

inline explicit sink_info(std::string file_path)#

Construct a new sink info object for a single file.

Parameters:

file_path – Single output file path

inline explicit sink_info(std::vector<std::vector<char>*> buffers)#

Construct a new sink info object for multiple host buffers.

Parameters:

buffers – Output host buffers

inline explicit sink_info(std::vector<char> *buffer)#

Construct a new sink info object for a single host buffer.

Parameters:

buffer – Single output host buffer

inline explicit sink_info(std::vector<cudf::io::data_sink*> const &user_sinks)#

Construct a new sink info object for multiple user-implemented sinks.

Parameters:

user_sinks – Output user-implemented sinks

inline explicit sink_info(class cudf::io::data_sink *user_sink)#

Construct a new sink info object for a single user-implemented sink.

Parameters:

user_sink – Single output user-implemented sink

inline auto type() const#

Get the type of the input.

Returns:

The type of the input

inline auto num_sinks() const#

Get the number of sinks.

Returns:

The number of sinks

inline auto const &filepaths() const#

Get the filepaths of the input.

Returns:

The filepaths of the input

inline auto const &buffers() const#

Get the host buffers of the input.

Returns:

The host buffers of the input

inline auto const &user_sinks() const#

Get the user sinks of the input.

Returns:

The user sinks of the input

class column_in_metadata#
#include <types.hpp>

Metadata for a column.

Public Functions

inline column_in_metadata(std::string_view name)#

Construct a new column in metadata object.

Parameters:

name – Column name

inline column_in_metadata &add_child(column_in_metadata const &child)#

Add the children metadata of this column.

Parameters:

child – The children metadata of this column to add

Returns:

this for chaining

inline column_in_metadata &set_name(std::string const &name) noexcept#

Set the name of this column.

Parameters:

name – Name of the column

Returns:

this for chaining

inline column_in_metadata &set_nullability(bool nullable) noexcept#

Set the nullability of this column.

Parameters:

nullable – Whether this column is nullable

Returns:

this for chaining

inline column_in_metadata &set_list_column_as_map() noexcept#

Specify that this list column should be encoded as a map in the written file.

The column must have the structure list<struct<key, value>>. This option is invalid otherwise

Returns:

this for chaining

inline column_in_metadata &set_int96_timestamps(bool req) noexcept#

Specifies whether this timestamp column should be encoded using the deprecated int96 physical type. Only valid for the following column types: timestamp_s, timestamp_ms, timestamp_us, timestamp_ns.

Parameters:

req – True = use int96 physical type. False = use int64 physical type

Returns:

this for chaining

inline column_in_metadata &set_decimal_precision(uint8_t precision) noexcept#

Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.

Parameters:

precision – The integer precision to set for this decimal column

Returns:

this for chaining

inline column_in_metadata &set_type_length(int32_t length) noexcept#

Set the data length of the column. Only valid if this column is a fixed-length byte array.

Parameters:

length – The data length to set for this column

Returns:

this for chaining

inline column_in_metadata &set_parquet_field_id(int32_t field_id) noexcept#

Set the parquet field id of this column.

Parameters:

field_id – The parquet field id to set

Returns:

this for chaining

inline column_in_metadata &set_output_as_binary(bool binary) noexcept#

Specifies whether this column should be written as binary or string data Only valid for the following column types: string.

Parameters:

binary – True = use binary data type. False = use string data type

Returns:

this for chaining

inline column_in_metadata &set_skip_compression(bool skip) noexcept#

Specifies whether this column should not be compressed regardless of the compression codec specified for the file.

Parameters:

skip – If true do not compress this column

Returns:

this for chaining

inline column_in_metadata &set_encoding(column_encoding encoding) noexcept#

Sets the encoding to use for this column.

This is just a request, and the encoder may still choose to use a different encoding depending on resource constraints. Use the constants defined in the parquet_encoding struct.

Parameters:

encoding – The encoding to use

Returns:

this for chaining

inline column_in_metadata &child(size_type i) noexcept#

Get reference to a child of this column.

Parameters:

i – Index of the child to get

Returns:

this for chaining

inline column_in_metadata const &child(size_type i) const noexcept#

Get const reference to a child of this column.

Parameters:

i – Index of the child to get

Returns:

this for chaining

inline std::string const &get_name() const noexcept#

Get the name of this column.

Returns:

The name of this column

inline bool is_nullability_defined() const noexcept#

Get whether nullability has been explicitly set for this column.

Returns:

Boolean indicating whether nullability has been explicitly set for this column

inline bool nullable() const#

Gets the explicitly set nullability for this column.

Throws:

std::bad_optional_access – If nullability is not explicitly defined for this column. Check using is_nullability_defined() first.

Returns:

Boolean indicating whether this column is nullable

inline bool is_map() const noexcept#

If this is the metadata of a list column, returns whether it is to be encoded as a map.

Returns:

Boolean indicating whether this column is to be encoded as a map

inline bool is_enabled_int96_timestamps() const noexcept#

Get whether to encode this timestamp column using deprecated int96 physical type.

Returns:

Boolean indicating whether to encode this timestamp column using deprecated int96 physical type

inline bool is_decimal_precision_set() const noexcept#

Get whether precision has been set for this decimal column.

Returns:

Boolean indicating whether precision has been set for this decimal column

inline uint8_t get_decimal_precision() const#

Get the decimal precision that was set for this column.

Throws:

std::bad_optional_access – If decimal precision was not set for this column. Check using is_decimal_precision_set() first.

Returns:

The decimal precision that was set for this column

inline bool is_type_length_set() const noexcept#

Get whether type length has been set for this column.

Returns:

Boolean indicating whether type length has been set for this column

inline uint8_t get_type_length() const#

Get the type length that was set for this column.

Throws:

std::bad_optional_access – If type length was not set for this column. Check using is_type_length_set() first.

Returns:

The decimal precision that was set for this column

inline bool is_parquet_field_id_set() const noexcept#

Get whether parquet field id has been set for this column.

Returns:

Boolean indicating whether parquet field id has been set for this column

inline int32_t get_parquet_field_id() const#

Get the parquet field id that was set for this column.

Throws:

std::bad_optional_access – If parquet field id was not set for this column. Check using is_parquet_field_id_set() first.

Returns:

The parquet field id that was set for this column

inline size_type num_children() const noexcept#

Get the number of children of this column.

Returns:

The number of children of this column

inline bool is_enabled_output_as_binary() const noexcept#

Get whether to encode this column as binary or string data.

Returns:

Boolean indicating whether to encode this column as binary data

inline bool is_enabled_skip_compression() const noexcept#

Get whether to skip compressing this column.

Returns:

Boolean indicating whether to skip compression of this column

inline column_encoding get_encoding() const#

Get the encoding that was set for this column.

Returns:

The encoding that was set for this column

class table_input_metadata#
#include <types.hpp>

Metadata for a table.

Public Functions

explicit table_input_metadata(table_view const &table)#

Construct a new table_input_metadata from a table_view.

The constructed table_input_metadata has the same structure as the passed table_view

Parameters:

table – The table_view to construct metadata for

explicit table_input_metadata(table_metadata const &metadata)#

Construct a new table_input_metadata from a table_metadata object.

The constructed table_input_metadata has the same structure, column names and nullability as the passed table_metadata.

Parameters:

metadata – The table_metadata to construct table_intput_metadata for

Public Members

std::vector<column_in_metadata> column_metadata#

List of column metadata.

struct partition_info#
#include <types.hpp>

Information used while writing partitioned datasets.

This information defines the slice of an input table to write to file. In partitioned dataset writing, one partition_info struct defines one partition and corresponds to one output file

Public Functions

inline partition_info(size_type start_row, size_type num_rows)#

Construct a new partition_info.

Parameters:
  • start_row – The start row of the partition

  • num_rows – The number of rows in the partition

Public Members

size_type start_row#

The start row of the partition.

size_type num_rows#

The number of rows in the partition.

class reader_column_schema#
#include <types.hpp>

schema element for reader

Public Functions

inline reader_column_schema(size_type number_of_children)#

Construct a new reader column schema object.

Parameters:

number_of_children – number of child schema objects to default construct

inline reader_column_schema(host_span<reader_column_schema> const &child_span)#

Construct a new reader column schema object with a span defining the children.

Parameters:

child_span – span of child schema objects

inline reader_column_schema &add_child(reader_column_schema const &child)#

Add the children metadata of this column.

Parameters:

child – The children metadata of this column to add

Returns:

this for chaining

inline reader_column_schema &child(size_type i)#

Get reference to a child of this column.

Parameters:

i – Index of the child to get

Returns:

this for chaining

inline reader_column_schema const &child(size_type i) const#

Get const reference to a child of this column.

Parameters:

i – Index of the child to get

Returns:

this for chaining

inline reader_column_schema &set_convert_binary_to_strings(bool convert_to_string)#

Specifies whether this column should be written as binary or string data Only valid for the following column types: string, list<int8>

Parameters:

convert_to_string – True = convert binary to strings False = return binary

Returns:

this for chaining

inline reader_column_schema &set_type_length(int32_t type_length)#

Sets the length of fixed length data.

Parameters:

type_length – Size of the data type in bytes

Returns:

this for chaining

inline bool is_enabled_convert_binary_to_strings() const#

Get whether to encode this column as binary or string data.

Returns:

Boolean indicating whether to encode this column as binary data

inline int32_t get_type_length() const#

Get the length in bytes of this fixed length data.

Returns:

The length in bytes of the data type

inline size_t get_num_children() const#

Get the number of child objects.

Returns:

number of children