Io Types#
- group IO Types
Typedefs
-
using no_statistics = std::monostate#
Monostate type alias for the statistics variant.
-
using date_statistics = minmax_statistics<int32_t>#
Statistics for date(time) columns.
-
using binary_statistics = sum_statistics<int64_t>#
Statistics for binary columns.
The
sumis the total number of bytes across all elements.
-
using statistics_type = std::variant<no_statistics, integer_statistics, double_statistics, string_statistics, bucket_statistics, decimal_statistics, date_statistics, binary_statistics, timestamp_statistics>#
Variant type for ORC type-specific column statistics.
The variant can hold any of the supported column statistics types.
Enums
-
enum CompressionKind#
Identifies a compression algorithm.
Values:
-
enumerator NONE#
-
enumerator ZLIB#
-
enumerator SNAPPY#
-
enumerator LZO#
-
enumerator LZ4#
-
enumerator ZSTD#
-
enumerator NONE#
-
enum TypeKind#
Identifies a data type in an orc file.
Values:
-
enumerator INVALID_TYPE_KIND#
-
enumerator BOOLEAN#
-
enumerator BYTE#
-
enumerator SHORT#
-
enumerator INT#
-
enumerator LONG#
-
enumerator FLOAT#
-
enumerator DOUBLE#
-
enumerator STRING#
-
enumerator BINARY#
-
enumerator TIMESTAMP#
-
enumerator LIST#
-
enumerator MAP#
-
enumerator STRUCT#
-
enumerator UNION#
-
enumerator DECIMAL#
-
enumerator DATE#
-
enumerator VARCHAR#
-
enumerator CHAR#
-
enumerator INVALID_TYPE_KIND#
-
enum StreamKind#
Identifies the type of data stream.
Values:
-
enumerator INVALID_STREAM_KIND#
-
enumerator PRESENT#
-
enumerator DATA#
-
enumerator LENGTH#
-
enumerator DICTIONARY_DATA#
-
enumerator DICTIONARY_COUNT#
-
enumerator SECONDARY#
-
enumerator ROW_INDEX#
-
enumerator BLOOM_FILTER#
-
enumerator BLOOM_FILTER_UTF8#
-
enumerator INVALID_STREAM_KIND#
-
enum ColumnEncodingKind#
Identifies the encoding of columns.
Values:
-
enumerator INVALID_ENCODING_KIND#
-
enumerator DIRECT#
-
enumerator DICTIONARY#
-
enumerator DIRECT_V2#
-
enumerator DICTIONARY_V2#
-
enumerator INVALID_ENCODING_KIND#
-
enum ProtofType#
Identifies the type of encoding in a protocol buffer.
Values:
-
enumerator VARINT#
-
enumerator FIXED64#
-
enumerator FIXEDLEN#
-
enumerator START_GROUP#
-
enumerator END_GROUP#
-
enumerator FIXED32#
-
enumerator INVALID_6#
-
enumerator INVALID_7#
-
enumerator VARINT#
-
enum class Type : int8_t#
Basic data types in Parquet, determines how data is physically stored.
Values:
-
enumerator UNDEFINED#
-
enumerator BOOLEAN#
-
enumerator INT32#
-
enumerator INT64#
-
enumerator INT96#
-
enumerator FLOAT#
-
enumerator DOUBLE#
-
enumerator BYTE_ARRAY#
-
enumerator FIXED_LEN_BYTE_ARRAY#
-
enumerator UNDEFINED#
-
enum class ConvertedType : int8_t#
High-level data types in Parquet, determines how data is logically interpreted.
Values:
-
enumerator UNKNOWN#
-
enumerator UTF8#
-
enumerator MAP#
-
enumerator MAP_KEY_VALUE#
-
enumerator LIST#
-
enumerator ENUM#
-
enumerator DECIMAL#
-
enumerator DATE#
-
enumerator TIME_MILLIS#
-
enumerator TIME_MICROS#
-
enumerator TIMESTAMP_MILLIS#
-
enumerator TIMESTAMP_MICROS#
-
enumerator UINT_8#
-
enumerator UINT_16#
-
enumerator UINT_32#
-
enumerator UINT_64#
-
enumerator INT_8#
-
enumerator INT_16#
-
enumerator INT_32#
-
enumerator INT_64#
-
enumerator JSON#
-
enumerator BSON#
-
enumerator INTERVAL#
-
enumerator NA#
-
enumerator UNKNOWN#
-
enum class Encoding : uint8_t#
Encoding types for the actual data stream.
Values:
-
enumerator PLAIN#
-
enumerator GROUP_VAR_INT#
-
enumerator PLAIN_DICTIONARY#
-
enumerator RLE#
-
enumerator BIT_PACKED#
-
enumerator DELTA_BINARY_PACKED#
-
enumerator DELTA_LENGTH_BYTE_ARRAY#
-
enumerator DELTA_BYTE_ARRAY#
-
enumerator RLE_DICTIONARY#
-
enumerator BYTE_STREAM_SPLIT#
-
enumerator NUM_ENCODINGS#
-
enumerator PLAIN#
-
enum class Compression : uint8_t#
Compression codec used for compressed data pages.
Values:
-
enumerator UNCOMPRESSED#
-
enumerator SNAPPY#
-
enumerator GZIP#
-
enumerator LZO#
-
enumerator BROTLI#
-
enumerator LZ4#
-
enumerator ZSTD#
-
enumerator LZ4_RAW#
-
enumerator UNCOMPRESSED#
-
enum class FieldRepetitionType : int8_t#
Compression codec used for compressed data pages.
Values:
-
enumerator UNSPECIFIED#
-
enumerator REQUIRED#
-
enumerator OPTIONAL#
-
enumerator REPEATED#
-
enumerator UNSPECIFIED#
-
enum class PageType : uint8_t#
Types of pages.
Values:
-
enumerator DATA_PAGE#
-
enumerator INDEX_PAGE#
-
enumerator DICTIONARY_PAGE#
-
enumerator DATA_PAGE_V2#
-
enumerator DATA_PAGE#
-
enum class BoundaryOrder : uint8_t#
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so, in which direction.
Values:
-
enumerator UNORDERED#
-
enumerator ASCENDING#
-
enumerator DESCENDING#
-
enumerator UNORDERED#
-
enum class FieldType : uint8_t#
Thrift compact protocol struct field types.
Values:
-
enumerator BOOLEAN_TRUE#
-
enumerator BOOLEAN_FALSE#
-
enumerator I8#
-
enumerator I16#
-
enumerator I32#
-
enumerator I64#
-
enumerator DOUBLE#
-
enumerator BINARY#
-
enumerator LIST#
-
enumerator SET#
-
enumerator MAP#
-
enumerator STRUCT#
-
enumerator UUID#
-
enumerator BOOLEAN_TRUE#
-
enum class compression_type : int32_t#
Compression algorithms.
Values:
-
enumerator NONE#
No compression.
-
enumerator AUTO#
Automatically detect or select compression format.
-
enumerator SNAPPY#
Snappy format, using byte-oriented LZ77.
-
enumerator GZIP#
GZIP format, using DEFLATE algorithm.
-
enumerator BZIP2#
BZIP2 format, using Burrows-Wheeler transform.
-
enumerator BROTLI#
BROTLI format, using LZ77 + Huffman + 2nd order context modeling.
-
enumerator ZIP#
ZIP format, using DEFLATE algorithm.
-
enumerator XZ#
XZ format, using LZMA(2) algorithm.
-
enumerator ZLIB#
ZLIB format, using DEFLATE algorithm.
-
enumerator LZ4#
LZ4 format, using LZ77.
-
enumerator LZO#
Lempel–Ziv–Oberhumer format.
-
enumerator ZSTD#
Zstandard format.
-
enumerator NONE#
-
enum class io_type : int32_t#
Data source or destination types.
Values:
-
enumerator FILEPATH#
Input/output is a file path.
-
enumerator HOST_BUFFER#
Input/output is a buffer in host memory.
-
enumerator DEVICE_BUFFER#
Input/output is a buffer in device memory.
-
enumerator VOID#
Input/output is nothing. No work is done. Useful for benchmarking.
-
enumerator USER_IMPLEMENTED#
Input/output is handled by a custom user class.
-
enumerator FILEPATH#
-
enum class quote_style : int32_t#
Behavior when handling quotations in field data.
Values:
-
enumerator MINIMAL#
Quote only fields which contain special characters.
-
enumerator ALL#
Quote all fields.
-
enumerator NONNUMERIC#
Quote all non-numeric fields.
-
enumerator NONE#
Never quote fields; disable quotation parsing.
-
enumerator MINIMAL#
-
enum statistics_freq#
Column statistics granularity type for parquet/orc writers.
Values:
-
enumerator STATISTICS_NONE#
No column statistics.
-
enumerator STATISTICS_ROWGROUP#
Per-Rowgroup column statistics.
-
enumerator STATISTICS_PAGE#
Per-page column statistics.
-
enumerator STATISTICS_COLUMN#
Full column and offset indices. Implies STATISTICS_ROWGROUP.
-
enumerator STATISTICS_NONE#
-
enum class column_encoding : int32_t#
Valid encodings for use with
column_in_metadata::set_encoding()Values:
-
enumerator USE_DEFAULT#
No encoding has been requested, use default encoding.
-
enumerator DICTIONARY#
Use dictionary encoding.
-
enumerator PLAIN#
Use plain encoding.
-
enumerator DELTA_BINARY_PACKED#
Use DELTA_BINARY_PACKED encoding (only valid for integer columns)
-
enumerator DELTA_LENGTH_BYTE_ARRAY#
Use DELTA_LENGTH_BYTE_ARRAY encoding (only valid for BYTE_ARRAY columns)
-
enumerator DELTA_BYTE_ARRAY#
Use DELTA_BYTE_ARRAY encoding (only valid for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
-
enumerator BYTE_STREAM_SPLIT#
Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
-
enumerator DIRECT#
Use DIRECT encoding.
-
enumerator DIRECT_V2#
Use DIRECT_V2 encoding.
-
enumerator DICTIONARY_V2#
Use DICTIONARY_V2 encoding.
-
enumerator USE_DEFAULT#
Functions
-
template<typename T>
inline constexpr auto is_byte_like_type()# Returns
trueif the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.- Template Parameters:
T – The representation type
- Returns:
trueif the type is considered a byte-like type
-
struct raw_orc_statistics#
- #include <orc_metadata.hpp>
Holds column names and buffers containing raw file-level and stripe-level statistics.
The buffers can be parsed using a Protobuf parser. Alternatively, use
parsed_orc_statisticsto get the statistics parsed into a libcudf representation.The
column_namesandfile_statsmembers contain one element per column. Thestripes_statscontains one element per stripe, where each element contains column statistics for each column.
-
template<typename T>
struct minmax_statistics# - #include <orc_metadata.hpp>
Base class for column statistics that include optional minimum and maximum.
Includes accessors for the minimum and maximum values.
-
template<typename T>
struct sum_statistics# - #include <orc_metadata.hpp>
Base class for column statistics that include an optional sum.
Includes accessors for the sum value.
-
struct integer_statistics : public cudf::io::minmax_statistics<int64_t>, public cudf::io::sum_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for integral columns.
-
struct double_statistics : public cudf::io::minmax_statistics<double>, public cudf::io::sum_statistics<double>#
- #include <orc_metadata.hpp>
Statistics for floating point columns.
-
struct string_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for string columns.
The
minimumandmaximumare the first and last elements, respectively, in lexicographical order. Thesumis the total length of elements in the column. Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value
-
struct bucket_statistics#
- #include <orc_metadata.hpp>
Statistics for boolean columns.
The
countarray contains the count oftruevalues.Public Members
-
std::vector<uint64_t> count#
count of
truevalues
-
std::vector<uint64_t> count#
-
struct decimal_statistics : public cudf::io::minmax_statistics<std::string>, public cudf::io::sum_statistics<std::string>#
- #include <orc_metadata.hpp>
Statistics for decimal columns.
-
struct timestamp_statistics : public cudf::io::minmax_statistics<int64_t>#
- #include <orc_metadata.hpp>
Statistics for timestamp columns.
The
minimumandmaximummin/max elements in the column, as the number of milliseconds since the UNIX epoch. Theminimum_utcandmaximum_utcare the same values adjusted to UTC.
-
struct column_statistics#
- #include <orc_metadata.hpp>
Contains per-column ORC statistics.
All columns can have the
number_of_valuesstatistics. Depending on the data type, a column can have additional statistics, accessible throughtype_specific_statsaccessor.Public Functions
Public Members
-
std::optional<uint64_t> number_of_values#
number of statistics
-
std::optional<bool> has_null#
column has any nulls
-
statistics_type type_specific_stats#
type-specific statistics
-
std::optional<uint64_t> number_of_values#
-
struct parsed_orc_statistics#
- #include <orc_metadata.hpp>
Holds column names and parsed file-level and stripe-level statistics.
The
column_namesandfile_statsmembers contain one element per column. Thestripes_statsmember contains one element per stripe, where each element contains column statistics for each column.Public Members
-
std::vector<std::string> column_names#
column names
-
std::vector<column_statistics> file_stats#
file-level statistics
-
std::vector<std::vector<column_statistics>> stripes_stats#
stripe-level statistics
-
std::vector<std::string> column_names#
-
struct orc_column_schema#
- #include <orc_metadata.hpp>
Schema of an ORC column, including the nested columns.
Public Functions
-
inline orc_column_schema(std::string_view name, orc::TypeKind type, std::vector<orc_column_schema> children)#
constructor
- Parameters:
name – column name
type – ORC type
children – child columns (empty for non-nested types)
-
inline auto name() const#
Returns ORC column name; can be empty.
- Returns:
Column name
-
inline auto type_kind() const#
Returns ORC type of the column.
- Returns:
Column ORC type
-
inline auto const &children() const &#
Returns schemas of all child columns.
- Returns:
Children schemas
-
inline auto children() &&#
Returns schemas of all child columns.
- Returns:
Children schemas Children array is moved out of the object (rvalues only).
-
inline auto const &child(int idx) const &#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema
-
inline auto child(int idx) &&#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema Child is moved out of the object (rvalues only).
-
inline auto num_children() const#
Returns the number of child columns.
- Returns:
Children count
-
inline orc_column_schema(std::string_view name, orc::TypeKind type, std::vector<orc_column_schema> children)#
-
struct orc_schema#
- #include <orc_metadata.hpp>
Schema of an ORC file.
Public Functions
-
inline orc_schema(orc_column_schema root_column_schema)#
constructor
- Parameters:
root_column_schema – root column
-
inline auto const &root() const &#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema
-
inline auto root() &&#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema Root column schema is moved out of the object (rvalues only).
-
inline orc_schema(orc_column_schema root_column_schema)#
-
class orc_metadata#
- #include <orc_metadata.hpp>
Information about content of an ORC file.
Public Functions
-
inline orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)#
constructor
- Parameters:
schema – ORC schema
num_rows – number of rows
num_stripes – number of stripes
-
inline auto const &schema() const#
Returns the ORC schema.
- Returns:
ORC schema Number of rows in the root column; can vary for nested columns
-
inline auto num_rows() const#
Returns the number of rows of the root column.
If a file contains list columns, nested columns can have a different number of rows.
- Returns:
Number of rows
-
inline auto num_stripes() const#
Returns the number of stripes in the file.
- Returns:
Number of stripes
-
inline orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)#
-
struct parquet_column_schema#
- #include <parquet_metadata.hpp>
Schema of a parquet column, including the nested columns.
Public Functions
-
explicit parquet_column_schema() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline parquet_column_schema(std::string_view name, Type type, std::vector<parquet_column_schema> children)#
constructor
- Parameters:
name – column name
type – parquet type
children – child columns (empty for non-nested types)
-
inline auto name() const#
Returns parquet column name; can be empty.
- Returns:
Column name
-
inline auto type() const#
Returns parquet physical type of the column.
- Returns:
Column parquet physical type
-
inline auto const &children() const &#
Returns schemas of all child columns.
- Returns:
Children schemas
-
inline auto children() &&#
Returns schemas of all child columns.
- Returns:
Children schemas Children array is moved out of the object (rvalues only)
-
inline auto const &child(int idx) const &#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema
-
inline auto child(int idx) &&#
Returns schema of the child with the given index.
- Parameters:
idx – child index
- Returns:
Child schema Child is moved out of the object (rvalues only)
-
inline auto num_children() const#
Returns the number of child columns.
- Returns:
Children count
-
explicit parquet_column_schema() = default#
-
struct parquet_schema#
- #include <parquet_metadata.hpp>
Schema of a parquet file.
Public Functions
-
explicit parquet_schema() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack
-
inline parquet_schema(parquet_column_schema root_column_schema)#
constructor
- Parameters:
root_column_schema – root column
-
inline auto const &root() const &#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema
-
inline auto root() &&#
Returns the schema of the struct column that contains all columns as fields.
- Returns:
Root column schema Root column schema is moved out of the object (rvalues only)
-
explicit parquet_schema() = default#
-
class parquet_metadata#
- #include <parquet_metadata.hpp>
Information about content of a parquet file.
Public Types
-
using key_value_metadata = std::unordered_map<std::string, std::string>#
Key-value metadata in the file footer.
-
using row_group_metadata = std::unordered_map<std::string, int64_t>#
Row group metadata from each RowGroup element.
-
using column_chunk_metadata = std::unordered_map<std::string, std::vector<int64_t>>#
Column chunk metadata from each ColumnChunkMetaData element.
Public Functions
-
explicit parquet_metadata() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline parquet_metadata(parquet_schema schema, int64_t num_rows, size_type num_rowgroups, std::vector<size_type> num_rowgroups_per_file, key_value_metadata file_metadata, std::vector<row_group_metadata> rg_metadata, column_chunk_metadata column_chunk_metadata)#
constructor
- Parameters:
schema – parquet schema
num_rows – number of rows
num_rowgroups – total number of row groups
num_rowgroups_per_file – number of row groups per file
file_metadata – key-value metadata in the file footer
rg_metadata – vector of maps containing metadata for each row group
column_chunk_metadata – map of column names to vectors of
total_uncompressed_sizemetadata from all their column chunks
-
inline auto const &schema() const#
Returns the parquet schema.
- Returns:
parquet schema
-
inline auto num_rows() const#
Returns the number of rows of the root column.
If a file contains list columns, nested columns can have a different number of rows.
- Returns:
Number of rows
-
inline auto num_rowgroups() const#
Returns the total number of rowgroups.
- Returns:
Total number of row groups
-
inline auto const &num_rowgroups_per_file() const#
Returns the number of rowgroups in each file.
- Returns:
Number of row groups per file
-
inline auto const &metadata() const#
Returns the Key value metadata in the file footer.
- Returns:
Key value metadata as a map
-
inline auto const &rowgroup_metadata() const#
Returns the row group metadata in the file footer.
- Returns:
Vector of row group metadata as maps
-
inline auto const &columnchunk_metadata() const#
Returns a map of column names to vectors of
total_uncompressed_sizemetadata from all their column chunks.- Returns:
Map of column names to vectors of
total_uncompressed_sizemetadata from all their column chunks
-
using key_value_metadata = std::unordered_map<std::string, std::string>#
-
struct file_header_s#
- #include <parquet_schema.hpp>
Struct that describes the Parquet file data header.
Public Members
-
uint32_t magic#
Parquet 4-byte magic number “PAR1”.
-
uint32_t magic#
-
struct file_ender_s#
- #include <parquet_schema.hpp>
Struct that describes the Parquet file data postscript.
-
struct DecimalType#
- #include <parquet_schema.hpp>
Struct that describes the decimal logical type annotation.
Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
-
struct TimeUnit#
- #include <parquet_schema.hpp>
Time units for temporal logical types.
Public Types
-
struct TimeType#
- #include <parquet_schema.hpp>
Struct that describes the time logical type annotation.
Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
-
struct TimestampType#
- #include <parquet_schema.hpp>
Struct that describes the timestamp logical type annotation.
Allowed for physical types: INT64
-
struct IntType#
- #include <parquet_schema.hpp>
Struct that describes the integer logical type annotation.
Allowed for physical types: INT32, INT64
-
struct LogicalType#
- #include <parquet_schema.hpp>
Struct that describes the logical type annotation.
Public Types
-
enum Type#
Logical type annotations to replace ConvertedType.
Values:
-
enumerator UNDEFINED#
-
enumerator STRING#
-
enumerator MAP#
-
enumerator LIST#
-
enumerator ENUM#
-
enumerator DECIMAL#
-
enumerator DATE#
-
enumerator TIME#
-
enumerator TIMESTAMP#
-
enumerator INTEGER#
-
enumerator UNKNOWN#
-
enumerator JSON#
-
enumerator BSON#
-
enumerator UNDEFINED#
Public Functions
-
inline LogicalType(DecimalType &&dt)#
Constructor for Decimal logical type.
- Parameters:
dt – Decimal type
-
inline LogicalType(TimestampType &&tst)#
Constructor for Timestamp logical type.
- Parameters:
tst – Timestamp type
-
inline LogicalType(IntType &&it)#
Constructor for Integer logical type.
- Parameters:
it – Integer type
-
inline constexpr bool is_time_millis() const#
Check if the time is in milliseconds.
- Returns:
True if the time is in milliseconds, false otherwise
-
inline constexpr bool is_time_micros() const#
Check if the time is in microseconds.
- Returns:
True if the time is in microseconds, false otherwise
-
inline constexpr bool is_time_nanos() const#
Check if the time is in nanoseconds.
- Returns:
True if the time is in nanoseconds, false otherwise
-
inline constexpr bool is_timestamp_millis() const#
Check if the timestamp is in milliseconds.
- Returns:
True if the timestamp is in milliseconds, false otherwise
-
inline constexpr bool is_timestamp_micros() const#
Check if the timestamp is in microseconds.
- Returns:
True if the timestamp is in microseconds, false otherwise
-
inline constexpr bool is_timestamp_nanos() const#
Check if the timestamp is in nanoseconds.
- Returns:
True if the timestamp is in nanoseconds, false otherwise
-
inline constexpr int8_t bit_width() const#
Get the bit width of the integer type.
- Returns:
The bit width of the integer type, or -1 if the type is not an integer
-
inline constexpr bool is_signed() const#
Check if the integer is signed.
- Returns:
True if the integer is signed, false otherwise
-
inline constexpr int32_t scale() const#
Get the scale of the decimal type.
- Returns:
The scale of the decimal type, or -1 if the type is not a decimal
-
inline constexpr int32_t precision() const#
Get the precision of the decimal type.
- Returns:
The precision of the decimal type, or -1 if the type is not a decimal
Public Members
-
cuda::std::optional<DecimalType> decimal_type#
Decimal type.
-
cuda::std::optional<TimestampType> timestamp_type#
Timestamp type.
-
enum Type#
-
struct ColumnOrder#
- #include <parquet_schema.hpp>
Union to specify the order used for the min_value and max_value fields for a column.
Public Types
-
struct SchemaElement#
- #include <parquet_schema.hpp>
Struct for describing an element/field in the Parquet format schema.
Parquet is a strongly-typed format so the file layout can be interpreted as as a schema tree.
Public Functions
-
inline bool operator==(SchemaElement const &other) const#
Check if two schema elements are equal.
- Parameters:
other – The other schema element to compare to
- Returns:
True if the two schema elements are equal, false otherwise
-
inline bool is_stub() const#
Check if the schema element is a stub.
- Returns:
True if the schema element is a stub, false otherwise
-
inline bool is_one_level_list(SchemaElement const &parent) const#
Check if the schema element is a one-level list.
apache/parquet-cpp One-level LIST encoding: Only allows required lists with required cells: repeated value_type name
- Parameters:
parent – The parent schema element
- Returns:
True if the schema element is a one-level list, false otherwise
-
inline bool is_list() const#
Check if the schema element is a list.
- Returns:
True if the schema element is a list, false otherwise
-
inline bool is_struct() const#
Check if the schema element is a struct.
In parquet terms, a group is a level of nesting in the schema. a group can be a struct or a list
- Returns:
True if the schema element is a struct, false otherwise
Public Members
-
int32_t type_length = 0#
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
-
FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED#
3: repetition of the field
-
std::string name = ""#
4: name of the field
-
int32_t num_children = 0#
5: nested fields
-
std::optional<ConvertedType> converted_type#
6: DEPRECATED: record the original type before conversion to parquet type
-
int32_t decimal_scale = 0#
7: DEPRECATED: record the scale for DECIMAL converted type
-
int32_t decimal_precision = 0#
8: DEPRECATED: record the precision for DECIMAL converted type
-
std::optional<int32_t> field_id#
9: save field_id from original schema
-
std::optional<LogicalType> logical_type#
10: replaces converted type
-
bool output_as_byte_array = false#
extra cudf specific fields
-
int max_definition_level = 0#
Maximum definition level.
-
int max_repetition_level = 0#
Maximum repetition level.
-
inline bool operator==(SchemaElement const &other) const#
-
struct Statistics#
- #include <parquet_schema.hpp>
Thrift-derived struct describing column chunk statistics.
Public Members
-
std::optional<std::vector<uint8_t>> max#
deprecated max value in signed comparison order
-
std::optional<std::vector<uint8_t>> min#
deprecated min value in signed comparison order
-
std::optional<int64_t> null_count#
count of null values in the column
-
std::optional<int64_t> distinct_count#
count of distinct values occurring
-
std::optional<std::vector<uint8_t>> max_value#
max value for column determined by ColumnOrder
-
std::optional<std::vector<uint8_t>> min_value#
min value for column determined by ColumnOrder
-
std::optional<bool> is_max_value_exact#
If true, max_value is the actual maximum value for a column.
-
std::optional<bool> is_min_value_exact#
If true, min_value is the actual minimum value for a column.
-
std::optional<std::vector<uint8_t>> max#
-
struct SizeStatistics#
- #include <parquet_schema.hpp>
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
Public Members
-
std::optional<int64_t> unencoded_byte_array_data_bytes#
Number of variable-width bytes stored for the page/chunk. Should not be set for anything but the BYTE_ARRAY physical type.
-
std::optional<std::vector<int64_t>> repetition_level_histogram#
When present, there is expected to be one element corresponding to each repetition (i.e. size=max repetition_level+1) where each element represents the number of times the repetition level was observed in the data.
This value should not be written if max_repetition_level is 0.
-
std::optional<std::vector<int64_t>> definition_level_histogram#
Same as repetition_level_histogram except for definition levels.
This value should not be written if max_definition_level is 0 or 1.
-
std::optional<int64_t> unencoded_byte_array_data_bytes#
-
struct PageLocation#
- #include <parquet_schema.hpp>
Thrift-derived struct describing page location information stored in the offsets index.
-
struct OffsetIndex#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the offset index.
Public Members
-
std::vector<PageLocation> page_locations#
Page locations.
-
std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes#
per-page size info. see description of the same field in SizeStatistics. only present for columns with a BYTE_ARRAY physical type.
-
std::vector<PageLocation> page_locations#
-
struct ColumnIndex#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the column index.
Public Members
-
std::vector<bool> null_pages#
Boolean used to determine if a page contains only null values.
-
std::vector<std::vector<uint8_t>> min_values#
Lower bound for values in each page.
-
std::vector<std::vector<uint8_t>> max_values#
Upper bound for values in each page.
-
BoundaryOrder boundary_order = BoundaryOrder::UNORDERED#
Indicates if min and max values are ordered.
-
std::optional<std::vector<int64_t>> null_counts#
Optional count of null values per page.
-
std::optional<std::vector<int64_t>> repetition_level_histogram#
Repetition level histogram for the column chunk.
-
std::optional<std::vector<int64_t>> definition_level_histogram#
Definition level histogram for the column chunk.
-
std::vector<bool> null_pages#
-
struct PageEncodingStats#
- #include <parquet_schema.hpp>
Thrift-derived struct describing page encoding statistics.
-
struct SortingColumn#
- #include <parquet_schema.hpp>
Thrift-derived struct describing column sort order.
-
struct ColumnChunkMetaData#
- #include <parquet_schema.hpp>
Thrift-derived struct describing a column chunk.
Public Members
-
std::vector<Encoding> encodings#
Set of all encodings used for this column. The purpose is to validate whether we can decode those pages.
-
std::vector<std::string> path_in_schema#
Path in schema.
-
Compression codec = Compression::UNCOMPRESSED#
Compression codec.
-
int64_t num_values = 0#
Number of values in this column.
-
int64_t total_uncompressed_size = 0#
Total byte size of all uncompressed pages in this column chunk (including the headers)
-
int64_t total_compressed_size = 0#
Total byte size of all compressed pages in this column chunk (including the headers)
-
int64_t data_page_offset = 0#
Byte offset from beginning of file to first data page.
-
int64_t index_page_offset = 0#
Byte offset from beginning of file to root index page.
-
int64_t dictionary_page_offset = 0#
Byte offset from the beginning of file to first (only) dictionary page.
-
Statistics statistics#
Optional statistics for this column chunk.
-
std::optional<std::vector<PageEncodingStats>> encoding_stats#
Set of all encodings used for pages in this column chunk. This information can be used to determine if all data pages are dictionary encoded for example.
-
std::optional<int64_t> bloom_filter_offset#
Byte offset from beginning of file to Bloom filter data.
-
std::optional<int32_t> bloom_filter_length#
Size of Bloom filter data including the serialized header, in bytes. Added in 2.10 so readers may not read this field from old files and it can be obtained after the BloomFilterHeader has been deserialized. Writers should write this field so readers can read the bloom filter in a single I/O.
-
std::optional<SizeStatistics> size_statistics#
Optional statistics to help estimate total memory when converted to in-memory representations. The histograms contained in these statistics can also be useful in some cases for more fine-grained nullability/list length filter pushdown.
-
std::vector<Encoding> encodings#
-
struct BloomFilterAlgorithm#
- #include <parquet_schema.hpp>
The algorithm used in bloom filter.
Public Types
Public Members
-
Algorithm algorithm = {Algorithm::SPLIT_BLOCK}#
Bloom filter algorithm.
-
Algorithm algorithm = {Algorithm::SPLIT_BLOCK}#
-
struct BloomFilterHash#
- #include <parquet_schema.hpp>
The hash function used in Bloom filter.
Public Types
-
struct BloomFilterCompression#
- #include <parquet_schema.hpp>
The compression used in the bloom filter.
Public Types
Public Members
-
Compression compression = {Compression::UNCOMPRESSED}#
Bloom filter compression type.
-
Compression compression = {Compression::UNCOMPRESSED}#
-
struct BloomFilterHeader#
- #include <parquet_schema.hpp>
Bloom filter header struct.
The bloom filter data of a column chunk stores this header at the beginning following by the filter bitset.
Public Members
-
int32_t num_bytes#
The size of bitset in bytes.
-
BloomFilterAlgorithm algorithm#
The algorithm for setting bits.
-
BloomFilterHash hash#
The hash function used for bloom filter.
-
BloomFilterCompression compression#
The compression used in the bloom filter.
-
int32_t num_bytes#
-
struct ColumnChunk#
- #include <parquet_schema.hpp>
Thrift-derived struct describing a chunk of data for a particular column.
Each column chunk lives in a particular row group and are guaranteed to be contiguous in the file. Any missing or corrupted chunks can be skipped during reading.
Public Members
-
std::string file_path = ""#
File where column data is stored. If not set, assumed to be same file as metadata. This path is relative to the current file.
-
int64_t file_offset = 0#
Deprecated: Byte offset in file_path to the ColumnMetaData.
-
ColumnChunkMetaData meta_data#
Column metadata for this chunk. Some writers may also replicate this at the location pointed to by file_path/file_offset.
-
int64_t offset_index_offset = 0#
File offset of ColumnChunk’s OffsetIndex.
-
int32_t offset_index_length = 0#
Size of ColumnChunk’s OffsetIndex, in bytes.
-
int64_t column_index_offset = 0#
File offset of ColumnChunk’s ColumnIndex.
-
int32_t column_index_length = 0#
Size of ColumnChunk’s ColumnIndex, in bytes.
-
int schema_idx = -1#
Index in flattened schema (derived from path_in_schema)
-
std::optional<OffsetIndex> offset_index#
OffsetIndexfor this column chunk
-
std::optional<ColumnIndex> column_index#
ColumnIndexfor this column chunk
-
std::string file_path = ""#
-
struct RowGroup#
- #include <parquet_schema.hpp>
Thrift-derived struct describing a group of row data.
There may be one or more row groups within a dataset, with each row group consisting of a column chunk for each column.
Public Members
-
std::vector<ColumnChunk> columns#
Metadata for each column chunk in this row group.
-
int64_t total_byte_size = 0#
Total byte size of all the uncompressed column data in this row group.
-
int64_t num_rows = 0#
Number of rows in this row group.
-
std::optional<std::vector<SortingColumn>> sorting_columns#
If set, specifies a sort ordering of the rows in this RowGroup.
-
std::optional<int64_t> file_offset#
Byte offset from beginning of file to first page (data or dictionary) in this row group.
-
std::optional<int64_t> total_compressed_size#
Total byte size of all compressed (and potentially encrypted) column data in this row group.
-
std::optional<int16_t> ordinal#
Row group ordinal in the file.
-
std::vector<ColumnChunk> columns#
-
struct KeyValue#
- #include <parquet_schema.hpp>
Thrift-derived struct describing a key-value pair, for user metadata.
-
struct FileMetaData#
- #include <parquet_schema.hpp>
Thrift-derived struct describing file-level metadata.
The additional information stored in the key_value_metadata can be used during reading to reconstruct the output data to the exact original dataset prior to conversion to Parquet.
Public Members
-
int32_t version = 0#
Version of this file.
-
std::vector<SchemaElement> schema#
Parquet schema for this file. This schema contains metadata for all the columns. The schema is represented as a tree with a single root. The nodes of the tree are flattened to a list by doing a depth-first traversal. The column metadata contains the path in the schema for that column which can be used to map columns to nodes in the schema. The first element is the root
-
int64_t num_rows = 0#
Number of rows in this file.
-
std::string created_by = ""#
String for application that wrote this file.
-
std::optional<std::vector<ColumnOrder>> column_orders#
Sort order used for the min_value and max_value fields in the Statistics objects and the min_values and max_values fields in the ColumnIndex objects of each column in this file.
-
int32_t version = 0#
-
struct DataPageHeader#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the header for a data page.
Public Members
-
int32_t num_values = 0#
Number of values, including NULLs, in this data page.
-
int32_t num_values = 0#
-
struct DataPageHeaderV2#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the header for a V2 data page.
Public Members
-
int32_t num_values = 0#
Number of values, including NULLs, in this data page.
-
int32_t num_nulls = 0#
Number of NULL values, in this data page.
-
int32_t num_rows = 0#
Number of rows in this data page. which means pages change on record boundaries (r = 0)
-
int32_t definition_levels_byte_length = 0#
Length of the definition levels.
-
int32_t repetition_levels_byte_length = 0#
Length of the repetition levels.
-
bool is_compressed = true#
Whether the values are compressed.
-
int32_t num_values = 0#
-
struct DictionaryPageHeader#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the header for a dictionary page.
-
struct PageHeader#
- #include <parquet_schema.hpp>
Thrift-derived struct describing the page header.
Column data are divided into individual chunks, which are subdivided into pages. Each page has an associated header, describing the page type. There can be multiple page types interleaved in a column chunk, and each page is individually compressed and encoded. Any missing or corrupted pages can be skipped during reading.
Public Members
-
PageType type = PageType::DATA_PAGE#
The type of the page: indicates which of the *_header fields is set.
-
int32_t uncompressed_page_size = 0#
Uncompressed page size in bytes (not including the header)
-
int32_t compressed_page_size = 0#
Compressed page size in bytes (not including the header)
-
DataPageHeader data_page_header#
Data page header.
-
DictionaryPageHeader dictionary_page_header#
Dictionary page header.
-
DataPageHeaderV2 data_page_header_v2#
V2 data page header.
-
PageType type = PageType::DATA_PAGE#
-
class writer_compression_statistics#
- #include <types.hpp>
Statistics about compression performed by a writer.
Public Functions
-
writer_compression_statistics() = default#
Default constructor.
-
inline writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)#
Constructor with initial values.
- Parameters:
num_compressed_bytes – The number of bytes that were successfully compressed
num_failed_bytes – The number of bytes that failed to compress
num_skipped_bytes – The number of bytes that were skipped during compression
num_compressed_output_bytes – The number of bytes in the compressed output
-
inline writer_compression_statistics &operator+=(writer_compression_statistics const &other) noexcept#
Adds the values from another
writer_compression_statisticsobject.- Parameters:
other – The other writer_compression_statistics object
- Returns:
writer_compression_statistics& Reference to this object
-
inline auto num_compressed_bytes() const noexcept#
Returns the number of bytes in blocks that were successfully compressed.
This is the number of bytes that were actually compressed, not the size of the compressed output.
- Returns:
size_t The number of bytes that were successfully compressed
-
inline auto num_failed_bytes() const noexcept#
Returns the number of bytes in blocks that failed to compress.
- Returns:
size_t The number of bytes that failed to compress
-
inline auto num_skipped_bytes() const noexcept#
Returns the number of bytes in blocks that were skipped during compression.
- Returns:
size_t The number of bytes that were skipped during compression
-
inline auto num_total_input_bytes() const noexcept#
Returns the total size of compression inputs.
- Returns:
size_t The total size of compression inputs
-
inline auto compression_ratio() const noexcept#
Returns the compression ratio for the successfully compressed blocks.
Returns nan if there were no successfully compressed blocks.
- Returns:
double The ratio between the size of the compression inputs and the size of the compressed output.
-
writer_compression_statistics() = default#
-
struct column_name_info#
- #include <types.hpp>
Detailed name (and optionally nullability) information for output columns.
The hierarchy of children matches the hierarchy of children in the output cudf columns.
Public Functions
-
inline column_name_info(std::string _name, std::optional<bool> _is_nullable = std::nullopt, std::optional<bool> _is_binary = std::nullopt)#
Construct a column name info with a name, optional nullabilty, and no children.
- Parameters:
_name – Column name
_is_nullable – True if column is nullable
_is_binary – True if column is binary data
-
inline bool operator==(column_name_info const &rhs) const#
Compares two column name info structs for equality.
- Parameters:
rhs – column name info struct to compare against
- Returns:
boolean indicating if this and rhs are equal
Public Members
-
std::string name#
Column name.
-
std::optional<bool> is_nullable#
Column nullability.
-
std::optional<bool> is_binary#
Column is binary (i.e. not a list)
-
std::optional<int32_t> type_length#
Byte width of data (for fixed length data)
-
std::vector<column_name_info> children#
Child column names.
-
inline column_name_info(std::string _name, std::optional<bool> _is_nullable = std::nullopt, std::optional<bool> _is_binary = std::nullopt)#
-
struct table_metadata#
- #include <types.hpp>
Table metadata returned by IO readers.
Public Members
-
std::vector<column_name_info> schema_info#
Detailed name information for the entire output hierarchy.
-
std::vector<size_t> num_rows_per_source#
Number of rows read from each data source Currently only computed for Parquet readers if no AST filters being used. Empty vector otherwise
-
std::map<std::string, std::string> user_data#
Format-dependent metadata of the first input file as key-values pairs (deprecated)
-
std::vector<std::unordered_map<std::string, std::string>> per_file_user_data#
Per file format-dependent metadata as key-values pairs.
-
std::vector<column_name_info> schema_info#
-
struct table_with_metadata#
- #include <types.hpp>
Table with table metadata used by io readers to return the metadata by value.
-
struct source_info#
- #include <types.hpp>
Source information for read interfaces.
Public Functions
-
source_info() = default#
Default constructor for the next-gen parquet reader.
-
inline explicit source_info(std::vector<std::string> file_paths)#
Construct a new source info object for multiple files.
- Parameters:
file_paths – Input files paths
-
inline explicit source_info(std::string file_path)#
Construct a new source info object for a single file.
- Parameters:
file_path – Single input file
-
template<typename T>
inline explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)# Construct a new source info object for multiple buffers in host memory.
- Parameters:
host_buffers – Input buffers in host memory
-
template<typename T>
inline explicit source_info(cudf::host_span<T> host_data)# Construct a new source info object for a single buffer.
- Parameters:
host_data – Input buffer in host memory
-
inline explicit source_info(cudf::host_span<cudf::device_span<std::byte const>> device_buffers)#
Construct a new source info object for multiple buffers in device memory.
- Parameters:
device_buffers – Input buffers in device memory
-
inline explicit source_info(cudf::device_span<std::byte const> d_buffer)#
Construct a new source info object from a device buffer.
- Parameters:
d_buffer – Input buffer in device memory
-
inline explicit source_info(std::vector<cudf::io::datasource*> const &sources)#
Construct a new source info object for multiple user-implemented sources.
- Parameters:
sources – User-implemented input sources
-
inline explicit source_info(cudf::io::datasource *source)#
Construct a new source info object for a single user-implemented source.
- Parameters:
source – Single user-implemented Input source
-
inline auto type() const#
Get the type of the input.
- Returns:
The type of the input
-
inline auto const &filepaths() const#
Get the filepaths of the input.
- Returns:
The filepaths of the input
-
inline auto const &host_buffers() const#
Get the host buffers of the input.
- Returns:
The host buffers of the input
-
inline auto const &device_buffers() const#
Get the device buffers of the input.
- Returns:
The device buffers of the input
-
inline auto const &user_sources() const#
Get the user sources of the input.
- Returns:
The user sources of the input
-
inline auto num_sources() const#
Get the number of input sources.
- Returns:
The number of input sources
-
source_info() = default#
-
struct sink_info#
- #include <types.hpp>
Destination information for write interfaces.
Public Functions
-
inline sink_info(size_t num_sinks)#
Construct a new sink info object.
- Parameters:
num_sinks – Number of sinks
-
inline explicit sink_info(std::vector<std::string> file_paths)#
Construct a new sink info object for multiple files.
- Parameters:
file_paths – Output files paths
-
inline explicit sink_info(std::string file_path)#
Construct a new sink info object for a single file.
- Parameters:
file_path – Single output file path
-
inline explicit sink_info(std::vector<std::vector<char>*> buffers)#
Construct a new sink info object for multiple host buffers.
- Parameters:
buffers – Output host buffers
-
inline explicit sink_info(std::vector<char> *buffer)#
Construct a new sink info object for a single host buffer.
- Parameters:
buffer – Single output host buffer
-
inline explicit sink_info(std::vector<cudf::io::data_sink*> const &user_sinks)#
Construct a new sink info object for multiple user-implemented sinks.
- Parameters:
user_sinks – Output user-implemented sinks
-
inline explicit sink_info(class cudf::io::data_sink *user_sink)#
Construct a new sink info object for a single user-implemented sink.
- Parameters:
user_sink – Single output user-implemented sink
-
inline auto type() const#
Get the type of the input.
- Returns:
The type of the input
-
inline auto num_sinks() const#
Get the number of sinks.
- Returns:
The number of sinks
-
inline auto const &filepaths() const#
Get the filepaths of the input.
- Returns:
The filepaths of the input
-
inline auto const &buffers() const#
Get the host buffers of the input.
- Returns:
The host buffers of the input
-
inline auto const &user_sinks() const#
Get the user sinks of the input.
- Returns:
The user sinks of the input
-
inline sink_info(size_t num_sinks)#
-
class column_in_metadata#
- #include <types.hpp>
Metadata for a column.
Public Functions
-
inline column_in_metadata(std::string_view name)#
Construct a new column in metadata object.
- Parameters:
name – Column name
-
inline column_in_metadata &add_child(column_in_metadata const &child)#
Add the children metadata of this column.
- Parameters:
child – The children metadata of this column to add
- Returns:
this for chaining
-
inline column_in_metadata &set_name(std::string const &name) noexcept#
Set the name of this column.
- Parameters:
name – Name of the column
- Returns:
this for chaining
-
inline column_in_metadata &set_nullability(bool nullable) noexcept#
Set the nullability of this column.
- Parameters:
nullable – Whether this column is nullable
- Returns:
this for chaining
-
inline column_in_metadata &set_list_column_as_map() noexcept#
Specify that this list column should be encoded as a map in the written file.
The column must have the structure list<struct<key, value>>. This option is invalid otherwise
- Returns:
this for chaining
-
inline column_in_metadata &set_int96_timestamps(bool req) noexcept#
Specifies whether this timestamp column should be encoded using the deprecated int96 physical type. Only valid for the following column types: timestamp_s, timestamp_ms, timestamp_us, timestamp_ns.
- Parameters:
req – True = use int96 physical type. False = use int64 physical type
- Returns:
this for chaining
-
inline column_in_metadata &set_decimal_precision(uint8_t precision) noexcept#
Set the decimal precision of this column. Only valid if this column is a decimal (fixed-point) type.
- Parameters:
precision – The integer precision to set for this decimal column
- Returns:
this for chaining
-
inline column_in_metadata &set_type_length(int32_t length) noexcept#
Set the data length of the column. Only valid if this column is a fixed-length byte array.
- Parameters:
length – The data length to set for this column
- Returns:
this for chaining
-
inline column_in_metadata &set_parquet_field_id(int32_t field_id) noexcept#
Set the parquet field id of this column.
- Parameters:
field_id – The parquet field id to set
- Returns:
this for chaining
-
inline column_in_metadata &set_output_as_binary(bool binary) noexcept#
Specifies whether this column should be written as binary or string data Only valid for the following column types: string.
- Parameters:
binary – True = use binary data type. False = use string data type
- Returns:
this for chaining
-
inline column_in_metadata &set_skip_compression(bool skip) noexcept#
Specifies whether this column should not be compressed regardless of the compression codec specified for the file.
- Parameters:
skip – If
truedo not compress this column- Returns:
this for chaining
-
inline column_in_metadata &set_encoding(column_encoding encoding) noexcept#
Sets the encoding to use for this column.
This is just a request, and the encoder may still choose to use a different encoding depending on resource constraints. Use the constants defined in the
parquet_encodingstruct.- Parameters:
encoding – The encoding to use
- Returns:
this for chaining
-
inline column_in_metadata &child(size_type i) noexcept#
Get reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline column_in_metadata const &child(size_type i) const noexcept#
Get const reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline std::string const &get_name() const noexcept#
Get the name of this column.
- Returns:
The name of this column
-
inline bool is_nullability_defined() const noexcept#
Get whether nullability has been explicitly set for this column.
- Returns:
Boolean indicating whether nullability has been explicitly set for this column
-
inline bool nullable() const#
Gets the explicitly set nullability for this column.
- Throws:
std::bad_optional_access – If nullability is not explicitly defined for this column. Check using
is_nullability_defined()first.- Returns:
Boolean indicating whether this column is nullable
-
inline bool is_map() const noexcept#
If this is the metadata of a list column, returns whether it is to be encoded as a map.
- Returns:
Boolean indicating whether this column is to be encoded as a map
-
inline bool is_enabled_int96_timestamps() const noexcept#
Get whether to encode this timestamp column using deprecated int96 physical type.
- Returns:
Boolean indicating whether to encode this timestamp column using deprecated int96 physical type
-
inline bool is_decimal_precision_set() const noexcept#
Get whether precision has been set for this decimal column.
- Returns:
Boolean indicating whether precision has been set for this decimal column
-
inline uint8_t get_decimal_precision() const#
Get the decimal precision that was set for this column.
- Throws:
std::bad_optional_access – If decimal precision was not set for this column. Check using
is_decimal_precision_set()first.- Returns:
The decimal precision that was set for this column
-
inline bool is_type_length_set() const noexcept#
Get whether type length has been set for this column.
- Returns:
Boolean indicating whether type length has been set for this column
-
inline uint8_t get_type_length() const#
Get the type length that was set for this column.
- Throws:
std::bad_optional_access – If type length was not set for this column. Check using
is_type_length_set()first.- Returns:
The decimal precision that was set for this column
-
inline bool is_parquet_field_id_set() const noexcept#
Get whether parquet field id has been set for this column.
- Returns:
Boolean indicating whether parquet field id has been set for this column
-
inline int32_t get_parquet_field_id() const#
Get the parquet field id that was set for this column.
- Throws:
std::bad_optional_access – If parquet field id was not set for this column. Check using
is_parquet_field_id_set()first.- Returns:
The parquet field id that was set for this column
-
inline size_type num_children() const noexcept#
Get the number of children of this column.
- Returns:
The number of children of this column
-
inline bool is_enabled_output_as_binary() const noexcept#
Get whether to encode this column as binary or string data.
- Returns:
Boolean indicating whether to encode this column as binary data
-
inline bool is_enabled_skip_compression() const noexcept#
Get whether to skip compressing this column.
- Returns:
Boolean indicating whether to skip compression of this column
-
inline column_encoding get_encoding() const#
Get the encoding that was set for this column.
- Returns:
The encoding that was set for this column
-
inline column_in_metadata(std::string_view name)#
-
class table_input_metadata#
- #include <types.hpp>
Metadata for a table.
Public Functions
-
explicit table_input_metadata(table_view const &table)#
Construct a new table_input_metadata from a table_view.
The constructed table_input_metadata has the same structure as the passed table_view
- Parameters:
table – The table_view to construct metadata for
-
explicit table_input_metadata(table_metadata const &metadata)#
Construct a new table_input_metadata from a table_metadata object.
The constructed table_input_metadata has the same structure, column names and nullability as the passed table_metadata.
- Parameters:
metadata – The table_metadata to construct table_intput_metadata for
Public Members
-
std::vector<column_in_metadata> column_metadata#
List of column metadata.
-
explicit table_input_metadata(table_view const &table)#
-
struct partition_info#
- #include <types.hpp>
Information used while writing partitioned datasets.
This information defines the slice of an input table to write to file. In partitioned dataset writing, one partition_info struct defines one partition and corresponds to one output file
Public Functions
-
inline partition_info(size_type start_row, size_type num_rows)#
Construct a new partition_info.
- Parameters:
start_row – The start row of the partition
num_rows – The number of rows in the partition
-
inline partition_info(size_type start_row, size_type num_rows)#
-
class reader_column_schema#
- #include <types.hpp>
schema element for reader
Public Functions
-
inline reader_column_schema(size_type number_of_children)#
Construct a new reader column schema object.
- Parameters:
number_of_children – number of child schema objects to default construct
-
inline reader_column_schema(host_span<reader_column_schema> const &child_span)#
Construct a new reader column schema object with a span defining the children.
- Parameters:
child_span – span of child schema objects
-
inline reader_column_schema &add_child(reader_column_schema const &child)#
Add the children metadata of this column.
- Parameters:
child – The children metadata of this column to add
- Returns:
this for chaining
-
inline reader_column_schema &child(size_type i)#
Get reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline reader_column_schema const &child(size_type i) const#
Get const reference to a child of this column.
- Parameters:
i – Index of the child to get
- Returns:
this for chaining
-
inline reader_column_schema &set_convert_binary_to_strings(bool convert_to_string)#
Specifies whether this column should be written as binary or string data Only valid for the following column types: string, list<int8>
- Parameters:
convert_to_string – True = convert binary to strings False = return binary
- Returns:
this for chaining
-
inline reader_column_schema &set_type_length(int32_t type_length)#
Sets the length of fixed length data.
- Parameters:
type_length – Size of the data type in bytes
- Returns:
this for chaining
-
inline bool is_enabled_convert_binary_to_strings() const#
Get whether to encode this column as binary or string data.
- Returns:
Boolean indicating whether to encode this column as binary data
-
inline int32_t get_type_length() const#
Get the length in bytes of this fixed length data.
- Returns:
The length in bytes of the data type
-
inline size_t get_num_children() const#
Get the number of child objects.
- Returns:
number of children
-
inline reader_column_schema(size_type number_of_children)#
-
using no_statistics = std::monostate#