26 #include <cuda/std/optional>
33 namespace CUDF_EXPORT
cudf {
34 namespace io::parquet {
53 FIXED_LEN_BYTE_ARRAY = 7,
76 TIMESTAMP_MICROS = 10,
101 DELTA_BINARY_PACKED = 5,
102 DELTA_LENGTH_BYTE_ARRAY = 6,
103 DELTA_BYTE_ARRAY = 7,
105 BYTE_STREAM_SPLIT = 9,
199 int32_t precision = 0;
207 enum Type : uint8_t { UNDEFINED, MILLIS, MICROS, NANOS };
220 bool isAdjustedToUTC =
true;
233 bool isAdjustedToUTC =
true;
247 bool isSigned =
false;
325 return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
335 return type == TIME and time_type->unit.type == TimeUnit::MICROS;
345 return type == TIME and time_type->unit.type == TimeUnit::NANOS;
355 return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
365 return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
375 return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
385 return type == INTEGER ? int_type->bitWidth : -1;
393 [[nodiscard]] constexpr
bool is_signed()
const {
return type == INTEGER and int_type->isSigned; }
400 [[nodiscard]] constexpr int32_t
scale()
const
402 return type == DECIMAL ? decimal_type->scale : -1;
412 return type == DECIMAL ? decimal_type->precision : -1;
421 enum Type : uint8_t { UNDEFINED, TYPE_ORDER };
436 int32_t type_length = 0;
440 std::string name =
"";
442 int32_t num_children = 0;
446 int32_t decimal_scale = 0;
448 int32_t decimal_precision = 0;
455 bool output_as_byte_array =
false;
463 int max_definition_level = 0;
465 int max_repetition_level = 0;
515 return repetition_type == FieldRepetitionType::REPEATED && num_children == 1;
530 return repetition_type == FieldRepetitionType::REPEATED and num_children == 0 and
539 [[nodiscard]]
bool is_list()
const {
return converted_type == ConvertedType::LIST; }
551 return type == Type::UNDEFINED &&
553 ((repetition_type != FieldRepetitionType::REPEATED) ||
554 (repetition_type == FieldRepetitionType::REPEATED && num_children > 1));
563 std::optional<std::vector<uint8_t>>
max;
565 std::optional<std::vector<uint8_t>>
min;
688 int64_t num_values = 0;
690 int64_t total_uncompressed_size = 0;
692 int64_t total_compressed_size = 0;
694 int64_t data_page_offset = 0;
696 int64_t index_page_offset = 0;
698 int64_t dictionary_page_offset = 0;
732 enum Hash : uint8_t { UNDEFINED, XXHASH };
775 std::string file_path =
"";
777 int64_t file_offset = 0;
782 int64_t offset_index_offset = 0;
784 int32_t offset_index_length = 0;
786 int64_t column_index_offset = 0;
788 int32_t column_index_length = 0;
813 int64_t total_byte_size = 0;
815 int64_t num_rows = 0;
852 int64_t num_rows = 0;
858 std::string created_by =
"";
869 int32_t num_values = 0;
873 Encoding definition_level_encoding = Encoding::PLAIN;
875 Encoding repetition_level_encoding = Encoding::PLAIN;
883 int32_t num_values = 0;
885 int32_t num_nulls = 0;
888 int32_t num_rows = 0;
892 int32_t definition_levels_byte_length = 0;
894 int32_t repetition_levels_byte_length = 0;
896 bool is_compressed =
true;
904 int32_t num_values = 0;
922 int32_t uncompressed_page_size = 0;
924 int32_t compressed_page_size = 0;
ConvertedType
High-level data types in Parquet, determines how data is logically interpreted.
FieldRepetitionType
Compression codec used for compressed data pages.
Encoding
Encoding types for the actual data stream.
Type
Basic data types in Parquet, determines how data is physically stored.
FieldType
Thrift compact protocol struct field types.
Compression
Compression codec used for compressed data pages.
BoundaryOrder
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so,...
int32_t size_type
Row index type for columns and tables.
The algorithm used in bloom filter.
Algorithm
Available bloom filter algorithms.
The compression used in the bloom filter.
Compression
Available bloom filter compression types.
The hash function used in Bloom filter.
Hash
Available bloom filter hashers.
Thrift-derived struct describing a chunk of data for a particular column.
std::optional< ColumnIndex > column_index
ColumnIndex for this column chunk
std::optional< OffsetIndex > offset_index
OffsetIndex for this column chunk
ColumnChunkMetaData meta_data
Thrift-derived struct describing the column index.
std::optional< std::vector< int64_t > > definition_level_histogram
Definition level histogram for the column chunk.
std::optional< std::vector< int64_t > > null_counts
Optional count of null values per page.
std::vector< std::vector< uint8_t > > max_values
Upper bound for values in each page.
std::optional< std::vector< int64_t > > repetition_level_histogram
Repetition level histogram for the column chunk.
std::vector< bool > null_pages
Boolean used to determine if a page contains only null values.
std::vector< std::vector< uint8_t > > min_values
Lower bound for values in each page.
Union to specify the order used for the min_value and max_value fields for a column.
Type
Available column order types.
Type type
Column order type.
Struct that describes the decimal logical type annotation.
Struct that describes the integer logical type annotation.
Thrift-derived struct describing a key-value pair, for user metadata.
std::string key
string key
std::string value
string value
Struct that describes the logical type annotation.
constexpr CUDF_HOST_DEVICE bool is_time_millis() const
Check if the time is in milliseconds.
LogicalType(Type tp=Type::UNDEFINED)
Default constructor.
cuda::std::optional< IntType > int_type
Integer type.
cuda::std::optional< TimeType > time_type
Time type.
constexpr CUDF_HOST_DEVICE bool is_timestamp_millis() const
Check if the timestamp is in milliseconds.
constexpr CUDF_HOST_DEVICE bool is_timestamp_micros() const
Check if the timestamp is in microseconds.
constexpr int32_t scale() const
Get the scale of the decimal type.
constexpr CUDF_HOST_DEVICE bool is_time_nanos() const
Check if the time is in nanoseconds.
LogicalType(IntType &&it)
Constructor for Integer logical type.
Type
Logical type annotations to replace ConvertedType.
constexpr bool is_signed() const
Check if the integer is signed.
constexpr CUDF_HOST_DEVICE bool is_timestamp_nanos() const
Check if the timestamp is in nanoseconds.
cuda::std::optional< DecimalType > decimal_type
Decimal type.
LogicalType(TimeType &&tt)
Constructor for Time logical type.
cuda::std::optional< TimestampType > timestamp_type
Timestamp type.
LogicalType(TimestampType &&tst)
Constructor for Timestamp logical type.
constexpr CUDF_HOST_DEVICE int32_t precision() const
Get the precision of the decimal type.
constexpr CUDF_HOST_DEVICE int8_t bit_width() const
Get the bit width of the integer type.
constexpr CUDF_HOST_DEVICE bool is_time_micros() const
Check if the time is in microseconds.
LogicalType(DecimalType &&dt)
Constructor for Decimal logical type.
Thrift-derived struct describing the offset index.
std::vector< PageLocation > page_locations
Page locations.
std::optional< std::vector< int64_t > > unencoded_byte_array_data_bytes
Thrift-derived struct describing page encoding statistics.
Encoding encoding
Encoding of the page.
int32_t count
Number of pages of this type with this encoding.
PageType page_type
The page type (data/dic/...)
Thrift-derived struct describing page location information stored in the offsets index.
int32_t compressed_page_size
Compressed page size in bytes plus the heeader length.
int64_t offset
Offset of the page in the file.
Thrift-derived struct describing a group of row data.
std::optional< int16_t > ordinal
Row group ordinal in the file.
std::optional< int64_t > file_offset
Byte offset from beginning of file to first page (data or dictionary) in this row group.
std::optional< std::vector< SortingColumn > > sorting_columns
If set, specifies a sort ordering of the rows in this RowGroup.
std::vector< ColumnChunk > columns
Metadata for each column chunk in this row group.
std::optional< int64_t > total_compressed_size
Total byte size of all compressed (and potentially encrypted) column data in this row group.
Struct for describing an element/field in the Parquet format schema.
Type type
1: parquet physical type for output
std::optional< type_id > arrow_type
cudf type determined from arrow:schema
int32_t decimal_precision
8: DEPRECATED: record the precision for DECIMAL converted type
std::optional< LogicalType > logical_type
10: replaces converted type
std::optional< int32_t > field_id
9: save field_id from original schema
bool is_struct() const
Check if the schema element is a struct.
std::string name
4: name of the field
int32_t decimal_scale
7: DEPRECATED: record the scale for DECIMAL converted type
bool is_stub() const
Check if the schema element is a stub.
bool is_one_level_list(SchemaElement const &parent) const
Check if the schema element is a one-level list.
int32_t num_children
5: nested fields
int32_t type_length
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
std::vector< size_type > children_idx
Children indices.
std::optional< ConvertedType > converted_type
6: DEPRECATED: record the original type before conversion to parquet type
bool operator==(SchemaElement const &other) const
Check if two schema elements are equal.
bool is_list() const
Check if the schema element is a list.
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
std::optional< std::vector< int64_t > > repetition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
std::optional< int64_t > unencoded_byte_array_data_bytes
Thrift-derived struct describing column sort order.
bool nulls_first
If true, nulls will come before non-null values.
bool descending
If true, indicates this column is sorted in descending order.
int32_t column_idx
The column index (in this row group)
Thrift-derived struct describing column chunk statistics.
std::optional< bool > is_min_value_exact
If true, min_value is the actual minimum value for a column.
std::optional< std::vector< uint8_t > > max_value
max value for column determined by ColumnOrder
std::optional< int64_t > null_count
count of null values in the column
std::optional< std::vector< uint8_t > > max
deprecated max value in signed comparison order
std::optional< std::vector< uint8_t > > min_value
min value for column determined by ColumnOrder
std::optional< bool > is_max_value_exact
If true, max_value is the actual maximum value for a column.
std::optional< std::vector< uint8_t > > min
deprecated min value in signed comparison order
std::optional< int64_t > distinct_count
count of distinct values occurring
Struct that describes the time logical type annotation.
Time units for temporal logical types.
Type
Available time units.
Struct that describes the timestamp logical type annotation.
Struct that describes the Parquet file data postscript.
uint32_t footer_len
Length of the footer.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Type declarations for libcudf.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.