parquet_schema.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
22 #pragma once
23 
24 #include <cudf/types.hpp>
25 
26 #include <cuda/std/optional>
27 
28 #include <cstdint>
29 #include <optional>
30 #include <string>
31 #include <vector>
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io::parquet {
44 enum class Type : int8_t {
45  UNDEFINED = -1, // Undefined for non-leaf nodes
46  BOOLEAN = 0,
47  INT32 = 1,
48  INT64 = 2,
49  INT96 = 3, // Deprecated
50  FLOAT = 4,
51  DOUBLE = 5,
52  BYTE_ARRAY = 6,
53  FIXED_LEN_BYTE_ARRAY = 7,
54 };
55 
59 enum class ConvertedType : int8_t {
60  UNKNOWN = -1, // No type information present
61  UTF8 = 0, // a BYTE_ARRAY may contain UTF8 encoded chars
62  MAP = 1, // a map is converted as an optional field containing a repeated key/value pair
63  MAP_KEY_VALUE = 2, // a key/value pair is converted into a group of two fields
64  LIST =
65  3, // a list is converted into an optional field containing a repeated field for its values
66  ENUM = 4, // an enum is converted into a binary field
67  DECIMAL = 5, // A decimal value. 10^(-scale) encoded as 2's complement big endian
68  // (precision=number of digits, scale=location of decimal point)
69  DATE = 6, // A Date, stored as days since Unix epoch, encoded as the INT32 physical type.
70  TIME_MILLIS = 7, // A time. The total number of milliseconds since midnight.The value is stored
71  // as an INT32 physical type.
72  TIME_MICROS = 8, // A time. The total number of microseconds since midnight. The value is stored
73  // as an INT64 physical type.
74  TIMESTAMP_MILLIS = 9, // A date/time combination, recorded as milliseconds since the Unix epoch
75  // using physical type of INT64.
76  TIMESTAMP_MICROS = 10, // A date/time combination, microseconds since the Unix epoch as INT64
77  UINT_8 = 11, // An unsigned integer 8-bit value as INT32
78  UINT_16 = 12, // An unsigned integer 16-bit value as INT32
79  UINT_32 = 13, // An unsigned integer 32-bit value as INT32
80  UINT_64 = 14, // An unsigned integer 64-bit value as INT64
81  INT_8 = 15, // A signed integer 8-bit value as INT32
82  INT_16 = 16, // A signed integer 16-bit value as INT32
83  INT_32 = 17, // A signed integer 32-bit value as INT32
84  INT_64 = 18, // A signed integer 8-bit value as INT64
85  JSON = 19, // A JSON document embedded within a single UTF8 column.
86  BSON = 20, // A BSON document embedded within a single BINARY column.
87  INTERVAL = 21, // This type annotates a time interval stored as a FIXED_LEN_BYTE_ARRAY of length
88  // 12 for 3 integers {months,days,milliseconds}
89  NA = 25, // No Type information, For eg, all-nulls.
90 };
91 
95 enum class Encoding : uint8_t {
96  PLAIN = 0,
97  GROUP_VAR_INT = 1, // Deprecated, never used
98  PLAIN_DICTIONARY = 2,
99  RLE = 3,
100  BIT_PACKED = 4, // Deprecated by parquet-format in 2013, superseded by RLE
101  DELTA_BINARY_PACKED = 5,
102  DELTA_LENGTH_BYTE_ARRAY = 6,
103  DELTA_BYTE_ARRAY = 7,
104  RLE_DICTIONARY = 8,
105  BYTE_STREAM_SPLIT = 9,
106  NUM_ENCODINGS = 10,
107 };
108 
112 enum class Compression : uint8_t {
113  UNCOMPRESSED = 0,
114  SNAPPY = 1,
115  GZIP = 2,
116  LZO = 3,
117  BROTLI = 4, // Added in 2.3.2
118  LZ4 = 5, // deprecated; based on LZ4, but with an additional undocumented framing scheme
119  ZSTD = 6, // Added in 2.3.2
120  LZ4_RAW = 7, // "standard" LZ4 block format
121 };
122 
126 enum class FieldRepetitionType : int8_t {
127  UNSPECIFIED = -1,
128  REQUIRED = 0, // This field is required (can not be null) and each record has exactly 1 value.
129  OPTIONAL = 1, // The field is optional (can be null) and each record has 0 or 1 values.
130  REPEATED = 2, // The field is repeated and can contain 0 or more values
131 };
132 
136 enum class PageType : uint8_t {
137  DATA_PAGE = 0,
138  INDEX_PAGE = 1,
139  DICTIONARY_PAGE = 2,
140  DATA_PAGE_V2 = 3,
141 };
142 
147 enum class BoundaryOrder : uint8_t {
148  UNORDERED = 0,
149  ASCENDING = 1,
150  DESCENDING = 2,
151 };
152 
156 enum class FieldType : uint8_t {
157  BOOLEAN_TRUE = 1,
158  BOOLEAN_FALSE = 2,
159  I8 = 3,
160  I16 = 4,
161  I32 = 5,
162  I64 = 6,
163  DOUBLE = 7,
164  BINARY = 8,
165  LIST = 9,
166  SET = 10,
167  MAP = 11,
168  STRUCT = 12,
169  UUID = 13,
170 };
171 
177  uint32_t magic;
178 };
179 
183 struct file_ender_s {
185  uint32_t footer_len;
187  uint32_t magic;
188 };
189 
195 struct DecimalType {
197  int32_t scale = 0;
199  int32_t precision = 0;
200 };
201 
205 struct TimeUnit {
207  enum Type : uint8_t { UNDEFINED, MILLIS, MICROS, NANOS };
210 };
211 
217 struct TimeType {
220  bool isAdjustedToUTC = true;
222  TimeUnit unit = {TimeUnit::Type::MILLIS};
223 };
224 
233  bool isAdjustedToUTC = true;
235  TimeUnit unit = {TimeUnit::Type::MILLIS};
236 };
237 
243 struct IntType {
245  int8_t bitWidth = 0;
247  bool isSigned = false;
248 };
249 
253 struct LogicalType {
255  enum Type : uint8_t {
256  UNDEFINED,
257  STRING,
258  MAP,
259  LIST,
260  ENUM,
261  DECIMAL,
262  DATE,
263  TIME,
264  TIMESTAMP,
265  // 9 is reserved
266  INTEGER = 10,
267  UNKNOWN,
268  JSON,
269  BSON
270  };
271 
275  cuda::std::optional<DecimalType> decimal_type;
277  cuda::std::optional<TimeType> time_type;
279  cuda::std::optional<TimestampType> timestamp_type;
281  cuda::std::optional<IntType> int_type;
282 
288  LogicalType(Type tp = Type::UNDEFINED) : type(tp) {}
289 
295  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
296 
302  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
303 
309  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
310 
316  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
317 
323  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
324  {
325  return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
326  }
327 
333  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
334  {
335  return type == TIME and time_type->unit.type == TimeUnit::MICROS;
336  }
337 
343  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
344  {
345  return type == TIME and time_type->unit.type == TimeUnit::NANOS;
346  }
347 
353  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
354  {
355  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
356  }
357 
363  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
364  {
365  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
366  }
367 
373  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
374  {
375  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
376  }
377 
383  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
384  {
385  return type == INTEGER ? int_type->bitWidth : -1;
386  }
387 
393  [[nodiscard]] constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
394 
400  [[nodiscard]] constexpr int32_t scale() const
401  {
402  return type == DECIMAL ? decimal_type->scale : -1;
403  }
404 
410  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
411  {
412  return type == DECIMAL ? decimal_type->precision : -1;
413  }
414 };
415 
419 struct ColumnOrder {
421  enum Type : uint8_t { UNDEFINED, TYPE_ORDER };
424 };
425 
434  Type type = Type::UNDEFINED;
436  int32_t type_length = 0;
438  FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED;
440  std::string name = "";
442  int32_t num_children = 0;
444  std::optional<ConvertedType> converted_type;
446  int32_t decimal_scale = 0;
448  int32_t decimal_precision = 0;
450  std::optional<int32_t> field_id;
452  std::optional<LogicalType> logical_type;
453 
455  bool output_as_byte_array = false;
456 
458  std::optional<type_id> arrow_type;
459 
460  // The following fields are filled in later during schema initialization
461 
463  int max_definition_level = 0;
465  int max_repetition_level = 0;
467  size_type parent_idx = 0;
469  std::vector<size_type> children_idx;
470 
477  bool operator==(SchemaElement const& other) const
478  {
479  return type == other.type && converted_type == other.converted_type &&
480  type_length == other.type_length && name == other.name &&
481  num_children == other.num_children && decimal_scale == other.decimal_scale &&
482  decimal_precision == other.decimal_precision && field_id == other.field_id;
483  }
484 
485  // the parquet format is a little squishy when it comes to interpreting
486  // repeated fields. sometimes repeated fields act as "stubs" in the schema
487  // that don't represent a true nesting level.
488  //
489  // this is the case with plain lists:
490  //
491  // optional group my_list (LIST) {
492  // repeated group element { <-- not part of the output hierarchy
493  // required binary str (UTF8);
494  // };
495  // }
496  //
497  // However, for backwards compatibility reasons, there are a few special cases, namely
498  // List<Struct<>> (which also corresponds to how the map type is specified), where
499  // this does not hold true
500  //
501  // optional group my_list (LIST) {
502  // repeated group element { <-- part of the hierarchy because it represents a struct
503  // required binary str (UTF8);
504  // required int32 num;
505  // };
506  // }
507 
513  [[nodiscard]] bool is_stub() const
514  {
515  return repetition_type == FieldRepetitionType::REPEATED && num_children == 1;
516  }
517 
528  [[nodiscard]] bool is_one_level_list(SchemaElement const& parent) const
529  {
530  return repetition_type == FieldRepetitionType::REPEATED and num_children == 0 and
531  not parent.is_list();
532  }
533 
539  [[nodiscard]] bool is_list() const { return converted_type == ConvertedType::LIST; }
540 
549  [[nodiscard]] bool is_struct() const
550  {
551  return type == Type::UNDEFINED &&
552  // this assumption might be a little weak.
553  ((repetition_type != FieldRepetitionType::REPEATED) ||
554  (repetition_type == FieldRepetitionType::REPEATED && num_children > 1));
555  }
556 };
557 
561 struct Statistics {
563  std::optional<std::vector<uint8_t>> max;
565  std::optional<std::vector<uint8_t>> min;
567  std::optional<int64_t> null_count;
569  std::optional<int64_t> distinct_count;
571  std::optional<std::vector<uint8_t>> max_value;
573  std::optional<std::vector<uint8_t>> min_value;
575  std::optional<bool> is_max_value_exact;
577  std::optional<bool> is_min_value_exact;
578 };
579 
586  std::optional<int64_t> unencoded_byte_array_data_bytes;
595  std::optional<std::vector<int64_t>> repetition_level_histogram;
596 
602  std::optional<std::vector<int64_t>> definition_level_histogram;
603 };
604 
609 struct PageLocation {
611  int64_t offset;
617 };
618 
622 struct OffsetIndex {
624  std::vector<PageLocation> page_locations;
627  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
628 };
629 
633 struct ColumnIndex {
635  std::vector<bool> null_pages;
637  std::vector<std::vector<uint8_t>> min_values;
639  std::vector<std::vector<uint8_t>> max_values;
641  BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;
643  std::optional<std::vector<int64_t>> null_counts;
645  std::optional<std::vector<int64_t>> repetition_level_histogram;
647  std::optional<std::vector<int64_t>> definition_level_histogram;
648 };
649 
659  int32_t count;
660 };
661 
667  int32_t column_idx;
672 };
673 
679  Type type = Type::BOOLEAN;
682  std::vector<Encoding> encodings;
684  std::vector<std::string> path_in_schema;
686  Compression codec = Compression::UNCOMPRESSED;
688  int64_t num_values = 0;
690  int64_t total_uncompressed_size = 0;
692  int64_t total_compressed_size = 0;
694  int64_t data_page_offset = 0;
696  int64_t index_page_offset = 0;
698  int64_t dictionary_page_offset = 0;
703  std::optional<std::vector<PageEncodingStats>> encoding_stats;
705  std::optional<int64_t> bloom_filter_offset;
710  std::optional<int32_t> bloom_filter_length;
714  std::optional<SizeStatistics> size_statistics;
715 };
716 
722  enum Algorithm : uint8_t { UNDEFINED, SPLIT_BLOCK };
724  Algorithm algorithm{Algorithm::SPLIT_BLOCK};
725 };
726 
732  enum Hash : uint8_t { UNDEFINED, XXHASH };
734  Hash hash{Hash::XXHASH};
735 };
736 
742  enum Compression : uint8_t { UNDEFINED, UNCOMPRESSED };
744  Compression compression{Compression::UNCOMPRESSED};
745 };
746 
755  int32_t num_bytes;
762 };
763 
772 struct ColumnChunk {
775  std::string file_path = "";
777  int64_t file_offset = 0;
782  int64_t offset_index_offset = 0;
784  int32_t offset_index_length = 0;
786  int64_t column_index_offset = 0;
788  int32_t column_index_length = 0;
789 
790  // Following fields are derived from other fields
791 
793  int schema_idx = -1;
794 
795  // The indexes don't really live here, but it's a convenient place to hang them.
796 
798  std::optional<OffsetIndex> offset_index;
800  std::optional<ColumnIndex> column_index;
801 };
802 
809 struct RowGroup {
811  std::vector<ColumnChunk> columns;
813  int64_t total_byte_size = 0;
815  int64_t num_rows = 0;
817  std::optional<std::vector<SortingColumn>> sorting_columns;
819  std::optional<int64_t> file_offset;
821  std::optional<int64_t> total_compressed_size;
823  std::optional<int16_t> ordinal;
824 };
825 
829 struct KeyValue {
831  std::string key;
833  std::string value;
834 };
835 
843 struct FileMetaData {
845  int32_t version = 0;
850  std::vector<SchemaElement> schema;
852  int64_t num_rows = 0;
854  std::vector<RowGroup> row_groups;
856  std::vector<KeyValue> key_value_metadata;
858  std::string created_by = "";
861  std::optional<std::vector<ColumnOrder>> column_orders;
862 };
863 
869  int32_t num_values = 0;
871  Encoding encoding = Encoding::PLAIN;
873  Encoding definition_level_encoding = Encoding::PLAIN;
875  Encoding repetition_level_encoding = Encoding::PLAIN;
876 };
877 
883  int32_t num_values = 0;
885  int32_t num_nulls = 0;
888  int32_t num_rows = 0;
890  Encoding encoding = Encoding::PLAIN;
892  int32_t definition_levels_byte_length = 0;
894  int32_t repetition_levels_byte_length = 0;
896  bool is_compressed = true;
897 };
898 
904  int32_t num_values = 0;
906  Encoding encoding = Encoding::PLAIN;
907 };
908 
918 struct PageHeader {
920  PageType type = PageType::DATA_PAGE;
922  int32_t uncompressed_page_size = 0;
924  int32_t compressed_page_size = 0;
925 
926  // Headers for page specific data. One only will be set.
927 
934 };
935  // end of group
937 } // namespace io::parquet
938 } // namespace CUDF_EXPORT cudf
ConvertedType
High-level data types in Parquet, determines how data is logically interpreted.
FieldRepetitionType
Compression codec used for compressed data pages.
PageType
Types of pages.
Encoding
Encoding types for the actual data stream.
Type
Basic data types in Parquet, determines how data is physically stored.
FieldType
Thrift compact protocol struct field types.
Compression
Compression codec used for compressed data pages.
BoundaryOrder
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so,...
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF interfaces
Definition: host_udf.hpp:37
The algorithm used in bloom filter.
Algorithm
Available bloom filter algorithms.
The compression used in the bloom filter.
Compression
Available bloom filter compression types.
The hash function used in Bloom filter.
Hash
Available bloom filter hashers.
Bloom filter header struct.
BloomFilterCompression compression
The compression used in the bloom filter.
BloomFilterHash hash
The hash function used for bloom filter.
BloomFilterAlgorithm algorithm
The algorithm for setting bits.
int32_t num_bytes
The size of bitset in bytes.
Thrift-derived struct describing a column chunk.
std::optional< std::vector< PageEncodingStats > > encoding_stats
Statistics statistics
Optional statistics for this column chunk.
std::optional< int64_t > bloom_filter_offset
Byte offset from beginning of file to Bloom filter data.
std::optional< int32_t > bloom_filter_length
std::optional< SizeStatistics > size_statistics
std::vector< std::string > path_in_schema
Path in schema.
Thrift-derived struct describing a chunk of data for a particular column.
std::optional< ColumnIndex > column_index
ColumnIndex for this column chunk
std::optional< OffsetIndex > offset_index
OffsetIndex for this column chunk
Thrift-derived struct describing the column index.
std::optional< std::vector< int64_t > > definition_level_histogram
Definition level histogram for the column chunk.
std::optional< std::vector< int64_t > > null_counts
Optional count of null values per page.
std::vector< std::vector< uint8_t > > max_values
Upper bound for values in each page.
std::optional< std::vector< int64_t > > repetition_level_histogram
Repetition level histogram for the column chunk.
std::vector< bool > null_pages
Boolean used to determine if a page contains only null values.
std::vector< std::vector< uint8_t > > min_values
Lower bound for values in each page.
Union to specify the order used for the min_value and max_value fields for a column.
Type
Available column order types.
Type type
Column order type.
Thrift-derived struct describing the header for a V2 data page.
Thrift-derived struct describing the header for a data page.
Struct that describes the decimal logical type annotation.
Thrift-derived struct describing the header for a dictionary page.
Thrift-derived struct describing file-level metadata.
std::optional< std::vector< ColumnOrder > > column_orders
std::vector< RowGroup > row_groups
Row groups in this file.
std::vector< KeyValue > key_value_metadata
Optional key/value metadata.
std::vector< SchemaElement > schema
Struct that describes the integer logical type annotation.
Thrift-derived struct describing a key-value pair, for user metadata.
std::string key
string key
std::string value
string value
Struct that describes the logical type annotation.
constexpr CUDF_HOST_DEVICE bool is_time_millis() const
Check if the time is in milliseconds.
LogicalType(Type tp=Type::UNDEFINED)
Default constructor.
cuda::std::optional< IntType > int_type
Integer type.
cuda::std::optional< TimeType > time_type
Time type.
constexpr CUDF_HOST_DEVICE bool is_timestamp_millis() const
Check if the timestamp is in milliseconds.
constexpr CUDF_HOST_DEVICE bool is_timestamp_micros() const
Check if the timestamp is in microseconds.
constexpr int32_t scale() const
Get the scale of the decimal type.
constexpr CUDF_HOST_DEVICE bool is_time_nanos() const
Check if the time is in nanoseconds.
LogicalType(IntType &&it)
Constructor for Integer logical type.
Type
Logical type annotations to replace ConvertedType.
constexpr bool is_signed() const
Check if the integer is signed.
constexpr CUDF_HOST_DEVICE bool is_timestamp_nanos() const
Check if the timestamp is in nanoseconds.
cuda::std::optional< DecimalType > decimal_type
Decimal type.
LogicalType(TimeType &&tt)
Constructor for Time logical type.
cuda::std::optional< TimestampType > timestamp_type
Timestamp type.
LogicalType(TimestampType &&tst)
Constructor for Timestamp logical type.
constexpr CUDF_HOST_DEVICE int32_t precision() const
Get the precision of the decimal type.
constexpr CUDF_HOST_DEVICE int8_t bit_width() const
Get the bit width of the integer type.
constexpr CUDF_HOST_DEVICE bool is_time_micros() const
Check if the time is in microseconds.
LogicalType(DecimalType &&dt)
Constructor for Decimal logical type.
Thrift-derived struct describing the offset index.
std::vector< PageLocation > page_locations
Page locations.
std::optional< std::vector< int64_t > > unencoded_byte_array_data_bytes
Thrift-derived struct describing page encoding statistics.
Encoding encoding
Encoding of the page.
int32_t count
Number of pages of this type with this encoding.
PageType page_type
The page type (data/dic/...)
Thrift-derived struct describing the page header.
DataPageHeader data_page_header
Data page header.
DictionaryPageHeader dictionary_page_header
Dictionary page header.
DataPageHeaderV2 data_page_header_v2
V2 data page header.
Thrift-derived struct describing page location information stored in the offsets index.
int32_t compressed_page_size
Compressed page size in bytes plus the heeader length.
int64_t offset
Offset of the page in the file.
Thrift-derived struct describing a group of row data.
std::optional< int16_t > ordinal
Row group ordinal in the file.
std::optional< int64_t > file_offset
Byte offset from beginning of file to first page (data or dictionary) in this row group.
std::optional< std::vector< SortingColumn > > sorting_columns
If set, specifies a sort ordering of the rows in this RowGroup.
std::vector< ColumnChunk > columns
Metadata for each column chunk in this row group.
std::optional< int64_t > total_compressed_size
Total byte size of all compressed (and potentially encrypted) column data in this row group.
Struct for describing an element/field in the Parquet format schema.
Type type
1: parquet physical type for output
std::optional< type_id > arrow_type
cudf type determined from arrow:schema
int32_t decimal_precision
8: DEPRECATED: record the precision for DECIMAL converted type
std::optional< LogicalType > logical_type
10: replaces converted type
std::optional< int32_t > field_id
9: save field_id from original schema
bool is_struct() const
Check if the schema element is a struct.
std::string name
4: name of the field
int32_t decimal_scale
7: DEPRECATED: record the scale for DECIMAL converted type
bool is_stub() const
Check if the schema element is a stub.
bool is_one_level_list(SchemaElement const &parent) const
Check if the schema element is a one-level list.
int32_t num_children
5: nested fields
int32_t type_length
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
std::vector< size_type > children_idx
Children indices.
std::optional< ConvertedType > converted_type
6: DEPRECATED: record the original type before conversion to parquet type
bool operator==(SchemaElement const &other) const
Check if two schema elements are equal.
bool is_list() const
Check if the schema element is a list.
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
std::optional< std::vector< int64_t > > repetition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
std::optional< int64_t > unencoded_byte_array_data_bytes
Thrift-derived struct describing column sort order.
bool nulls_first
If true, nulls will come before non-null values.
bool descending
If true, indicates this column is sorted in descending order.
int32_t column_idx
The column index (in this row group)
Thrift-derived struct describing column chunk statistics.
std::optional< bool > is_min_value_exact
If true, min_value is the actual minimum value for a column.
std::optional< std::vector< uint8_t > > max_value
max value for column determined by ColumnOrder
std::optional< int64_t > null_count
count of null values in the column
std::optional< std::vector< uint8_t > > max
deprecated max value in signed comparison order
std::optional< std::vector< uint8_t > > min_value
min value for column determined by ColumnOrder
std::optional< bool > is_max_value_exact
If true, max_value is the actual maximum value for a column.
std::optional< std::vector< uint8_t > > min
deprecated min value in signed comparison order
std::optional< int64_t > distinct_count
count of distinct values occurring
Struct that describes the time logical type annotation.
Time units for temporal logical types.
Type
Available time units.
Struct that describes the timestamp logical type annotation.
Struct that describes the Parquet file data postscript.
uint32_t footer_len
Length of the footer.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Struct that describes the Parquet file data header.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Type declarations for libcudf.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32