parquet_schema.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
11 #pragma once
12 
13 #include <cudf/types.hpp>
14 
15 #include <cuda/std/optional>
16 
17 #include <cstdint>
18 #include <optional>
19 #include <string>
20 #include <vector>
21 
22 namespace CUDF_EXPORT cudf {
23 namespace io::parquet {
33 enum class Type : int8_t {
34  UNDEFINED = -1, // Undefined for non-leaf nodes
35  BOOLEAN = 0,
36  INT32 = 1,
37  INT64 = 2,
38  INT96 = 3, // Deprecated
39  FLOAT = 4,
40  DOUBLE = 5,
41  BYTE_ARRAY = 6,
42  FIXED_LEN_BYTE_ARRAY = 7,
43 };
44 
48 enum class ConvertedType : int8_t {
49  UNKNOWN = -1, // No type information present
50  UTF8 = 0, // a BYTE_ARRAY may contain UTF8 encoded chars
51  MAP = 1, // a map is converted as an optional field containing a repeated key/value pair
52  MAP_KEY_VALUE = 2, // a key/value pair is converted into a group of two fields
53  LIST =
54  3, // a list is converted into an optional field containing a repeated field for its values
55  ENUM = 4, // an enum is converted into a binary field
56  DECIMAL = 5, // A decimal value. 10^(-scale) encoded as 2's complement big endian
57  // (precision=number of digits, scale=location of decimal point)
58  DATE = 6, // A Date, stored as days since Unix epoch, encoded as the INT32 physical type.
59  TIME_MILLIS = 7, // A time. The total number of milliseconds since midnight.The value is stored
60  // as an INT32 physical type.
61  TIME_MICROS = 8, // A time. The total number of microseconds since midnight. The value is stored
62  // as an INT64 physical type.
63  TIMESTAMP_MILLIS = 9, // A date/time combination, recorded as milliseconds since the Unix epoch
64  // using physical type of INT64.
65  TIMESTAMP_MICROS = 10, // A date/time combination, microseconds since the Unix epoch as INT64
66  UINT_8 = 11, // An unsigned integer 8-bit value as INT32
67  UINT_16 = 12, // An unsigned integer 16-bit value as INT32
68  UINT_32 = 13, // An unsigned integer 32-bit value as INT32
69  UINT_64 = 14, // An unsigned integer 64-bit value as INT64
70  INT_8 = 15, // A signed integer 8-bit value as INT32
71  INT_16 = 16, // A signed integer 16-bit value as INT32
72  INT_32 = 17, // A signed integer 32-bit value as INT32
73  INT_64 = 18, // A signed integer 8-bit value as INT64
74  JSON = 19, // A JSON document embedded within a single UTF8 column.
75  BSON = 20, // A BSON document embedded within a single BINARY column.
76  INTERVAL = 21, // This type annotates a time interval stored as a FIXED_LEN_BYTE_ARRAY of length
77  // 12 for 3 integers {months,days,milliseconds}
78  NA = 25, // No Type information, For eg, all-nulls.
79 };
80 
84 enum class Encoding : uint8_t {
85  PLAIN = 0,
86  GROUP_VAR_INT = 1, // Deprecated, never used
87  PLAIN_DICTIONARY = 2,
88  RLE = 3,
89  BIT_PACKED = 4, // Deprecated by parquet-format in 2013, superseded by RLE
90  DELTA_BINARY_PACKED = 5,
91  DELTA_LENGTH_BYTE_ARRAY = 6,
92  DELTA_BYTE_ARRAY = 7,
93  RLE_DICTIONARY = 8,
94  BYTE_STREAM_SPLIT = 9,
95  NUM_ENCODINGS = 10,
96 };
97 
101 enum class Compression : uint8_t {
102  UNCOMPRESSED = 0,
103  SNAPPY = 1,
104  GZIP = 2,
105  LZO = 3,
106  BROTLI = 4, // Added in 2.3.2
107  LZ4 = 5, // deprecated; based on LZ4, but with an additional undocumented framing scheme
108  ZSTD = 6, // Added in 2.3.2
109  LZ4_RAW = 7, // "standard" LZ4 block format
110 };
111 
115 enum class FieldRepetitionType : int8_t {
116  UNSPECIFIED = -1,
117  REQUIRED = 0, // This field is required (can not be null) and each record has exactly 1 value.
118  OPTIONAL = 1, // The field is optional (can be null) and each record has 0 or 1 values.
119  REPEATED = 2, // The field is repeated and can contain 0 or more values
120 };
121 
125 enum class PageType : uint8_t {
126  DATA_PAGE = 0,
127  INDEX_PAGE = 1,
128  DICTIONARY_PAGE = 2,
129  DATA_PAGE_V2 = 3,
130 };
131 
136 enum class BoundaryOrder : uint8_t {
137  UNORDERED = 0,
138  ASCENDING = 1,
139  DESCENDING = 2,
140 };
141 
145 enum class FieldType : uint8_t {
146  BOOLEAN_TRUE = 1,
147  BOOLEAN_FALSE = 2,
148  I8 = 3,
149  I16 = 4,
150  I32 = 5,
151  I64 = 6,
152  DOUBLE = 7,
153  BINARY = 8,
154  LIST = 9,
155  SET = 10,
156  MAP = 11,
157  STRUCT = 12,
158  UUID = 13,
159 };
160 
166  uint32_t magic;
167 };
168 
172 struct file_ender_s {
174  uint32_t footer_len;
176  uint32_t magic;
177 };
178 
184 struct DecimalType {
186  int32_t scale = 0;
188  int32_t precision = 0;
189 };
190 
194 struct TimeUnit {
196  enum Type : uint8_t { UNDEFINED, MILLIS, MICROS, NANOS };
199 };
200 
206 struct TimeType {
209  bool isAdjustedToUTC = true;
211  TimeUnit unit = {TimeUnit::Type::MILLIS};
212 };
213 
222  bool isAdjustedToUTC = true;
224  TimeUnit unit = {TimeUnit::Type::MILLIS};
225 };
226 
232 struct IntType {
234  int8_t bitWidth = 0;
236  bool isSigned = false;
237 };
238 
242 struct LogicalType {
244  enum Type : uint8_t {
245  UNDEFINED,
246  STRING,
247  MAP,
248  LIST,
249  ENUM,
250  DECIMAL,
251  DATE,
252  TIME,
253  TIMESTAMP,
254  // 9 is reserved
255  INTEGER = 10,
256  UNKNOWN,
257  JSON,
258  BSON,
259  VARIANT = 16,
260  };
261 
265  cuda::std::optional<DecimalType> decimal_type;
267  cuda::std::optional<TimeType> time_type;
269  cuda::std::optional<TimestampType> timestamp_type;
271  cuda::std::optional<IntType> int_type;
272 
278  LogicalType(Type tp = Type::UNDEFINED) : type(tp) {}
279 
285  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
286 
292  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
293 
299  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
300 
306  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
307 
313  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
314  {
315  return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
316  }
317 
323  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
324  {
325  return type == TIME and time_type->unit.type == TimeUnit::MICROS;
326  }
327 
333  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
334  {
335  return type == TIME and time_type->unit.type == TimeUnit::NANOS;
336  }
337 
343  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
344  {
345  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
346  }
347 
353  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
354  {
355  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
356  }
357 
363  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
364  {
365  return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
366  }
367 
373  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
374  {
375  return type == INTEGER ? int_type->bitWidth : -1;
376  }
377 
383  [[nodiscard]] constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
384 
390  [[nodiscard]] constexpr int32_t scale() const
391  {
392  return type == DECIMAL ? decimal_type->scale : -1;
393  }
394 
400  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
401  {
402  return type == DECIMAL ? decimal_type->precision : -1;
403  }
404 };
405 
409 struct ColumnOrder {
411  enum Type : uint8_t { UNDEFINED, TYPE_ORDER };
414 };
415 
424  Type type = Type::UNDEFINED;
426  int32_t type_length = 0;
428  FieldRepetitionType repetition_type = FieldRepetitionType::REQUIRED;
430  std::string name = "";
432  int32_t num_children = 0;
434  std::optional<ConvertedType> converted_type;
436  int32_t decimal_scale = 0;
438  int32_t decimal_precision = 0;
440  std::optional<int32_t> field_id;
442  cuda::std::optional<LogicalType> logical_type;
443 
445  bool output_as_byte_array = false;
446 
448  std::optional<type_id> arrow_type;
449 
450  // The following fields are filled in later during schema initialization
451 
453  int max_definition_level = 0;
455  int max_repetition_level = 0;
457  size_type parent_idx = 0;
459  std::vector<size_type> children_idx;
460 
467  bool operator==(SchemaElement const& other) const
468  {
469  return type == other.type && converted_type == other.converted_type &&
470  type_length == other.type_length && name == other.name &&
471  num_children == other.num_children && decimal_scale == other.decimal_scale &&
472  decimal_precision == other.decimal_precision && field_id == other.field_id;
473  }
474 
475  // the parquet format is a little squishy when it comes to interpreting
476  // repeated fields. sometimes repeated fields act as "stubs" in the schema
477  // that don't represent a true nesting level.
478  //
479  // this is the case with plain lists:
480  //
481  // optional group my_list (LIST) {
482  // repeated group element { <-- not part of the output hierarchy
483  // required binary str (UTF8);
484  // };
485  // }
486  //
487  // However, for backwards compatibility reasons, there are a few special cases, namely
488  // List<Struct<>> (which also corresponds to how the map type is specified), where
489  // this does not hold true
490  //
491  // optional group my_list (LIST) {
492  // repeated group element { <-- part of the hierarchy because it represents a struct
493  // required binary str (UTF8);
494  // required int32 num;
495  // };
496  // }
497 
527  [[nodiscard]] bool is_stub() const
528  {
529  auto const is_list_or_map_logical_type =
530  logical_type.has_value() &&
531  (logical_type->type == LogicalType::LIST || logical_type->type == LogicalType::MAP);
532  return repetition_type == FieldRepetitionType::REPEATED && num_children == 1 &&
533  converted_type != ConvertedType::LIST && converted_type != ConvertedType::MAP &&
534  !is_list_or_map_logical_type;
535  }
536 
547  [[nodiscard]] bool is_one_level_list(SchemaElement const& parent) const
548  {
549  return repetition_type == FieldRepetitionType::REPEATED and num_children == 0 and
550  not parent.is_list();
551  }
552 
558  [[nodiscard]] bool is_list() const { return converted_type == ConvertedType::LIST; }
559 
568  [[nodiscard]] bool is_struct() const
569  {
570  return type == Type::UNDEFINED &&
571  // this assumption might be a little weak.
572  ((repetition_type != FieldRepetitionType::REPEATED) ||
573  (repetition_type == FieldRepetitionType::REPEATED && num_children > 1));
574  }
575 };
576 
580 struct Statistics {
582  std::optional<std::vector<uint8_t>> max;
584  std::optional<std::vector<uint8_t>> min;
586  std::optional<int64_t> null_count;
588  std::optional<int64_t> distinct_count;
590  std::optional<std::vector<uint8_t>> max_value;
592  std::optional<std::vector<uint8_t>> min_value;
594  std::optional<bool> is_max_value_exact;
596  std::optional<bool> is_min_value_exact;
597 };
598 
605  std::optional<int64_t> unencoded_byte_array_data_bytes;
614  std::optional<std::vector<int64_t>> repetition_level_histogram;
615 
621  std::optional<std::vector<int64_t>> definition_level_histogram;
622 };
623 
628 struct PageLocation {
630  int64_t offset;
636 };
637 
641 struct OffsetIndex {
643  std::vector<PageLocation> page_locations;
646  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
647 };
648 
652 struct ColumnIndex {
654  std::vector<bool> null_pages;
656  std::vector<std::vector<uint8_t>> min_values;
658  std::vector<std::vector<uint8_t>> max_values;
660  BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;
662  std::optional<std::vector<int64_t>> null_counts;
664  std::optional<std::vector<int64_t>> repetition_level_histogram;
666  std::optional<std::vector<int64_t>> definition_level_histogram;
667 };
668 
678  int32_t count;
679 };
680 
686  int32_t column_idx;
691 };
692 
698  Type type = Type::BOOLEAN;
701  std::vector<Encoding> encodings;
703  std::vector<std::string> path_in_schema;
705  Compression codec = Compression::UNCOMPRESSED;
707  int64_t num_values = 0;
709  int64_t total_uncompressed_size = 0;
711  int64_t total_compressed_size = 0;
713  int64_t data_page_offset = 0;
715  int64_t index_page_offset = 0;
717  int64_t dictionary_page_offset = 0;
722  std::optional<std::vector<PageEncodingStats>> encoding_stats;
724  std::optional<int64_t> bloom_filter_offset;
729  std::optional<int32_t> bloom_filter_length;
733  std::optional<SizeStatistics> size_statistics;
734 };
735 
741  enum Algorithm : uint8_t { UNDEFINED, SPLIT_BLOCK };
743  Algorithm algorithm{Algorithm::SPLIT_BLOCK};
744 };
745 
751  enum Hash : uint8_t { UNDEFINED, XXHASH };
753  Hash hash{Hash::XXHASH};
754 };
755 
761  enum Compression : uint8_t { UNDEFINED, UNCOMPRESSED };
763  Compression compression{Compression::UNCOMPRESSED};
764 };
765 
774  int32_t num_bytes;
781 };
782 
791 struct ColumnChunk {
794  std::string file_path = "";
796  int64_t file_offset = 0;
801  int64_t offset_index_offset = 0;
803  int32_t offset_index_length = 0;
805  int64_t column_index_offset = 0;
807  int32_t column_index_length = 0;
808 
809  // Following fields are derived from other fields
810 
812  int schema_idx = -1;
813 
814  // The indexes don't really live here, but it's a convenient place to hang them.
815 
817  std::optional<OffsetIndex> offset_index;
819  std::optional<ColumnIndex> column_index;
820 };
821 
828 struct RowGroup {
830  std::vector<ColumnChunk> columns;
832  int64_t total_byte_size = 0;
834  int64_t num_rows = 0;
836  std::optional<std::vector<SortingColumn>> sorting_columns;
838  std::optional<int64_t> file_offset;
840  std::optional<int64_t> total_compressed_size;
842  std::optional<int16_t> ordinal;
843 };
844 
848 struct KeyValue {
850  std::string key;
852  std::string value;
853 };
854 
862 struct FileMetaData {
864  int32_t version = 0;
869  std::vector<SchemaElement> schema;
871  int64_t num_rows = 0;
873  std::vector<RowGroup> row_groups;
875  std::vector<KeyValue> key_value_metadata;
877  std::string created_by = "";
880  std::optional<std::vector<ColumnOrder>> column_orders;
881 };
882 
888  int32_t num_values = 0;
890  Encoding encoding = Encoding::PLAIN;
892  Encoding definition_level_encoding = Encoding::PLAIN;
894  Encoding repetition_level_encoding = Encoding::PLAIN;
895 };
896 
902  int32_t num_values = 0;
904  int32_t num_nulls = 0;
907  int32_t num_rows = 0;
909  Encoding encoding = Encoding::PLAIN;
911  int32_t definition_levels_byte_length = 0;
913  int32_t repetition_levels_byte_length = 0;
915  bool is_compressed = true;
916 };
917 
923  int32_t num_values = 0;
925  Encoding encoding = Encoding::PLAIN;
926 };
927 
937 struct PageHeader {
939  PageType type = PageType::DATA_PAGE;
941  int32_t uncompressed_page_size = 0;
943  int32_t compressed_page_size = 0;
944 
945  // Headers for page specific data. One only will be set.
946 
953 };
954  // end of group
956 } // namespace io::parquet
957 } // namespace CUDF_EXPORT cudf
ConvertedType
High-level data types in Parquet, determines how data is logically interpreted.
FieldRepetitionType
Compression codec used for compressed data pages.
PageType
Types of pages.
Encoding
Encoding types for the actual data stream.
Type
Basic data types in Parquet, determines how data is physically stored.
FieldType
Thrift compact protocol struct field types.
Compression
Compression codec used for compressed data pages.
BoundaryOrder
Enum to annotate whether lists of min/max elements inside ColumnIndex are ordered and if so,...
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
cuDF interfaces
Definition: host_udf.hpp:26
The algorithm used in bloom filter.
Algorithm
Available bloom filter algorithms.
The compression used in the bloom filter.
Compression
Available bloom filter compression types.
The hash function used in Bloom filter.
Hash
Available bloom filter hashers.
Bloom filter header struct.
BloomFilterCompression compression
The compression used in the bloom filter.
BloomFilterHash hash
The hash function used for bloom filter.
BloomFilterAlgorithm algorithm
The algorithm for setting bits.
int32_t num_bytes
The size of bitset in bytes.
Thrift-derived struct describing a column chunk.
std::optional< std::vector< PageEncodingStats > > encoding_stats
Statistics statistics
Optional statistics for this column chunk.
std::optional< int64_t > bloom_filter_offset
Byte offset from beginning of file to Bloom filter data.
std::optional< int32_t > bloom_filter_length
std::optional< SizeStatistics > size_statistics
std::vector< std::string > path_in_schema
Path in schema.
Thrift-derived struct describing a chunk of data for a particular column.
std::optional< ColumnIndex > column_index
ColumnIndex for this column chunk
std::optional< OffsetIndex > offset_index
OffsetIndex for this column chunk
Thrift-derived struct describing the column index.
std::optional< std::vector< int64_t > > definition_level_histogram
Definition level histogram for the column chunk.
std::optional< std::vector< int64_t > > null_counts
Optional count of null values per page.
std::vector< std::vector< uint8_t > > max_values
Upper bound for values in each page.
std::optional< std::vector< int64_t > > repetition_level_histogram
Repetition level histogram for the column chunk.
std::vector< bool > null_pages
Boolean used to determine if a page contains only null values.
std::vector< std::vector< uint8_t > > min_values
Lower bound for values in each page.
Union to specify the order used for the min_value and max_value fields for a column.
Type
Available column order types.
Type type
Column order type.
Thrift-derived struct describing the header for a V2 data page.
Thrift-derived struct describing the header for a data page.
Struct that describes the decimal logical type annotation.
Thrift-derived struct describing the header for a dictionary page.
Thrift-derived struct describing file-level metadata.
std::optional< std::vector< ColumnOrder > > column_orders
std::vector< RowGroup > row_groups
Row groups in this file.
std::vector< KeyValue > key_value_metadata
Optional key/value metadata.
std::vector< SchemaElement > schema
Struct that describes the integer logical type annotation.
Thrift-derived struct describing a key-value pair, for user metadata.
std::string key
string key
std::string value
string value
Struct that describes the logical type annotation.
constexpr CUDF_HOST_DEVICE bool is_time_millis() const
Check if the time is in milliseconds.
LogicalType(Type tp=Type::UNDEFINED)
Default constructor.
cuda::std::optional< IntType > int_type
Integer type.
cuda::std::optional< TimeType > time_type
Time type.
constexpr CUDF_HOST_DEVICE bool is_timestamp_millis() const
Check if the timestamp is in milliseconds.
constexpr CUDF_HOST_DEVICE bool is_timestamp_micros() const
Check if the timestamp is in microseconds.
constexpr int32_t scale() const
Get the scale of the decimal type.
constexpr CUDF_HOST_DEVICE bool is_time_nanos() const
Check if the time is in nanoseconds.
LogicalType(IntType &&it)
Constructor for Integer logical type.
Type
Logical type annotations to replace ConvertedType.
constexpr bool is_signed() const
Check if the integer is signed.
constexpr CUDF_HOST_DEVICE bool is_timestamp_nanos() const
Check if the timestamp is in nanoseconds.
cuda::std::optional< DecimalType > decimal_type
Decimal type.
LogicalType(TimeType &&tt)
Constructor for Time logical type.
cuda::std::optional< TimestampType > timestamp_type
Timestamp type.
LogicalType(TimestampType &&tst)
Constructor for Timestamp logical type.
constexpr CUDF_HOST_DEVICE int32_t precision() const
Get the precision of the decimal type.
constexpr CUDF_HOST_DEVICE int8_t bit_width() const
Get the bit width of the integer type.
constexpr CUDF_HOST_DEVICE bool is_time_micros() const
Check if the time is in microseconds.
LogicalType(DecimalType &&dt)
Constructor for Decimal logical type.
Thrift-derived struct describing the offset index.
std::vector< PageLocation > page_locations
Page locations.
std::optional< std::vector< int64_t > > unencoded_byte_array_data_bytes
Thrift-derived struct describing page encoding statistics.
Encoding encoding
Encoding of the page.
int32_t count
Number of pages of this type with this encoding.
PageType page_type
The page type (data/dic/...)
Thrift-derived struct describing the page header.
DataPageHeader data_page_header
Data page header.
DictionaryPageHeader dictionary_page_header
Dictionary page header.
DataPageHeaderV2 data_page_header_v2
V2 data page header.
Thrift-derived struct describing page location information stored in the offsets index.
int32_t compressed_page_size
Compressed page size in bytes plus the heeader length.
int64_t offset
Offset of the page in the file.
Thrift-derived struct describing a group of row data.
std::optional< int16_t > ordinal
Row group ordinal in the file.
std::optional< int64_t > file_offset
Byte offset from beginning of file to first page (data or dictionary) in this row group.
std::optional< std::vector< SortingColumn > > sorting_columns
If set, specifies a sort ordering of the rows in this RowGroup.
std::vector< ColumnChunk > columns
Metadata for each column chunk in this row group.
std::optional< int64_t > total_compressed_size
Total byte size of all compressed (and potentially encrypted) column data in this row group.
Struct for describing an element/field in the Parquet format schema.
Type type
1: parquet physical type for output
std::optional< type_id > arrow_type
cudf type determined from arrow:schema
int32_t decimal_precision
8: DEPRECATED: record the precision for DECIMAL converted type
std::optional< int32_t > field_id
9: save field_id from original schema
bool is_struct() const
Check if the schema element is a struct.
cuda::std::optional< LogicalType > logical_type
10: replaces converted type
std::string name
4: name of the field
int32_t decimal_scale
7: DEPRECATED: record the scale for DECIMAL converted type
bool is_stub() const
Check if the schema element is a stub.
bool is_one_level_list(SchemaElement const &parent) const
Check if the schema element is a one-level list.
int32_t num_children
5: nested fields
int32_t type_length
2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
std::vector< size_type > children_idx
Children indices.
std::optional< ConvertedType > converted_type
6: DEPRECATED: record the original type before conversion to parquet type
bool operator==(SchemaElement const &other) const
Check if two schema elements are equal.
bool is_list() const
Check if the schema element is a list.
Thrift-derived struct containing statistics used to estimate page and column chunk sizes.
std::optional< std::vector< int64_t > > repetition_level_histogram
std::optional< std::vector< int64_t > > definition_level_histogram
std::optional< int64_t > unencoded_byte_array_data_bytes
Thrift-derived struct describing column sort order.
bool nulls_first
If true, nulls will come before non-null values.
bool descending
If true, indicates this column is sorted in descending order.
int32_t column_idx
The column index (in this row group)
Thrift-derived struct describing column chunk statistics.
std::optional< bool > is_min_value_exact
If true, min_value is the actual minimum value for a column.
std::optional< std::vector< uint8_t > > max_value
max value for column determined by ColumnOrder
std::optional< int64_t > null_count
count of null values in the column
std::optional< std::vector< uint8_t > > max
deprecated max value in signed comparison order
std::optional< std::vector< uint8_t > > min_value
min value for column determined by ColumnOrder
std::optional< bool > is_max_value_exact
If true, max_value is the actual maximum value for a column.
std::optional< std::vector< uint8_t > > min
deprecated min value in signed comparison order
std::optional< int64_t > distinct_count
count of distinct values occurring
Struct that describes the time logical type annotation.
Time units for temporal logical types.
Type
Available time units.
Struct that describes the timestamp logical type annotation.
Struct that describes the Parquet file data postscript.
uint32_t footer_len
Length of the footer.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Struct that describes the Parquet file data header.
uint32_t magic
Parquet 4-byte magic number "PAR1".
Type declarations for libcudf.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:21