parquet_metadata.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
11 #pragma once
12 
13 #include <cudf/io/datasource.hpp>
15 #include <cudf/io/types.hpp>
16 #include <cudf/utilities/export.hpp>
17 
18 #include <string_view>
19 #include <vector>
20 
21 namespace CUDF_EXPORT cudf {
22 namespace io {
31 
36  public:
42  explicit parquet_column_schema() : _cudf_type{data_type{type_id::EMPTY}} {}
43 
52  parquet_column_schema(std::string_view name,
53  Type type,
54  std::vector<parquet_column_schema>&& children,
55  data_type cudf_type)
56  : _name{name}, _type{type}, _children{std::move(children)}, _cudf_type{cudf_type}
57  {
58  }
59 
65  [[nodiscard]] auto name() const { return _name; }
66 
72  [[nodiscard]] auto type() const { return _type; }
73 
79  [[nodiscard]] auto const& children() const& { return _children; }
80 
85  [[nodiscard]] auto children() && { return std::move(_children); }
86 
94  [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }
95 
100  [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }
101 
107  [[nodiscard]] auto num_children() const { return children().size(); }
108 
116  [[nodiscard]] auto cudf_type() const { return _cudf_type; }
117 
118  private:
119  std::string _name;
120  // 3 types available: Physical, Converted, Logical
121  Type _type; // Physical type
122  std::vector<parquet_column_schema> _children;
123  data_type _cudf_type;
124 };
125 
130  public:
136  explicit parquet_schema() = default;
137 
143  parquet_schema(parquet_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}
144 
150  [[nodiscard]] auto const& root() const& { return _root; }
151 
156  [[nodiscard]] auto root() && { return std::move(_root); }
157 
158  private:
159  parquet_column_schema _root;
160 };
161 
166  public:
168  using key_value_metadata = std::unordered_map<std::string, std::string>;
170  using row_group_metadata = std::unordered_map<std::string, int64_t>;
172  using column_chunk_metadata = std::unordered_map<std::string, std::vector<int64_t>>;
173 
179  explicit parquet_metadata() = default;
180 
194  int64_t num_rows,
195  size_type num_rowgroups,
196  std::vector<size_type> num_rowgroups_per_file,
197  key_value_metadata file_metadata,
198  std::vector<row_group_metadata> rg_metadata,
200  : _schema{std::move(schema)},
201  _num_rows{num_rows},
202  _num_rowgroups{num_rowgroups},
203  _num_rowgroups_per_file{std::move(num_rowgroups_per_file)},
204  _file_metadata{std::move(file_metadata)},
205  _rowgroup_metadata{std::move(rg_metadata)},
206  _column_chunk_metadata{std::move(column_chunk_metadata)}
207  {
208  }
209 
215  [[nodiscard]] auto const& schema() const { return _schema; }
216 
224  [[nodiscard]] auto num_rows() const { return _num_rows; }
225 
231  [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
232 
238  [[nodiscard]] auto const& num_rowgroups_per_file() const { return _num_rowgroups_per_file; }
239 
245  [[nodiscard]] auto const& metadata() const { return _file_metadata; }
246 
252  [[nodiscard]] auto const& rowgroup_metadata() const { return _rowgroup_metadata; }
253 
261  [[nodiscard]] auto const& columnchunk_metadata() const { return _column_chunk_metadata; }
262 
263  private:
264  parquet_schema _schema;
265  int64_t _num_rows;
266  size_type _num_rowgroups;
267  std::vector<size_type> _num_rowgroups_per_file;
268  key_value_metadata _file_metadata;
269  std::vector<row_group_metadata> _rowgroup_metadata;
270  column_chunk_metadata _column_chunk_metadata;
271 };
272 
284 
294 std::vector<parquet::FileMetaData> read_parquet_footers(
295  cudf::host_span<std::unique_ptr<cudf::io::datasource> const> sources);
296  // end of group
298 } // namespace io
299 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:277
Information about content of a parquet file.
auto const & schema() const
Returns the parquet schema.
auto const & rowgroup_metadata() const
Returns the row group metadata in the file footer.
auto const & metadata() const
Returns the Key value metadata in the file footer.
parquet_metadata()=default
Default constructor.
parquet_metadata(parquet_schema schema, int64_t num_rows, size_type num_rowgroups, std::vector< size_type > num_rowgroups_per_file, key_value_metadata file_metadata, std::vector< row_group_metadata > rg_metadata, column_chunk_metadata column_chunk_metadata)
constructor
auto num_rowgroups() const
Returns the total number of rowgroups.
std::unordered_map< std::string, std::vector< int64_t > > column_chunk_metadata
Column chunk metadata from each ColumnChunkMetaData element.
std::unordered_map< std::string, int64_t > row_group_metadata
Row group metadata from each RowGroup element.
auto const & columnchunk_metadata() const
Returns a map of column names to vectors of total_uncompressed_size metadata from all their column ch...
auto num_rows() const
Returns the number of rows of the root column.
auto const & num_rowgroups_per_file() const
Returns the number of rowgroups in each file.
std::unordered_map< std::string, std::string > key_value_metadata
Key-value metadata in the file footer.
std::vector< parquet::FileMetaData > read_parquet_footers(cudf::host_span< std::unique_ptr< cudf::io::datasource > const > sources)
Constructs FileMetaData objects from parquet dataset.
parquet_metadata read_parquet_metadata(source_info const &src_info)
Reads metadata of parquet dataset.
Type
Basic data types in Parquet, determines how data is physically stored.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
type_id
Identifies a column's logical element type.
Definition: types.hpp:192
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Schema of a parquet column, including the nested columns.
auto const & child(int idx) const &
Returns schema of the child with the given index.
auto name() const
Returns parquet column name; can be empty.
auto const & children() const &
Returns schemas of all child columns.
auto type() const
Returns parquet physical type of the column.
auto children() &&
Returns schemas of all child columns.
auto num_children() const
Returns the number of child columns.
parquet_column_schema(std::string_view name, Type type, std::vector< parquet_column_schema > &&children, data_type cudf_type)
constructor
parquet_column_schema()
Default constructor.
auto child(int idx) &&
Returns schema of the child with the given index.
auto cudf_type() const
Returns the cudf data type for this column.
Schema of a parquet file.
parquet_schema()=default
Default constructor.
auto root() &&
Returns the schema of the struct column that contains all columns as fields.
auto const & root() const &
Returns the schema of the struct column that contains all columns as fields.
parquet_schema(parquet_column_schema root_column_schema)
constructor
Source information for read interfaces.
Definition: io/types.hpp:316