orc_metadata.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
22 #pragma once
23 
24 #include <cudf/io/orc_types.hpp>
25 #include <cudf/io/types.hpp>
26 
27 #include <optional>
28 #include <variant>
29 #include <vector>
30 
31 namespace cudf {
32 namespace io {
49  std::vector<std::string> column_names;
50  std::vector<std::string> file_stats;
51  std::vector<std::vector<std::string>> stripes_stats;
52 };
53 
71  source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream());
72 
76 using no_statistics = std::monostate;
77 
83 template <typename T>
85  std::optional<T> minimum;
86  std::optional<T> maximum;
87 };
88 
94 template <typename T>
96  std::optional<T> sum;
97 };
98 
102 struct integer_statistics : minmax_statistics<int64_t>, sum_statistics<int64_t> {};
103 
108 
116 struct string_statistics : minmax_statistics<std::string>, sum_statistics<int64_t> {};
117 
124  std::vector<uint64_t> count;
125 };
126 
130 struct decimal_statistics : minmax_statistics<std::string>, sum_statistics<std::string> {};
131 
136 
143 
151  std::optional<int64_t> minimum_utc;
152  std::optional<int64_t> maximum_utc;
153  std::optional<uint32_t> minimum_nanos;
154  std::optional<uint32_t> maximum_nanos;
155 };
156 
158 namespace orc {
159 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
160 // returned from `read_parsed_orc_statistics`, are constructed from
161 // `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes.
162 struct column_statistics;
163 } // namespace orc
164 
172  std::optional<uint64_t> number_of_values;
173  std::optional<bool> has_null;
174  std::variant<no_statistics,
184 
190  column_statistics(orc::column_statistics&& detail_statistics);
191 };
192 
201  std::vector<std::string> column_names;
202  std::vector<column_statistics> file_stats;
203  std::vector<std::vector<column_statistics>> stripes_stats;
204 };
205 
217  source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream());
218 
223  public:
231  orc_column_schema(std::string_view name,
232  orc::TypeKind type,
233  std::vector<orc_column_schema> children)
234  : _name{name}, _type_kind{type}, _children{std::move(children)}
235  {
236  }
237 
243  [[nodiscard]] auto name() const { return _name; }
244 
250  [[nodiscard]] auto type_kind() const { return _type_kind; }
251 
257  [[nodiscard]] auto const& children() const& { return _children; }
258 
263  [[nodiscard]] auto children() && { return std::move(_children); }
264 
272  [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }
273 
278  [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }
279 
285  [[nodiscard]] auto num_children() const { return children().size(); }
286 
287  private:
288  std::string _name;
289  orc::TypeKind _type_kind;
290  std::vector<orc_column_schema> _children;
291 };
292 
296 struct orc_schema {
297  public:
303  orc_schema(orc_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}
304 
310  [[nodiscard]] auto const& root() const& { return _root; }
311 
316  [[nodiscard]] auto root() && { return std::move(_root); }
317 
318  private:
319  orc_column_schema _root;
320 };
321 
326  public:
335  : _schema{std::move(schema)}, _num_rows{num_rows}, _num_stripes{num_stripes}
336  {
337  }
338 
344  [[nodiscard]] auto const& schema() const { return _schema; }
345 
347 
354  [[nodiscard]] auto num_rows() const { return _num_rows; }
355 
361  [[nodiscard]] auto num_stripes() const { return _num_stripes; }
362 
363  private:
364  orc_schema _schema;
365  uint64_t _num_rows;
366  size_type _num_stripes;
367 };
368 
380  rmm::cuda_stream_view stream = cudf::get_default_stream());
381  // end of group
383 } // namespace io
384 } // namespace cudf
Information about content of an ORC file.
auto num_rows() const
Returns the number of rows of the root column.
orc_metadata(orc_schema schema, uint64_t num_rows, size_type num_stripes)
constructor
auto const & schema() const
Returns the ORC schema.
auto num_stripes() const
Returns the number of stripes in the file.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
raw_orc_statistics read_raw_orc_statistics(source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
Reads file-level and stripe-level statistics of ORC dataset.
parsed_orc_statistics read_parsed_orc_statistics(source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
Reads file-level and stripe-level statistics of ORC dataset.
orc_metadata read_orc_metadata(source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
Reads metadata of ORC dataset.
sum_statistics< int64_t > binary_statistics
Statistics for binary columns.
minmax_statistics< int32_t > date_statistics
Statistics for date(time) columns.
std::monostate no_statistics
Monostate type alias for the statistics variant.
TypeKind
Identifies a data type in an orc file.
Definition: orc_types.hpp:43
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:34
Statistics for boolean columns.
std::vector< uint64_t > count
count of true values
Contains per-column ORC statistics.
std::optional< uint64_t > number_of_values
number of statistics
std::optional< bool > has_null
column has any nulls
column_statistics(orc::column_statistics &&detail_statistics)
Construct a new column statistics object.
std::variant< no_statistics, integer_statistics, double_statistics, string_statistics, bucket_statistics, decimal_statistics, date_statistics, binary_statistics, timestamp_statistics > type_specific_stats
type-specific statistics
Statistics for decimal columns.
Statistics for floating point columns.
Statistics for integral columns.
Base class for column statistics that include optional minimum and maximum.
std::optional< T > minimum
Minimum value.
std::optional< T > maximum
Maximum value.
Schema of an ORC column, including the nested columns.
auto const & children() const &
Returns schemas of all child columns.
orc_column_schema(std::string_view name, orc::TypeKind type, std::vector< orc_column_schema > children)
constructor
auto child(int idx) &&
Returns schema of the child with the given index.
auto type_kind() const
Returns ORC type of the column.
auto const & child(int idx) const &
Returns schema of the child with the given index.
auto name() const
Returns ORC column name; can be empty.
auto num_children() const
Returns the number of child columns.
auto children() &&
Returns schemas of all child columns.
Schema of an ORC file.
auto root() &&
Returns the schema of the struct column that contains all columns as fields.
auto const & root() const &
Returns the schema of the struct column that contains all columns as fields.
orc_schema(orc_column_schema root_column_schema)
constructor
Holds column names and parsed file-level and stripe-level statistics.
std::vector< std::vector< column_statistics > > stripes_stats
stripe-level statistics
std::vector< std::string > column_names
column names
std::vector< column_statistics > file_stats
file-level statistics
Holds column names and buffers containing raw file-level and stripe-level statistics.
std::vector< std::vector< std::string > > stripes_stats
Stripe-level statistics for each column.
std::vector< std::string > column_names
Column names.
std::vector< std::string > file_stats
File-level statistics for each column.
Source information for read interfaces.
Definition: io/types.hpp:314
Statistics for string columns.
Base class for column statistics that include an optional sum.
std::optional< T > sum
Sum of values in column.
Statistics for timestamp columns.
std::optional< uint32_t > minimum_nanos
nanoseconds part of the minimum
std::optional< uint32_t > maximum_nanos
nanoseconds part of the maximum
std::optional< int64_t > minimum_utc
minimum in milliseconds
std::optional< int64_t > maximum_utc
maximum in milliseconds