hybrid_scan_multifile.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/io/parquet.hpp>
12 #include <cudf/io/types.hpp>
13 #include <cudf/types.hpp>
14 #include <cudf/utilities/export.hpp>
15 
16 #include <rmm/cuda_stream_view.hpp>
17 #include <rmm/resource_ref.hpp>
18 
19 #include <memory>
20 #include <vector>
21 
22 namespace cudf::io::parquet::experimental::detail {
27 class hybrid_scan_reader_impl;
28 } // namespace cudf::io::parquet::experimental::detail
29 
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io::parquet::experimental {
53  public:
61  parquet_reader_options const& options);
62 
70  parquet_reader_options const& options);
71 
76 
82  [[nodiscard]] std::vector<FileMetaData> parquet_metadatas() const;
83 
89  [[nodiscard]] std::vector<byte_range_info> page_index_byte_ranges() const;
90 
97  cudf::host_span<cudf::host_span<uint8_t const> const> page_index_bytes) const;
98 
105  [[nodiscard]] std::vector<std::vector<size_type>> all_row_groups(
106  parquet_reader_options const& options) const;
107 
115  cudf::host_span<std::vector<size_type> const> row_group_indices) const;
116 
125 
137  [[nodiscard]] std::vector<std::vector<size_type>> filter_row_groups_with_byte_range(
138  cudf::host_span<std::vector<size_type> const> row_group_indices,
139  parquet_reader_options const& options) const;
140 
149  [[nodiscard]] std::vector<std::vector<size_type>> filter_row_groups_with_stats(
150  cudf::host_span<std::vector<size_type> const> row_group_indices,
151  parquet_reader_options const& options,
152  rmm::cuda_stream_view stream) const;
153 
166  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<byte_range_info>>
167  secondary_filters_byte_ranges(cudf::host_span<std::vector<size_type> const> row_group_indices,
168  parquet_reader_options const& options) const;
169 
170  private:
171  std::unique_ptr<detail::hybrid_scan_reader_impl> _impl;
172 };
173  // end of group
175 
176 } // namespace io::parquet::experimental
177 } // namespace CUDF_EXPORT cudf
Multi-file variant of the experimental Hybrid Scan Parquet reader.
hybrid_scan_multifile(cudf::host_span< FileMetaData const > parquet_metadata, parquet_reader_options const &options)
Constructor for the multi-file experimental Parquet reader.
std::vector< FileMetaData > parquet_metadatas() const
Get parquet metadatas for all sources.
std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > secondary_filters_byte_ranges(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
size_type total_rows_in_row_groups(cudf::host_span< std::vector< size_type > const > row_group_indices) const
Get the total number of top-level rows in the per-source row groups.
void reset_column_selection() const
Resets the current column selection.
std::vector< std::vector< size_type > > filter_row_groups_with_stats(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the input row groups using column chunk statistics.
std::vector< std::vector< size_type > > all_row_groups(parquet_reader_options const &options) const
Get all available per-source row group indices from the parquet files.
std::vector< std::vector< size_type > > filter_row_groups_with_byte_range(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read...
std::vector< byte_range_info > page_index_byte_ranges() const
Get byte ranges of the page index for all sources.
void setup_page_indexes(cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes) const
Setup the per-source page index within each Parquet file metadata.
~hybrid_scan_multifile()
Destructor for the multi-file experimental Parquet reader.
hybrid_scan_multifile(cudf::host_span< cudf::host_span< uint8_t const > const > footer_bytes, parquet_reader_options const &options)
Constructor for the multi-file experimental Parquet reader.
Information about content of a parquet file.
Settings for read_parquet().
Definition: parquet.hpp:66
stores offset and size used to indicate a byte range
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Type declarations for libcudf.