hybrid_scan_multifile.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/io/parquet.hpp>
12 #include <cudf/io/types.hpp>
13 #include <cudf/types.hpp>
14 #include <cudf/utilities/export.hpp>
15 
16 #include <rmm/cuda_stream_view.hpp>
17 #include <rmm/resource_ref.hpp>
18 
19 #include <memory>
20 #include <vector>
21 
22 namespace cudf::io::parquet::experimental::detail {
27 class hybrid_scan_reader_impl;
28 } // namespace cudf::io::parquet::experimental::detail
29 
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io::parquet::experimental {
53  public:
61  parquet_reader_options const& options);
62 
70  parquet_reader_options const& options);
71 
76 
82  [[nodiscard]] std::vector<FileMetaData> parquet_metadatas() const;
83 
89  [[nodiscard]] std::vector<byte_range_info> page_index_byte_ranges() const;
90 
97  cudf::host_span<cudf::host_span<uint8_t const> const> page_index_bytes) const;
98 
105  [[nodiscard]] std::vector<std::vector<size_type>> all_row_groups(
106  parquet_reader_options const& options) const;
107 
115  cudf::host_span<std::vector<size_type> const> row_group_indices) const;
116 
125 
137  [[nodiscard]] std::vector<std::vector<size_type>> filter_row_groups_with_byte_range(
138  cudf::host_span<std::vector<size_type> const> row_group_indices,
139  parquet_reader_options const& options) const;
140 
149  [[nodiscard]] std::vector<std::vector<size_type>> filter_row_groups_with_stats(
150  cudf::host_span<std::vector<size_type> const> row_group_indices,
151  parquet_reader_options const& options,
152  rmm::cuda_stream_view stream) const;
153 
166  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<byte_range_info>>
167  secondary_filters_byte_ranges(cudf::host_span<std::vector<size_type> const> row_group_indices,
168  parquet_reader_options const& options) const;
169 
179  [[nodiscard]] std::unique_ptr<cudf::column> build_all_true_row_mask(
180  cudf::host_span<std::vector<size_type> const> row_group_indices,
181  rmm::cuda_stream_view stream,
183 
195  [[nodiscard]] std::unique_ptr<cudf::column> build_row_mask_with_page_index_stats(
196  cudf::host_span<std::vector<size_type> const> row_group_indices,
197  parquet_reader_options const& options,
198  rmm::cuda_stream_view stream,
200 
214  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<size_type>>
215  filter_column_chunks_byte_ranges(cudf::host_span<std::vector<size_type> const> row_group_indices,
216  parquet_reader_options const& options) const;
217 
234  cudf::host_span<std::vector<size_type> const> row_group_indices,
235  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
236  cudf::mutable_column_view& row_mask,
237  use_data_page_mask mask_data_pages,
238  parquet_reader_options const& options,
239  rmm::cuda_stream_view stream,
241 
255  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<size_type>>
256  payload_column_chunks_byte_ranges(cudf::host_span<std::vector<size_type> const> row_group_indices,
257  parquet_reader_options const& options) const;
258 
274  cudf::host_span<std::vector<size_type> const> row_group_indices,
275  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
276  cudf::column_view const& row_mask,
277  use_data_page_mask mask_data_pages,
278  parquet_reader_options const& options,
279  rmm::cuda_stream_view stream,
281 
290  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<size_type>>
291  all_column_chunks_byte_ranges(cudf::host_span<std::vector<size_type> const> row_group_indices,
292  parquet_reader_options const& options) const;
293 
306  cudf::host_span<std::vector<size_type> const> row_group_indices,
307  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
308  parquet_reader_options const& options,
309  rmm::cuda_stream_view stream,
311 
330  std::size_t chunk_read_limit,
331  std::size_t pass_read_limit,
332  cudf::host_span<std::vector<size_type> const> row_group_indices,
333  cudf::column_view const& row_mask,
334  use_data_page_mask mask_data_pages,
335  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
336  parquet_reader_options const& options,
337  rmm::cuda_stream_view stream,
339 
352  cudf::mutable_column_view& row_mask) const;
353 
372  std::size_t chunk_read_limit,
373  std::size_t pass_read_limit,
374  cudf::host_span<std::vector<size_type> const> row_group_indices,
375  cudf::column_view const& row_mask,
376  use_data_page_mask mask_data_pages,
377  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
378  parquet_reader_options const& options,
379  rmm::cuda_stream_view stream,
381 
392  cudf::column_view const& row_mask) const;
393 
410  std::size_t chunk_read_limit,
411  std::size_t pass_read_limit,
412  cudf::host_span<std::vector<size_type> const> row_group_indices,
413  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
414  parquet_reader_options const& options,
415  rmm::cuda_stream_view stream,
417 
424 
442  [[nodiscard]] std::vector<std::vector<std::vector<size_type>>> construct_row_group_passes(
443  cudf::host_span<std::vector<size_type> const> row_group_indices,
444  std::size_t pass_read_limit) const;
445 
451  [[nodiscard]] bool has_next_table_chunk() const;
452 
453  private:
454  std::unique_ptr<detail::hybrid_scan_reader_impl> _impl;
455 };
456  // end of group
458 
459 } // namespace io::parquet::experimental
460 } // namespace CUDF_EXPORT cudf
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
Multi-file variant of the experimental Hybrid Scan Parquet reader.
table_with_metadata materialize_payload_columns(cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materialize payload columns and applies the row mask to the output table.
std::pair< std::vector< byte_range_info >, std::vector< size_type > > all_column_chunks_byte_ranges(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of all (or selected) columns.
hybrid_scan_multifile(cudf::host_span< FileMetaData const > parquet_metadata, parquet_reader_options const &options)
Constructor for the multi-file experimental Parquet reader.
std::vector< FileMetaData > parquet_metadatas() const
Get parquet metadatas for all sources.
void setup_chunking_for_filter_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for filter columns and preprocess the input data pages.
std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > secondary_filters_byte_ranges(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
bool has_next_table_chunk() const
Check if there is any parquet data left to read for the current chunked setup.
size_type total_rows_in_row_groups(cudf::host_span< std::vector< size_type > const > row_group_indices) const
Get the total number of top-level rows in the per-source row groups.
void reset_column_selection() const
Resets the current column selection.
std::vector< std::vector< size_type > > filter_row_groups_with_stats(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the input row groups using column chunk statistics.
std::vector< std::vector< size_type > > all_row_groups(parquet_reader_options const &options) const
Get all available per-source row group indices from the parquet files.
std::unique_ptr< cudf::column > build_row_mask_with_page_index_stats(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean column indicating surviving rows using page-level statistics in the page index.
std::vector< std::vector< size_type > > filter_row_groups_with_byte_range(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read...
std::vector< byte_range_info > page_index_byte_ranges() const
Get byte ranges of the page index for all sources.
std::vector< std::vector< std::vector< size_type > > > construct_row_group_passes(cudf::host_span< std::vector< size_type > const > row_group_indices, std::size_t pass_read_limit) const
Partition row groups into passes such that the amount of GPU memory required to read,...
void setup_chunking_for_payload_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for payload columns and preprocess the input data pages.
table_with_metadata materialize_filter_columns_chunk(cudf::mutable_column_view &row_mask) const
Materializes a chunk of filter columns and updates the corresponding range of input row mask to only ...
void setup_chunking_for_all_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for all (or selected) columns and preprocess the input data pages.
void setup_page_indexes(cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes) const
Setup the per-source page index within each Parquet file metadata.
table_with_metadata materialize_payload_columns_chunk(cudf::column_view const &row_mask) const
Materializes a chunk of payload columns and applies the corresponding range of input row mask to the ...
table_with_metadata materialize_filter_columns(cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materializes filter columns and updates the input row mask to only the rows that exist in the output ...
table_with_metadata materialize_all_columns(cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materializes all (or selected) columns and returns the final output table.
~hybrid_scan_multifile()
Destructor for the multi-file experimental Parquet reader.
table_with_metadata materialize_all_columns_chunk() const
Materializes a chunk of all (or selected) columns and returns the output table chunk.
std::pair< std::vector< byte_range_info >, std::vector< size_type > > payload_column_chunks_byte_ranges(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of payload columns.
std::unique_ptr< cudf::column > build_all_true_row_mask(cudf::host_span< std::vector< size_type > const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean survival column of size equal to the total number of rows in the row groups containi...
hybrid_scan_multifile(cudf::host_span< cudf::host_span< uint8_t const > const > footer_bytes, parquet_reader_options const &options)
Constructor for the multi-file experimental Parquet reader.
std::pair< std::vector< byte_range_info >, std::vector< size_type > > filter_column_chunks_byte_ranges(cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of filter columns.
Information about content of a parquet file.
Settings for read_parquet().
Definition: parquet.hpp:66
stores offset and size used to indicate a byte range
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
use_data_page_mask
Whether to compute and use a page mask using the row mask to skip decompression and decoding of the m...
Definition: hybrid_scan.hpp:46
cuda::mr::resource_ref< cuda::mr::device_accessible > device_async_resource_ref
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
Definition: span.hpp:320
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:293
Type declarations for libcudf.