hybrid_scan.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/parquet.hpp>
11 #include <cudf/io/types.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
14 
15 #include <rmm/cuda_stream_view.hpp>
16 #include <rmm/resource_ref.hpp>
17 
18 #include <memory>
19 #include <utility>
20 #include <vector>
21 
22 namespace cudf::io::parquet::experimental::detail {
27 class hybrid_scan_reader_impl;
28 } // namespace cudf::io::parquet::experimental::detail
29 
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io::parquet::experimental {
45 enum class use_data_page_mask : bool {
46  YES = true,
47  NO = false
48 };
49 
277  public:
286  parquet_reader_options const& options);
287 
296  parquet_reader_options const& options);
297 
302 
311  [[nodiscard]] FileMetaData parquet_metadata() const;
312 
318  [[nodiscard]] byte_range_info page_index_byte_range() const;
319 
329  void setup_page_index(cudf::host_span<uint8_t const> page_index_bytes) const;
330 
337  [[nodiscard]] std::vector<size_type> all_row_groups(parquet_reader_options const& options) const;
338 
345  [[nodiscard]] std::size_t total_rows_in_row_groups(
346  cudf::host_span<size_type const> row_group_indices) const;
347 
356 
368  [[nodiscard]] std::vector<size_type> filter_row_groups_with_byte_range(
369  cudf::host_span<size_type const> row_group_indices,
370  parquet_reader_options const& options) const;
371 
380  [[nodiscard]] std::vector<size_type> filter_row_groups_with_stats(
381  cudf::host_span<size_type const> row_group_indices,
382  parquet_reader_options const& options,
383  rmm::cuda_stream_view stream) const;
384 
397  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<byte_range_info>>
399  parquet_reader_options const& options) const;
400 
411  [[nodiscard]] std::vector<size_type> filter_row_groups_with_dictionary_pages(
412  cudf::host_span<cudf::device_span<uint8_t const> const> dictionary_page_data,
413  cudf::host_span<size_type const> row_group_indices,
414  parquet_reader_options const& options,
415  rmm::cuda_stream_view stream) const;
416 
429  [[nodiscard]] std::vector<size_type> filter_row_groups_with_bloom_filters(
430  cudf::host_span<cudf::device_span<uint8_t const> const> bloom_filter_data,
431  cudf::host_span<size_type const> row_group_indices,
432  parquet_reader_options const& options,
433  rmm::cuda_stream_view stream) const;
434 
445  [[nodiscard]] std::unique_ptr<cudf::column> build_all_true_row_mask(
446  cudf::host_span<size_type const> row_group_indices,
447  rmm::cuda_stream_view stream,
449 
461  [[nodiscard]] std::unique_ptr<cudf::column> build_row_mask_with_page_index_stats(
462  cudf::host_span<size_type const> row_group_indices,
463  parquet_reader_options const& options,
464  rmm::cuda_stream_view stream,
466 
474  [[nodiscard]] std::vector<byte_range_info> filter_column_chunks_byte_ranges(
475  cudf::host_span<size_type const> row_group_indices,
476  parquet_reader_options const& options) const;
477 
492  cudf::host_span<size_type const> row_group_indices,
493  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
494  cudf::mutable_column_view& row_mask,
495  use_data_page_mask mask_data_pages,
496  parquet_reader_options const& options,
497  rmm::cuda_stream_view stream,
499 
507  [[nodiscard]] std::vector<byte_range_info> payload_column_chunks_byte_ranges(
508  cudf::host_span<size_type const> row_group_indices,
509  parquet_reader_options const& options) const;
510 
524  cudf::host_span<size_type const> row_group_indices,
525  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
526  cudf::column_view const& row_mask,
527  use_data_page_mask mask_data_pages,
528  parquet_reader_options const& options,
529  rmm::cuda_stream_view stream,
531 
539  [[nodiscard]] std::vector<byte_range_info> all_column_chunks_byte_ranges(
540  cudf::host_span<size_type const> row_group_indices,
541  parquet_reader_options const& options) const;
542 
554  cudf::host_span<size_type const> row_group_indices,
555  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
556  parquet_reader_options const& options,
557  rmm::cuda_stream_view stream,
575  std::size_t chunk_read_limit,
576  std::size_t pass_read_limit,
577  cudf::host_span<size_type const> row_group_indices,
578  cudf::column_view const& row_mask,
579  use_data_page_mask mask_data_pages,
580  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
581  parquet_reader_options const& options,
582  rmm::cuda_stream_view stream,
584 
594  cudf::mutable_column_view& row_mask) const;
595 
612  std::size_t chunk_read_limit,
613  std::size_t pass_read_limit,
614  cudf::host_span<size_type const> row_group_indices,
615  cudf::column_view const& row_mask,
616  use_data_page_mask mask_data_pages,
617  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
618  parquet_reader_options const& options,
619  rmm::cuda_stream_view stream,
621 
631  cudf::column_view const& row_mask) const;
632 
648  std::size_t chunk_read_limit,
649  std::size_t pass_read_limit,
650  cudf::host_span<size_type const> row_group_indices,
651  cudf::host_span<cudf::device_span<uint8_t const> const> column_chunk_data,
652  parquet_reader_options const& options,
653  rmm::cuda_stream_view stream,
655 
662 
680  [[nodiscard]] std::vector<std::vector<cudf::size_type>> construct_row_group_passes(
681  cudf::host_span<cudf::size_type const> row_group_indices, std::size_t pass_read_limit) const;
682 
688  [[nodiscard]] bool has_next_table_chunk() const;
689 
690  private:
691  std::unique_ptr<detail::hybrid_scan_reader_impl> _impl;
692 };
693  // end of group
695 
696 } // namespace io::parquet::experimental
697 } // namespace CUDF_EXPORT cudf
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
The experimental parquet reader class to optimally read parquet files subject to highly selective fil...
std::unique_ptr< cudf::column > build_row_mask_with_page_index_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean column indicating surviving rows using page-level statistics in the page index.
table_with_metadata materialize_filter_columns(cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materializes filter columns and updates the input row mask to only the rows that exist in the output ...
std::unique_ptr< cudf::column > build_all_true_row_mask(cudf::host_span< size_type const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean (survival) column of size equal to the total number of rows in the row groups contai...
std::vector< size_type > filter_row_groups_with_dictionary_pages(cudf::host_span< cudf::device_span< uint8_t const > const > dictionary_page_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk dictionary pages.
void setup_page_index(cudf::host_span< uint8_t const > page_index_bytes) const
Setup the page index within the Parquet file metadata (FileMetaData)
std::vector< byte_range_info > payload_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of payload columns.
byte_range_info page_index_byte_range() const
Get the byte range of the page index in the Parquet file.
std::vector< std::vector< cudf::size_type > > construct_row_group_passes(cudf::host_span< cudf::size_type const > row_group_indices, std::size_t pass_read_limit) const
Partition row groups into passes such that the amount of GPU memory required to read,...
std::size_t total_rows_in_row_groups(cudf::host_span< size_type const > row_group_indices) const
Get the total number of top-level rows in the row groups.
std::vector< byte_range_info > all_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of all (or selected) columns.
std::vector< size_type > filter_row_groups_with_bloom_filters(cudf::host_span< cudf::device_span< uint8_t const > const > bloom_filter_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk bloom filters.
hybrid_scan_reader(cudf::host_span< uint8_t const > footer_bytes, parquet_reader_options const &options)
Constructor for the experimental parquet reader class to optimally read Parquet files subject to high...
table_with_metadata materialize_filter_columns_chunk(cudf::mutable_column_view &row_mask) const
Materializes a chunk of filter columns and updates the corresponding range of input row mask to only ...
std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > secondary_filters_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
table_with_metadata materialize_payload_columns_chunk(cudf::column_view const &row_mask) const
Materializes a chunk of payload columns and applies the corresponding range of input row mask to the ...
std::vector< size_type > filter_row_groups_with_byte_range(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Filter the row groups using the specified byte range specified by [bytes_to_skip, bytes_to_skip + byt...
hybrid_scan_reader(FileMetaData const &parquet_metadata, parquet_reader_options const &options)
Constructor for the experimental parquet reader class to optimally read Parquet files subject to high...
table_with_metadata materialize_payload_columns(cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materialize payload columns and applies the row mask to the output table.
void reset_column_selection() const
Resets the current column selection.
FileMetaData parquet_metadata() const
Get the Parquet file footer metadata.
void setup_chunking_for_all_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for all (or selected) columns and preprocess the input data pages.
std::vector< size_type > filter_row_groups_with_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the input row groups using column chunk statistics.
std::vector< size_type > all_row_groups(parquet_reader_options const &options) const
Get all available row groups from the parquet file.
void setup_chunking_for_filter_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for filter columns and preprocess the input data pages.
table_with_metadata materialize_all_columns(cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Materializes all (or selected) columns and returns the final output table.
void setup_chunking_for_payload_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Setup chunking information for payload columns and preprocess the input data pages.
bool has_next_table_chunk() const
Check if there is any parquet data left to read for the current setup.
std::vector< byte_range_info > filter_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of filter columns.
~hybrid_scan_reader()
Destructor for the experimental parquet reader class.
table_with_metadata materialize_all_columns_chunk() const
Materializes all (or selected) columns and returns the final output table.
Information about content of a parquet file.
Settings for read_parquet().
Definition: parquet.hpp:66
stores offset and size used to indicate a byte range
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
use_data_page_mask
Whether to compute and use a page mask using the row mask to skip decompression and decoding of the m...
Definition: hybrid_scan.hpp:45
@ YES
Compute and use a data page mask.
@ NO
Do not compute or use a data page mask.
cuda::mr::resource_ref< cuda::mr::device_accessible > device_async_resource_ref
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
Definition: span.hpp:320
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Thrift-derived struct describing file-level metadata.
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Type declarations for libcudf.