Multi-file variant of the experimental Hybrid Scan Parquet reader. More...
#include <hybrid_scan_multifile.hpp>
Public Member Functions | |
| hybrid_scan_multifile (cudf::host_span< cudf::host_span< uint8_t const > const > footer_bytes, parquet_reader_options const &options) | |
| Constructor for the multi-file experimental Parquet reader. More... | |
| hybrid_scan_multifile (cudf::host_span< FileMetaData const > parquet_metadata, parquet_reader_options const &options) | |
| Constructor for the multi-file experimental Parquet reader. More... | |
| ~hybrid_scan_multifile () | |
| Destructor for the multi-file experimental Parquet reader. | |
| std::vector< FileMetaData > | parquet_metadatas () const |
| Get parquet metadatas for all sources. More... | |
| std::vector< byte_range_info > | page_index_byte_ranges () const |
| Get byte ranges of the page index for all sources. More... | |
| void | setup_page_indexes (cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes) const |
| Setup the per-source page index within each Parquet file metadata. More... | |
| std::vector< std::vector< size_type > > | all_row_groups (parquet_reader_options const &options) const |
| Get all available per-source row group indices from the parquet files. More... | |
| size_type | total_rows_in_row_groups (cudf::host_span< std::vector< size_type > const > row_group_indices) const |
| Get the total number of top-level rows in the per-source row groups. More... | |
| void | reset_column_selection () const |
| Resets the current column selection. More... | |
| std::vector< std::vector< size_type > > | filter_row_groups_with_byte_range (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const |
Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read) More... | |
| std::vector< std::vector< size_type > > | filter_row_groups_with_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const |
| Filter the input row groups using column chunk statistics. More... | |
| std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > | secondary_filters_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning. More... | |
| std::unique_ptr< cudf::column > | build_all_true_row_mask (cudf::host_span< std::vector< size_type > const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
Builds a boolean survival column of size equal to the total number of rows in the row groups containing all true values. More... | |
| std::unique_ptr< cudf::column > | build_row_mask_with_page_index_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Builds a boolean column indicating surviving rows using page-level statistics in the page index. More... | |
| std::pair< std::vector< byte_range_info >, std::vector< size_type > > | filter_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of filter columns. More... | |
| table_with_metadata | materialize_filter_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materializes filter columns and updates the input row mask to only the rows that exist in the output table. More... | |
| std::pair< std::vector< byte_range_info >, std::vector< size_type > > | payload_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of payload columns. More... | |
| table_with_metadata | materialize_payload_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materialize payload columns and applies the row mask to the output table. More... | |
| std::pair< std::vector< byte_range_info >, std::vector< size_type > > | all_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of all (or selected) columns. More... | |
| table_with_metadata | materialize_all_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materializes all (or selected) columns and returns the final output table. More... | |
| void | setup_chunking_for_filter_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for filter columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_filter_columns_chunk (cudf::mutable_column_view &row_mask) const |
| Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table. More... | |
| void | setup_chunking_for_payload_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for payload columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_payload_columns_chunk (cudf::column_view const &row_mask) const |
| Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk. More... | |
| void | setup_chunking_for_all_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for all (or selected) columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_all_columns_chunk () const |
| Materializes a chunk of all (or selected) columns and returns the output table chunk. More... | |
| std::vector< std::vector< std::vector< size_type > > > | construct_row_group_passes (cudf::host_span< std::vector< size_type > const > row_group_indices, std::size_t pass_read_limit) const |
| Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit. More... | |
| bool | has_next_table_chunk () const |
| Check if there is any parquet data left to read for the current chunked setup. More... | |
Multi-file variant of the experimental Hybrid Scan Parquet reader.
Vectorizes hybrid_scan_reader APIs to support multiple Parquet sources. Inputs and outputs are indexed by source order except for the row mask which is a single BOOL8 column spanning all rows from all sources concatenated in source order, then row-group order within a source.
hybrid_scan.hpp and the existing single-file reader (hybrid_scan_reader) will become its subclass. Only keeping this separate here for now to reduce noise. Definition at line 52 of file hybrid_scan_multifile.hpp.
|
explicit |
Constructor for the multi-file experimental Parquet reader.
| footer_bytes | Host span of Parquet file footer byte spans, one per source |
| options | Parquet reader options |
|
explicit |
Constructor for the multi-file experimental Parquet reader.
| parquet_metadata | Host span of pre-populated Parquet file metadata, one per source |
| options | Parquet reader options |
| std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_column_chunks_byte_ranges | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of all (or selected) columns.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_row_groups | ( | parquet_reader_options const & | options | ) | const |
Get all available per-source row group indices from the parquet files.
| options | Parquet reader options |
| std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_all_true_row_mask | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Builds a boolean survival column of size equal to the total number of rows in the row groups containing all true values.
| row_group_indices | Span of vectors of input row group indices, one per source |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the returned column's device memory |
| std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_row_mask_with_page_index_stats | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Builds a boolean column indicating surviving rows using page-level statistics in the page index.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the returned column's device memory |
| std::vector<std::vector<std::vector<size_type> > > cudf::io::parquet::experimental::hybrid_scan_multifile::construct_row_group_passes | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| std::size_t | pass_read_limit | ||
| ) | const |
Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit.
Note that the pass_read_limit is a hint, not an absolute limit - if a single row group cannot fit within the limit given, it will still constitute a pass. The compressed row group size is estimated over all columns in each row group (not just the columns selected for reading), for conservative estimates.
| std::invalid_argument | if no row group indices in the input |
| row_group_indices | Span of vectors of input row group indices, one per source |
| pass_read_limit | Memory limit to read and decompress row group data, 0 if there is no limit (single pass) |
| std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_column_chunks_byte_ranges | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of filter columns.
Byte ranges are flattened in source order. Within each source, byte ranges follow the selected row group and column chunk order used by row_group_indices and options. The returned source map has one source index per byte range and can be used to regroup byte ranges by datasource before fetching.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_byte_range | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read)
Filters the row groups such that only the row groups that start within the byte range are selected. Note that the last selected row group may end beyond the byte range.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_stats | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream | ||
| ) | const |
Filter the input row groups using column chunk statistics.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| bool cudf::io::parquet::experimental::hybrid_scan_multifile::has_next_table_chunk | ( | ) | const |
Check if there is any parquet data left to read for the current chunked setup.
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_all_columns | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materializes all (or selected) columns and returns the final output table.
| row_group_indices | Span of vectors of input row group indices, one per source |
| column_chunk_data | Flattened device spans of column chunk data returned in the same order as all_column_chunks_byte_ranges |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_all_columns_chunk | ( | ) | const |
Materializes a chunk of all (or selected) columns and returns the output table chunk.
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_filter_columns | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| cudf::mutable_column_view & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materializes filter columns and updates the input row mask to only the rows that exist in the output table.
| row_group_indices | Span of vectors of input row group indices, one per source | |
| column_chunk_data | Flattened device spans of filter column chunk data returned in the same order as filter_column_chunks_byte_ranges | |
| [in,out] | row_mask | Mutable boolean column spanning all selected rows across all sources and indicating surviving rows from page pruning |
| mask_data_pages | Whether to build and use a data page mask using the row mask | |
| options | Parquet reader options | |
| stream | CUDA stream used for device memory operations and kernel launches | |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_filter_columns_chunk | ( | cudf::mutable_column_view & | row_mask | ) | const |
Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table.
| [in,out] | row_mask | Mutable boolean column spanning all selected rows across all sources and indicating surviving rows from page pruning. The row mask size must equal the total number of rows in the input row groups, and is empty only when there are no such rows (yielding an empty output table) |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_payload_columns | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materialize payload columns and applies the row mask to the output table.
| row_group_indices | Span of vectors of input row group indices, one per source |
| column_chunk_data | Flattened device spans of payload column chunk data returned in the same order as payload_column_chunks_byte_ranges |
| row_mask | Boolean column spanning all selected rows across all sources and indicating which rows need to be read |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_payload_columns_chunk | ( | cudf::column_view const & | row_mask | ) | const |
Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk.
| row_mask | Boolean column spanning all selected rows across all sources and indicating which rows need to be read |
| std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_multifile::page_index_byte_ranges | ( | ) | const |
Get byte ranges of the page index for all sources.
| std::vector<FileMetaData> cudf::io::parquet::experimental::hybrid_scan_multifile::parquet_metadatas | ( | ) | const |
Get parquet metadatas for all sources.
| std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::payload_column_chunks_byte_ranges | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of payload columns.
Byte ranges are flattened in source order. Within each source, byte ranges follow the selected row group and column chunk order used by row_group_indices and options. The returned source map has one source index per byte range and can be used to regroup byte ranges by datasource before fetching.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| void cudf::io::parquet::experimental::hybrid_scan_multifile::reset_column_selection | ( | ) | const |
Resets the current column selection.
Resets the current column selection state forcing column re-selection in subsequent filter, byte range, setup chunking and materialization APIs. This is useful if the filter expression has been cascaded (and-ed) to include new columns.
| std::pair<std::vector<byte_range_info>, std::vector<byte_range_info> > cudf::io::parquet::experimental::hybrid_scan_multifile::secondary_filters_byte_ranges | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
| row_group_indices | Span of vectors of input row group indices, one per source |
| options | Parquet reader options |
| void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_all_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< std::vector< size_type > const > | row_group_indices, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for all (or selected) columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Span of vectors of input row group indices, one per source |
| column_chunk_data | Flattened device spans of column chunk data returned in the same order as all_column_chunks_byte_ranges |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_filter_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< std::vector< size_type > const > | row_group_indices, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for filter columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Span of vectors of input row group indices, one per source |
| row_mask | Boolean column spanning all selected rows across all sources and indicating which rows need to be read |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| column_chunk_data | Flattened device spans of filter column chunk data returned in the same order as filter_column_chunks_byte_ranges |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_payload_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< std::vector< size_type > const > | row_group_indices, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for payload columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Span of vectors of input row group indices, one per source |
| row_mask | Boolean column spanning all selected rows across all sources and indicating which rows need to be read |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| column_chunk_data | Flattened device spans of payload column chunk data returned in the same order as payload_column_chunks_byte_ranges |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_page_indexes | ( | cudf::host_span< cudf::host_span< uint8_t const > const > | page_index_bytes | ) | const |
Setup the per-source page index within each Parquet file metadata.
| page_index_bytes | Host span of Parquet page index buffer bytes, one per source |
| size_type cudf::io::parquet::experimental::hybrid_scan_multifile::total_rows_in_row_groups | ( | cudf::host_span< std::vector< size_type > const > | row_group_indices | ) | const |
Get the total number of top-level rows in the per-source row groups.
| row_group_indices | Span of vectors of input row group indices, one per source |