Multi-file variant of the experimental Hybrid Scan Parquet reader. More...

#include <cudf/io/experimental/hybrid_scan_multifile.hpp>

Public Member Functions
	hybrid_scan_multifile (cudf::host_span< cudf::host_span< uint8_t const > const > footer_bytes, parquet_reader_options const &options)
	Constructor for the multi-file experimental Parquet reader. More...

	hybrid_scan_multifile (cudf::host_span< FileMetaData const > parquet_metadata, parquet_reader_options const &options)
	Constructor for the multi-file experimental Parquet reader. More...

	~hybrid_scan_multifile ()
	Destructor for the multi-file experimental Parquet reader.

std::vector< FileMetaData >	parquet_metadatas () const
	Get parquet metadatas for all sources. More...

std::vector< byte_range_info >	page_index_byte_ranges () const
	Get byte ranges of the page index for all sources. More...

void	setup_page_indexes (cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes) const
	Setup the per-source page index within each Parquet file metadata. More...

std::vector< std::vector< size_type > >	all_row_groups (parquet_reader_options const &options) const
	Get all available per-source row group indices from the parquet files. More...

size_type	total_rows_in_row_groups (cudf::host_span< std::vector< size_type > const > row_group_indices) const
	Get the total number of top-level rows in the per-source row groups. More...

void	reset_column_selection () const
	Resets the current column selection. More...

std::vector< std::vector< size_type > >	filter_row_groups_with_byte_range (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Filter the row groups using the byte range specified by [`bytes_to_skip`, `bytes_to_skip + bytes_to_read`) More...

std::vector< std::vector< size_type > >	filter_row_groups_with_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
	Filter the input row groups using column chunk statistics. More...

std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > >	secondary_filters_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning. More...

std::unique_ptr< cudf::column >	build_all_true_row_mask (cudf::host_span< std::vector< size_type > const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Builds a boolean survival column of size equal to the total number of rows in the row groups containing all `true` values. More...

std::unique_ptr< cudf::column >	build_row_mask_with_page_index_stats (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Builds a boolean column indicating surviving rows using page-level statistics in the page index. More...

std::pair< std::vector< byte_range_info >, std::vector< size_type > >	filter_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of column chunks of filter columns. More...

table_with_metadata	materialize_filter_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Materializes filter columns and updates the input row mask to only the rows that exist in the output table. More...

std::pair< std::vector< byte_range_info >, std::vector< size_type > >	payload_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of column chunks of payload columns. More...

table_with_metadata	materialize_payload_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Materialize payload columns and applies the row mask to the output table. More...

std::pair< std::vector< byte_range_info >, std::vector< size_type > >	all_column_chunks_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of column chunks of all (or selected) columns. More...

table_with_metadata	materialize_all_columns (cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Materializes all (or selected) columns and returns the final output table. More...

void	setup_chunking_for_filter_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Setup chunking information for filter columns and preprocess the input data pages. More...

table_with_metadata	materialize_filter_columns_chunk (cudf::mutable_column_view &row_mask) const
	Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table. More...

void	setup_chunking_for_payload_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Setup chunking information for payload columns and preprocess the input data pages. More...

table_with_metadata	materialize_payload_columns_chunk (cudf::column_view const &row_mask) const
	Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk. More...

void	setup_chunking_for_all_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< std::vector< size_type > const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
	Setup chunking information for all (or selected) columns and preprocess the input data pages. More...

table_with_metadata	materialize_all_columns_chunk () const
	Materializes a chunk of all (or selected) columns and returns the output table chunk. More...

std::vector< std::vector< std::vector< size_type > > >	construct_row_group_passes (cudf::host_span< std::vector< size_type > const > row_group_indices, std::size_t pass_read_limit) const
	Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit. More...

bool	has_next_table_chunk () const
	Check if there is any parquet data left to read for the current chunked setup. More...

std::pair< std::vector< byte_range_info >, std::vector< size_type > >	dictionary_pages_byte_ranges (cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options) const
	Get byte ranges of column chunk dictionary pages for row group pruning. More...

std::vector< std::vector< size_type > >	filter_row_groups_with_dictionary_pages (cudf::host_span< cudf::device_span< uint8_t const > const > dictionary_page_data, cudf::host_span< std::vector< size_type > const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
	Filter the row groups using column chunk dictionary pages. More...

Detailed Description

Multi-file variant of the experimental Hybrid Scan Parquet reader.

Vectorizes hybrid_scan_reader APIs to support multiple Parquet sources. Inputs and outputs are indexed by source order except for the row mask which is a single BOOL8 column spanning all rows from all sources concatenated in source order, then row-group order within a source.

Note: Detailed usage documentation will be added once all APIs are in place. This reader will eventually move to hybrid_scan.hpp and the existing single-file reader (hybrid_scan_reader) will become its subclass. Only keeping this separate here for now to reduce noise.

Definition at line 56 of file hybrid_scan_multifile.hpp.

Constructor & Destructor Documentation

◆ hybrid_scan_multifile() [1/2]

cudf::io::parquet::experimental::hybrid_scan_multifile::hybrid_scan_multifile	(	cudf::host_span< cudf::host_span< uint8_t const > const >	footer_bytes,
		parquet_reader_options const &	options
	)

explicit

Constructor for the multi-file experimental Parquet reader.

Parameters

footer_bytes	Host span of Parquet file footer byte spans, one per source
options	Parquet reader options

◆ hybrid_scan_multifile() [2/2]

cudf::io::parquet::experimental::hybrid_scan_multifile::hybrid_scan_multifile	(	cudf::host_span< FileMetaData const >	parquet_metadata,
		parquet_reader_options const &	options
	)

explicit

Constructor for the multi-file experimental Parquet reader.

Parameters

parquet_metadata	Host span of pre-populated Parquet file metadata, one per source
options	Parquet reader options

Member Function Documentation

◆ all_column_chunks_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_column_chunks_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of column chunks of all (or selected) columns.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Pair of flattened byte ranges to column chunks of all (or selected) columns and their corresponding source indices

◆ all_row_groups()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::all_row_groups ( parquet_reader_options const & options ) const

Get all available per-source row group indices from the parquet files.

Parameters

options Parquet reader options

Returns: Vector of vectors of row group indices, one per source

◆ build_all_true_row_mask()

std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_all_true_row_mask	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Builds a boolean survival column of size equal to the total number of rows in the row groups containing all true values.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: An all-true boolean (survival) column spanning all selected rows across all sources

◆ build_row_mask_with_page_index_stats()

std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_multifile::build_row_mask_with_page_index_stats	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Builds a boolean column indicating surviving rows using page-level statistics in the page index.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: A boolean column spanning all selected rows across all sources and indicating which filter column rows survive the statistics in the page index

◆ construct_row_group_passes()

std::vector<std::vector<std::vector<size_type> > > cudf::io::parquet::experimental::hybrid_scan_multifile::construct_row_group_passes	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		std::size_t	pass_read_limit
	)		const

Partition row groups into passes such that the amount of GPU memory required to read, decompress and decode a pass is bounded by the specified limit.

Note that the pass_read_limit is a hint, not an absolute limit - if a single row group cannot fit within the limit given, it will still constitute a pass. The compressed row group size is estimated over all columns in each row group (not just the columns selected for reading), for conservative estimates.

Exceptions

std::invalid_argument if no row group indices in the input

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
pass_read_limit	Memory limit to read and decompress row group data, `0` if there is no limit (single pass)

Returns: Vector of per-source row group indices, one per constructed pass

◆ dictionary_pages_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::dictionary_pages_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of column chunk dictionary pages for row group pruning.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Pair of flattened byte ranges to column chunk dictionary pages subject to the filter predicate and their corresponding source indices

◆ filter_column_chunks_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_column_chunks_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of column chunks of filter columns.

Byte ranges are flattened in source order. Within each source, byte ranges follow the selected row group and column chunk order used by row_group_indices and options. The returned source map has one source index per byte range and can be used to regroup byte ranges by datasource before fetching.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Pair of flattened byte ranges to column chunks of filter columns and their corresponding source indices

◆ filter_row_groups_with_byte_range()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_byte_range	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Filter the row groups using the byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read)

Filters the row groups such that only the row groups that start within the byte range are selected. Note that the last selected row group may end beyond the byte range.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Vector of vectors of filtered row group indices, one per source

◆ filter_row_groups_with_dictionary_pages()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_dictionary_pages	(	cudf::host_span< cudf::device_span< uint8_t const > const >	dictionary_page_data,
		cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream
	)		const

Filter the row groups using column chunk dictionary pages.

Parameters

dictionary_page_data	Device spans of dictionary page data of column chunks with an (in)equality predicate, ordered to match the dictionary page byte ranges returned by `dictionary_pages_byte_ranges`
row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches

Returns: Vector of vectors of filtered row group indices, one per source

◆ filter_row_groups_with_stats()

std::vector<std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::filter_row_groups_with_stats	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream
	)		const

Filter the input row groups using column chunk statistics.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches

Returns: Vector of vectors of filtered row group indices, one per source

◆ has_next_table_chunk()

bool cudf::io::parquet::experimental::hybrid_scan_multifile::has_next_table_chunk ( ) const

Check if there is any parquet data left to read for the current chunked setup.

Returns: Boolean indicating if there is any data left to read

◆ materialize_all_columns()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_all_columns	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Materializes all (or selected) columns and returns the final output table.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
column_chunk_data	Flattened device spans of column chunk data returned in the same order as `all_column_chunks_byte_ranges`
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table

Returns: Table of all materialized columns and metadata

◆ materialize_all_columns_chunk()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_all_columns_chunk ( ) const

Materializes a chunk of all (or selected) columns and returns the output table chunk.

Returns: Table chunk of materialized all (or selected) columns and metadata

◆ materialize_filter_columns()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_filter_columns	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		cudf::mutable_column_view &	row_mask,
		use_data_page_mask	mask_data_pages,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Materializes filter columns and updates the input row mask to only the rows that exist in the output table.

Parameters

	row_group_indices	Span of vectors of input row group indices, one per source
	column_chunk_data	Flattened device spans of filter column chunk data returned in the same order as `filter_column_chunks_byte_ranges`
[in,out]	row_mask	Mutable boolean column spanning all selected rows across all sources and indicating surviving rows from page pruning
	mask_data_pages	Whether to build and use a data page mask using the row mask
	options	Parquet reader options
	stream	CUDA stream used for device memory operations and kernel launches
	mr	Device memory resource used to allocate the device memory for the output table

Returns: Table of materialized filter columns and metadata

◆ materialize_filter_columns_chunk()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_filter_columns_chunk ( cudf::mutable_column_view & row_mask ) const

Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table.

Parameters

[in,out] row_mask Mutable boolean column spanning all selected rows across all sources and indicating surviving rows from page pruning. The row mask size must equal the total number of rows in the input row groups, and is empty only when there are no such rows (yielding an empty output table)

Returns: Table chunk of materialized filter columns and metadata

◆ materialize_payload_columns()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_payload_columns	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		cudf::column_view const &	row_mask,
		use_data_page_mask	mask_data_pages,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Materialize payload columns and applies the row mask to the output table.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
column_chunk_data	Flattened device spans of payload column chunk data returned in the same order as `payload_column_chunks_byte_ranges`
row_mask	Boolean column spanning all selected rows across all sources and indicating which rows need to be read
mask_data_pages	Whether to build and use a data page mask using the row mask
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table

Returns: Table of materialized payload columns and metadata

◆ materialize_payload_columns_chunk()

table_with_metadata cudf::io::parquet::experimental::hybrid_scan_multifile::materialize_payload_columns_chunk ( cudf::column_view const & row_mask ) const

Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk.

Parameters

row_mask Boolean column spanning all selected rows across all sources and indicating which rows need to be read

Returns: Table chunk of materialized payload columns and metadata

◆ page_index_byte_ranges()

std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_multifile::page_index_byte_ranges ( ) const

Get byte ranges of the page index for all sources.

Returns: Vector of page index byte ranges, one per source

◆ parquet_metadatas()

std::vector<FileMetaData> cudf::io::parquet::experimental::hybrid_scan_multifile::parquet_metadatas ( ) const

Get parquet metadatas for all sources.

Returns: Vector of parquet metadata, one per source

◆ payload_column_chunks_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<size_type> > cudf::io::parquet::experimental::hybrid_scan_multifile::payload_column_chunks_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of column chunks of payload columns.

Byte ranges are flattened in source order. Within each source, byte ranges follow the selected row group and column chunk order used by row_group_indices and options. The returned source map has one source index per byte range and can be used to regroup byte ranges by datasource before fetching.

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Pair of flattened byte ranges to column chunks of payload columns and their corresponding source indices

◆ reset_column_selection()

void cudf::io::parquet::experimental::hybrid_scan_multifile::reset_column_selection ( ) const

Resets the current column selection.

Resets the current column selection state forcing column re-selection in subsequent filter, byte range, setup chunking and materialization APIs. This is useful if the filter expression has been cascaded (and-ed) to include new columns.

◆ secondary_filters_byte_ranges()

std::pair<std::vector<byte_range_info>, std::vector<byte_range_info> > cudf::io::parquet::experimental::hybrid_scan_multifile::secondary_filters_byte_ranges	(	cudf::host_span< std::vector< size_type > const >	row_group_indices,
		parquet_reader_options const &	options
	)		const

Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.

Note: Device buffers for bloom filter byte ranges must be allocated using a 32 byte aligned memory resource

Parameters

row_group_indices	Span of vectors of input row group indices, one per source
options	Parquet reader options

Returns: Pair of vectors of byte ranges of column chunk with bloom filters and dictionary pages subject to filter predicate

◆ setup_chunking_for_all_columns()

void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_all_columns	(	std::size_t	chunk_read_limit,
		std::size_t	pass_read_limit,
		cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Setup chunking information for all (or selected) columns and preprocess the input data pages.

Parameters

chunk_read_limit	Limit on total number of bytes to be returned per table chunk. `0` if there is no limit
pass_read_limit	Limit on the memory used for reading and decompressing data. `0` if there is no limit
row_group_indices	Span of vectors of input row group indices, one per source
column_chunk_data	Flattened device spans of column chunk data returned in the same order as `all_column_chunks_byte_ranges`
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table chunks

◆ setup_chunking_for_filter_columns()

void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_filter_columns	(	std::size_t	chunk_read_limit,
		std::size_t	pass_read_limit,
		cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::column_view const &	row_mask,
		use_data_page_mask	mask_data_pages,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Setup chunking information for filter columns and preprocess the input data pages.

Parameters

chunk_read_limit	Limit on total number of bytes to be returned per table chunk. `0` if there is no limit
pass_read_limit	Limit on the memory used for reading and decompressing data. `0` if there is no limit
row_group_indices	Span of vectors of input row group indices, one per source
row_mask	Boolean column spanning all selected rows across all sources and indicating which rows need to be read
mask_data_pages	Whether to build and use a data page mask using the row mask
column_chunk_data	Flattened device spans of filter column chunk data returned in the same order as `filter_column_chunks_byte_ranges`
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table chunks

◆ setup_chunking_for_payload_columns()

void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_chunking_for_payload_columns	(	std::size_t	chunk_read_limit,
		std::size_t	pass_read_limit,
		cudf::host_span< std::vector< size_type > const >	row_group_indices,
		cudf::column_view const &	row_mask,
		use_data_page_mask	mask_data_pages,
		cudf::host_span< cudf::device_span< uint8_t const > const >	column_chunk_data,
		parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream,
		rmm::device_async_resource_ref	mr
	)		const

Setup chunking information for payload columns and preprocess the input data pages.

Parameters

chunk_read_limit	Limit on total number of bytes to be returned per table chunk. `0` if there is no limit
pass_read_limit	Limit on the memory used for reading and decompressing data. `0` if there is no limit
row_group_indices	Span of vectors of input row group indices, one per source
row_mask	Boolean column spanning all selected rows across all sources and indicating which rows need to be read
mask_data_pages	Whether to build and use a data page mask using the row mask
column_chunk_data	Flattened device spans of payload column chunk data returned in the same order as `payload_column_chunks_byte_ranges`
options	Parquet reader options
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the device memory for the output table chunks

◆ setup_page_indexes()

void cudf::io::parquet::experimental::hybrid_scan_multifile::setup_page_indexes ( cudf::host_span< cudf::host_span< uint8_t const > const > page_index_bytes ) const

Setup the per-source page index within each Parquet file metadata.

Parameters

page_index_bytes Host span of Parquet page index buffer bytes, one per source

◆ total_rows_in_row_groups()

size_type cudf::io::parquet::experimental::hybrid_scan_multifile::total_rows_in_row_groups ( cudf::host_span< std::vector< size_type > const > row_group_indices ) const

Get the total number of top-level rows in the per-source row groups.

Parameters

row_group_indices Span of vectors of input row group indices, one per source

Returns: Total number of top-level rows across all sources

The documentation for this class was generated from the following file:

cudf/io/experimental/hybrid_scan_multifile.hpp

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ hybrid_scan_multifile() [1/2]

◆ hybrid_scan_multifile() [2/2]

Member Function Documentation

◆ all_column_chunks_byte_ranges()

◆ all_row_groups()

◆ build_all_true_row_mask()

◆ build_row_mask_with_page_index_stats()

◆ construct_row_group_passes()

◆ dictionary_pages_byte_ranges()

◆ filter_column_chunks_byte_ranges()

◆ filter_row_groups_with_byte_range()

◆ filter_row_groups_with_dictionary_pages()

◆ filter_row_groups_with_stats()

◆ has_next_table_chunk()

◆ materialize_all_columns()

◆ materialize_all_columns_chunk()

◆ materialize_filter_columns()

◆ materialize_filter_columns_chunk()

◆ materialize_payload_columns()

◆ materialize_payload_columns_chunk()

◆ page_index_byte_ranges()

◆ parquet_metadatas()

◆ payload_column_chunks_byte_ranges()

◆ reset_column_selection()

◆ secondary_filters_byte_ranges()

◆ setup_chunking_for_all_columns()

◆ setup_chunking_for_filter_columns()

◆ setup_chunking_for_payload_columns()

◆ setup_page_indexes()

◆ total_rows_in_row_groups()