The experimental parquet reader class to optimally read parquet files subject to highly selective filters, called a Hybrid Scan operation. More...
#include <hybrid_scan.hpp>
Public Member Functions | |
| hybrid_scan_reader (cudf::host_span< uint8_t const > footer_bytes, parquet_reader_options const &options) | |
| Constructor for the experimental parquet reader class to optimally read Parquet files subject to highly selective filters. More... | |
| hybrid_scan_reader (FileMetaData const &parquet_metadata, parquet_reader_options const &options) | |
| Constructor for the experimental parquet reader class to optimally read Parquet files subject to highly selective filters. More... | |
| ~hybrid_scan_reader () | |
| Destructor for the experimental parquet reader class. | |
| FileMetaData | parquet_metadata () const |
| Get the Parquet file footer metadata. More... | |
| byte_range_info | page_index_byte_range () const |
| Get the byte range of the page index in the Parquet file. More... | |
| void | setup_page_index (cudf::host_span< uint8_t const > page_index_bytes) const |
Setup the page index within the Parquet file metadata (FileMetaData) More... | |
| std::vector< size_type > | all_row_groups (parquet_reader_options const &options) const |
| Get all available row groups from the parquet file. More... | |
| size_type | total_rows_in_row_groups (cudf::host_span< size_type const > row_group_indices) const |
| Get the total number of top-level rows in the row groups. More... | |
| void | reset_column_selection () const |
| Resets the current column selection. More... | |
| std::vector< size_type > | filter_row_groups_with_byte_range (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const |
Filter the row groups using the specified byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read) More... | |
| std::vector< size_type > | filter_row_groups_with_stats (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const |
| Filter the input row groups using column chunk statistics. More... | |
| std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > | secondary_filters_byte_ranges (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning. More... | |
| std::vector< size_type > | filter_row_groups_with_dictionary_pages (cudf::host_span< cudf::device_span< uint8_t const > const > dictionary_page_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const |
| Filter the row groups using column chunk dictionary pages. More... | |
| std::vector< size_type > | filter_row_groups_with_bloom_filters (cudf::host_span< cudf::device_span< uint8_t const > const > bloom_filter_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const |
| Filter the row groups using column chunk bloom filters. More... | |
| std::unique_ptr< cudf::column > | build_all_true_row_mask (cudf::host_span< size_type const > row_group_indices, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
Builds a boolean (survival) column of size equal to the total number of rows in the row groups containing all true values. More... | |
| std::unique_ptr< cudf::column > | build_row_mask_with_page_index_stats (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Builds a boolean column indicating surviving rows using page-level statistics in the page index. More... | |
| std::vector< byte_range_info > | filter_column_chunks_byte_ranges (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of filter columns. More... | |
| table_with_metadata | materialize_filter_columns (cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materializes filter columns and updates the input row mask to only the rows that exist in the output table. More... | |
| std::vector< byte_range_info > | payload_column_chunks_byte_ranges (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of payload columns. More... | |
| table_with_metadata | materialize_payload_columns (cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materialize payload columns and applies the row mask to the output table. More... | |
| std::vector< byte_range_info > | all_column_chunks_byte_ranges (cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const |
| Get byte ranges of column chunks of all (or selected) columns. More... | |
| table_with_metadata | materialize_all_columns (cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Materializes all (or selected) columns and returns the final output table. More... | |
| void | setup_chunking_for_filter_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for filter columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_filter_columns_chunk (cudf::mutable_column_view &row_mask) const |
| Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table. More... | |
| void | setup_chunking_for_payload_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for payload columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_payload_columns_chunk (cudf::column_view const &row_mask) const |
| Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk. More... | |
| void | setup_chunking_for_all_columns (std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::host_span< cudf::device_span< uint8_t const > const > column_chunk_data, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const |
| Setup chunking information for all (or selected) columns and preprocess the input data pages. More... | |
| table_with_metadata | materialize_all_columns_chunk () const |
| Materializes all (or selected) columns and returns the final output table. More... | |
| bool | has_next_table_chunk () const |
| Check if there is any parquet data left to read for the current setup. More... | |
The experimental parquet reader class to optimally read parquet files subject to highly selective filters, called a Hybrid Scan operation.
This class is designed to best exploit reductive optimization techniques to speed up reading Parquet files subject to highly selective filters. The parquet file contents are read in two passes. In the first pass, only the filter columns (i.e. columns that appear in the filter expression) are read allowing pruning of row groups and filter column data pages using the filter expression. In the second pass, only the payload columns (i.e. columns that do not appear in the filter expression) are optimally read by applying the surviving row mask from the first pass to prune payload column data pages.
The following code snippets demonstrate how to use the experimental parquet reader.
Start with an instance of the experimental reader with a span of parquet file footer bytes and parquet reader options.
Metadata handling (OPTIONAL): Get a materialized parquet file footer metadata struct (FileMetaData) from the reader to get insights into the parquet data as needed. Optionally, set up the page index to materialize page level stats used for data page pruning.
Row group pruning (OPTIONAL): Start with either a list of custom or all row group indices in the parquet file and optionally filter it using a byte range and/or the filter expression using column chunk statistics, dictionaries and bloom filters. Byte ranges for column chunk dictionary pages and bloom filters within parquet file may be obtained via secondary_filters_byte_ranges() function. The byte ranges may be read into device buffers and their device spans may be passed to the row group filtration functions.
Build an initial row mask: Once the row groups are filtered, the next step is to build an initial BOOL8 row mask column indicating which rows in the current span of row groups survive in the final table. This row mask column may contain all true values built using the build_all_true_row_mask() function or it may contain a true value for only the rows that survive the page-level statistics from the page index subject to the same filter as row groups (needs page index to be set up using the setup_page_index() function). The size of this row mask column must be equal to the total number of rows in the current span of row groups.
Materialize filter columns: Once we are done with pruning row groups and constructing the row mask, the next step is to materialize filter columns into a table (first reader pass). This is done using the materialize_filter_columns() function. This function requires a span of device spans of column chunk data for the current list of row groups, and a mutable view of the current row mask. The function optionally builds a mask for the current data pages using the input row mask to skip decompression and decoding of the pruned pages based on the mask_data_pages argument. The filter columns are then read into a table and filtered based on the filter expression and the row mask is updated to only indicate the rows that survive in the read table. The final table is returned. The byte ranges for the required column chunk data may be obtained using the filter_column_chunks_byte_ranges() function and read into device buffers with corresponding device spans.
Materialize payload columns: Once the filter columns are materialized, the final step is to materialize the payload columns into another table (second reader pass). This is done using the materialize_payload_columns() function which is identical to the materialize_filter_columns() in terms of functionality except that it accepts an immutable view of the row mask and uses it to filter the read output table before returning it. The byte ranges for the required column chunk data may be obtained using the payload_column_chunks_byte_ranges() function and read into device buffers with corresponding device spans.
Once both reader passes are complete, the filter and payload column tables may be trivially combined by releasing the columns from both tables and moving them into a new cudf table.
cudf::io::read_parquet() function. Definition at line 278 of file hybrid_scan.hpp.
|
explicit |
Constructor for the experimental parquet reader class to optimally read Parquet files subject to highly selective filters.
| footer_bytes | Host span of parquet file footer bytes |
| options | Parquet reader options |
|
explicit |
Constructor for the experimental parquet reader class to optimally read Parquet files subject to highly selective filters.
| parquet_metadata | Pre-populated Parquet file metadata |
| options | Parquet reader options |
| std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_reader::all_column_chunks_byte_ranges | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of all (or selected) columns.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| std::vector<size_type> cudf::io::parquet::experimental::hybrid_scan_reader::all_row_groups | ( | parquet_reader_options const & | options | ) | const |
Get all available row groups from the parquet file.
| options | Parquet reader options |
| std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_reader::build_all_true_row_mask | ( | cudf::host_span< size_type const > | row_group_indices, |
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Builds a boolean (survival) column of size equal to the total number of rows in the row groups containing all true values.
| row_group_indices | Input row groups indices |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the returned column's device memory |
| std::unique_ptr<cudf::column> cudf::io::parquet::experimental::hybrid_scan_reader::build_row_mask_with_page_index_stats | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Builds a boolean column indicating surviving rows using page-level statistics in the page index.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the returned column's device memory |
| std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_reader::filter_column_chunks_byte_ranges | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of filter columns.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| std::vector<size_type> cudf::io::parquet::experimental::hybrid_scan_reader::filter_row_groups_with_bloom_filters | ( | cudf::host_span< cudf::device_span< uint8_t const > const > | bloom_filter_data, |
| cudf::host_span< size_type const > | row_group_indices, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream | ||
| ) | const |
Filter the row groups using column chunk bloom filters.
bloom_filter_data device spans must point to 32-byte aligned addresses| bloom_filter_data | Device spans of bloom filter data of column chunks with an equality predicate |
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| std::vector<size_type> cudf::io::parquet::experimental::hybrid_scan_reader::filter_row_groups_with_byte_range | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Filter the row groups using the specified byte range specified by [bytes_to_skip, bytes_to_skip + bytes_to_read)
Filters the row groups such that only the row groups that start within the byte range are selected. Note that the last selected row group may end beyond the byte range.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| std::vector<size_type> cudf::io::parquet::experimental::hybrid_scan_reader::filter_row_groups_with_dictionary_pages | ( | cudf::host_span< cudf::device_span< uint8_t const > const > | dictionary_page_data, |
| cudf::host_span< size_type const > | row_group_indices, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream | ||
| ) | const |
Filter the row groups using column chunk dictionary pages.
| dictionary_page_data | Device spans of dictionary page data of column chunks with an (in)equality predicate |
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| std::vector<size_type> cudf::io::parquet::experimental::hybrid_scan_reader::filter_row_groups_with_stats | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream | ||
| ) | const |
Filter the input row groups using column chunk statistics.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| bool cudf::io::parquet::experimental::hybrid_scan_reader::has_next_table_chunk | ( | ) | const |
Check if there is any parquet data left to read for the current setup.
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_all_columns | ( | cudf::host_span< size_type const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materializes all (or selected) columns and returns the final output table.
| row_group_indices | Input row groups indices |
| column_chunk_data | Device spans of column chunk data of all columns |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_all_columns_chunk | ( | ) | const |
Materializes all (or selected) columns and returns the final output table.
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_filter_columns | ( | cudf::host_span< size_type const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| cudf::mutable_column_view & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materializes filter columns and updates the input row mask to only the rows that exist in the output table.
| row_group_indices | Input row groups indices | |
| column_chunk_data | Device spans of column chunk data of filter columns | |
| [in,out] | row_mask | Mutable boolean column indicating surviving rows from page pruning |
| mask_data_pages | Whether to build and use a data page mask using the row mask | |
| options | Parquet reader options | |
| stream | CUDA stream used for device memory operations and kernel launches | |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_filter_columns_chunk | ( | cudf::mutable_column_view & | row_mask | ) | const |
Materializes a chunk of filter columns and updates the corresponding range of input row mask to only the rows that exist in the output table.
| [in,out] | row_mask | Mutable boolean column indicating surviving rows from page pruning |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_payload_columns | ( | cudf::host_span< size_type const > | row_group_indices, |
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Materialize payload columns and applies the row mask to the output table.
| row_group_indices | Input row groups indices |
| column_chunk_data | Device spans of column chunk data of payload columns |
| row_mask | Boolean column indicating which rows need to be read. All rows read if empty |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table |
| table_with_metadata cudf::io::parquet::experimental::hybrid_scan_reader::materialize_payload_columns_chunk | ( | cudf::column_view const & | row_mask | ) | const |
Materializes a chunk of payload columns and applies the corresponding range of input row mask to the output table chunk.
| row_mask | Boolean column indicating which rows need to be read. All rows read if empty |
| byte_range_info cudf::io::parquet::experimental::hybrid_scan_reader::page_index_byte_range | ( | ) | const |
Get the byte range of the page index in the Parquet file.
| FileMetaData cudf::io::parquet::experimental::hybrid_scan_reader::parquet_metadata | ( | ) | const |
Get the Parquet file footer metadata.
Returns the materialized Parquet file footer metadata struct. The footer will contain the materialized page index if called after setup_page_index().
| std::vector<byte_range_info> cudf::io::parquet::experimental::hybrid_scan_reader::payload_column_chunks_byte_ranges | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of column chunks of payload columns.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| void cudf::io::parquet::experimental::hybrid_scan_reader::reset_column_selection | ( | ) | const |
Resets the current column selection.
Resets the current column selection state forcing column re-selection in subsequent filter, byte range, setup chunking and materialization APIs. This is useful if the filter expression has been cascaded (and-ed) to include new columns
| std::pair<std::vector<byte_range_info>, std::vector<byte_range_info> > cudf::io::parquet::experimental::hybrid_scan_reader::secondary_filters_byte_ranges | ( | cudf::host_span< size_type const > | row_group_indices, |
| parquet_reader_options const & | options | ||
| ) | const |
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
| row_group_indices | Input row groups indices |
| options | Parquet reader options |
| void cudf::io::parquet::experimental::hybrid_scan_reader::setup_chunking_for_all_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< size_type const > | row_group_indices, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for all (or selected) columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Input row groups indices |
| column_chunk_data | Device spans of column chunk data of all columns |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| void cudf::io::parquet::experimental::hybrid_scan_reader::setup_chunking_for_filter_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< size_type const > | row_group_indices, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for filter columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Input row groups indices |
| row_mask | Boolean column indicating which rows need to be read. All rows read if empty |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| column_chunk_data | Device spans of column chunk data of filter columns |
| options | Parquet reader options |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| stream | CUDA stream used for device memory operations and kernel launches |
| void cudf::io::parquet::experimental::hybrid_scan_reader::setup_chunking_for_payload_columns | ( | std::size_t | chunk_read_limit, |
| std::size_t | pass_read_limit, | ||
| cudf::host_span< size_type const > | row_group_indices, | ||
| cudf::column_view const & | row_mask, | ||
| use_data_page_mask | mask_data_pages, | ||
| cudf::host_span< cudf::device_span< uint8_t const > const > | column_chunk_data, | ||
| parquet_reader_options const & | options, | ||
| rmm::cuda_stream_view | stream, | ||
| rmm::device_async_resource_ref | mr | ||
| ) | const |
Setup chunking information for payload columns and preprocess the input data pages.
| chunk_read_limit | Limit on total number of bytes to be returned per table chunk. 0 if there is no limit |
| pass_read_limit | Limit on the memory used for reading and decompressing data. 0 if there is no limit |
| row_group_indices | Input row groups indices |
| row_mask | Boolean column indicating which rows need to be read. All rows read if empty |
| mask_data_pages | Whether to build and use a data page mask using the row mask |
| column_chunk_data | Device spans of column chunk data of payload columns |
| options | Parquet reader options |
| stream | CUDA stream used for device memory operations and kernel launches |
| mr | Device memory resource used to allocate the device memory for the output table chunks |
| void cudf::io::parquet::experimental::hybrid_scan_reader::setup_page_index | ( | cudf::host_span< uint8_t const > | page_index_bytes | ) | const |
Setup the page index within the Parquet file metadata (FileMetaData)
Materialize the ColumnIndex and OffsetIndex structs (collectively called the page index) within the Parquet file metadata struct (returned by parquet_metadata()). The statistics contained in page index can be used to prune data pages before decoding.
| page_index_bytes | Host span of Parquet page index buffer bytes |
| size_type cudf::io::parquet::experimental::hybrid_scan_reader::total_rows_in_row_groups | ( | cudf::host_span< size_type const > | row_group_indices | ) | const |
Get the total number of top-level rows in the row groups.
| row_group_indices | Input row groups indices |