hybrid_scan.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/parquet.hpp>
22 #include <cudf/io/types.hpp>
23 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
25 
26 #include <thrust/host_vector.h>
27 
28 #include <memory>
29 #include <utility>
30 #include <vector>
31 
32 namespace CUDF_EXPORT cudf {
33 namespace io::parquet::experimental::detail {
38 class hybrid_scan_reader_impl;
39 } // namespace io::parquet::experimental::detail
40 } // namespace CUDF_EXPORT cudf
41 
44 
45 namespace CUDF_EXPORT cudf {
46 namespace io::parquet::experimental {
57 enum class use_data_page_mask : bool {
58  YES = true,
59  NO = false
60 };
61 
278  public:
287  parquet_reader_options const& options);
288 
297  parquet_reader_options const& options);
298 
303 
312  [[nodiscard]] FileMetaData parquet_metadata() const;
313 
319  [[nodiscard]] byte_range_info page_index_byte_range() const;
320 
330  void setup_page_index(cudf::host_span<uint8_t const> page_index_bytes) const;
331 
338  [[nodiscard]] std::vector<size_type> all_row_groups(parquet_reader_options const& options) const;
339 
347  cudf::host_span<size_type const> row_group_indices) const;
348 
357  [[nodiscard]] std::vector<size_type> filter_row_groups_with_stats(
358  cudf::host_span<size_type const> row_group_indices,
359  parquet_reader_options const& options,
360  rmm::cuda_stream_view stream) const;
361 
374  [[nodiscard]] std::pair<std::vector<byte_range_info>, std::vector<byte_range_info>>
376  parquet_reader_options const& options) const;
377 
388  [[nodiscard]] std::vector<size_type> filter_row_groups_with_dictionary_pages(
389  cudf::host_span<rmm::device_buffer> dictionary_page_data,
390  cudf::host_span<size_type const> row_group_indices,
391  parquet_reader_options const& options,
392  rmm::cuda_stream_view stream) const;
393 
407  [[nodiscard]] std::vector<size_type> filter_row_groups_with_bloom_filters(
408  cudf::host_span<rmm::device_buffer> bloom_filter_data,
409  cudf::host_span<size_type const> row_group_indices,
410  parquet_reader_options const& options,
411  rmm::cuda_stream_view stream) const;
412 
424  [[nodiscard]] std::unique_ptr<cudf::column> build_row_mask_with_page_index_stats(
425  cudf::host_span<size_type const> row_group_indices,
426  parquet_reader_options const& options,
427  rmm::cuda_stream_view stream,
429 
437  [[nodiscard]] std::vector<byte_range_info> filter_column_chunks_byte_ranges(
438  cudf::host_span<size_type const> row_group_indices,
439  parquet_reader_options const& options) const;
440 
454  cudf::host_span<size_type const> row_group_indices,
455  std::vector<rmm::device_buffer>&& column_chunk_buffers,
456  cudf::mutable_column_view& row_mask,
457  use_data_page_mask mask_data_pages,
458  parquet_reader_options const& options,
459  rmm::cuda_stream_view stream) const;
460 
468  [[nodiscard]] std::vector<byte_range_info> payload_column_chunks_byte_ranges(
469  cudf::host_span<size_type const> row_group_indices,
470  parquet_reader_options const& options) const;
471 
484  cudf::host_span<size_type const> row_group_indices,
485  std::vector<rmm::device_buffer>&& column_chunk_buffers,
486  cudf::column_view const& row_mask,
487  use_data_page_mask mask_data_pages,
488  parquet_reader_options const& options,
489  rmm::cuda_stream_view stream) const;
490 
505  void setup_chunking_for_filter_columns(std::size_t chunk_read_limit,
506  std::size_t pass_read_limit,
507  cudf::host_span<size_type const> row_group_indices,
508  cudf::column_view const& row_mask,
509  use_data_page_mask mask_data_pages,
510  std::vector<rmm::device_buffer>&& column_chunk_buffers,
511  parquet_reader_options const& options,
512  rmm::cuda_stream_view stream) const;
513 
524  cudf::mutable_column_view& row_mask, rmm::cuda_stream_view stream) const;
525 
540  void setup_chunking_for_payload_columns(std::size_t chunk_read_limit,
541  std::size_t pass_read_limit,
542  cudf::host_span<size_type const> row_group_indices,
543  cudf::column_view const& row_mask,
544  use_data_page_mask mask_data_pages,
545  std::vector<rmm::device_buffer>&& column_chunk_buffers,
546  parquet_reader_options const& options,
547  rmm::cuda_stream_view stream) const;
548 
559  cudf::column_view const& row_mask, rmm::cuda_stream_view stream) const;
560 
566  [[nodiscard]] bool has_next_table_chunk() const;
567 
568  private:
569  std::unique_ptr<detail::hybrid_scan_reader_impl> _impl;
570 };
571  // end of group
573 
574 } // namespace io::parquet::experimental
575 } // namespace CUDF_EXPORT cudf
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
The experimental parquet reader class to optimally read parquet files subject to highly selective fil...
std::unique_ptr< cudf::column > build_row_mask_with_page_index_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
Builds a boolean column indicating which rows survive the page statistics in the page index.
std::vector< size_type > filter_row_groups_with_dictionary_pages(cudf::host_span< rmm::device_buffer > dictionary_page_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk dictionary pages.
void setup_page_index(cudf::host_span< uint8_t const > page_index_bytes) const
Setup the page index within the Parquet file metadata (FileMetaData)
std::vector< byte_range_info > payload_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of payload columns.
byte_range_info page_index_byte_range() const
Get the byte range of the page index in the Parquet file.
hybrid_scan_reader(cudf::host_span< uint8_t const > footer_bytes, parquet_reader_options const &options)
Constructor for the experimental parquet reader class to optimally read Parquet files subject to high...
std::pair< std::vector< byte_range_info >, std::vector< byte_range_info > > secondary_filters_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of bloom filters and dictionary pages (secondary filters) for row group pruning.
hybrid_scan_reader(FileMetaData const &parquet_metadata, parquet_reader_options const &options)
Constructor for the experimental parquet reader class to optimally read Parquet files subject to high...
FileMetaData parquet_metadata() const
Get the Parquet file footer metadata.
std::vector< size_type > filter_row_groups_with_bloom_filters(cudf::host_span< rmm::device_buffer > bloom_filter_data, cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the row groups using column chunk bloom filters.
table_with_metadata materialize_payload_columns(cudf::host_span< size_type const > row_group_indices, std::vector< rmm::device_buffer > &&column_chunk_buffers, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Materialize payload columns and applies the row mask to the output table.
table_with_metadata materialize_filter_columns_chunk(cudf::mutable_column_view &row_mask, rmm::cuda_stream_view stream) const
Materializes a chunk of filter columns and updates the corresponding range of input row mask to only ...
void setup_chunking_for_payload_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, std::vector< rmm::device_buffer > &&column_chunk_buffers, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Setup chunking information for payload columns and preprocess the input data pages.
std::vector< size_type > filter_row_groups_with_stats(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Filter the input row groups using column chunk statistics.
table_with_metadata materialize_payload_columns_chunk(cudf::column_view const &row_mask, rmm::cuda_stream_view stream) const
Materializes a chunk of payload columns and applies the corresponding range of input row mask to the ...
std::vector< size_type > all_row_groups(parquet_reader_options const &options) const
Get all available row groups from the parquet file.
size_type total_rows_in_row_groups(cudf::host_span< size_type const > row_group_indices) const
Get the total number of top-level rows in the row groups.
bool has_next_table_chunk() const
Check if there is any parquet data left to read for the current setup.
std::vector< byte_range_info > filter_column_chunks_byte_ranges(cudf::host_span< size_type const > row_group_indices, parquet_reader_options const &options) const
Get byte ranges of column chunks of filter columns.
table_with_metadata materialize_filter_columns(cudf::host_span< size_type const > row_group_indices, std::vector< rmm::device_buffer > &&column_chunk_buffers, cudf::mutable_column_view &row_mask, use_data_page_mask mask_data_pages, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Materializes filter columns and updates the input row mask to only the rows that exist in the output ...
void setup_chunking_for_filter_columns(std::size_t chunk_read_limit, std::size_t pass_read_limit, cudf::host_span< size_type const > row_group_indices, cudf::column_view const &row_mask, use_data_page_mask mask_data_pages, std::vector< rmm::device_buffer > &&column_chunk_buffers, parquet_reader_options const &options, rmm::cuda_stream_view stream) const
Setup chunking information for filter columns and preprocess the input data pages.
~hybrid_scan_reader()
Destructor for the experimental parquet reader class.
Information about content of a parquet file.
Settings for read_parquet().
Definition: parquet.hpp:78
stores offset and size used to indicate a byte range
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
use_data_page_mask
Whether to compute and use a page mask using the row mask to skip decompression and decoding of the m...
Definition: hybrid_scan.hpp:57
@ YES
Compute and use a data page mask.
@ NO
Do not compute or use a data page mask.
detail::cccl_async_resource_ref< cuda::mr::async_resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:37
Parquet footer schema structs.
C++20 std::span with reduced feature set.
Definition: span.hpp:194
Thrift-derived struct describing file-level metadata.
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:303
Type declarations for libcudf.