deletion_vectors.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/parquet.hpp>
9 #include <cudf/io/types.hpp>
10 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
13 
14 #include <queue>
15 
16 namespace CUDF_EXPORT cudf {
17 namespace io::parquet::experimental {
18 
30  // Following vectors specify the data spans of input deletion vectors and the number of rows
31  // spanned by each deletion vector in order. Deletion vectors are applied in order of their
32  // appearance in the vectors. These vectors if empty will result in no table filtration.
33 
35  std::vector<cudf::host_span<cuda::std::byte const>> serialized_roaring_bitmaps;
37  std::vector<size_type> deletion_vector_row_counts;
38 
39  // Following vectors customize the row index column prepended to the read table from the Parquet
40  // source(s). These vectors if empty will result in an index column that is a sequence from 0 to
41  // the total number of rows in the table.
42 
44  std::vector<size_t> row_group_offsets;
46  std::vector<size_type> row_group_num_rows;
47 };
48 
63  public:
77  std::size_t chunk_read_limit,
78  parquet_reader_options const& options,
82 
104  std::size_t chunk_read_limit,
105  std::size_t pass_read_limit,
106  parquet_reader_options const& options,
110 
116 
122  [[nodiscard]] bool has_next() const;
123 
137 
138  private:
139  std::unique_ptr<cudf::io::chunked_parquet_reader> _reader;
140  std::queue<size_t> _row_group_row_offsets;
141  std::queue<size_type> _row_group_row_counts;
142  std::queue<cudf::roaring_bitmap> _deletion_vectors;
143  std::queue<size_type> _deletion_vector_row_counts;
144  size_t _start_row;
145  bool _is_unspecified_row_group_data;
146  rmm::cuda_stream_view _stream;
149 };
150 
174  parquet_reader_options const& options,
178 
188 [[nodiscard]] size_t compute_num_deleted_rows(
190  cudf::size_type max_chunk_rows = std::numeric_limits<size_type>::max(),
192  // end of group
194 
195 } // namespace io::parquet::experimental
196 } // namespace CUDF_EXPORT cudf
The chunked parquet reader class to read a Parquet source iteratively in a series of tables,...
table_with_metadata read_chunk()
Read a chunk of table from the Parquet source, prepend an index column to it, and filters the resulta...
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance and the roaring bitmap deletion vector.
bool has_next() const
Check if there is any data in the given source that has not yet been read.
Settings for read_parquet().
Definition: parquet.hpp:66
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
size_t compute_num_deleted_rows(deletion_vector_info const &deletion_vector_info, cudf::size_type max_chunk_rows=std::numeric_limits< size_type >::max(), rmm::cuda_stream_view stream=cudf::get_default_stream())
Computes the number of rows deleted by the serialized 64-bit roaring bitmap deletion vectors.
table_with_metadata read_parquet(parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a table from parquet source, prepends an index column to it, deserializes the specified 64-bit ...
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::resource_ref< cuda::mr::device_accessible > device_async_resource_ref
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Roaring bitmap APIs.
Struct used to specify information about deletion vectors and the index column to the experimental pa...
std::vector< cudf::host_span< cuda::std::byte const > > serialized_roaring_bitmaps
Host spans of 64-bit roaring bitmaps serialized in portable format.
std::vector< size_type > deletion_vector_row_counts
Number of rows spanned by each deletion vector.
std::vector< size_t > row_group_offsets
Row index offset for each row group to be read from the Parquet source(s)
std::vector< size_type > row_group_num_rows
Number of rows in each row group to be read from the Parquet source(s)
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Type declarations for libcudf.