deletion_vectors.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/parquet.hpp>
9 #include <cudf/io/types.hpp>
10 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
12 
13 #include <queue>
14 
15 namespace CUDF_EXPORT cudf {
16 namespace io::parquet::experimental {
17 
29  // Following vectors specify the data spans of input deletion vectors and the number of rows
30  // spanned by each deletion vector in order. Deletion vectors are applied in order of their
31  // appearance in the vectors. These vectors if empty will result in no table filtration.
32 
34  std::vector<cudf::host_span<cuda::std::byte const>> serialized_roaring_bitmaps;
36  std::vector<size_type> deletion_vector_row_counts;
37 
38  // Following vectors customize the row index column prepended to the read table from the Parquet
39  // source(s). These vectors if empty will result in an index column that is a sequence from 0 to
40  // the total number of rows in the table.
41 
43  std::vector<size_t> row_group_offsets;
45  std::vector<size_type> row_group_num_rows;
46 };
47 
62  public:
64  struct roaring_bitmap_impl;
65 
79  std::size_t chunk_read_limit,
80  parquet_reader_options const& options,
84 
106  std::size_t chunk_read_limit,
107  std::size_t pass_read_limit,
108  parquet_reader_options const& options,
112 
118 
124  [[nodiscard]] bool has_next() const;
125 
139 
140  private:
141  std::unique_ptr<cudf::io::chunked_parquet_reader> _reader;
142  std::queue<size_t> _row_group_row_offsets;
143  std::queue<size_type> _row_group_row_counts;
144  std::queue<roaring_bitmap_impl> _deletion_vectors;
145  std::queue<size_type> _deletion_vector_row_counts;
146  size_t _start_row;
147  bool _is_unspecified_row_group_data;
148  rmm::cuda_stream_view _stream;
151 };
152 
176  parquet_reader_options const& options,
180  // end of group
182 
183 } // namespace io::parquet::experimental
184 } // namespace CUDF_EXPORT cudf
The chunked parquet reader class to read a Parquet source iteratively in a series of tables,...
table_with_metadata read_chunk()
Read a chunk of table from the Parquet source, prepend an index column to it, and filters the resulta...
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance and the roaring bitmap deletion vector.
bool has_next() const
Check if there is any data in the given source that has not yet been read.
Settings for read_parquet().
Definition: parquet.hpp:66
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_parquet(parquet_reader_options const &options, deletion_vector_info const &deletion_vector_info, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource_ref())
Reads a table from parquet source, prepends an index column to it, deserializes the specified 64-bit ...
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
device_async_resource_ref get_current_device_resource_ref()
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Struct used to specify information about deletion vectors and the index column to the experimental pa...
std::vector< cudf::host_span< cuda::std::byte const > > serialized_roaring_bitmaps
Host spans of 64-bit roaring bitmaps serialized in portable format.
std::vector< size_type > deletion_vector_row_counts
Number of rows spanned by each deletion vector.
std::vector< size_t > row_group_offsets
Row index offset for each row group to be read from the Parquet source(s)
std::vector< size_type > row_group_num_rows
Number of rows in each row group to be read from the Parquet source(s)
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Type declarations for libcudf.