deletion_vectors.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/parquet.hpp>
9 #include <cudf/io/types.hpp>
10 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
12 
13 #include <queue>
14 
15 namespace CUDF_EXPORT cudf {
16 namespace io::parquet::experimental {
17 
38  public:
40  struct roaring_bitmap_impl;
41 
57  std::size_t chunk_read_limit,
58  parquet_reader_options const& options,
59  cudf::host_span<cuda::std::byte const> serialized_roaring_bitmap,
60  cudf::host_span<size_t const> row_group_offsets,
61  cudf::host_span<size_type const> row_group_num_rows,
64 
88  std::size_t chunk_read_limit,
89  std::size_t pass_read_limit,
90  parquet_reader_options const& options,
91  cudf::host_span<cuda::std::byte const> serialized_roaring_bitmap,
92  cudf::host_span<size_t const> row_group_offsets,
93  cudf::host_span<size_type const> row_group_num_rows,
96 
114  std::size_t chunk_read_limit,
115  parquet_reader_options const& options,
116  cudf::host_span<cudf::host_span<cuda::std::byte const> const> serialized_roaring_bitmaps,
117  cudf::host_span<size_type const> deletion_vector_row_counts,
118  cudf::host_span<size_t const> row_group_offsets,
119  cudf::host_span<size_type const> row_group_num_rows,
122 
148  std::size_t chunk_read_limit,
149  std::size_t pass_read_limit,
150  parquet_reader_options const& options,
151  cudf::host_span<cudf::host_span<cuda::std::byte const> const> serialized_roaring_bitmaps,
152  cudf::host_span<size_type const> deletion_vector_row_counts,
153  cudf::host_span<size_t const> row_group_offsets,
154  cudf::host_span<size_type const> row_group_num_rows,
157 
163 
169  [[nodiscard]] bool has_next() const;
170 
184 
185  private:
186  std::unique_ptr<cudf::io::chunked_parquet_reader> _reader;
187  std::queue<size_t> _row_group_row_offsets;
188  std::queue<size_type> _row_group_row_counts;
189  std::queue<roaring_bitmap_impl> _deletion_vectors;
190  std::queue<size_type> _deletion_vector_row_counts;
191  size_t _start_row;
192  bool _is_unspecified_row_group_data;
193  rmm::cuda_stream_view _stream;
196 };
197 
222  parquet_reader_options const& options,
223  cudf::host_span<cuda::std::byte const> serialized_roaring_bitmap,
224  cudf::host_span<size_t const> row_group_offsets,
225  cudf::host_span<size_type const> row_group_num_rows,
228 
256  parquet_reader_options const& options,
257  cudf::host_span<cudf::host_span<cuda::std::byte const> const> serialized_roaring_bitmaps,
258  cudf::host_span<size_type const> deletion_vector_row_counts,
259  cudf::host_span<size_t const> row_group_offsets,
260  cudf::host_span<size_type const> row_group_num_rows,
263  // end of group
265 
266 } // namespace io::parquet::experimental
267 } // namespace CUDF_EXPORT cudf
The chunked parquet reader class to read a Parquet source iteratively in a series of tables,...
table_with_metadata read_chunk()
Read a chunk of table from the Parquet source, prepend an index column to it, and filters the resulta...
~chunked_parquet_reader()
Destructor, destroying the internal reader instance and the roaring bitmap deletion vector.
bool has_next() const
Check if there is any data in the given source that has not yet been read.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, cudf::host_span< cudf::host_span< cuda::std::byte const > const > serialized_roaring_bitmaps, cudf::host_span< size_type const > deletion_vector_row_counts, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, cudf::host_span< cudf::host_span< cuda::std::byte const > const > serialized_roaring_bitmaps, cudf::host_span< size_type const > deletion_vector_row_counts, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, cudf::host_span< cuda::std::byte const > serialized_roaring_bitmap, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, cudf::host_span< cuda::std::byte const > serialized_roaring_bitmap, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Constructor for the chunked reader.
Settings for read_parquet().
Definition: parquet.hpp:66
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_parquet(parquet_reader_options const &options, cudf::host_span< cudf::host_span< cuda::std::byte const > const > serialized_roaring_bitmaps, cudf::host_span< size_type const > deletion_vector_row_counts, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource_ref())
Reads a table from parquet source, prepends an index column to it, deserializes the specified 64-bit ...
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
device_async_resource_ref get_current_device_resource_ref()
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
C++20 std::span with reduced feature set.
Definition: span.hpp:182
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Type declarations for libcudf.