IO Utilities#
- group IO Utilities
Functions
-
std::size_t metadata_size_hint()#
Returns the Parquet reader’s footer speculative read size in bytes.
Controlled by the
LIBCUDF_PARQUET_METADATA_SIZE_HINTenvironment variable. Defaults to 64 KiB.When the footer is smaller than the speculative read size, the footer metadata is loaded in a single read, which is especially useful for high-latency, remote storage systems. When the footer is larger than the speculative read size, the footer metadata will be loaded in two reads.
Set
LIBCUDF_PARQUET_METADATA_SIZE_HINT=0to disable speculative reads.- Returns:
Number of bytes to speculatively read from the end of the source.
Fetches a host buffer of Parquet footer bytes from the input data source.
- Parameters:
datasource – Input data source
- Returns:
Host buffer containing footer bytes
Fetches host buffers of Parquet footer bytes from multiple input data sources.
- Parameters:
datasources – Input data sources
- Throws:
cudf::logic_error – if any datasource contains a corrupted Parquet magic number, header or footer, or has an invalid footer length.
- Returns:
Vector of host buffers containing footer bytes, one per datasource
-
std::unique_ptr<cudf::io::datasource::buffer> fetch_page_index_to_host(cudf::io::datasource &datasource, byte_range_info const page_index_bytes)#
Fetches a host buffer of Parquet page index from the input data source.
- Parameters:
datasource – Input datasource
page_index_bytes – Byte range of page index
- Returns:
Host buffer containing page index bytes
-
std::vector<std::unique_ptr<cudf::io::datasource::buffer>> fetch_page_indexes_to_host(cudf::host_span<std::reference_wrapper<cudf::io::datasource> const> datasources, cudf::host_span<byte_range_info const> page_index_bytes_per_source)#
Fetches host buffers of Parquet page index bytes from multiple input data sources.
- Parameters:
datasources – Input datasources
page_index_bytes_per_source – Byte ranges of page index, one per datasource
- Throws:
cudf::logic_error – if the number of datasources does not match the number of page index byte ranges
std::out_of_range – if any page index byte range is out of range for its datasource
- Returns:
Vector of host buffers containing page index bytes, one per datasource
-
std::tuple<std::vector<rmm::device_buffer>, std::vector<cudf::device_span<uint8_t const>>, std::future<void>> fetch_byte_ranges_to_device_async(cudf::io::datasource &datasource, cudf::host_span<byte_range_info const> byte_ranges, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)#
Fetches a list of byte ranges from a datasource into device buffers.
- Parameters:
datasource – Input datasource
byte_ranges – Byte ranges to fetch
stream – CUDA stream
mr – Device memory resource
- Returns:
A tuple containing the device buffers, the device spans of the fetched data, and a future to wait on the read tasks
-
std::tuple<std::vector<rmm::device_buffer>, std::vector<std::vector<cudf::device_span<uint8_t const>>>, std::future<void>> fetch_byte_ranges_to_device_async(cudf::host_span<std::reference_wrapper<cudf::io::datasource> const> datasources, cudf::host_span<std::vector<byte_range_info> const> byte_ranges_per_source, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)#
Fetches lists of byte ranges from multiple datasources into device buffers.
- Parameters:
datasources – Input datasources
byte_ranges_per_source – Vector of byte ranges to fetch, one per datasource
stream – CUDA stream
mr – Device memory resource
- Returns:
A tuple containing a vector of device buffers, a vector of vectors of device spans (one per byte range per datasource), and a future to wait on the read tasks
-
std::size_t metadata_size_hint()#