Files
file	avro.hpp

file	csv.hpp

file	deletion_vectors.hpp

file	hybrid_scan.hpp

file	io/json.hpp

file	orc.hpp

file	parquet.hpp

file	byte_range_info.hpp

file	data_chunk_source.hpp

file	multibyte_split.hpp

Classes
class	cudf::io::avro_reader_options
	Settings to use for `read_avro()`. More...

class	cudf::io::avro_reader_options_builder
	Builder to build options for `read_avro()`. More...

class	cudf::io::csv_reader_options
	Settings to use for `read_csv()`. More...

class	cudf::io::csv_reader_options_builder
	Builder to build options for `read_csv()`. More...

class	cudf::io::parquet::experimental::chunked_parquet_reader
	The chunked parquet reader class to read a Parquet source iteratively in a series of tables, chunk by chunk. Each chunk is prepended with a row index column built using the specified row group offsets and row counts. The resultant table chunk is filtered using the supplied serialized roaring64 bitmap deletion vector and returned. More...

class	cudf::io::parquet::experimental::hybrid_scan_reader
	The experimental parquet reader class to optimally read parquet files subject to highly selective filters, called a Hybrid Scan operation. More...

struct	cudf::io::schema_element
	Allows specifying the target types for nested JSON data via json_reader_options' `set_dtypes` method. More...

class	cudf::io::json_reader_options
	Input arguments to the `read_json` interface. More...

class	cudf::io::json_reader_options_builder
	Builds settings to use for `read_json()`. More...

class	cudf::io::orc_reader_options
	Settings to use for `read_orc()`. More...

class	cudf::io::orc_reader_options_builder
	Builds settings to use for `read_orc()`. More...

class	cudf::io::chunked_orc_reader
	The chunked orc reader class to read an ORC file iteratively into a series of tables, chunk by chunk. More...

class	cudf::io::parquet_reader_options
	Settings for `read_parquet()`. More...

class	cudf::io::parquet_reader_options_builder
	Builds parquet_reader_options to use for `read_parquet()`. More...

class	cudf::io::chunked_parquet_reader
	The chunked parquet reader class to read Parquet file iteratively in to a series of tables, chunk by chunk. More...

class	cudf::io::text::byte_range_info
	stores offset and size used to indicate a byte range More...

class	cudf::io::text::device_data_chunk
	A contract guaranteeing stream-ordered memory access to the underlying device data. More...

class	cudf::io::text::data_chunk_reader
	a reader capable of producing views over device memory. More...

class	cudf::io::text::data_chunk_source
	a data source capable of creating a reader which can produce views of the data source in device memory. More...

struct	cudf::io::text::parse_options
	Parsing options for multibyte_split. More...

Enumerations
enum class	cudf::io::parquet::experimental::use_data_page_mask : bool { YES = true , NO = false }
	Whether to compute and use a page mask using the row mask to skip decompression and decoding of the masked pages. More...

enum class	cudf::io::json_recovery_mode_t { cudf::io::FAIL , cudf::io::RECOVER_WITH_NULL }
	Control the error recovery behavior of the json parser. More...

Functions
table_with_metadata	cudf::io::read_avro (avro_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Reads an Avro dataset into a set of columns. More...

table_with_metadata	cudf::io::read_csv (csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Reads a CSV dataset into a set of columns. More...

table_with_metadata	cudf::io::parquet::experimental::read_parquet (parquet_reader_options const &options, cudf::host_span< cuda::std::byte const > serialized_roaring64, cudf::host_span< size_t const > row_group_offsets, cudf::host_span< size_type const > row_group_num_rows, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource_ref())
	Reads a table from parquet source, prepends an index column to it, deserializes the roaring64 deletion vector and applies it to the read table. More...

table_with_metadata	cudf::io::read_json (json_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Reads a JSON dataset into a set of columns. More...

bool	cudf::io::is_supported_read_orc (compression_type compression)
	Check if the compression type is supported for reading ORC files. More...

bool	cudf::io::is_supported_write_orc (compression_type compression)
	Check if the compression type is supported for writing ORC files. More...

table_with_metadata	cudf::io::read_orc (orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Reads an ORC dataset into a set of columns. More...

raw_orc_statistics	cudf::io::read_raw_orc_statistics (source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Reads file-level and stripe-level statistics of ORC dataset. More...

parsed_orc_statistics	cudf::io::read_parsed_orc_statistics (source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Reads file-level and stripe-level statistics of ORC dataset. More...

orc_metadata	cudf::io::read_orc_metadata (source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Reads metadata of ORC dataset. More...

bool	cudf::io::is_supported_read_parquet (compression_type compression)
	Check if the compression type is supported for reading Parquet files. More...

bool	cudf::io::is_supported_write_parquet (compression_type compression)
	Check if the compression type is supported for writing Parquet files. More...

table_with_metadata	cudf::io::read_parquet (parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Reads a Parquet dataset into a set of columns. More...

parquet_metadata	cudf::io::read_parquet_metadata (source_info const &src_info)
	Reads metadata of parquet dataset. More...

std::vector< byte_range_info >	cudf::io::text::create_byte_range_infos_consecutive (int64_t total_bytes, int64_t range_count)
	Create a collection of consecutive ranges between [0, total_bytes). More...

byte_range_info	cudf::io::text::create_byte_range_info_max ()
	Create a byte_range_info which represents as much of a file as possible. Specifically, `[0, numeric_limits<int64_t>:\:max())`. More...

std::unique_ptr< cudf::column >	cudf::io::text::multibyte_split (data_chunk_source const &source, std::string_view delimiter, parse_options options={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits the source text into a strings column using a multiple byte delimiter. More...

Variables
constexpr size_t	cudf::io::default_stripe_size_bytes = 64 * 1024 * 1024
	64MB default orc stripe size

constexpr size_type	cudf::io::default_stripe_size_rows = 1000000
	1M rows default orc stripe rows

constexpr size_type	cudf::io::default_row_index_stride = 10000
	10K rows default orc row index stride

constexpr size_t	cudf::io::default_row_group_size_bytes
	Infinite bytes per row group. More...

constexpr size_type	cudf::io::default_row_group_size_rows = 1'000'000
	1 million rows per row group

constexpr size_t	cudf::io::default_max_page_size_bytes = 512 * 1024
	512KB per page

constexpr size_type	cudf::io::default_max_page_size_rows = 20000
	20k rows per page

constexpr int32_t	cudf::io::default_column_index_truncate_length = 64
	truncate to 64 bytes

constexpr size_t	cudf::io::default_max_dictionary_size = 1024 * 1024
	1MB dictionary size

constexpr size_type	cudf::io::default_max_page_fragment_size = 5000
	5000 rows per page fragment

Detailed Description

Enumeration Type Documentation

◆ json_recovery_mode_t

enum cudf::io::json_recovery_mode_t

strong

Control the error recovery behavior of the json parser.

Enumerator
FAIL	Does not recover from an error when encountering an invalid format.
RECOVER_WITH_NULL	Recovers from an error, replacing invalid records with null.

Definition at line 57 of file io/json.hpp.

◆ use_data_page_mask

enum cudf::io::parquet::experimental::use_data_page_mask : bool

strong

Whether to compute and use a page mask using the row mask to skip decompression and decoding of the masked pages.

Enumerator
YES	Compute and use a data page mask.
NO	Do not compute or use a data page mask.

Definition at line 46 of file hybrid_scan.hpp.

Function Documentation

◆ create_byte_range_info_max()

byte_range_info cudf::io::text::create_byte_range_info_max ( )

Create a byte_range_info which represents as much of a file as possible. Specifically, [0, numeric_limits<int64_t>:\:max()).

Returns: Byte range info of size [0, numeric_limits<int64_t>:\:max())

◆ create_byte_range_infos_consecutive()

std::vector<byte_range_info> cudf::io::text::create_byte_range_infos_consecutive	(	int64_t	total_bytes,
		int64_t	range_count
	)

Create a collection of consecutive ranges between [0, total_bytes).

Each range wil be the same size except if total_bytes is not evenly divisible by range_count, in which case the last range size will be the remainder.

Parameters

total_bytes	total number of bytes in all ranges
range_count	total number of ranges in which to divide bytes

Returns: Vector of range objects

◆ is_supported_read_orc()

bool cudf::io::is_supported_read_orc ( compression_type compression )

Check if the compression type is supported for reading ORC files.

Note: This is a runtime check. Some compression types may not be supported because of the current system configuration.

Parameters

compression Compression type

Returns: Boolean indicating if the compression type is supported

◆ is_supported_read_parquet()

bool cudf::io::is_supported_read_parquet ( compression_type compression )

Check if the compression type is supported for reading Parquet files.

Note: This is a runtime check. Some compression types may not be supported because of the current system configuration.

Parameters

compression Compression type

Returns: Boolean indicating if the compression type is supported

◆ is_supported_write_orc()

bool cudf::io::is_supported_write_orc ( compression_type compression )

Check if the compression type is supported for writing ORC files.

Note: This is a runtime check. Some compression types may not be supported because of the current system configuration.

Parameters

compression Compression type

Returns: Boolean indicating if the compression type is supported

◆ is_supported_write_parquet()

bool cudf::io::is_supported_write_parquet ( compression_type compression )

Check if the compression type is supported for writing Parquet files.

Note: This is a runtime check. Some compression types may not be supported because of the current system configuration.

Parameters

compression Compression type

Returns: Boolean indicating if the compression type is supported

◆ multibyte_split()

std::unique_ptr<cudf::column> cudf::io::text::multibyte_split	(	data_chunk_source const &	source,
		std::string_view	delimiter,
		parse_options	options = `{}`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits the source text into a strings column using a multiple byte delimiter.

Providing a byte range allows multibyte_split to read a file partially, only returning the offsets of delimiters which begin within the range. If thinking in terms of "records", where each delimiter dictates the end of a record, all records which begin within the byte range provided will be returned, including any record which may begin in the range but end outside of the range. Records which begin outside of the range will ignored, even if those records end inside the range.

Examples:
 source:     "abc..def..ghi..jkl.."
 delimiter:  ".."
 
 byte_range: nullopt
 return:     ["abc..", "def..", "ghi..", jkl..", ""]
 
 byte_range: [0, 2)
 return:     ["abc.."]
 
 byte_range: [2, 9)
 return:     ["def..", "ghi.."]
 
 byte_range: [11, 2)
 return:     []
 
 byte_range: [13, 7)
 return:     ["jkl..", ""]

Parameters

source	The source string
delimiter	UTF-8 encoded string for which to find offsets in the source
options	the parsing options to use (including byte range)
stream	CUDA stream used for device memory operations and kernel launches
mr	Memory resource to use for the device memory allocation

Returns: The strings found by splitting the source by the delimiter within the relevant byte range.

◆ read_avro()

table_with_metadata cudf::io::read_avro	(	avro_reader_options const &	options,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Reads an Avro dataset into a set of columns.

The following code snippet demonstrates how to read a dataset from a file:

auto source  = cudf::io::source_info("dataset.avro");
auto options = cudf::io::avro_reader_options::builder(source);
auto result  = cudf::io::read_avro(options);

Parameters

options	Settings for controlling reading behavior
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the table in the returned table_with_metadata

Returns: The set of columns along with metadata

◆ read_csv()

table_with_metadata cudf::io::read_csv	(	csv_reader_options	options,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Reads a CSV dataset into a set of columns.

The following code snippet demonstrates how to read a dataset from a file:

auto source  = cudf::io::source_info("dataset.csv");
auto options = cudf::io::csv_reader_options::builder(source);
auto result  = cudf::io::read_csv(options);

Parameters

options	Settings for controlling reading behavior
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the table in the returned table_with_metadata

Returns: The set of columns along with metadata

◆ read_json()

table_with_metadata cudf::io::read_json	(	json_reader_options	options,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Reads a JSON dataset into a set of columns.

The following code snippet demonstrates how to read a dataset from a file:

auto source  = cudf::io::source_info("dataset.json");
auto options = cudf::io::read_json_options::builder(source);
auto result  = cudf::io::read_json(options);

Parameters

options	Settings for controlling reading behavior
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the table in the returned table_with_metadata.

Returns: The set of columns along with metadata

◆ read_orc()

table_with_metadata cudf::io::read_orc	(	orc_reader_options const &	options,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Reads an ORC dataset into a set of columns.

The following code snippet demonstrates how to read a dataset from a file:

auto source  = cudf::io::source_info("dataset.orc");
auto options = cudf::io::orc_reader_options::builder(source);
auto result  = cudf::io::read_orc(options);

Parameters

options	Settings for controlling reading behavior
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the table in the returned table_with_metadata.

Returns: The set of columns

◆ read_orc_metadata()

orc_metadata cudf::io::read_orc_metadata	(	source_info const &	src_info,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Reads metadata of ORC dataset.

Parameters

src_info	Dataset source
stream	CUDA stream used for device memory operations and kernel launches

Returns: orc_metadata with ORC schema, number of rows and number of stripes.

◆ read_parquet() [1/2]

table_with_metadata cudf::io::parquet::experimental::read_parquet	(	parquet_reader_options const &	options,
		cudf::host_span< cuda::std::byte const >	serialized_roaring64,
		cudf::host_span< size_t const >	row_group_offsets,
		cudf::host_span< size_type const >	row_group_num_rows,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `rmm::mr::get_current_device_resource_ref()`
	)

Reads a table from parquet source, prepends an index column to it, deserializes the roaring64 deletion vector and applies it to the read table.

Reads a table from a parquet source, builds a row index column to the table using the specified row group offsets and row counts and prepends it to the table, deserializes the specified roaring64 deletion vector and applies it to the read table. If the row group offsets and row counts are empty, the index column is simply a sequence of UINT64 from 0 to the total number of rows in the table. If the serialized roaring64 bitmap span is empty, the read table (prepended with the index column) is returned as is.

Parameters

options	Parquet reader options
serialized_roaring64	Host span of `portable` serialized 64-bit roaring bitmap
row_group_offsets	Host span of row index offsets for each row group
row_group_num_rows	Host span of number of rows in each row group
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the returned table

Returns: Read table with a prepended index column filtered using the deletion vector, along with its metadata

◆ read_parquet() [2/2]

table_with_metadata cudf::io::read_parquet	(	parquet_reader_options const &	options,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Reads a Parquet dataset into a set of columns.

The following code snippet demonstrates how to read a dataset from a file:

auto source  = cudf::io::source_info("dataset.parquet");
auto options = cudf::io::parquet_reader_options::builder(source);
auto result  = cudf::io::read_parquet(options);

Parameters

options	Settings for controlling reading behavior
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate device memory of the table in the returned table_with_metadata

Returns: The set of columns along with metadata

◆ read_parquet_metadata()

parquet_metadata cudf::io::read_parquet_metadata ( source_info const & src_info )

Reads metadata of parquet dataset.

Parameters

src_info Dataset source

Returns: parquet_metadata with parquet schema, number of rows, number of row groups and key-value metadata

◆ read_parsed_orc_statistics()

parsed_orc_statistics cudf::io::read_parsed_orc_statistics	(	source_info const &	src_info,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Reads file-level and stripe-level statistics of ORC dataset.

Parameters

src_info	Dataset source
stream	CUDA stream used for device memory operations and kernel launches

Returns: Column names and decoded ORC statistics

◆ read_raw_orc_statistics()

raw_orc_statistics cudf::io::read_raw_orc_statistics	(	source_info const &	src_info,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Reads file-level and stripe-level statistics of ORC dataset.

The following code snippet demonstrates how to read statistics of a dataset from a file:

auto result = cudf::read_raw_orc_statistics(cudf::source_info("dataset.orc"));

cudf::io::read_raw_orc_statistics

raw_orc_statistics read_raw_orc_statistics(source_info const &src_info, rmm::cuda_stream_view stream=cudf::get_default_stream())

Reads file-level and stripe-level statistics of ORC dataset.

Parameters

src_info	Dataset source
stream	CUDA stream used for device memory operations and kernel launches

Returns: Column names and encoded ORC statistics

Variable Documentation

◆ default_row_group_size_bytes

constexpr size_t cudf::io::default_row_group_size_bytes

constexpr

Initial value:

=

std::numeric_limits<size_t>::max()

Infinite bytes per row group.

Definition at line 30 of file parquet.hpp.

Files

Classes

Enumerations

Functions

Variables

Detailed Description

Enumeration Type Documentation

◆ json_recovery_mode_t

◆ use_data_page_mask

Function Documentation

◆ create_byte_range_info_max()

◆ create_byte_range_infos_consecutive()

◆ is_supported_read_orc()

◆ is_supported_read_parquet()

◆ is_supported_write_orc()

◆ is_supported_write_parquet()

◆ multibyte_split()

◆ read_avro()

◆ read_csv()

◆ read_json()

◆ read_orc()

◆ read_orc_metadata()

◆ read_parquet() [1/2]

◆ read_parquet() [2/2]

◆ read_parquet_metadata()

◆ read_parsed_orc_statistics()

◆ read_raw_orc_statistics()

Variable Documentation

◆ default_row_group_size_bytes