Io Readers#
- group io_readers
Enums
Functions
-
table_with_metadata read_avro(avro_reader_options const &options, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Reads an Avro dataset into a set of columns.
The following code snippet demonstrates how to read a dataset from a file:
auto source = cudf::io::source_info("dataset.avro"); auto options = cudf::io::avro_reader_options::builder(source); auto result = cudf::io::read_avro(options);
- Parameters:
options – Settings for controlling reading behavior
mr – Device memory resource used to allocate device memory of the table in the returned table_with_metadata
- Returns:
The set of columns along with metadata
-
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Reads a CSV dataset into a set of columns.
The following code snippet demonstrates how to read a dataset from a file:
auto source = cudf::io::source_info("dataset.csv"); auto options = cudf::io::csv_reader_options::builder(source); auto result = cudf::io::read_csv(options);
- Parameters:
options – Settings for controlling reading behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource used to allocate device memory of the table in the returned table_with_metadata
- Returns:
The set of columns along with metadata
-
table_with_metadata read_json(json_reader_options options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Reads a JSON dataset into a set of columns.
The following code snippet demonstrates how to read a dataset from a file:
auto source = cudf::io::source_info("dataset.json"); auto options = cudf::io::read_json_options::builder(source); auto result = cudf::io::read_json(options);
- Parameters:
options – Settings for controlling reading behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource used to allocate device memory of the table in the returned table_with_metadata.
- Returns:
The set of columns along with metadata
-
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Reads an ORC dataset into a set of columns.
The following code snippet demonstrates how to read a dataset from a file:
auto source = cudf::io::source_info("dataset.orc"); auto options = cudf::io::orc_reader_options::builder(source); auto result = cudf::io::read_orc(options);
- Parameters:
options – Settings for controlling reading behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource used to allocate device memory of the table in the returned table_with_metadata.
- Returns:
The set of columns
-
raw_orc_statistics read_raw_orc_statistics(source_info const &src_info, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Reads file-level and stripe-level statistics of ORC dataset.
The following code snippet demonstrates how to read statistics of a dataset from a file:
auto result = cudf::read_raw_orc_statistics(cudf::source_info("dataset.orc"));
- Parameters:
src_info – Dataset source
stream – CUDA stream used for device memory operations and kernel launches
- Returns:
Column names and encoded ORC statistics
-
parsed_orc_statistics read_parsed_orc_statistics(source_info const &src_info, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Reads file-level and stripe-level statistics of ORC dataset.
- Parameters:
src_info – Dataset source
stream – CUDA stream used for device memory operations and kernel launches
- Returns:
Column names and decoded ORC statistics
-
orc_metadata read_orc_metadata(source_info const &src_info, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Reads metadata of ORC dataset.
- Parameters:
src_info – Dataset source
stream – CUDA stream used for device memory operations and kernel launches
- Returns:
orc_metadata with ORC schema, number of rows and number of stripes.
-
table_with_metadata read_parquet(parquet_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Reads a Parquet dataset into a set of columns.
The following code snippet demonstrates how to read a dataset from a file:
auto source = cudf::io::source_info("dataset.parquet"); auto options = cudf::io::parquet_reader_options::builder(source); auto result = cudf::io::read_parquet(options);
- Parameters:
options – Settings for controlling reading behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource used to allocate device memory of the table in the returned table_with_metadata
- Returns:
The set of columns along with metadata
-
parquet_metadata read_parquet_metadata(source_info const &src_info)#
Reads metadata of parquet dataset.
- Parameters:
src_info – Dataset source
- Returns:
parquet_metadata with parquet schema, number of rows, number of row groups and key-value metadata.
-
std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes, int64_t range_count)#
Create a collection of consecutive ranges between [0, total_bytes).
Each range wil be the same size except if
total_bytes
is not evenly divisible byrange_count
, in which case the last range size will be the remainder.- Parameters:
total_bytes – total number of bytes in all ranges
range_count – total number of ranges in which to divide bytes
- Returns:
Vector of range objects
-
byte_range_info create_byte_range_info_max()#
Create a byte_range_info which represents as much of a file as possible. Specifically,
[0, numeric_limits<int64_t>:\:max())
.- Returns:
Byte range info of size
[0, numeric_limits<int64_t>:\:max())
-
std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const &source, std::string const &delimiter, parse_options options = {}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Splits the source text into a strings column using a multiple byte delimiter.
Providing a byte range allows multibyte_split to read a file partially, only returning the offsets of delimiters which begin within the range. If thinking in terms of “records”, where each delimiter dictates the end of a record, all records which begin within the byte range provided will be returned, including any record which may begin in the range but end outside of the range. Records which begin outside of the range will ignored, even if those records end inside the range.
Examples: source: "abc..def..ghi..jkl.." delimiter: ".." byte_range: nullopt return: ["abc..", "def..", "ghi..", jkl..", ""] byte_range: [0, 2) return: ["abc.."] byte_range: [2, 9) return: ["def..", "ghi.."] byte_range: [11, 2) return: [] byte_range: [13, 7) return: ["jkl..", ""]
- Parameters:
source – The source string
delimiter – UTF-8 encoded string for which to find offsets in the source
options – the parsing options to use (including byte range)
stream – CUDA stream used for device memory operations and kernel launches
mr – Memory resource to use for the device memory allocation
- Returns:
The strings found by splitting the source by the delimiter within the relevant byte range.
-
std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const &source, std::string const &delimiter, std::optional<byte_range_info> byte_range, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Splits the source text into a strings column using a multiple byte delimiter.
- Deprecated:
Since 24.08
- Parameters:
source – The source input data encoded in UTF-8
delimiter – UTF-8 encoded string for which to find offsets in the source
byte_range – The position and size within
source
to produce the column fromstream – CUDA stream used for device memory operations and kernel launches
mr – Memory resource to use for the device memory allocation
- Returns:
The strings found by splitting the source by the delimiter within the relevant byte range.
Variables
-
constexpr size_t default_stripe_size_bytes = 64 * 1024 * 1024#
64MB default orc stripe size
-
constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024#
128MB per row group
-
constexpr size_t default_max_page_size_bytes = 512 * 1024#
512KB per page
-
constexpr int32_t default_column_index_truncate_length = 64#
truncate to 64 bytes
-
constexpr size_t default_max_dictionary_size = 1024 * 1024#
1MB dictionary size
-
class avro_reader_options#
- #include <avro.hpp>
Settings to use for
read_avro()
.Public Functions
-
avro_reader_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline source_info const &get_source() const#
Returns source info.
- Returns:
Source info
-
inline std::vector<std::string> get_columns() const#
Returns names of the columns to be read.
- Returns:
Names of the columns to be read
-
inline size_type get_skip_rows() const#
Returns number of rows to skip from the start.
- Returns:
Number of rows to skip from the start
-
inline size_type get_num_rows() const#
Returns number of rows to read.
- Returns:
Number of rows to read
-
inline void set_columns(std::vector<std::string> col_names)#
Set names of the column to be read.
- Parameters:
col_names – Vector of column names
Public Static Functions
-
static avro_reader_options_builder builder(source_info src)#
create avro_reader_options_builder which will build avro_reader_options.
- Parameters:
src – source information used to read avro file
- Returns:
builder to build reader options
-
avro_reader_options() = default#
-
class avro_reader_options_builder#
- #include <avro.hpp>
Builder to build options for
read_avro()
.Public Functions
-
avro_reader_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit avro_reader_options_builder(source_info src)#
Constructor from source info.
- Parameters:
src – The source information used to read avro file
-
inline avro_reader_options_builder &columns(std::vector<std::string> col_names)#
Set names of the column to be read.
- Parameters:
col_names – Vector of column names
- Returns:
this for chaining
-
inline avro_reader_options_builder &skip_rows(size_type val)#
Sets number of rows to skip.
- Parameters:
val – Number of rows to skip from start
- Returns:
this for chaining
-
inline avro_reader_options_builder &num_rows(size_type val)#
Sets number of rows to read.
- Parameters:
val – Number of rows to read after skip
- Returns:
this for chaining
-
inline operator avro_reader_options&&()#
move avro_reader_options member once it’s built.
-
inline avro_reader_options &&build()#
move avro_reader_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
avro_reader_options
object’s r-value reference
-
avro_reader_options_builder() = default#
-
class csv_reader_options#
- #include <csv.hpp>
Settings to use for
read_csv()
.Public Functions
-
csv_reader_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline source_info const &get_source() const#
Returns source info.
- Returns:
Source info
-
inline compression_type get_compression() const#
Returns compression format of the source.
- Returns:
Compression format of the source
-
inline std::size_t get_byte_range_offset() const#
Returns number of bytes to skip from source start.
- Returns:
Number of bytes to skip from source start
-
inline std::size_t get_byte_range_size() const#
Returns number of bytes to read.
- Returns:
Number of bytes to read
-
inline std::size_t get_byte_range_size_with_padding() const#
Returns number of bytes to read with padding.
- Returns:
Number of bytes to read with padding
-
inline std::size_t get_byte_range_padding() const#
Returns number of bytes to pad when reading.
- Returns:
Number of bytes to pad when reading
-
inline std::vector<std::string> const &get_names() const#
Returns names of the columns.
- Returns:
Names of the columns
-
inline std::string get_prefix() const#
Returns prefix to be used for column ID.
- Returns:
Prefix to be used for column ID
-
inline bool is_enabled_mangle_dupe_cols() const#
Whether to rename duplicate column names.
- Returns:
true
if duplicate column names are renamed
-
inline std::vector<std::string> const &get_use_cols_names() const#
Returns names of the columns to be read.
- Returns:
Names of the columns to be read
-
inline std::vector<int> const &get_use_cols_indexes() const#
Returns indexes of columns to read.
- Returns:
Indexes of columns to read
-
inline size_type get_skiprows() const#
Returns number of rows to skip from start.
- Returns:
Number of rows to skip from start
Returns number of rows to skip from end.
- Returns:
Number of rows to skip from end
-
inline char get_lineterminator() const#
Returns line terminator.
- Returns:
Line terminator
-
inline char get_delimiter() const#
Returns field delimiter.
- Returns:
Field delimiter
-
inline char get_thousands() const#
Returns numeric data thousands separator.
- Returns:
Numeric data thousands separator
-
inline char get_decimal() const#
Returns decimal point character.
- Returns:
Decimal point character
-
inline char get_comment() const#
Returns comment line start character.
- Returns:
Comment line start character
-
inline bool is_enabled_windowslinetermination() const#
Whether to treat
\r\n
as line terminator.- Returns:
true
if\r\n
is treated as line terminator
-
inline bool is_enabled_delim_whitespace() const#
Whether to treat whitespace as field delimiter.
- Returns:
true
if whitespace is treated as field delimiter
-
inline bool is_enabled_skipinitialspace() const#
Whether to skip whitespace after the delimiter.
- Returns:
true
if whitespace is skipped after the delimiter
-
inline bool is_enabled_skip_blank_lines() const#
Whether to ignore empty lines or parse line values as invalid.
- Returns:
true
if empty lines or parse line values are ignored as invalid
-
inline quote_style get_quoting() const#
Returns quoting style.
- Returns:
Quoting style
-
inline char get_quotechar() const#
Returns quoting character.
- Returns:
Quoting character
-
inline bool is_enabled_doublequote() const#
Whether a quote inside a value is double-quoted.
- Returns:
true
if a quote inside a value is double-quoted
-
inline bool is_enabled_detect_whitespace_around_quotes() const#
Whether to detect quotes surrounded by spaces e.g.
"data"
. This flag has no effect when _doublequote is true.- Returns:
true
if detect_whitespace_around_quotes is enabled
-
inline std::vector<std::string> const &get_parse_dates_names() const#
Returns names of columns to read as datetime.
- Returns:
Names of columns to read as datetime
-
inline std::vector<int> const &get_parse_dates_indexes() const#
Returns indexes of columns to read as datetime.
- Returns:
Indexes of columns to read as datetime
-
inline std::vector<std::string> const &get_parse_hex_names() const#
Returns names of columns to read as hexadecimal.
- Returns:
Names of columns to read as hexadecimal
-
inline std::vector<int> const &get_parse_hex_indexes() const#
Returns indexes of columns to read as hexadecimal.
- Returns:
Indexes of columns to read as hexadecimal
-
inline std::variant<std::vector<data_type>, std::map<std::string, data_type>> const &get_dtypes() const#
Returns per-column types.
- Returns:
Per-column types
-
inline std::vector<std::string> const &get_true_values() const#
Returns additional values to recognize as boolean true values.
- Returns:
Additional values to recognize as boolean true values
-
inline std::vector<std::string> const &get_false_values() const#
Returns additional values to recognize as boolean false values.
- Returns:
Additional values to recognize as boolean false values
-
inline std::vector<std::string> const &get_na_values() const#
Returns additional values to recognize as null values.
- Returns:
Additional values to recognize as null values
-
inline bool is_enabled_keep_default_na() const#
Whether to keep the built-in default NA values.
- Returns:
true
if the built-in default NA values are kept
-
inline bool is_enabled_na_filter() const#
Whether to disable null filter.
- Returns:
true
if null filter is enabled
-
inline bool is_enabled_dayfirst() const#
Whether to parse dates as DD/MM versus MM/DD.
- Returns:
True if dates are parsed as DD/MM, false if MM/DD
-
inline data_type get_timestamp_type() const#
Returns timestamp_type to which all timestamp columns will be cast.
- Returns:
timestamp_type to which all timestamp columns will be cast
-
inline void set_compression(compression_type comp)#
Sets compression format of the source.
- Parameters:
comp – Compression type
-
inline void set_byte_range_offset(std::size_t offset)#
Sets number of bytes to skip from source start.
- Parameters:
offset – Number of bytes of offset
-
inline void set_byte_range_size(std::size_t size)#
Sets number of bytes to read.
- Parameters:
size – Number of bytes to read
-
inline void set_names(std::vector<std::string> col_names)#
Sets names of the column.
- Parameters:
col_names – Vector of column names
-
inline void set_prefix(std::string pfx)#
Sets prefix to be used for column ID.
- Parameters:
pfx – String used as prefix in for each column name
-
inline void enable_mangle_dupe_cols(bool val)#
Sets whether to rename duplicate column names.
- Parameters:
val – Boolean value to enable/disable
-
inline void set_use_cols_names(std::vector<std::string> col_names)#
Sets names of the columns to be read.
- Parameters:
col_names – Vector of column names that are needed
-
inline void set_use_cols_indexes(std::vector<int> col_indices)#
Sets indexes of columns to read.
- Parameters:
col_indices – Vector of column indices that are needed
-
inline void set_nrows(size_type nrows)#
Sets number of rows to read.
- Parameters:
nrows – Number of rows to read
-
inline void set_skiprows(size_type skiprows)#
Sets number of rows to skip from start.
- Parameters:
skiprows – Number of rows to skip
Sets number of rows to skip from end.
- Parameters:
skipfooter – Number of rows to skip
-
inline void set_header(size_type hdr)#
Sets header row index.
- Parameters:
hdr – Index where header row is located
-
inline void set_lineterminator(char term)#
Sets line terminator.
- Parameters:
term – A character to indicate line termination
-
inline void set_delimiter(char delim)#
Sets field delimiter.
- Parameters:
delim – A character to indicate delimiter
-
inline void set_thousands(char val)#
Sets numeric data thousands separator.
- Parameters:
val – A character that separates thousands
-
inline void set_decimal(char val)#
Sets decimal point character.
- Parameters:
val – A character that indicates decimal values
-
inline void set_comment(char val)#
Sets comment line start character.
- Parameters:
val – A character that indicates comment
-
inline void enable_windowslinetermination(bool val)#
Sets whether to treat
\r\n
as line terminator.- Parameters:
val – Boolean value to enable/disable
-
inline void enable_delim_whitespace(bool val)#
Sets whether to treat whitespace as field delimiter.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_skipinitialspace(bool val)#
Sets whether to skip whitespace after the delimiter.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_skip_blank_lines(bool val)#
Sets whether to ignore empty lines or parse line values as invalid.
- Parameters:
val – Boolean value to enable/disable
-
inline void set_quoting(quote_style quoting)#
Sets the expected quoting style used in the input CSV data.
Note: Only the following quoting styles are supported:
MINIMAL: String columns containing special characters like row-delimiters/ field-delimiter/quotes will be quoted.
NONE: No quoting is done for any columns.
- Parameters:
quoting – Quoting style used
-
inline void set_quotechar(char ch)#
Sets quoting character.
- Parameters:
ch – A character to indicate quoting
-
inline void enable_doublequote(bool val)#
Sets a quote inside a value is double-quoted.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_detect_whitespace_around_quotes(bool val)#
Sets whether to detect quotes surrounded by spaces e.g.
"data"
. This flag has no effect when _doublequote is true.- Parameters:
val – Boolean value to enable/disable
-
inline void set_parse_dates(std::vector<std::string> col_names)#
Sets names of columns to read as datetime.
- Parameters:
col_names – Vector of column names to infer as datetime
-
inline void set_parse_dates(std::vector<int> col_indices)#
Sets indexes of columns to read as datetime.
- Parameters:
col_indices – Vector of column indices to infer as datetime
-
inline void set_parse_hex(std::vector<std::string> col_names)#
Sets names of columns to parse as hexadecimal.
- Parameters:
col_names – Vector of column names to parse as hexadecimal
-
inline void set_parse_hex(std::vector<int> col_indices)#
Sets indexes of columns to parse as hexadecimal.
- Parameters:
col_indices – Vector of column indices to parse as hexadecimal
-
inline void set_dtypes(std::map<std::string, data_type> types)#
Sets per-column types.
- Parameters:
types – Column name -> data type map specifying the columns’ target data types
-
inline void set_dtypes(std::vector<data_type> types)#
Sets per-column types.
- Parameters:
types – Vector specifying the columns’ target data types
-
inline void set_true_values(std::vector<std::string> vals)#
Sets additional values to recognize as boolean true values.
- Parameters:
vals – Vector of values to be considered to be
true
-
inline void set_false_values(std::vector<std::string> vals)#
Sets additional values to recognize as boolean false values.
- Parameters:
vals – Vector of values to be considered to be
false
-
inline void set_na_values(std::vector<std::string> vals)#
Sets additional values to recognize as null values.
- Parameters:
vals – Vector of values to be considered to be null
-
inline void enable_keep_default_na(bool val)#
Sets whether to keep the built-in default NA values.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_na_filter(bool val)#
Sets whether to disable null filter.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_dayfirst(bool val)#
Sets whether to parse dates as DD/MM versus MM/DD.
- Parameters:
val – Boolean value to enable/disable
Public Static Functions
-
static csv_reader_options_builder builder(source_info src)#
Creates a
csv_reader_options_builder
which will buildcsv_reader_options
.- Parameters:
src – Source information to read csv file
- Returns:
Builder to build reader options
-
csv_reader_options() = default#
-
class csv_reader_options_builder#
- #include <csv.hpp>
Builder to build options for
read_csv()
.Public Functions
-
csv_reader_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline csv_reader_options_builder(source_info src)#
Constructor from source info.
- Parameters:
src – The source information used to read csv file
-
inline csv_reader_options_builder &compression(compression_type comp)#
Sets compression format of the source.
- Parameters:
comp – Compression type
- Returns:
this for chaining
-
inline csv_reader_options_builder &byte_range_offset(std::size_t offset)#
Sets number of bytes to skip from source start.
- Parameters:
offset – Number of bytes of offset
- Returns:
this for chaining
-
inline csv_reader_options_builder &byte_range_size(std::size_t size)#
Sets number of bytes to read.
- Parameters:
size – Number of bytes to read
- Returns:
this for chaining
-
inline csv_reader_options_builder &names(std::vector<std::string> col_names)#
Sets names of the column.
- Parameters:
col_names – Vector of column names
- Returns:
this for chaining
-
inline csv_reader_options_builder &prefix(std::string pfx)#
Sets prefix to be used for column ID.
- Parameters:
pfx – String used as prefix in for each column name
- Returns:
this for chaining
-
inline csv_reader_options_builder &mangle_dupe_cols(bool val)#
Sets whether to rename duplicate column names.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &use_cols_names(std::vector<std::string> col_names)#
Sets names of the columns to be read.
- Parameters:
col_names – Vector of column names that are needed
- Returns:
this for chaining
-
inline csv_reader_options_builder &use_cols_indexes(std::vector<int> col_indices)#
Sets indexes of columns to read.
- Parameters:
col_indices – Vector of column indices that are needed
- Returns:
this for chaining
-
inline csv_reader_options_builder &nrows(size_type rows)#
Sets number of rows to read.
- Parameters:
rows – Number of rows to read
- Returns:
this for chaining
-
inline csv_reader_options_builder &skiprows(size_type skip)#
Sets number of rows to skip from start.
- Parameters:
skip – Number of rows to skip
- Returns:
this for chaining
Sets number of rows to skip from end.
- Parameters:
skip – Number of rows to skip
- Returns:
this for chaining
-
inline csv_reader_options_builder &header(size_type hdr)#
Sets header row index.
- Parameters:
hdr – Index where header row is located
- Returns:
this for chaining
-
inline csv_reader_options_builder &lineterminator(char term)#
Sets line terminator.
- Parameters:
term – A character to indicate line termination
- Returns:
this for chaining
-
inline csv_reader_options_builder &delimiter(char delim)#
Sets field delimiter.
- Parameters:
delim – A character to indicate delimiter
- Returns:
this for chaining
-
inline csv_reader_options_builder &thousands(char val)#
Sets numeric data thousands separator.
- Parameters:
val – A character that separates thousands
- Returns:
this for chaining
-
inline csv_reader_options_builder &decimal(char val)#
Sets decimal point character.
- Parameters:
val – A character that indicates decimal values
- Returns:
this for chaining
-
inline csv_reader_options_builder &comment(char val)#
Sets comment line start character.
- Parameters:
val – A character that indicates comment
- Returns:
this for chaining
-
inline csv_reader_options_builder &windowslinetermination(bool val)#
Sets whether to treat
\r\n
as line terminator.- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &delim_whitespace(bool val)#
Sets whether to treat whitespace as field delimiter.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &skipinitialspace(bool val)#
Sets whether to skip whitespace after the delimiter.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &skip_blank_lines(bool val)#
Sets whether to ignore empty lines or parse line values as invalid.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder "ing(quote_style style)#
Sets quoting style.
- Parameters:
style – Quoting style used
- Returns:
this for chaining
-
inline csv_reader_options_builder "echar(char ch)#
Sets quoting character.
- Parameters:
ch – A character to indicate quoting
- Returns:
this for chaining
-
inline csv_reader_options_builder &doublequote(bool val)#
Sets a quote inside a value is double-quoted.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &detect_whitespace_around_quotes(bool val)#
Sets whether to detect quotes surrounded by spaces e.g.
"data"
. This flag has no effect when _doublequote is true.- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &parse_dates(std::vector<std::string> col_names)#
Sets names of columns to read as datetime.
- Parameters:
col_names – Vector of column names to read as datetime
- Returns:
this for chaining
-
inline csv_reader_options_builder &parse_dates(std::vector<int> col_indices)#
Sets indexes of columns to read as datetime.
- Parameters:
col_indices – Vector of column indices to read as datetime
- Returns:
this for chaining
-
inline csv_reader_options_builder &parse_hex(std::vector<std::string> col_names)#
Sets names of columns to parse as hexadecimal.
- Parameters:
col_names – Vector of column names to parse as hexadecimal
- Returns:
this for chaining
-
inline csv_reader_options_builder &parse_hex(std::vector<int> col_indices)#
Sets indexes of columns to parse as hexadecimal.
- Parameters:
col_indices – Vector of column indices to parse as hexadecimal
- Returns:
this for chaining
-
inline csv_reader_options_builder &dtypes(std::map<std::string, data_type> types)#
Sets per-column types.
- Parameters:
types – Column name -> data type map specifying the columns’ target data types
- Returns:
this for chaining
-
inline csv_reader_options_builder &dtypes(std::vector<data_type> types)#
Sets per-column types.
- Parameters:
types – Vector of data types in which the column needs to be read
- Returns:
this for chaining
-
inline csv_reader_options_builder &true_values(std::vector<std::string> vals)#
Sets additional values to recognize as boolean true values.
- Parameters:
vals – Vector of values to be considered to be
true
- Returns:
this for chaining
-
inline csv_reader_options_builder &false_values(std::vector<std::string> vals)#
Sets additional values to recognize as boolean false values.
- Parameters:
vals – Vector of values to be considered to be
false
- Returns:
this for chaining
-
inline csv_reader_options_builder &na_values(std::vector<std::string> vals)#
Sets additional values to recognize as null values.
- Parameters:
vals – Vector of values to be considered to be null
- Returns:
this for chaining
-
inline csv_reader_options_builder &keep_default_na(bool val)#
Sets whether to keep the built-in default NA values.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &na_filter(bool val)#
Sets whether to disable null filter.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder &dayfirst(bool val)#
Sets whether to parse dates as DD/MM versus MM/DD.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_reader_options_builder ×tamp_type(data_type type)#
Sets timestamp_type to which all timestamp columns will be cast.
- Parameters:
type – Dtype to which all timestamp column will be cast
- Returns:
this for chaining
-
inline operator csv_reader_options&&()#
move csv_reader_options member once it’s built.
-
inline csv_reader_options &&build()#
move csv_reader_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
csv_reader_options
object’s r-value reference
-
csv_reader_options_builder() = default#
-
struct schema_element#
- #include <json.hpp>
Allows specifying the target types for nested JSON data via json_reader_options’
set_dtypes
method.Public Members
-
std::map<std::string, schema_element> child_types#
Allows specifying this column’s child columns target type.
-
std::map<std::string, schema_element> child_types#
-
class json_reader_options#
- #include <json.hpp>
Input arguments to the
read_json
interface.Available parameters are closely patterned after PANDAS’
read_json
API. Not all parameters are supported. If the matching PANDAS’ parameter has a default value ofNone
, then a default value of-1
or0
may be used as the equivalent.Parameters in PANDAS that are unavailable or in cudf:
Name
Description
orient
currently fixed-format
typ
data is always returned as a cudf::table
convert_axes
use column functions for axes operations instead
convert_dates
dates are detected automatically
keep_default_dates
dates are detected automatically
numpy
data is always returned as a cudf::table
precise_float
there is only one converter
date_unit
only millisecond units are supported
encoding
only ASCII-encoded data is supported
chunksize
use
byte_range_xxx
for chunking insteadPublic Functions
-
json_reader_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline source_info const &get_source() const#
Returns source info.
- Returns:
Source info
-
inline std::variant<std::vector<data_type>, std::map<std::string, data_type>, std::map<std::string, schema_element>> const &get_dtypes() const#
Returns data types of the columns.
- Returns:
Data types of the columns
-
inline compression_type get_compression() const#
Returns compression format of the source.
- Returns:
Compression format of the source
-
inline size_t get_byte_range_offset() const#
Returns number of bytes to skip from source start.
- Returns:
Number of bytes to skip from source start
-
inline size_t get_byte_range_size() const#
Returns number of bytes to read.
- Returns:
Number of bytes to read
-
inline size_t get_byte_range_size_with_padding() const#
Returns number of bytes to read with padding.
- Returns:
Number of bytes to read with padding
-
inline size_t get_byte_range_padding() const#
Returns number of bytes to pad when reading.
- Returns:
Number of bytes to pad
-
inline char get_delimiter() const#
Returns delimiter separating records in JSON lines.
- Returns:
Delimiter separating records in JSON lines
-
inline bool is_enabled_lines() const#
Whether to read the file as a json object per line.
- Returns:
true
if reading the file as a json object per line
-
inline bool is_enabled_mixed_types_as_string() const#
Whether to parse mixed types as a string column.
- Returns:
true
if mixed types are parsed as a string column
-
inline bool is_enabled_prune_columns() const#
Whether to prune columns on read, selected based on the set_dtypes option.
When set as true, if the reader options include set_dtypes, then the reader will only return those columns which are mentioned in set_dtypes. If false, then all columns are returned, independent of the set_dtypes setting.
- Returns:
True if column pruning is enabled
-
inline bool is_enabled_dayfirst() const#
Whether to parse dates as DD/MM versus MM/DD.
- Returns:
true if dates are parsed as DD/MM, false if MM/DD
-
inline bool is_enabled_keep_quotes() const#
Whether the reader should keep quotes of string values.
- Returns:
true if the reader should keep quotes, false otherwise
-
inline bool is_enabled_normalize_single_quotes() const#
Whether the reader should normalize single quotes around strings.
- Returns:
true if the reader should normalize single quotes, false otherwise
-
inline bool is_enabled_normalize_whitespace() const#
Whether the reader should normalize unquoted whitespace characters.
- Returns:
true if the reader should normalize whitespace, false otherwise
-
inline json_recovery_mode_t recovery_mode() const#
Queries the JSON reader’s behavior on invalid JSON lines.
- Returns:
An enum that specifies the JSON reader’s behavior on invalid JSON lines.
-
inline void set_dtypes(std::vector<data_type> types)#
Set data types for columns to be read.
- Parameters:
types – Vector of dtypes
-
inline void set_dtypes(std::map<std::string, data_type> types)#
Set data types for columns to be read.
- Parameters:
types – Vector dtypes in string format
-
inline void set_dtypes(std::map<std::string, schema_element> types)#
Set data types for a potentially nested column hierarchy.
- Parameters:
types – Map of column names to schema_element to support arbitrary nesting of data types
-
inline void set_compression(compression_type comp_type)#
Set the compression type.
- Parameters:
comp_type – The compression type used
-
inline void set_byte_range_offset(size_t offset)#
Set number of bytes to skip from source start.
- Parameters:
offset – Number of bytes of offset
-
inline void set_byte_range_size(size_t size)#
Set number of bytes to read.
- Parameters:
size – Number of bytes to read
-
inline void set_delimiter(char delimiter)#
Set delimiter separating records in JSON lines.
- Parameters:
delimiter – Delimiter separating records in JSON lines
-
inline void enable_lines(bool val)#
Set whether to read the file as a json object per line.
- Parameters:
val – Boolean value to enable/disable the option to read each line as a json object
-
inline void enable_mixed_types_as_string(bool val)#
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string column using schema.
- Parameters:
val – Boolean value to enable/disable parsing mixed types as a string column
-
inline void enable_prune_columns(bool val)#
Set whether to prune columns on read, selected based on the set_dtypes option.
When set as true, if the reader options include set_dtypes, then the reader will only return those columns which are mentioned in set_dtypes. If false, then all columns are returned, independent of the set_dtypes setting.
- Parameters:
val – Boolean value to enable/disable column pruning
-
inline void enable_dayfirst(bool val)#
Set whether to parse dates as DD/MM versus MM/DD.
- Parameters:
val – Boolean value to enable/disable day first parsing format
-
inline void enable_keep_quotes(bool val)#
Set whether the reader should keep quotes of string values.
- Parameters:
val – Boolean value to indicate whether the reader should keep quotes of string values
-
inline void enable_normalize_single_quotes(bool val)#
Set whether the reader should enable normalization of single quotes around strings.
- Parameters:
val – Boolean value to indicate whether the reader should normalize single quotes around strings
-
inline void enable_normalize_whitespace(bool val)#
Set whether the reader should enable normalization of unquoted whitespace.
- Parameters:
val – Boolean value to indicate whether the reader should normalize unquoted whitespace characters i.e. tabs and spaces
-
inline void set_recovery_mode(json_recovery_mode_t val)#
Specifies the JSON reader’s behavior on invalid JSON lines.
- Parameters:
val – An enum value to indicate the JSON reader’s behavior on invalid JSON lines.
Public Static Functions
-
static json_reader_options_builder builder(source_info src)#
create json_reader_options_builder which will build json_reader_options.
- Parameters:
src – source information used to read json file
- Returns:
builder to build the options
-
json_reader_options() = default#
-
class json_reader_options_builder#
- #include <json.hpp>
Builds settings to use for
read_json()
.Public Functions
-
explicit json_reader_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit json_reader_options_builder(source_info src)#
Constructor from source info.
- Parameters:
src – The source information used to read avro file
-
inline json_reader_options_builder &dtypes(std::vector<data_type> types)#
Set data types for columns to be read.
- Parameters:
types – Vector of dtypes
- Returns:
this for chaining
-
inline json_reader_options_builder &dtypes(std::map<std::string, data_type> types)#
Set data types for columns to be read.
- Parameters:
types – Column name -> dtype map
- Returns:
this for chaining
-
inline json_reader_options_builder &dtypes(std::map<std::string, schema_element> types)#
Set data types for columns to be read.
- Parameters:
types – Column name -> schema_element map
- Returns:
this for chaining
-
inline json_reader_options_builder &compression(compression_type comp_type)#
Set the compression type.
- Parameters:
comp_type – The compression type used
- Returns:
this for chaining
-
inline json_reader_options_builder &byte_range_offset(size_type offset)#
Set number of bytes to skip from source start.
- Parameters:
offset – Number of bytes of offset
- Returns:
this for chaining
-
inline json_reader_options_builder &byte_range_size(size_type size)#
Set number of bytes to read.
- Parameters:
size – Number of bytes to read
- Returns:
this for chaining
-
inline json_reader_options_builder &delimiter(char delimiter)#
Set delimiter separating records in JSON lines.
- Parameters:
delimiter – Delimiter separating records in JSON lines
- Returns:
this for chaining
-
inline json_reader_options_builder &lines(bool val)#
Set whether to read the file as a json object per line.
- Parameters:
val – Boolean value to enable/disable the option to read each line as a json object
- Returns:
this for chaining
-
inline json_reader_options_builder &mixed_types_as_string(bool val)#
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string column using schema.
- Parameters:
val – Boolean value to enable/disable parsing mixed types as a string column
- Returns:
this for chaining
-
inline json_reader_options_builder &prune_columns(bool val)#
Set whether to prune columns on read, selected based on the dtypes option.
When set as true, if the reader options include dtypes, then the reader will only return those columns which are mentioned in dtypes. If false, then all columns are returned, independent of the dtypes setting.
- Parameters:
val – Boolean value to enable/disable column pruning
- Returns:
this for chaining
-
inline json_reader_options_builder &dayfirst(bool val)#
Set whether to parse dates as DD/MM versus MM/DD.
- Parameters:
val – Boolean value to enable/disable day first parsing format
- Returns:
this for chaining
-
inline json_reader_options_builder &keep_quotes(bool val)#
Set whether the reader should keep quotes of string values.
- Parameters:
val – Boolean value to indicate whether the reader should keep quotes of string values
- Returns:
this for chaining
-
inline json_reader_options_builder &normalize_single_quotes(bool val)#
Set whether the reader should normalize single quotes around strings.
- Parameters:
val – Boolean value to indicate whether the reader should normalize single quotes of strings
- Returns:
this for chaining
-
inline json_reader_options_builder &normalize_whitespace(bool val)#
Set whether the reader should normalize unquoted whitespace.
- Parameters:
val – Boolean value to indicate whether the reader should normalize unquoted whitespace
- Returns:
this for chaining
-
inline json_reader_options_builder &recovery_mode(json_recovery_mode_t val)#
Specifies the JSON reader’s behavior on invalid JSON lines.
- Parameters:
val – An enum value to indicate the JSON reader’s behavior on invalid JSON lines.
- Returns:
this for chaining
-
inline operator json_reader_options&&()#
move json_reader_options member once it’s built.
-
inline json_reader_options &&build()#
move json_reader_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
json_reader_options
object r-value reference
-
explicit json_reader_options_builder() = default#
-
class orc_reader_options#
- #include <orc.hpp>
Settings to use for
read_orc()
.Public Functions
-
orc_reader_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline source_info const &get_source() const#
Returns source info.
- Returns:
Source info
-
inline auto const &get_columns() const#
Returns names of the columns to read, if set.
- Returns:
Names of the columns to read;
nullopt
if the option is not set
-
inline auto const &get_stripes() const#
Returns vector of vectors, stripes to read for each input source.
- Returns:
Vector of vectors, stripes to read for each input source
-
inline int64_t get_skip_rows() const#
Returns number of rows to skip from the start.
- Returns:
Number of rows to skip from the start
-
inline std::optional<int64_t> const &get_num_rows() const#
Returns number of row to read.
- Returns:
Number of rows to read;
nullopt
if the option hasn’t been set (in which case the file is read until the end)
-
inline bool is_enabled_use_index() const#
Whether to use row index to speed-up reading.
- Returns:
true
if row index is used to speed-up reading
-
inline bool is_enabled_use_np_dtypes() const#
Whether to use numpy-compatible dtypes.
- Returns:
true
if numpy-compatible dtypes are used
-
inline data_type get_timestamp_type() const#
Returns timestamp type to which timestamp column will be cast.
- Returns:
Timestamp type to which timestamp column will be cast
-
inline std::vector<std::string> const &get_decimal128_columns() const#
Returns fully qualified names of columns that should be read as 128-bit Decimal.
- Returns:
Fully qualified names of columns that should be read as 128-bit Decimal
-
inline void set_columns(std::vector<std::string> col_names)#
Sets names of the column to read.
- Parameters:
col_names – Vector of column names
-
inline void set_stripes(std::vector<std::vector<size_type>> stripes)#
Sets list of stripes to read for each input source.
- Parameters:
stripes – Vector of vectors, mapping stripes to read to input sources
- Throws:
cudf::logic_error – if a non-empty vector is passed, and
skip_rows
has been previously setcudf::logic_error – if a non-empty vector is passed, and
num_rows
has been previously set
-
inline void set_skip_rows(int64_t rows)#
Sets number of rows to skip from the start.
- Parameters:
rows – Number of rows
- Throws:
cudf::logic_error – if a negative value is passed
cudf::logic_error – if stripes have been previously set
-
inline void set_num_rows(int64_t nrows)#
Sets number of row to read.
- Parameters:
nrows – Number of rows
- Throws:
cudf::logic_error – if a negative value is passed
cudf::logic_error – if stripes have been previously set
-
inline void enable_use_index(bool use)#
Enable/Disable use of row index to speed-up reading.
- Parameters:
use – Boolean value to enable/disable row index use
-
inline void enable_use_np_dtypes(bool use)#
Enable/Disable use of numpy-compatible dtypes.
- Parameters:
use – Boolean value to enable/disable
-
inline void set_timestamp_type(data_type type)#
Sets timestamp type to which timestamp column will be cast.
- Parameters:
type – Type of timestamp
-
inline void set_decimal128_columns(std::vector<std::string> val)#
Set columns that should be read as 128-bit Decimal.
- Parameters:
val – Vector of fully qualified column names
Public Static Functions
-
static orc_reader_options_builder builder(source_info src)#
Creates
orc_reader_options_builder
which will buildorc_reader_options
.- Parameters:
src – Source information to read orc file
- Returns:
Builder to build reader options
-
orc_reader_options() = default#
-
class orc_reader_options_builder#
- #include <orc.hpp>
Builds settings to use for
read_orc()
.Public Functions
-
explicit orc_reader_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit orc_reader_options_builder(source_info src)#
Constructor from source info.
- Parameters:
src – The source information used to read orc file
-
inline orc_reader_options_builder &columns(std::vector<std::string> col_names)#
Sets names of the column to read.
- Parameters:
col_names – Vector of column names
- Returns:
this for chaining
-
inline orc_reader_options_builder &stripes(std::vector<std::vector<size_type>> stripes)#
Sets list of individual stripes to read per source.
- Parameters:
stripes – Vector of vectors, mapping stripes to read to input sources
- Returns:
this for chaining
-
inline orc_reader_options_builder &skip_rows(int64_t rows)#
Sets number of rows to skip from the start.
- Parameters:
rows – Number of rows
- Returns:
this for chaining
-
inline orc_reader_options_builder &num_rows(int64_t nrows)#
Sets number of row to read.
- Parameters:
nrows – Number of rows
- Returns:
this for chaining
-
inline orc_reader_options_builder &use_index(bool use)#
Enable/Disable use of row index to speed-up reading.
- Parameters:
use – Boolean value to enable/disable row index use
- Returns:
this for chaining
-
inline orc_reader_options_builder &use_np_dtypes(bool use)#
Enable/Disable use of numpy-compatible dtypes.
- Parameters:
use – Boolean value to enable/disable
- Returns:
this for chaining
-
inline orc_reader_options_builder ×tamp_type(data_type type)#
Sets timestamp type to which timestamp column will be cast.
- Parameters:
type – Type of timestamp
- Returns:
this for chaining
-
inline orc_reader_options_builder &decimal128_columns(std::vector<std::string> val)#
Columns that should be read as 128-bit Decimal.
- Parameters:
val – Vector of column names
- Returns:
this for chaining
-
inline operator orc_reader_options&&()#
move orc_reader_options member once it’s built.
-
inline orc_reader_options &&build()#
move orc_reader_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
orc_reader_options
object’s r-value reference
-
explicit orc_reader_options_builder() = default#
-
class chunked_orc_reader#
- #include <orc.hpp>
The chunked orc reader class to read an ORC file iteratively into a series of tables, chunk by chunk.
This class is designed to address the reading issue when reading very large ORC files such that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the file content by chunks using this class, each chunk is guaranteed to have its size stay within the given limit.
Public Functions
-
chunked_orc_reader()#
Default constructor, this should never be used.
This is added just to satisfy cython.
-
explicit chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Construct the reader from input/output size limits, output row granularity, along with other ORC reader options.
The typical usage should be similar to this:
do { auto const chunk = reader.read_chunk(); // Process chunk } while (reader.has_next());
If
chunk_read_limit == 0
(i.e., no output limit) andpass_read_limit == 0
(no temporary memory size limit), a call toread_chunk()
will read the whole data source and return a table containing all rows.The
chunk_read_limit
parameter controls the size of the output table to be returned perread_chunk()
call. If the user specifies a 100 MB limit, the reader will attempt to return tables that have a total bytes size (over all columns) of 100 MB or less. This is a soft limit and the code will not fail if it cannot satisfy the limit.The
pass_read_limit
parameter controls how much temporary memory is used in the entire process of loading, decompressing and decoding of data. Again, this is also a soft limit and the reader will try to make the best effort.Finally, the parameter
output_row_granularity
controls the changes in row number of the output chunk. For each call toread_chunk()
, with respect to the givenpass_read_limit
, a subset of stripes may be loaded, decompressed and decoded into an intermediate table. The reader will then subdivide that table into smaller tables for final output usingoutput_row_granularity
as the subdivision step.- Parameters:
chunk_read_limit – Limit on total number of bytes to be returned per
read_chunk()
call, or0
if there is no limitpass_read_limit – Limit on temporary memory usage for reading the data sources, or
0
if there is no limitoutput_row_granularity – The granularity parameter used for subdividing the decoded table for final output
options – Settings for controlling reading behaviors
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
- Throws:
cudf::logic_error – if
output_row_granularity
is non-positive
-
explicit chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Construct the reader from input/output size limits along with other ORC reader options.
This constructor implicitly call the other constructor with
output_row_granularity
set toDEFAULT_OUTPUT_ROW_GRANULARITY
rows.- Parameters:
chunk_read_limit – Limit on total number of bytes to be returned per
read_chunk()
call, or0
if there is no limitpass_read_limit – Limit on temporary memory usage for reading the data sources, or
0
if there is no limitoptions – Settings for controlling reading behaviors
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
explicit chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Construct the reader from output size limits along with other ORC reader options.
This constructor implicitly call the other constructor with
pass_read_limit
set to0
andoutput_row_granularity
set toDEFAULT_OUTPUT_ROW_GRANULARITY
rows.- Parameters:
chunk_read_limit – Limit on total number of bytes to be returned per
read_chunk()
call, or0
if there is no limitoptions – Settings for controlling reading behaviors
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
~chunked_orc_reader()#
Destructor, destroying the internal reader instance.
-
bool has_next() const#
Check if there is any data in the given data sources has not yet read.
- Returns:
A boolean value indicating if there is any data left to read
-
table_with_metadata read_chunk() const#
Read a chunk of rows in the given data sources.
The sequence of returned tables, if concatenated by their order, guarantees to form a complete dataset as reading the entire given data sources at once.
An empty table will be returned if the given sources are empty, or all the data has been read and returned by the previous calls.
- Returns:
An output
cudf::table
along with its metadata
-
chunked_orc_reader()#
-
class parquet_reader_options#
- #include <parquet.hpp>
Settings for
read_parquet()
.Public Functions
-
explicit parquet_reader_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline source_info const &get_source() const#
Returns source info.
- Returns:
Source info
-
inline bool is_enabled_convert_strings_to_categories() const#
Returns true/false depending on whether strings should be converted to categories or not.
- Returns:
true
if strings should be converted to categories
-
inline bool is_enabled_use_pandas_metadata() const#
Returns true/false depending whether to use pandas metadata or not while reading.
- Returns:
true
if pandas metadata is used while reading
-
inline bool is_enabled_use_arrow_schema() const#
Returns true/false depending whether to use arrow schema while reading.
- Returns:
true
if arrow schema is used while reading
-
inline std::optional<std::vector<reader_column_schema>> get_column_schema() const#
Returns optional tree of metadata.
- Returns:
vector of reader_column_schema objects.
-
inline int64_t get_skip_rows() const#
Returns number of rows to skip from the start.
- Returns:
Number of rows to skip from the start
-
inline std::optional<size_type> const &get_num_rows() const#
Returns number of rows to read.
- Returns:
Number of rows to read;
nullopt
if the option hasn’t been set (in which case the file is read until the end)
-
inline auto const &get_columns() const#
Returns names of column to be read, if set.
- Returns:
Names of column to be read;
nullopt
if the option is not set
-
inline auto const &get_row_groups() const#
Returns list of individual row groups to be read.
- Returns:
List of individual row groups to be read
-
inline auto const &get_filter() const#
Returns AST based filter for predicate pushdown.
- Returns:
AST expression to use as filter
-
inline data_type get_timestamp_type() const#
Returns timestamp type used to cast timestamp columns.
- Returns:
Timestamp type used to cast timestamp columns
-
inline void set_columns(std::vector<std::string> col_names)#
Sets names of the columns to be read.
- Parameters:
col_names – Vector of column names
-
void set_row_groups(std::vector<std::vector<size_type>> row_groups)#
Sets vector of individual row groups to read.
- Parameters:
row_groups – Vector of row groups to read
-
inline void set_filter(ast::expression const &filter)#
Sets AST based filter for predicate pushdown.
The filter can utilize cudf::ast::column_name_reference to reference a column by its name, even if it’s not necessarily present in the requested projected columns. To refer to output column indices, you can use cudf::ast::column_reference.
For a parquet with columns [“A”, “B”, “C”, … “X”, “Y”, “Z”], Example 1: with/without column projection
Column “C” need not be present in output table. Example 2: without column projectionuse_columns({"A", "X", "Z"}) .filter(operation(ast_operator::LESS, column_name_reference{"C"}, literal{100}));
Here,filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
1
will refer to column “B” because output will contain all columns in order [“A”, …, “Z”]. Example 3: with column projectionHere,use_columns({"A", "Z", "X"}) .filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
1
will refer to column “Z” because output will contain 3 columns in order [“A”, “Z”, “X”].- Parameters:
filter – AST expression to use as filter
-
inline void enable_convert_strings_to_categories(bool val)#
Sets to enable/disable conversion of strings to categories.
- Parameters:
val – Boolean value to enable/disable conversion of string columns to categories
-
inline void enable_use_pandas_metadata(bool val)#
Sets to enable/disable use of pandas metadata to read.
- Parameters:
val – Boolean value whether to use pandas metadata
-
inline void enable_use_arrow_schema(bool val)#
Sets to enable/disable use of arrow schema to read.
- Parameters:
val – Boolean value whether to use arrow schema
-
inline void set_column_schema(std::vector<reader_column_schema> val)#
Sets reader column schema.
- Parameters:
val – Tree of schema nodes to enable/disable conversion of binary to string columns. Note default is to convert to string columns.
-
void set_skip_rows(int64_t val)#
Sets number of rows to skip.
- Parameters:
val – Number of rows to skip from start
Public Static Functions
-
static parquet_reader_options_builder builder(source_info src)#
Creates a parquet_reader_options_builder which will build parquet_reader_options.
- Parameters:
src – Source information to read parquet file
- Returns:
Builder to build reader options
-
explicit parquet_reader_options() = default#
-
class parquet_reader_options_builder#
- #include <parquet.hpp>
Builds parquet_reader_options to use for
read_parquet()
.Public Functions
-
parquet_reader_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit parquet_reader_options_builder(source_info src)#
Constructor from source info.
- Parameters:
src – The source information used to read parquet file
-
inline parquet_reader_options_builder &columns(std::vector<std::string> col_names)#
Sets names of the columns to be read.
- Parameters:
col_names – Vector of column names
- Returns:
this for chaining
-
inline parquet_reader_options_builder &row_groups(std::vector<std::vector<size_type>> row_groups)#
Sets vector of individual row groups to read.
- Parameters:
row_groups – Vector of row groups to read
- Returns:
this for chaining
-
inline parquet_reader_options_builder &filter(ast::expression const &filter)#
Sets AST based filter for predicate pushdown.
The filter can utilize cudf::ast::column_name_reference to reference a column by its name, even if it’s not necessarily present in the requested projected columns. To refer to output column indices, you can use cudf::ast::column_reference.
For a parquet with columns [“A”, “B”, “C”, … “X”, “Y”, “Z”], Example 1: with/without column projection
Column “C” need not be present in output table. Example 2: without column projectionuse_columns({"A", "X", "Z"}) .filter(operation(ast_operator::LESS, column_name_reference{"C"}, literal{100}));
Here,filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
1
will refer to column “B” because output will contain all columns in order [“A”, …, “Z”]. Example 3: with column projectionHere,use_columns({"A", "Z", "X"}) .filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
1
will refer to column “Z” because output will contain 3 columns in order [“A”, “Z”, “X”].- Parameters:
filter – AST expression to use as filter
- Returns:
this for chaining
-
inline parquet_reader_options_builder &convert_strings_to_categories(bool val)#
Sets enable/disable conversion of strings to categories.
- Parameters:
val – Boolean value to enable/disable conversion of string columns to categories
- Returns:
this for chaining
-
inline parquet_reader_options_builder &use_pandas_metadata(bool val)#
Sets to enable/disable use of pandas metadata to read.
- Parameters:
val – Boolean value whether to use pandas metadata
- Returns:
this for chaining
-
inline parquet_reader_options_builder &use_arrow_schema(bool val)#
Sets to enable/disable use of arrow schema to read.
- Parameters:
val – Boolean value whether to use arrow schema
- Returns:
this for chaining
-
inline parquet_reader_options_builder &set_column_schema(std::vector<reader_column_schema> val)#
Sets reader metadata.
- Parameters:
val – Tree of metadata information.
- Returns:
this for chaining
-
inline parquet_reader_options_builder &skip_rows(int64_t val)#
Sets number of rows to skip.
- Parameters:
val – Number of rows to skip from start
- Returns:
this for chaining
-
inline parquet_reader_options_builder &num_rows(size_type val)#
Sets number of rows to read.
- Parameters:
val – Number of rows to read after skip
- Returns:
this for chaining
-
inline parquet_reader_options_builder ×tamp_type(data_type type)#
timestamp_type used to cast timestamp columns.
- Parameters:
type – The timestamp data_type to which all timestamp columns need to be cast
- Returns:
this for chaining
-
inline operator parquet_reader_options&&()#
move parquet_reader_options member once it’s built.
-
inline parquet_reader_options &&build()#
move parquet_reader_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
parquet_reader_options
object’s r-value reference
-
parquet_reader_options_builder() = default#
-
class chunked_parquet_reader#
- #include <parquet.hpp>
The chunked parquet reader class to read Parquet file iteratively in to a series of tables, chunk by chunk.
This class is designed to address the reading issue when reading very large Parquet files such that the sizes of their column exceed the limit that can be stored in cudf column. By reading the file content by chunks using this class, each chunk is guaranteed to have its sizes stay within the given limit.
Public Functions
-
chunked_parquet_reader()#
Default constructor, this should never be used.
This is added just to satisfy cython. This is added to not leak detail API
-
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Constructor for chunked reader.
This constructor requires the same
parquet_reader_option
parameter as incudf::read_parquet()
, and an additional parameter to specify the size byte limit of the output table for each reading.- Parameters:
chunk_read_limit – Limit on total number of bytes to be returned per read, or
0
if there is no limitoptions – The options used to read Parquet file
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#
Constructor for chunked reader.
This constructor requires the same
parquet_reader_option
parameter as incudf::read_parquet()
, with additional parameters to specify the size byte limit of the output table for each reading, and a byte limit on the amount of temporary memory to use when reading. pass_read_limit affects how many row groups we can read at a time by limiting the amount of memory dedicated to decompression space. pass_read_limit is a hint, not an absolute limit - if a single row group cannot fit within the limit given, it will still be loaded.- Parameters:
chunk_read_limit – Limit on total number of bytes to be returned per read, or
0
if there is no limitpass_read_limit – Limit on the amount of memory used for reading and decompressing data or
0
if there is no limitoptions – The options used to read Parquet file
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
~chunked_parquet_reader()#
Destructor, destroying the internal reader instance.
Since the declaration of the internal
reader
object does not exist in this header, this destructor needs to be defined in a separate source file which can access to that object’s declaration.
-
bool has_next() const#
Check if there is any data in the given file has not yet read.
- Returns:
A boolean value indicating if there is any data left to read
-
table_with_metadata read_chunk() const#
Read a chunk of rows in the given Parquet file.
The sequence of returned tables, if concatenated by their order, guarantees to form a complete dataset as reading the entire given file at once.
An empty table will be returned if the given file is empty, or all the data in the file has been read and returned by the previous calls.
- Returns:
An output
cudf::table
along with its metadata
-
chunked_parquet_reader()#
-
class byte_range_info#
- #include <byte_range_info.hpp>
stores offset and size used to indicate a byte range
Public Functions
-
inline constexpr byte_range_info(int64_t offset, int64_t size)#
Constructs a byte_range_info object.
- Parameters:
offset – offset in bytes
size – size in bytes
-
constexpr byte_range_info(byte_range_info const &other) noexcept = default#
Copy constructor.
- Parameters:
other – byte_range_info object to copy
-
constexpr byte_range_info &operator=(byte_range_info const &other) noexcept = default#
Copy assignment operator.
- Parameters:
other – byte_range_info object to copy
- Returns:
this object after copying
-
inline constexpr int64_t offset()#
Get the offset in bytes.
- Returns:
Offset in bytes
-
inline constexpr int64_t size()#
Get the size in bytes.
- Returns:
Size in bytes
-
inline constexpr byte_range_info(int64_t offset, int64_t size)#
-
class device_data_chunk#
- #include <data_chunk_source.hpp>
A contract guaranteeing stream-ordered memory access to the underlying device data.
This class guarantees access to the underlying data for the stream on which the data was allocated. Possible implementations may own the device data, or may only have a view over the data. Any work enqueued to the stream on which this data was allocated is guaranteed to be performed prior to the destruction of the underlying data, but otherwise no guarantees are made regarding if or when the underlying data gets destroyed.
Public Functions
-
virtual char const *data() const = 0#
Returns a pointer to the underlying device data.
- Returns:
A pointer to the underlying device data
-
virtual std::size_t size() const = 0#
Returns the size of the underlying device data.
- Returns:
The size of the underlying device data
-
virtual operator device_span<char const>() const = 0#
Returns a span over the underlying device data.
- Returns:
A span over the underlying device data
-
virtual char const *data() const = 0#
-
class data_chunk_reader#
- #include <data_chunk_source.hpp>
a reader capable of producing views over device memory.
The data chunk reader API encapsulates the idea of statefully traversing and loading a data source. A data source may be a file, a region of device memory, or a region of host memory. Reading data from these data sources efficiently requires different strategies depending on the type of data source, type of compression, capabilities of the host and device, the data’s destination. Whole-file decompression should be hidden behind this interface.
Public Functions
-
virtual void skip_bytes(std::size_t size) = 0#
Skips the specified number of bytes in the data source.
- Parameters:
size – The number of bytes to skip
-
virtual std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t size, rmm::cuda_stream_view stream) = 0#
Get the next chunk of bytes from the data source.
Performs any necessary work to read and prepare the underlying data source for consumption as a view over device memory. Common implementations may read from a file, copy data from host memory, allocate temporary memory, perform iterative decompression, or even launch device kernels.
- Parameters:
size – number of bytes to read
stream – stream to associate allocations or perform work required to obtain chunk
- Returns:
a chunk of data up to
size
bytes. May return less thansize
bytes if reader reaches end of underlying data source. Returned data must be accessed in stream order relative to the specifiedstream
-
virtual void skip_bytes(std::size_t size) = 0#
-
class data_chunk_source#
- #include <data_chunk_source.hpp>
a data source capable of creating a reader which can produce views of the data source in device memory.
Public Functions
-
virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0#
Get a reader for the data source.
- Returns:
data_chunk_reader
object for the data source
-
virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0#
-
struct parse_options#
- #include <multibyte_split.hpp>
Parsing options for multibyte_split.
Public Members
-
byte_range_info byte_range = create_byte_range_info_max()#
Only rows starting inside this byte range will be part of the output column.
-
bool strip_delimiters = false#
Whether delimiters at the end of rows should be stripped from the output column.
-
byte_range_info byte_range = create_byte_range_info_max()#
-
table_with_metadata read_avro(avro_reader_options const &options, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())#