27 #include <unordered_map>
32 namespace CUDF_EXPORT
cudf {
44 class csv_reader_options_builder;
58 std::size_t _byte_range_offset = 0;
60 std::size_t _byte_range_size = 0;
62 std::vector<std::string> _names;
66 bool _mangle_dupe_cols =
true;
71 std::vector<std::string> _use_cols_names;
73 std::vector<int> _use_cols_indexes;
86 char _lineterminator =
'\n';
88 char _delimiter =
',';
90 char _thousands =
'\0';
95 bool _windowslinetermination =
false;
97 bool _delim_whitespace =
false;
99 bool _skipinitialspace =
false;
101 bool _skip_blank_lines =
true;
105 char _quotechar =
'"';
107 bool _doublequote =
true;
110 bool _detect_whitespace_around_quotes =
false;
112 std::vector<std::string> _parse_dates_names;
114 std::vector<int> _parse_dates_indexes;
116 std::vector<std::string> _parse_hex_names;
118 std::vector<int> _parse_hex_indexes;
123 std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
125 std::vector<std::string> _true_values{
"True",
"TRUE",
"true"};
127 std::vector<std::string> _false_values{
"False",
"FALSE",
"false"};
129 std::vector<std::string> _na_values;
131 bool _keep_default_na =
true;
133 bool _na_filter =
true;
135 bool _dayfirst =
false;
137 data_type _timestamp_type{type_id::EMPTY};
199 if (_byte_range_size == 0) {
202 return _byte_range_size + get_byte_range_padding();
213 auto const num_names = _names.size();
214 auto const num_dtypes = std::visit([](
auto const& dtypes) {
return dtypes.size(); }, _dtypes);
215 auto const num_columns = std::max(num_dtypes, num_names);
217 auto const max_row_bytes = 16 * 1024;
218 auto const column_bytes = 64;
219 auto const base_padding = 1024;
221 if (num_columns == 0) {
223 return max_row_bytes;
227 return base_padding + num_columns * column_bytes;
235 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
242 [[nodiscard]] std::string
get_prefix()
const {
return _prefix; }
258 return _use_cols_names;
388 return _detect_whitespace_around_quotes;
398 return _parse_dates_names;
408 return _parse_dates_indexes;
418 return _parse_hex_names;
433 [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>>
const&
444 [[nodiscard]] std::vector<std::string>
const&
get_true_values()
const {
return _true_values; }
451 [[nodiscard]] std::vector<std::string>
const&
get_false_values()
const {
return _false_values; }
458 [[nodiscard]] std::vector<std::string>
const&
get_na_values()
const {
return _na_values; }
502 if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
504 "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
507 _byte_range_offset = offset;
517 if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
519 "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
522 _byte_range_size = size;
530 void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
553 _use_cols_names = std::move(col_names);
563 _use_cols_indexes = std::move(col_indices);
573 CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0),
"Cannot use both `nrows` and `skipfooter`");
574 if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
576 "nrows can't be a non negative value if range offset and/or range size has been set");
589 if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
590 CUDF_FAIL(
"skiprows must be zero if range offset or range size has been set",
591 std::invalid_argument);
593 _skiprows = skiprows;
604 "Cannot use both `nrows` and `skipfooter`",
605 std::invalid_argument);
606 if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
607 CUDF_FAIL(
"skipfooter must be zero if range offset or range size has been set",
608 std::invalid_argument);
611 _skipfooter = skipfooter;
696 CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
697 "Only MINIMAL and NONE are supported for quoting.");
730 _parse_dates_names = std::move(col_names);
740 _parse_dates_indexes = std::move(col_indices);
750 _parse_hex_names = std::move(col_names);
758 void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
765 void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
772 void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
781 _true_values.insert(_true_values.end(), vals.begin(), vals.end());
791 _false_values.insert(_false_values.end(), vals.begin(), vals.end());
801 if ((!vals.empty()) and (!_na_filter)) {
802 CUDF_FAIL(
"Can't set na_values when na_filtering is disabled");
805 _na_values = std::move(vals);
822 if (!val) { _na_values.clear(); }
871 options._compression = comp;
907 options._names = std::move(col_names);
919 options._prefix = pfx;
931 options._mangle_dupe_cols = val;
943 options._use_cols_names = std::move(col_names);
955 options._use_cols_indexes = std::move(col_indices);
1003 options._header = hdr;
1015 options._lineterminator = term;
1027 options._delimiter = delim;
1039 options._thousands = val;
1051 options._decimal = val;
1063 options._comment = val;
1075 options._windowslinetermination = val;
1087 options._delim_whitespace = val;
1099 options._skipinitialspace = val;
1111 options._skip_blank_lines = val;
1123 options._quoting = style;
1135 options._quotechar = ch;
1147 options._doublequote = val;
1160 options._detect_whitespace_around_quotes = val;
1172 options._parse_dates_names = std::move(col_names);
1184 options._parse_dates_indexes = std::move(col_indices);
1196 options._parse_hex_names = std::move(col_names);
1208 options._parse_hex_indexes = std::move(col_indices);
1220 options._dtypes = std::move(types);
1232 options._dtypes = std::move(types);
1244 options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1256 options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1304 options._dayfirst = val;
1316 options._timestamp_type = type;
1378 std::string _na_rep =
"";
1380 bool _include_header =
true;
1382 size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1384 std::string _line_terminator =
"\n";
1386 char _inter_column_delimiter =
',';
1388 std::string _true_value = std::string{
"true"};
1390 std::string _false_value = std::string{
"false"};
1392 std::vector<std::string> _names;
1446 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
1515 void set_names(std::vector<std::string> names) { _names = std::move(names); }
1585 CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1586 "Only MINIMAL and NONE are supported for quoting.");
1612 : options{sink,
table}
1624 options._names = names;
1636 options._na_rep = val;
1648 options._include_header = val;
1660 options._rows_per_chunk = val;
1672 options._line_terminator = term;
1684 options._inter_column_delimiter = delim;
1696 options._true_value = val;
1708 options._false_value = val;
Indicator for the logical data type of an element in a column.
Builder to build options for read_csv().
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
csv_reader_options && build()
move csv_reader_options member once it's built.
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
csv_reader_options_builder(source_info src)
Constructor from source info.
csv_reader_options_builder & comment(char val)
Sets comment line start character.
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Settings to use for read_csv().
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
size_type get_skiprows() const
Returns number of rows to skip from start.
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
quote_style get_quoting() const
Returns quoting style.
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
char get_delimiter() const
Returns field delimiter.
char get_lineterminator() const
Returns line terminator.
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
void set_decimal(char val)
Sets decimal point character.
std::string get_prefix() const
Returns prefix to be used for column ID.
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
char get_thousands() const
Returns numeric data thousands separator.
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
bool is_enabled_na_filter() const
Whether to disable null filter.
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
char get_comment() const
Returns comment line start character.
void set_lineterminator(char term)
Sets line terminator.
void set_quotechar(char ch)
Sets quoting character.
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
void set_compression(compression_type comp)
Sets compression format of the source.
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
std::vector< std::string > const & get_names() const
Returns names of the columns.
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
void set_names(std::vector< std::string > col_names)
Sets names of the column.
source_info const & get_source() const
Returns source info.
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
compression_type get_compression() const
Returns compression format of the source.
char get_quotechar() const
Returns quoting character.
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
void set_header(size_type hdr)
Sets header row index.
char get_decimal() const
Returns decimal point character.
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
void set_thousands(char val)
Sets numeric data thousands separator.
void enable_na_filter(bool val)
Sets whether to disable null filter.
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
void set_delimiter(char delim)
Sets field delimiter.
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
size_type get_nrows() const
Returns number of rows to read.
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
size_type get_skipfooter() const
Returns number of rows to skip from end.
void set_nrows(size_type nrows)
Sets number of rows to read.
void set_comment(char val)
Sets comment line start character.
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
size_type get_header() const
Returns header row index.
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Builder to build options for writer_csv()
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Settings to use for write_csv().
void set_table(table_view const &table)
(Re)sets the table being written.
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
std::string get_line_terminator() const
Returns character used for separating lines.
void set_line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
std::string get_na_rep() const
Returns string to used for null entries.
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
bool is_enabled_include_header() const
Whether to write headers to csv.
void set_na_rep(std::string val)
Sets string to used for null entries.
char get_inter_column_delimiter() const
Returns character used for separating column values.
sink_info const & get_sink() const
Returns sink used for writer output.
std::vector< std::string > const & get_names() const
Returns names of the columns.
quote_style get_quoting() const
Returns the quote style for the writer.
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
void set_names(std::vector< std::string > names)
Sets optional associated column names.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
size_type num_rows() const noexcept
Returns the number of rows.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a CSV dataset into a set of columns.
quote_style
Behavior when handling quotations in field data.
compression_type
Compression algorithms.
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
int32_t size_type
Row index type for columns and tables.
cuDF-IO API type definitions
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.