29 #include <unordered_map>
34 namespace CUDF_EXPORT
cudf {
46 class csv_reader_options_builder;
60 std::size_t _byte_range_offset = 0;
62 std::size_t _byte_range_size = 0;
64 std::vector<std::string> _names;
68 bool _mangle_dupe_cols =
true;
73 std::vector<std::string> _use_cols_names;
75 std::vector<int> _use_cols_indexes;
88 char _lineterminator =
'\n';
90 char _delimiter =
',';
92 char _thousands =
'\0';
97 bool _windowslinetermination =
false;
99 bool _delim_whitespace =
false;
101 bool _skipinitialspace =
false;
103 bool _skip_blank_lines =
true;
107 char _quotechar =
'"';
109 bool _doublequote =
true;
112 bool _detect_whitespace_around_quotes =
false;
114 std::vector<std::string> _parse_dates_names;
116 std::vector<int> _parse_dates_indexes;
118 std::vector<std::string> _parse_hex_names;
120 std::vector<int> _parse_hex_indexes;
125 std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
127 std::vector<std::string> _true_values{
"True",
"TRUE",
"true"};
129 std::vector<std::string> _false_values{
"False",
"FALSE",
"false"};
131 std::vector<std::string> _na_values;
133 bool _keep_default_na =
true;
135 bool _na_filter =
true;
137 bool _dayfirst =
false;
139 data_type _timestamp_type{type_id::EMPTY};
201 if (_byte_range_size == 0) {
204 return _byte_range_size + get_byte_range_padding();
215 auto const num_names = _names.size();
216 auto const num_dtypes = std::visit([](
auto const& dtypes) {
return dtypes.size(); }, _dtypes);
217 auto const num_columns = std::max(num_dtypes, num_names);
219 auto const max_row_bytes = 16 * 1024;
220 auto const column_bytes = 64;
221 auto const base_padding = 1024;
223 if (num_columns == 0) {
225 return max_row_bytes;
229 return base_padding + num_columns * column_bytes;
237 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
244 [[nodiscard]] std::string
get_prefix()
const {
return _prefix; }
260 return _use_cols_names;
390 return _detect_whitespace_around_quotes;
400 return _parse_dates_names;
410 return _parse_dates_indexes;
420 return _parse_hex_names;
435 [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>>
const&
446 [[nodiscard]] std::vector<std::string>
const&
get_true_values()
const {
return _true_values; }
453 [[nodiscard]] std::vector<std::string>
const&
get_false_values()
const {
return _false_values; }
460 [[nodiscard]] std::vector<std::string>
const&
get_na_values()
const {
return _na_values; }
504 if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
506 "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
509 _byte_range_offset = offset;
519 if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
521 "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
524 _byte_range_size = size;
532 void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
555 _use_cols_names = std::move(col_names);
565 _use_cols_indexes = std::move(col_indices);
575 CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0),
"Cannot use both `nrows` and `skipfooter`");
576 if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
578 "nrows can't be a non negative value if range offset and/or range size has been set");
591 if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
592 CUDF_FAIL(
"skiprows must be zero if range offset or range size has been set",
593 std::invalid_argument);
595 _skiprows = skiprows;
606 "Cannot use both `nrows` and `skipfooter`",
607 std::invalid_argument);
608 if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
609 CUDF_FAIL(
"skipfooter must be zero if range offset or range size has been set",
610 std::invalid_argument);
613 _skipfooter = skipfooter;
698 CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
699 "Only MINIMAL and NONE are supported for quoting.");
732 _parse_dates_names = std::move(col_names);
742 _parse_dates_indexes = std::move(col_indices);
752 _parse_hex_names = std::move(col_names);
760 void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
767 void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
774 void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
783 _true_values.insert(_true_values.end(), vals.begin(), vals.end());
793 _false_values.insert(_false_values.end(), vals.begin(), vals.end());
803 if ((!vals.empty()) and (!_na_filter)) {
804 CUDF_FAIL(
"Can't set na_values when na_filtering is disabled");
807 _na_values = std::move(vals);
824 if (!val) { _na_values.clear(); }
873 options._compression = comp;
909 options._names = std::move(col_names);
921 options._prefix = pfx;
933 options._mangle_dupe_cols = val;
945 options._use_cols_names = std::move(col_names);
957 options._use_cols_indexes = std::move(col_indices);
1005 options._header = hdr;
1017 options._lineterminator = term;
1029 options._delimiter = delim;
1041 options._thousands = val;
1053 options._decimal = val;
1065 options._comment = val;
1077 options._windowslinetermination = val;
1089 options._delim_whitespace = val;
1101 options._skipinitialspace = val;
1113 options._skip_blank_lines = val;
1125 options._quoting = style;
1137 options._quotechar = ch;
1149 options._doublequote = val;
1162 options._detect_whitespace_around_quotes = val;
1174 options._parse_dates_names = std::move(col_names);
1186 options._parse_dates_indexes = std::move(col_indices);
1198 options._parse_hex_names = std::move(col_names);
1210 options._parse_hex_indexes = std::move(col_indices);
1222 options._dtypes = std::move(types);
1234 options._dtypes = std::move(types);
1246 options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1258 options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1306 options._dayfirst = val;
1318 options._timestamp_type = type;
1380 std::string _na_rep =
"";
1382 bool _include_header =
true;
1384 size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1386 std::string _line_terminator =
"\n";
1388 char _inter_column_delimiter =
',';
1390 std::string _true_value = std::string{
"true"};
1392 std::string _false_value = std::string{
"false"};
1394 std::vector<std::string> _names;
1448 [[nodiscard]] std::vector<std::string>
const&
get_names()
const {
return _names; }
1517 void set_names(std::vector<std::string> names) { _names = std::move(names); }
1587 CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1588 "Only MINIMAL and NONE are supported for quoting.");
1614 : options{sink,
table}
1626 options._names = names;
1638 options._na_rep = val;
1650 options._include_header = val;
1662 options._rows_per_chunk = val;
1674 options._line_terminator = term;
1686 options._inter_column_delimiter = delim;
1698 options._true_value = val;
1710 options._false_value = val;
Indicator for the logical data type of an element in a column.
Builder to build options for read_csv().
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
csv_reader_options && build()
move csv_reader_options member once it's built.
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
csv_reader_options_builder(source_info src)
Constructor from source info.
csv_reader_options_builder & comment(char val)
Sets comment line start character.
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Settings to use for read_csv().
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
size_type get_skiprows() const
Returns number of rows to skip from start.
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
quote_style get_quoting() const
Returns quoting style.
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
char get_delimiter() const
Returns field delimiter.
char get_lineterminator() const
Returns line terminator.
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
void set_decimal(char val)
Sets decimal point character.
std::string get_prefix() const
Returns prefix to be used for column ID.
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
char get_thousands() const
Returns numeric data thousands separator.
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
bool is_enabled_na_filter() const
Whether to disable null filter.
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
char get_comment() const
Returns comment line start character.
void set_lineterminator(char term)
Sets line terminator.
void set_quotechar(char ch)
Sets quoting character.
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
void set_compression(compression_type comp)
Sets compression format of the source.
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
std::vector< std::string > const & get_names() const
Returns names of the columns.
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
void set_names(std::vector< std::string > col_names)
Sets names of the column.
source_info const & get_source() const
Returns source info.
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
compression_type get_compression() const
Returns compression format of the source.
char get_quotechar() const
Returns quoting character.
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
void set_header(size_type hdr)
Sets header row index.
char get_decimal() const
Returns decimal point character.
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
void set_thousands(char val)
Sets numeric data thousands separator.
void enable_na_filter(bool val)
Sets whether to disable null filter.
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
void set_delimiter(char delim)
Sets field delimiter.
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
size_type get_nrows() const
Returns number of rows to read.
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
size_type get_skipfooter() const
Returns number of rows to skip from end.
void set_nrows(size_type nrows)
Sets number of rows to read.
void set_comment(char val)
Sets comment line start character.
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
size_type get_header() const
Returns header row index.
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Builder to build options for writer_csv()
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Settings to use for write_csv().
void set_table(table_view const &table)
(Re)sets the table being written.
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
std::string get_line_terminator() const
Returns character used for separating lines.
void set_line_terminator(std::string term)
Sets character used for separating lines.
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
std::string get_na_rep() const
Returns string to used for null entries.
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
bool is_enabled_include_header() const
Whether to write headers to csv.
void set_na_rep(std::string val)
Sets string to used for null entries.
char get_inter_column_delimiter() const
Returns character used for separating column values.
sink_info const & get_sink() const
Returns sink used for writer output.
std::vector< std::string > const & get_names() const
Returns names of the columns.
quote_style get_quoting() const
Returns the quote style for the writer.
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
void set_names(std::vector< std::string > names)
Sets optional associated column names.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
size_type num_rows() const noexcept
Returns the number of rows.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Reads a CSV dataset into a set of columns.
compression_type
Compression algorithms.
quote_style
Behavior when handling quotations in field data.
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
int32_t size_type
Row index type for columns and tables.
cuDF-IO API type definitions
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.