csv.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/detail/utils.hpp>
20 #include <cudf/io/types.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/error.hpp>
25 
26 #include <memory>
27 #include <string>
28 #include <unordered_map>
29 #include <utility>
30 #include <variant>
31 #include <vector>
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io {
35 
45 class csv_reader_options_builder;
46 
52  source_info _source;
53 
54  // Read settings
55 
56  // Specify the compression format of the source or infer from file extension
57  compression_type _compression = compression_type::AUTO;
58  // Bytes to skip from the source start
59  std::size_t _byte_range_offset = 0;
60  // Bytes to read; always reads complete rows
61  std::size_t _byte_range_size = 0;
62  // Names of all the columns; if empty then names are auto-generated
63  std::vector<std::string> _names;
64  // If there is no header or names, prepend this to the column ID as the name
65  std::string _prefix;
66  // Whether to rename duplicate column names
67  bool _mangle_dupe_cols = true;
68 
69  // Filter settings
70 
71  // Names of columns to read; empty is all columns
72  std::vector<std::string> _use_cols_names;
73  // Indexes of columns to read; empty is all columns
74  std::vector<int> _use_cols_indexes;
75  // Rows to read; -1 is all
76  size_type _nrows = -1;
77  // Rows to skip from the start
78  size_type _skiprows = 0;
79  // Rows to skip from the end
80  size_type _skipfooter = 0;
81  // Header row index
82  size_type _header = 0;
83 
84  // Parsing settings
85 
86  // Line terminator
87  char _lineterminator = '\n';
88  // Field delimiter
89  char _delimiter = ',';
90  // Numeric data thousands separator; cannot match delimiter
91  char _thousands = '\0';
92  // Decimal point character; cannot match delimiter
93  char _decimal = '.';
94  // Comment line start character
95  char _comment = '\0';
96  bool _windowslinetermination = false;
97  // Treat whitespace as field delimiter; overrides character delimiter
98  bool _delim_whitespace = false;
99  // Skip whitespace after the delimiter
100  bool _skipinitialspace = false;
101  // Ignore empty lines or parse line values as invalid
102  bool _skip_blank_lines = true;
103  // Treatment of quoting behavior
104  quote_style _quoting = quote_style::MINIMAL;
105  // Quoting character (if `quoting` is true)
106  char _quotechar = '"';
107  // Whether a quote inside a value is double-quoted
108  bool _doublequote = true;
109  // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
110  // _doublequote is true
111  bool _detect_whitespace_around_quotes = false;
112  // Names of columns to read as datetime
113  std::vector<std::string> _parse_dates_names;
114  // Indexes of columns to read as datetime
115  std::vector<int> _parse_dates_indexes;
116  // Names of columns to parse as hexadecimal
117  std::vector<std::string> _parse_hex_names;
118  // Indexes of columns to parse as hexadecimal
119  std::vector<int> _parse_hex_indexes;
120 
121  // Conversion settings
122 
123  // Per-column types; disables type inference on those columns
124  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
125  // Additional values to recognize as boolean true values
126  std::vector<std::string> _true_values{"True", "TRUE", "true"};
127  // Additional values to recognize as boolean false values
128  std::vector<std::string> _false_values{"False", "FALSE", "false"};
129  // Additional values to recognize as null values
130  std::vector<std::string> _na_values;
131  // Whether to keep the built-in default NA values
132  bool _keep_default_na = true;
133  // Whether to disable null filter; disabling can improve performance
134  bool _na_filter = true;
135  // Whether to parse dates as DD/MM versus MM/DD
136  bool _dayfirst = false;
137  // Cast timestamp columns to a specific type
138  data_type _timestamp_type{type_id::EMPTY};
139 
145  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
146 
148 
149  public:
155  csv_reader_options() = default;
156 
164 
170  [[nodiscard]] source_info const& get_source() const { return _source; }
171 
177  [[nodiscard]] compression_type get_compression() const { return _compression; }
178 
184  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
185 
191  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
192 
198  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
199  {
200  if (_byte_range_size == 0) {
201  return 0;
202  } else {
203  return _byte_range_size + get_byte_range_padding();
204  }
205  }
206 
212  [[nodiscard]] std::size_t get_byte_range_padding() const
213  {
214  auto const num_names = _names.size();
215  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
216  auto const num_columns = std::max(num_dtypes, num_names);
217 
218  auto const max_row_bytes = 16 * 1024; // 16KB
219  auto const column_bytes = 64;
220  auto const base_padding = 1024; // 1KB
221 
222  if (num_columns == 0) {
223  // Use flat size if the number of columns is not known
224  return max_row_bytes;
225  }
226 
227  // Expand the size based on the number of columns, if available
228  return base_padding + num_columns * column_bytes;
229  }
230 
236  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
237 
243  [[nodiscard]] std::string get_prefix() const { return _prefix; }
244 
250  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
251 
257  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
258  {
259  return _use_cols_names;
260  }
261 
267  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
268 
274  [[nodiscard]] size_type get_nrows() const { return _nrows; }
275 
281  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
282 
288  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
289 
295  [[nodiscard]] size_type get_header() const { return _header; }
296 
302  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
303 
309  [[nodiscard]] char get_delimiter() const { return _delimiter; }
310 
316  [[nodiscard]] char get_thousands() const { return _thousands; }
317 
323  [[nodiscard]] char get_decimal() const { return _decimal; }
324 
330  [[nodiscard]] char get_comment() const { return _comment; }
331 
337  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
338 
344  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
345 
351  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
352 
358  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
359 
365  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
366 
372  [[nodiscard]] char get_quotechar() const { return _quotechar; }
373 
379  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
380 
388  {
389  return _detect_whitespace_around_quotes;
390  }
391 
397  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
398  {
399  return _parse_dates_names;
400  }
401 
407  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
408  {
409  return _parse_dates_indexes;
410  }
411 
417  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
418  {
419  return _parse_hex_names;
420  }
421 
427  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
428 
434  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
435  get_dtypes() const
436  {
437  return _dtypes;
438  }
439 
445  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
446 
452  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
453 
459  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
460 
466  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
467 
473  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
474 
480  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
481 
487  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
488 
494  void set_compression(compression_type comp) { _compression = comp; }
495 
501  void set_byte_range_offset(std::size_t offset)
502  {
503  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
504  CUDF_FAIL(
505  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
506  "value");
507  }
508  _byte_range_offset = offset;
509  }
510 
516  void set_byte_range_size(std::size_t size)
517  {
518  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
519  CUDF_FAIL(
520  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
521  "non-zero.");
522  }
523  _byte_range_size = size;
524  }
525 
531  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
532 
538  void set_prefix(std::string pfx) { _prefix = pfx; }
539 
545  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
546 
552  void set_use_cols_names(std::vector<std::string> col_names)
553  {
554  _use_cols_names = std::move(col_names);
555  }
556 
562  void set_use_cols_indexes(std::vector<int> col_indices)
563  {
564  _use_cols_indexes = std::move(col_indices);
565  }
566 
572  void set_nrows(size_type nrows)
573  {
574  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
575  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
576  CUDF_FAIL(
577  "nrows can't be a non negative value if range offset and/or range size has been set");
578  }
579 
580  _nrows = nrows;
581  }
582 
588  void set_skiprows(size_type skiprows)
589  {
590  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
591  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
592  std::invalid_argument);
593  }
594  _skiprows = skiprows;
595  }
596 
602  void set_skipfooter(size_type skipfooter)
603  {
604  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
605  "Cannot use both `nrows` and `skipfooter`",
606  std::invalid_argument);
607  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
608  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
609  std::invalid_argument);
610  }
611 
612  _skipfooter = skipfooter;
613  }
614 
620  void set_header(size_type hdr) { _header = hdr; }
621 
627  void set_lineterminator(char term) { _lineterminator = term; }
628 
634  void set_delimiter(char delim) { _delimiter = delim; }
635 
641  void set_thousands(char val) { _thousands = val; }
642 
648  void set_decimal(char val) { _decimal = val; }
649 
655  void set_comment(char val) { _comment = val; }
656 
662  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
663 
669  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
670 
676  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
677 
683  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
684 
695  void set_quoting(quote_style quoting)
696  {
697  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
698  "Only MINIMAL and NONE are supported for quoting.");
699  _quoting = quoting;
700  }
701 
707  void set_quotechar(char ch) { _quotechar = ch; }
708 
714  void enable_doublequote(bool val) { _doublequote = val; }
715 
722  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
723 
729  void set_parse_dates(std::vector<std::string> col_names)
730  {
731  _parse_dates_names = std::move(col_names);
732  }
733 
739  void set_parse_dates(std::vector<int> col_indices)
740  {
741  _parse_dates_indexes = std::move(col_indices);
742  }
743 
749  void set_parse_hex(std::vector<std::string> col_names)
750  {
751  _parse_hex_names = std::move(col_names);
752  }
753 
759  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
760 
766  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
767 
773  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
774 
780  void set_true_values(std::vector<std::string> vals)
781  {
782  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
783  }
784 
790  void set_false_values(std::vector<std::string> vals)
791  {
792  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
793  }
794 
800  void set_na_values(std::vector<std::string> vals)
801  {
802  if ((!vals.empty()) and (!_na_filter)) {
803  CUDF_FAIL("Can't set na_values when na_filtering is disabled");
804  }
805 
806  _na_values = std::move(vals);
807  }
808 
814  void enable_keep_default_na(bool val) { _keep_default_na = val; }
815 
821  void enable_na_filter(bool val)
822  {
823  if (!val) { _na_values.clear(); }
824  _na_filter = val;
825  }
826 
832  void enable_dayfirst(bool val) { _dayfirst = val; }
833 
839  void set_timestamp_type(data_type type) { _timestamp_type = type; }
840 };
841 
847  csv_reader_options options;
848 
849  public:
856 
862  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
863 
871  {
872  options._compression = comp;
873  return *this;
874  }
875 
883  {
884  options.set_byte_range_offset(offset);
885  return *this;
886  }
887 
895  {
896  options.set_byte_range_size(size);
897  return *this;
898  }
899 
906  csv_reader_options_builder& names(std::vector<std::string> col_names)
907  {
908  options._names = std::move(col_names);
909  return *this;
910  }
911 
919  {
920  options._prefix = std::move(pfx);
921  return *this;
922  }
923 
931  {
932  options._mangle_dupe_cols = val;
933  return *this;
934  }
935 
942  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
943  {
944  options._use_cols_names = std::move(col_names);
945  return *this;
946  }
947 
954  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
955  {
956  options._use_cols_indexes = std::move(col_indices);
957  return *this;
958  }
959 
967  {
968  options.set_nrows(rows);
969  return *this;
970  }
971 
979  {
980  options.set_skiprows(skip);
981  return *this;
982  }
983 
991  {
992  options.set_skipfooter(skip);
993  return *this;
994  }
995 
1003  {
1004  options._header = hdr;
1005  return *this;
1006  }
1007 
1015  {
1016  options._lineterminator = term;
1017  return *this;
1018  }
1019 
1027  {
1028  options._delimiter = delim;
1029  return *this;
1030  }
1031 
1039  {
1040  options._thousands = val;
1041  return *this;
1042  }
1043 
1051  {
1052  options._decimal = val;
1053  return *this;
1054  }
1055 
1063  {
1064  options._comment = val;
1065  return *this;
1066  }
1067 
1075  {
1076  options._windowslinetermination = val;
1077  return *this;
1078  }
1079 
1087  {
1088  options._delim_whitespace = val;
1089  return *this;
1090  }
1091 
1099  {
1100  options._skipinitialspace = val;
1101  return *this;
1102  }
1103 
1111  {
1112  options._skip_blank_lines = val;
1113  return *this;
1114  }
1115 
1123  {
1124  options._quoting = style;
1125  return *this;
1126  }
1127 
1135  {
1136  options._quotechar = ch;
1137  return *this;
1138  }
1139 
1147  {
1148  options._doublequote = val;
1149  return *this;
1150  }
1151 
1160  {
1161  options._detect_whitespace_around_quotes = val;
1162  return *this;
1163  }
1164 
1171  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1172  {
1173  options._parse_dates_names = std::move(col_names);
1174  return *this;
1175  }
1176 
1183  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1184  {
1185  options._parse_dates_indexes = std::move(col_indices);
1186  return *this;
1187  }
1188 
1195  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1196  {
1197  options._parse_hex_names = std::move(col_names);
1198  return *this;
1199  }
1200 
1207  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1208  {
1209  options._parse_hex_indexes = std::move(col_indices);
1210  return *this;
1211  }
1212 
1219  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1220  {
1221  options._dtypes = std::move(types);
1222  return *this;
1223  }
1224 
1231  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1232  {
1233  options._dtypes = std::move(types);
1234  return *this;
1235  }
1236 
1243  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1244  {
1245  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1246  return *this;
1247  }
1248 
1255  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1256  {
1257  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1258  return *this;
1259  }
1260 
1267  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1268  {
1269  options.set_na_values(std::move(vals));
1270  return *this;
1271  }
1272 
1280  {
1281  options.enable_keep_default_na(val);
1282  return *this;
1283  }
1284 
1292  {
1293  options.enable_na_filter(val);
1294  return *this;
1295  }
1296 
1304  {
1305  options._dayfirst = val;
1306  return *this;
1307  }
1308 
1316  {
1317  options._timestamp_type = type;
1318  return *this;
1319  }
1320 
1324  operator csv_reader_options&&() { return std::move(options); }
1325 
1333  csv_reader_options&& build() { return std::move(options); }
1334 };
1335 
1354  csv_reader_options options,
1357  // end of group
1369 
1374  // Specify the sink to use for writer output
1375  sink_info _sink;
1376  // Set of columns to output
1377  table_view _table;
1378  // string to use for null entries
1379  std::string _na_rep = "";
1380  // Indicates whether to write headers to csv
1381  bool _include_header = true;
1382  // maximum number of rows to write in each chunk (limits memory use)
1383  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1384  // character to use for separating lines (default "\n")
1385  std::string _line_terminator = "\n";
1386  // character to use for separating column values (default ",")
1387  char _inter_column_delimiter = ',';
1388  // string to use for values != 0 in INT8 types (default 'true')
1389  std::string _true_value = std::string{"true"};
1390  // string to use for values == 0 in INT8 types (default 'false')
1391  std::string _false_value = std::string{"false"};
1392  // Names of all columns; if empty, writer will generate column names
1393  std::vector<std::string> _names;
1394  // Quote style. Currently only MINIMAL and NONE are supported.
1395  quote_style _quoting = quote_style::MINIMAL;
1396 
1403  explicit csv_writer_options(sink_info sink, table_view const& table)
1404  : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
1405  {
1406  }
1407 
1409 
1410  public:
1416  explicit csv_writer_options() = default;
1417 
1427 
1433  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1434 
1440  [[nodiscard]] table_view const& get_table() const { return _table; }
1441 
1447  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1448 
1454  [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
1455 
1461  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1462 
1468  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1469 
1475  [[nodiscard]] std::string const& get_line_terminator() const { return _line_terminator; }
1476 
1482  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1483 
1489  [[nodiscard]] std::string const& get_true_value() const { return _true_value; }
1490 
1496  [[nodiscard]] std::string const& get_false_value() const { return _false_value; }
1497 
1508  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1509 
1510  // Setter
1516  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1517 
1523  void set_na_rep(std::string val) { _na_rep = std::move(val); }
1524 
1530  void enable_include_header(bool val) { _include_header = val; }
1531 
1537  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1538 
1544  void set_line_terminator(std::string term) { _line_terminator = std::move(term); }
1545 
1551  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1552 
1558  void set_true_value(std::string val) { _true_value = std::move(val); }
1559 
1565  void set_false_value(std::string val) { _false_value = std::move(val); }
1566 
1572  void set_table(table_view const& table) { _table = table; }
1573 
1584  void set_quoting(quote_style quoting)
1585  {
1586  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1587  "Only MINIMAL and NONE are supported for quoting.");
1588  _quoting = quoting;
1589  }
1590 };
1591 
1596  csv_writer_options options;
1597 
1598  public:
1604  explicit csv_writer_options_builder() = default;
1605 
1613  : options{sink, table}
1614  {
1615  }
1616 
1623  csv_writer_options_builder& names(std::vector<std::string> names)
1624  {
1625  options._names = names;
1626  return *this;
1627  }
1628 
1636  {
1637  options._na_rep = val;
1638  return *this;
1639  };
1640 
1648  {
1649  options._include_header = val;
1650  return *this;
1651  }
1652 
1660  {
1661  options._rows_per_chunk = val;
1662  return *this;
1663  }
1664 
1672  {
1673  options._line_terminator = term;
1674  return *this;
1675  }
1676 
1684  {
1685  options._inter_column_delimiter = delim;
1686  return *this;
1687  }
1688 
1696  {
1697  options._true_value = val;
1698  return *this;
1699  }
1700 
1708  {
1709  options._false_value = val;
1710  return *this;
1711  }
1712 
1722  {
1723  options.set_quoting(quoting);
1724  return *this;
1725  }
1726 
1730  operator csv_writer_options&&() { return std::move(options); }
1731 
1739  csv_writer_options&& build() { return std::move(options); }
1740 };
1741 
1759 void write_csv(csv_writer_options const& options,
1761 
1763 struct is_supported_csv_write_type_fn {
1764  template <typename T>
1765  constexpr bool operator()() const
1766  {
1767  return cudf::io::detail::is_convertible_to_string_column<T>();
1768  }
1769 };
1771 
1778 constexpr bool is_supported_write_csv(data_type type)
1779 {
1780  return cudf::type_dispatcher(type, is_supported_csv_write_type_fn{});
1781 }
1782  // end of group
1784 } // namespace io
1785 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
Builder to build options for read_csv().
Definition: csv.hpp:846
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1231
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1255
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:942
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1146
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1195
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:882
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1086
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:978
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1110
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1333
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1219
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1134
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1267
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1243
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1050
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1291
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1038
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1207
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:1159
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1074
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1183
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:966
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:906
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1315
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:930
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:990
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:894
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1279
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1122
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:1014
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1026
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:954
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1171
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:862
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1062
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:870
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:1002
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1303
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:918
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1098
Settings to use for read_csv().
Definition: csv.hpp:51
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:714
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:562
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:281
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:344
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:407
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:501
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:365
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:729
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:739
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:379
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:309
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:302
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:722
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:790
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:648
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:243
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
Definition: csv.hpp:387
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:459
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:316
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:250
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:766
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:198
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:427
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:452
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:832
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:800
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:695
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:435
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:839
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:184
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:487
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:473
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:358
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:330
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:627
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:707
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:337
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:683
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:662
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:588
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:494
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:669
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:236
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:773
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:602
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:480
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:212
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:531
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:170
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:814
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:397
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:538
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:191
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:267
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:257
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:552
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:177
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:372
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:780
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:466
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:620
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:323
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:445
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:759
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:641
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:821
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:516
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:634
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:545
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:274
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:417
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:676
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:288
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:572
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:655
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:351
size_type get_header() const
Returns header row index.
Definition: csv.hpp:295
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:749
Builder to build options for writer_csv()
Definition: csv.hpp:1595
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1739
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1721
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1695
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1647
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1635
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1671
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1707
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1623
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1683
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1612
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1659
Settings to use for write_csv().
Definition: csv.hpp:1373
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1572
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1537
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1584
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1544
csv_writer_options()=default
Default constructor.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1551
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1558
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1440
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1530
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1461
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1523
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1482
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1496
std::string const & get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1475
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1433
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1447
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1508
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1489
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1565
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1468
std::string const & get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1454
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1516
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:93
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a CSV dataset into a set of columns.
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:86
compression_type
Compression algorithms.
Definition: io/types.hpp:57
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
constexpr bool is_supported_write_csv(data_type type)
Checks if a cudf::data_type is supported for CSV writing.
Definition: csv.hpp:1778
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
CUDF_HOST_DEVICE constexpr decltype(auto) __forceinline__ type_dispatcher(cudf::data_type dtype, Functor f, Ts &&... args)
Invokes an operator() template with the type instantiation based on the specified cudf::data_type's i...
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:37
Destination information for write interfaces.
Definition: io/types.hpp:523
Source information for read interfaces.
Definition: io/types.hpp:348
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:303
Class definitions for (mutable)_table_view
Type declarations for libcudf.