csv.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/detail/utils.hpp>
9 #include <cudf/io/types.hpp>
11 #include <cudf/types.hpp>
12 #include <cudf/utilities/error.hpp>
14 
15 #include <memory>
16 #include <string>
17 #include <unordered_map>
18 #include <utility>
19 #include <variant>
20 #include <vector>
21 
22 namespace CUDF_EXPORT cudf {
23 namespace io {
24 
34 class csv_reader_options_builder;
35 
41  source_info _source;
42 
43  // Read settings
44 
45  // Specify the compression format of the source or infer from file extension
46  compression_type _compression = compression_type::AUTO;
47  // Bytes to skip from the source start
48  std::size_t _byte_range_offset = 0;
49  // Bytes to read; always reads complete rows
50  std::size_t _byte_range_size = 0;
51  // Names of all the columns; if empty then names are auto-generated
52  std::vector<std::string> _names;
53  // If there is no header or names, prepend this to the column ID as the name
54  std::string _prefix;
55  // Whether to rename duplicate column names
56  bool _mangle_dupe_cols = true;
57 
58  // Filter settings
59 
60  // Names of columns to read; empty is all columns
61  std::vector<std::string> _use_cols_names;
62  // Indexes of columns to read; empty is all columns
63  std::vector<int> _use_cols_indexes;
64  // Rows to read; -1 is all
65  size_type _nrows = -1;
66  // Rows to skip from the start
67  size_type _skiprows = 0;
68  // Rows to skip from the end
69  size_type _skipfooter = 0;
70  // Header row index
71  size_type _header = 0;
72 
73  // Parsing settings
74 
75  // Line terminator
76  char _lineterminator = '\n';
77  // Field delimiter
78  char _delimiter = ',';
79  // Numeric data thousands separator; cannot match delimiter
80  char _thousands = '\0';
81  // Decimal point character; cannot match delimiter
82  char _decimal = '.';
83  // Comment line start character
84  char _comment = '\0';
85  bool _windowslinetermination = false;
86  // Treat whitespace as field delimiter; overrides character delimiter
87  bool _delim_whitespace = false;
88  // Skip whitespace after the delimiter
89  bool _skipinitialspace = false;
90  // Ignore empty lines or parse line values as invalid
91  bool _skip_blank_lines = true;
92  // Treatment of quoting behavior
93  quote_style _quoting = quote_style::MINIMAL;
94  // Quoting character (if `quoting` is true)
95  char _quotechar = '"';
96  // Whether a quote inside a value is double-quoted
97  bool _doublequote = true;
98  // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
99  // _doublequote is true
100  bool _detect_whitespace_around_quotes = false;
101  // Names of columns to read as datetime
102  std::vector<std::string> _parse_dates_names;
103  // Indexes of columns to read as datetime
104  std::vector<int> _parse_dates_indexes;
105  // Names of columns to parse as hexadecimal
106  std::vector<std::string> _parse_hex_names;
107  // Indexes of columns to parse as hexadecimal
108  std::vector<int> _parse_hex_indexes;
109 
110  // Conversion settings
111 
112  // Per-column types; disables type inference on those columns
113  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
114  // Additional values to recognize as boolean true values
115  std::vector<std::string> _true_values{"True", "TRUE", "true"};
116  // Additional values to recognize as boolean false values
117  std::vector<std::string> _false_values{"False", "FALSE", "false"};
118  // Additional values to recognize as null values
119  std::vector<std::string> _na_values;
120  // Whether to keep the built-in default NA values
121  bool _keep_default_na = true;
122  // Whether to disable null filter; disabling can improve performance
123  bool _na_filter = true;
124  // Whether to parse dates as DD/MM versus MM/DD
125  bool _dayfirst = false;
126  // Cast timestamp columns to a specific type
127  data_type _timestamp_type{type_id::EMPTY};
128 
134  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
135 
137 
138  public:
144  csv_reader_options() = default;
145 
153 
159  [[nodiscard]] source_info const& get_source() const { return _source; }
160 
166  [[nodiscard]] compression_type get_compression() const { return _compression; }
167 
173  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
174 
180  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
181 
187  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
188  {
189  if (_byte_range_size == 0) {
190  return 0;
191  } else {
192  return _byte_range_size + get_byte_range_padding();
193  }
194  }
195 
201  [[nodiscard]] std::size_t get_byte_range_padding() const
202  {
203  auto const num_names = _names.size();
204  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
205  auto const num_columns = std::max(num_dtypes, num_names);
206 
207  auto const max_row_bytes = 16 * 1024; // 16KB
208  auto const column_bytes = 64;
209  auto const base_padding = 1024; // 1KB
210 
211  if (num_columns == 0) {
212  // Use flat size if the number of columns is not known
213  return max_row_bytes;
214  }
215 
216  // Expand the size based on the number of columns, if available
217  return base_padding + num_columns * column_bytes;
218  }
219 
225  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
226 
232  [[nodiscard]] std::string get_prefix() const { return _prefix; }
233 
239  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
240 
246  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
247  {
248  return _use_cols_names;
249  }
250 
256  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
257 
263  [[nodiscard]] size_type get_nrows() const { return _nrows; }
264 
270  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
271 
277  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
278 
284  [[nodiscard]] size_type get_header() const { return _header; }
285 
291  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
292 
298  [[nodiscard]] char get_delimiter() const { return _delimiter; }
299 
305  [[nodiscard]] char get_thousands() const { return _thousands; }
306 
312  [[nodiscard]] char get_decimal() const { return _decimal; }
313 
319  [[nodiscard]] char get_comment() const { return _comment; }
320 
326  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
327 
333  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
334 
340  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
341 
347  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
348 
354  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
355 
361  [[nodiscard]] char get_quotechar() const { return _quotechar; }
362 
368  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
369 
377  {
378  return _detect_whitespace_around_quotes;
379  }
380 
386  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
387  {
388  return _parse_dates_names;
389  }
390 
396  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
397  {
398  return _parse_dates_indexes;
399  }
400 
406  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
407  {
408  return _parse_hex_names;
409  }
410 
416  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
417 
423  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
424  get_dtypes() const
425  {
426  return _dtypes;
427  }
428 
434  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
435 
441  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
442 
448  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
449 
455  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
456 
462  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
463 
469  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
470 
476  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
477 
483  void set_source(source_info src) { _source = std::move(src); }
484 
490  void set_compression(compression_type comp) { _compression = comp; }
491 
497  void set_byte_range_offset(std::size_t offset)
498  {
499  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
500  CUDF_FAIL(
501  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
502  "value");
503  }
504  _byte_range_offset = offset;
505  }
506 
512  void set_byte_range_size(std::size_t size)
513  {
514  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
515  CUDF_FAIL(
516  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
517  "non-zero.");
518  }
519  _byte_range_size = size;
520  }
521 
527  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
528 
534  void set_prefix(std::string pfx) { _prefix = pfx; }
535 
541  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
542 
548  void set_use_cols_names(std::vector<std::string> col_names)
549  {
550  _use_cols_names = std::move(col_names);
551  }
552 
558  void set_use_cols_indexes(std::vector<int> col_indices)
559  {
560  _use_cols_indexes = std::move(col_indices);
561  }
562 
568  void set_nrows(size_type nrows)
569  {
570  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
571  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
572  CUDF_FAIL(
573  "nrows can't be a non negative value if range offset and/or range size has been set");
574  }
575 
576  _nrows = nrows;
577  }
578 
584  void set_skiprows(size_type skiprows)
585  {
586  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
587  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
588  std::invalid_argument);
589  }
590  _skiprows = skiprows;
591  }
592 
598  void set_skipfooter(size_type skipfooter)
599  {
600  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
601  "Cannot use both `nrows` and `skipfooter`",
602  std::invalid_argument);
603  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
604  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
605  std::invalid_argument);
606  }
607 
608  _skipfooter = skipfooter;
609  }
610 
616  void set_header(size_type hdr) { _header = hdr; }
617 
623  void set_lineterminator(char term) { _lineterminator = term; }
624 
630  void set_delimiter(char delim) { _delimiter = delim; }
631 
637  void set_thousands(char val) { _thousands = val; }
638 
644  void set_decimal(char val) { _decimal = val; }
645 
651  void set_comment(char val) { _comment = val; }
652 
658  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
659 
665  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
666 
672  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
673 
679  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
680 
691  void set_quoting(quote_style quoting)
692  {
693  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
694  "Only MINIMAL and NONE are supported for quoting.");
695  _quoting = quoting;
696  }
697 
703  void set_quotechar(char ch) { _quotechar = ch; }
704 
710  void enable_doublequote(bool val) { _doublequote = val; }
711 
718  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
719 
725  void set_parse_dates(std::vector<std::string> col_names)
726  {
727  _parse_dates_names = std::move(col_names);
728  }
729 
735  void set_parse_dates(std::vector<int> col_indices)
736  {
737  _parse_dates_indexes = std::move(col_indices);
738  }
739 
745  void set_parse_hex(std::vector<std::string> col_names)
746  {
747  _parse_hex_names = std::move(col_names);
748  }
749 
755  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
756 
762  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
763 
769  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
770 
776  void set_true_values(std::vector<std::string> vals)
777  {
778  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
779  }
780 
786  void set_false_values(std::vector<std::string> vals)
787  {
788  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
789  }
790 
796  void set_na_values(std::vector<std::string> vals)
797  {
798  CUDF_EXPECTS(vals.empty() or _na_filter, "Can't set na_values when na_filtering is disabled");
799 
800  _na_values = std::move(vals);
801  }
802 
808  void enable_keep_default_na(bool val) { _keep_default_na = val; }
809 
815  void enable_na_filter(bool val)
816  {
817  if (!val) { _na_values.clear(); }
818  _na_filter = val;
819  }
820 
826  void enable_dayfirst(bool val) { _dayfirst = val; }
827 
833  void set_timestamp_type(data_type type) { _timestamp_type = type; }
834 };
835 
841  csv_reader_options options;
842 
843  public:
850 
856  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
857 
865  {
866  options._compression = comp;
867  return *this;
868  }
869 
877  {
878  options.set_byte_range_offset(offset);
879  return *this;
880  }
881 
889  {
890  options.set_byte_range_size(size);
891  return *this;
892  }
893 
900  csv_reader_options_builder& names(std::vector<std::string> col_names)
901  {
902  options._names = std::move(col_names);
903  return *this;
904  }
905 
913  {
914  options._prefix = std::move(pfx);
915  return *this;
916  }
917 
925  {
926  options._mangle_dupe_cols = val;
927  return *this;
928  }
929 
936  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
937  {
938  options._use_cols_names = std::move(col_names);
939  return *this;
940  }
941 
948  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
949  {
950  options._use_cols_indexes = std::move(col_indices);
951  return *this;
952  }
953 
961  {
962  options.set_nrows(rows);
963  return *this;
964  }
965 
973  {
974  options.set_skiprows(skip);
975  return *this;
976  }
977 
985  {
986  options.set_skipfooter(skip);
987  return *this;
988  }
989 
997  {
998  options._header = hdr;
999  return *this;
1000  }
1001 
1009  {
1010  options._lineterminator = term;
1011  return *this;
1012  }
1013 
1021  {
1022  options._delimiter = delim;
1023  return *this;
1024  }
1025 
1033  {
1034  options._thousands = val;
1035  return *this;
1036  }
1037 
1045  {
1046  options._decimal = val;
1047  return *this;
1048  }
1049 
1057  {
1058  options._comment = val;
1059  return *this;
1060  }
1061 
1069  {
1070  options._windowslinetermination = val;
1071  return *this;
1072  }
1073 
1081  {
1082  options._delim_whitespace = val;
1083  return *this;
1084  }
1085 
1093  {
1094  options._skipinitialspace = val;
1095  return *this;
1096  }
1097 
1105  {
1106  options._skip_blank_lines = val;
1107  return *this;
1108  }
1109 
1117  {
1118  options._quoting = style;
1119  return *this;
1120  }
1121 
1129  {
1130  options._quotechar = ch;
1131  return *this;
1132  }
1133 
1141  {
1142  options._doublequote = val;
1143  return *this;
1144  }
1145 
1154  {
1155  options._detect_whitespace_around_quotes = val;
1156  return *this;
1157  }
1158 
1165  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1166  {
1167  options._parse_dates_names = std::move(col_names);
1168  return *this;
1169  }
1170 
1177  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1178  {
1179  options._parse_dates_indexes = std::move(col_indices);
1180  return *this;
1181  }
1182 
1189  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1190  {
1191  options._parse_hex_names = std::move(col_names);
1192  return *this;
1193  }
1194 
1201  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1202  {
1203  options._parse_hex_indexes = std::move(col_indices);
1204  return *this;
1205  }
1206 
1213  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1214  {
1215  options._dtypes = std::move(types);
1216  return *this;
1217  }
1218 
1225  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1226  {
1227  options._dtypes = std::move(types);
1228  return *this;
1229  }
1230 
1237  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1238  {
1239  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1240  return *this;
1241  }
1242 
1249  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1250  {
1251  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1252  return *this;
1253  }
1254 
1261  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1262  {
1263  options.set_na_values(std::move(vals));
1264  return *this;
1265  }
1266 
1274  {
1275  options.enable_keep_default_na(val);
1276  return *this;
1277  }
1278 
1286  {
1287  options.enable_na_filter(val);
1288  return *this;
1289  }
1290 
1298  {
1299  options._dayfirst = val;
1300  return *this;
1301  }
1302 
1310  {
1311  options._timestamp_type = type;
1312  return *this;
1313  }
1314 
1318  operator csv_reader_options&&() { return std::move(options); }
1319 
1327  csv_reader_options&& build() { return std::move(options); }
1328 };
1329 
1348  csv_reader_options options,
1351  // end of group
1363 
1368  // Specify the sink to use for writer output
1369  sink_info _sink;
1370  // Set of columns to output
1371  table_view _table;
1372  // string to use for null entries
1373  std::string _na_rep = "";
1374  // Indicates whether to write headers to csv
1375  bool _include_header = true;
1376  // maximum number of rows to write in each chunk (limits memory use)
1377  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1378  // character to use for separating lines (default "\n")
1379  std::string _line_terminator = "\n";
1380  // character to use for separating column values (default ",")
1381  char _inter_column_delimiter = ',';
1382  // string to use for values != 0 in INT8 types (default 'true')
1383  std::string _true_value = std::string{"true"};
1384  // string to use for values == 0 in INT8 types (default 'false')
1385  std::string _false_value = std::string{"false"};
1386  // Names of all columns; if empty, writer will generate column names
1387  std::vector<std::string> _names;
1388  // Quote style. Currently only MINIMAL and NONE are supported.
1389  quote_style _quoting = quote_style::MINIMAL;
1390 
1397  explicit csv_writer_options(sink_info sink, table_view const& table)
1398  : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
1399  {
1400  }
1401 
1403 
1404  public:
1410  explicit csv_writer_options() = default;
1411 
1421 
1427  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1428 
1434  [[nodiscard]] table_view const& get_table() const { return _table; }
1435 
1441  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1442 
1448  [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
1449 
1455  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1456 
1462  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1463 
1469  [[nodiscard]] std::string const& get_line_terminator() const { return _line_terminator; }
1470 
1476  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1477 
1483  [[nodiscard]] std::string const& get_true_value() const { return _true_value; }
1484 
1490  [[nodiscard]] std::string const& get_false_value() const { return _false_value; }
1491 
1502  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1503 
1504  // Setter
1510  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1511 
1517  void set_na_rep(std::string val) { _na_rep = std::move(val); }
1518 
1524  void enable_include_header(bool val) { _include_header = val; }
1525 
1531  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1532 
1538  void set_line_terminator(std::string term) { _line_terminator = std::move(term); }
1539 
1545  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1546 
1552  void set_true_value(std::string val) { _true_value = std::move(val); }
1553 
1559  void set_false_value(std::string val) { _false_value = std::move(val); }
1560 
1566  void set_table(table_view const& table) { _table = table; }
1567 
1578  void set_quoting(quote_style quoting)
1579  {
1580  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1581  "Only MINIMAL and NONE are supported for quoting.");
1582  _quoting = quoting;
1583  }
1584 };
1585 
1590  csv_writer_options options;
1591 
1592  public:
1598  explicit csv_writer_options_builder() = default;
1599 
1607  : options{sink, table}
1608  {
1609  }
1610 
1617  csv_writer_options_builder& names(std::vector<std::string> names)
1618  {
1619  options._names = names;
1620  return *this;
1621  }
1622 
1630  {
1631  options._na_rep = val;
1632  return *this;
1633  };
1634 
1642  {
1643  options._include_header = val;
1644  return *this;
1645  }
1646 
1654  {
1655  options._rows_per_chunk = val;
1656  return *this;
1657  }
1658 
1666  {
1667  options._line_terminator = term;
1668  return *this;
1669  }
1670 
1678  {
1679  options._inter_column_delimiter = delim;
1680  return *this;
1681  }
1682 
1690  {
1691  options._true_value = val;
1692  return *this;
1693  }
1694 
1702  {
1703  options._false_value = val;
1704  return *this;
1705  }
1706 
1716  {
1717  options.set_quoting(quoting);
1718  return *this;
1719  }
1720 
1724  operator csv_writer_options&&() { return std::move(options); }
1725 
1733  csv_writer_options&& build() { return std::move(options); }
1734 };
1735 
1753 void write_csv(csv_writer_options const& options,
1755 
1757 struct is_supported_csv_write_type_fn {
1758  template <typename T>
1759  constexpr bool operator()() const
1760  {
1761  return cudf::io::detail::is_convertible_to_string_column<T>();
1762  }
1763 };
1765 
1772 constexpr bool is_supported_write_csv(data_type type)
1773 {
1774  return cudf::type_dispatcher(type, is_supported_csv_write_type_fn{});
1775 }
1776  // end of group
1778 } // namespace io
1779 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:277
Builder to build options for read_csv().
Definition: csv.hpp:840
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1225
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1249
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:936
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1140
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1189
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:876
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1080
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:972
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1104
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1327
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1213
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1128
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1261
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1237
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1044
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1285
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1032
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1201
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:1153
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1068
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1177
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:960
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:900
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1309
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:924
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:984
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:888
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1273
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1116
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:1008
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1020
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:948
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1165
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:856
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1056
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:864
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:996
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1297
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:912
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1092
Settings to use for read_csv().
Definition: csv.hpp:40
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:710
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:558
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:270
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:333
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:396
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:497
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:354
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:725
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:735
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:368
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:298
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:291
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:718
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:786
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:644
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:232
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
Definition: csv.hpp:376
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:448
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:305
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:239
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:762
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:187
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:416
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:441
void set_source(source_info src)
Sets source info.
Definition: csv.hpp:483
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:826
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:796
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:691
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:424
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:833
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:173
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:476
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:462
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:347
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:319
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:623
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:703
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:326
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:679
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:658
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:584
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:490
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:665
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:225
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:769
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:598
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:469
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:201
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:527
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:159
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:808
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:386
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:534
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:180
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:256
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:246
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:548
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:166
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:361
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:776
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:455
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:616
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:312
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:434
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:755
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:637
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:815
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:512
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:630
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:541
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:263
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:406
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:672
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:277
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:568
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:651
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:340
size_type get_header() const
Returns header row index.
Definition: csv.hpp:284
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:745
Builder to build options for writer_csv()
Definition: csv.hpp:1589
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1733
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1715
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1689
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1641
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1629
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1665
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1701
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1617
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1677
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1606
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1653
Settings to use for write_csv().
Definition: csv.hpp:1367
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1566
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1531
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1578
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1538
csv_writer_options()=default
Default constructor.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1545
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1552
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1434
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1524
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1455
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1517
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1476
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1490
std::string const & get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1469
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1427
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1441
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1502
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1483
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1559
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1462
std::string const & get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1448
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1510
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:82
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a CSV dataset into a set of columns.
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:75
compression_type
Compression algorithms.
Definition: io/types.hpp:46
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
constexpr bool is_supported_write_csv(data_type type)
Checks if a cudf::data_type is supported for CSV writing.
Definition: csv.hpp:1772
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
CUDF_HOST_DEVICE constexpr decltype(auto) __forceinline__ type_dispatcher(cudf::data_type dtype, Functor f, Ts &&... args)
Invokes an operator() template with the type instantiation based on the specified cudf::data_type's i...
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:143
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:182
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Destination information for write interfaces.
Definition: io/types.hpp:471
Source information for read interfaces.
Definition: io/types.hpp:316
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.