csv.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
21 #include <cudf/types.hpp>
22 #include <cudf/utilities/error.hpp>
24 
25 #include <memory>
26 #include <string>
27 #include <unordered_map>
28 #include <utility>
29 #include <variant>
30 #include <vector>
31 
32 namespace CUDF_EXPORT cudf {
33 namespace io {
34 
44 class csv_reader_options_builder;
45 
51  source_info _source;
52 
53  // Read settings
54 
55  // Specify the compression format of the source or infer from file extension
56  compression_type _compression = compression_type::AUTO;
57  // Bytes to skip from the source start
58  std::size_t _byte_range_offset = 0;
59  // Bytes to read; always reads complete rows
60  std::size_t _byte_range_size = 0;
61  // Names of all the columns; if empty then names are auto-generated
62  std::vector<std::string> _names;
63  // If there is no header or names, prepend this to the column ID as the name
64  std::string _prefix;
65  // Whether to rename duplicate column names
66  bool _mangle_dupe_cols = true;
67 
68  // Filter settings
69 
70  // Names of columns to read; empty is all columns
71  std::vector<std::string> _use_cols_names;
72  // Indexes of columns to read; empty is all columns
73  std::vector<int> _use_cols_indexes;
74  // Rows to read; -1 is all
75  size_type _nrows = -1;
76  // Rows to skip from the start
77  size_type _skiprows = 0;
78  // Rows to skip from the end
79  size_type _skipfooter = 0;
80  // Header row index
81  size_type _header = 0;
82 
83  // Parsing settings
84 
85  // Line terminator
86  char _lineterminator = '\n';
87  // Field delimiter
88  char _delimiter = ',';
89  // Numeric data thousands separator; cannot match delimiter
90  char _thousands = '\0';
91  // Decimal point character; cannot match delimiter
92  char _decimal = '.';
93  // Comment line start character
94  char _comment = '\0';
95  bool _windowslinetermination = false;
96  // Treat whitespace as field delimiter; overrides character delimiter
97  bool _delim_whitespace = false;
98  // Skip whitespace after the delimiter
99  bool _skipinitialspace = false;
100  // Ignore empty lines or parse line values as invalid
101  bool _skip_blank_lines = true;
102  // Treatment of quoting behavior
103  quote_style _quoting = quote_style::MINIMAL;
104  // Quoting character (if `quoting` is true)
105  char _quotechar = '"';
106  // Whether a quote inside a value is double-quoted
107  bool _doublequote = true;
108  // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
109  // _doublequote is true
110  bool _detect_whitespace_around_quotes = false;
111  // Names of columns to read as datetime
112  std::vector<std::string> _parse_dates_names;
113  // Indexes of columns to read as datetime
114  std::vector<int> _parse_dates_indexes;
115  // Names of columns to parse as hexadecimal
116  std::vector<std::string> _parse_hex_names;
117  // Indexes of columns to parse as hexadecimal
118  std::vector<int> _parse_hex_indexes;
119 
120  // Conversion settings
121 
122  // Per-column types; disables type inference on those columns
123  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
124  // Additional values to recognize as boolean true values
125  std::vector<std::string> _true_values{"True", "TRUE", "true"};
126  // Additional values to recognize as boolean false values
127  std::vector<std::string> _false_values{"False", "FALSE", "false"};
128  // Additional values to recognize as null values
129  std::vector<std::string> _na_values;
130  // Whether to keep the built-in default NA values
131  bool _keep_default_na = true;
132  // Whether to disable null filter; disabling can improve performance
133  bool _na_filter = true;
134  // Whether to parse dates as DD/MM versus MM/DD
135  bool _dayfirst = false;
136  // Cast timestamp columns to a specific type
137  data_type _timestamp_type{type_id::EMPTY};
138 
144  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
145 
147 
148  public:
154  csv_reader_options() = default;
155 
163 
169  [[nodiscard]] source_info const& get_source() const { return _source; }
170 
176  [[nodiscard]] compression_type get_compression() const { return _compression; }
177 
183  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
184 
190  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
191 
197  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
198  {
199  if (_byte_range_size == 0) {
200  return 0;
201  } else {
202  return _byte_range_size + get_byte_range_padding();
203  }
204  }
205 
211  [[nodiscard]] std::size_t get_byte_range_padding() const
212  {
213  auto const num_names = _names.size();
214  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
215  auto const num_columns = std::max(num_dtypes, num_names);
216 
217  auto const max_row_bytes = 16 * 1024; // 16KB
218  auto const column_bytes = 64;
219  auto const base_padding = 1024; // 1KB
220 
221  if (num_columns == 0) {
222  // Use flat size if the number of columns is not known
223  return max_row_bytes;
224  }
225 
226  // Expand the size based on the number of columns, if available
227  return base_padding + num_columns * column_bytes;
228  }
229 
235  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
236 
242  [[nodiscard]] std::string get_prefix() const { return _prefix; }
243 
249  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
250 
256  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
257  {
258  return _use_cols_names;
259  }
260 
266  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
267 
273  [[nodiscard]] size_type get_nrows() const { return _nrows; }
274 
280  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
281 
287  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
288 
294  [[nodiscard]] size_type get_header() const { return _header; }
295 
301  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
302 
308  [[nodiscard]] char get_delimiter() const { return _delimiter; }
309 
315  [[nodiscard]] char get_thousands() const { return _thousands; }
316 
322  [[nodiscard]] char get_decimal() const { return _decimal; }
323 
329  [[nodiscard]] char get_comment() const { return _comment; }
330 
336  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
337 
343  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
344 
350  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
351 
357  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
358 
364  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
365 
371  [[nodiscard]] char get_quotechar() const { return _quotechar; }
372 
378  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
379 
387  {
388  return _detect_whitespace_around_quotes;
389  }
390 
396  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
397  {
398  return _parse_dates_names;
399  }
400 
406  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
407  {
408  return _parse_dates_indexes;
409  }
410 
416  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
417  {
418  return _parse_hex_names;
419  }
420 
426  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
427 
433  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
434  get_dtypes() const
435  {
436  return _dtypes;
437  }
438 
444  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
445 
451  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
452 
458  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
459 
465  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
466 
472  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
473 
479  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
480 
486  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
487 
493  void set_compression(compression_type comp) { _compression = comp; }
494 
500  void set_byte_range_offset(std::size_t offset)
501  {
502  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
503  CUDF_FAIL(
504  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
505  "value");
506  }
507  _byte_range_offset = offset;
508  }
509 
515  void set_byte_range_size(std::size_t size)
516  {
517  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
518  CUDF_FAIL(
519  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
520  "non-zero.");
521  }
522  _byte_range_size = size;
523  }
524 
530  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
531 
537  void set_prefix(std::string pfx) { _prefix = pfx; }
538 
544  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
545 
551  void set_use_cols_names(std::vector<std::string> col_names)
552  {
553  _use_cols_names = std::move(col_names);
554  }
555 
561  void set_use_cols_indexes(std::vector<int> col_indices)
562  {
563  _use_cols_indexes = std::move(col_indices);
564  }
565 
571  void set_nrows(size_type nrows)
572  {
573  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
574  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
575  CUDF_FAIL(
576  "nrows can't be a non negative value if range offset and/or range size has been set");
577  }
578 
579  _nrows = nrows;
580  }
581 
587  void set_skiprows(size_type skiprows)
588  {
589  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
590  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
591  std::invalid_argument);
592  }
593  _skiprows = skiprows;
594  }
595 
601  void set_skipfooter(size_type skipfooter)
602  {
603  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
604  "Cannot use both `nrows` and `skipfooter`",
605  std::invalid_argument);
606  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
607  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
608  std::invalid_argument);
609  }
610 
611  _skipfooter = skipfooter;
612  }
613 
619  void set_header(size_type hdr) { _header = hdr; }
620 
626  void set_lineterminator(char term) { _lineterminator = term; }
627 
633  void set_delimiter(char delim) { _delimiter = delim; }
634 
640  void set_thousands(char val) { _thousands = val; }
641 
647  void set_decimal(char val) { _decimal = val; }
648 
654  void set_comment(char val) { _comment = val; }
655 
661  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
662 
668  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
669 
675  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
676 
682  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
683 
694  void set_quoting(quote_style quoting)
695  {
696  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
697  "Only MINIMAL and NONE are supported for quoting.");
698  _quoting = quoting;
699  }
700 
706  void set_quotechar(char ch) { _quotechar = ch; }
707 
713  void enable_doublequote(bool val) { _doublequote = val; }
714 
721  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
722 
728  void set_parse_dates(std::vector<std::string> col_names)
729  {
730  _parse_dates_names = std::move(col_names);
731  }
732 
738  void set_parse_dates(std::vector<int> col_indices)
739  {
740  _parse_dates_indexes = std::move(col_indices);
741  }
742 
748  void set_parse_hex(std::vector<std::string> col_names)
749  {
750  _parse_hex_names = std::move(col_names);
751  }
752 
758  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
759 
765  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
766 
772  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
773 
779  void set_true_values(std::vector<std::string> vals)
780  {
781  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
782  }
783 
789  void set_false_values(std::vector<std::string> vals)
790  {
791  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
792  }
793 
799  void set_na_values(std::vector<std::string> vals)
800  {
801  if ((!vals.empty()) and (!_na_filter)) {
802  CUDF_FAIL("Can't set na_values when na_filtering is disabled");
803  }
804 
805  _na_values = std::move(vals);
806  }
807 
813  void enable_keep_default_na(bool val) { _keep_default_na = val; }
814 
820  void enable_na_filter(bool val)
821  {
822  if (!val) { _na_values.clear(); }
823  _na_filter = val;
824  }
825 
831  void enable_dayfirst(bool val) { _dayfirst = val; }
832 
838  void set_timestamp_type(data_type type) { _timestamp_type = type; }
839 };
840 
846  csv_reader_options options;
847 
848  public:
855 
861  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
862 
870  {
871  options._compression = comp;
872  return *this;
873  }
874 
882  {
883  options.set_byte_range_offset(offset);
884  return *this;
885  }
886 
894  {
895  options.set_byte_range_size(size);
896  return *this;
897  }
898 
905  csv_reader_options_builder& names(std::vector<std::string> col_names)
906  {
907  options._names = std::move(col_names);
908  return *this;
909  }
910 
918  {
919  options._prefix = pfx;
920  return *this;
921  }
922 
930  {
931  options._mangle_dupe_cols = val;
932  return *this;
933  }
934 
941  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
942  {
943  options._use_cols_names = std::move(col_names);
944  return *this;
945  }
946 
953  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
954  {
955  options._use_cols_indexes = std::move(col_indices);
956  return *this;
957  }
958 
966  {
967  options.set_nrows(rows);
968  return *this;
969  }
970 
978  {
979  options.set_skiprows(skip);
980  return *this;
981  }
982 
990  {
991  options.set_skipfooter(skip);
992  return *this;
993  }
994 
1002  {
1003  options._header = hdr;
1004  return *this;
1005  }
1006 
1014  {
1015  options._lineterminator = term;
1016  return *this;
1017  }
1018 
1026  {
1027  options._delimiter = delim;
1028  return *this;
1029  }
1030 
1038  {
1039  options._thousands = val;
1040  return *this;
1041  }
1042 
1050  {
1051  options._decimal = val;
1052  return *this;
1053  }
1054 
1062  {
1063  options._comment = val;
1064  return *this;
1065  }
1066 
1074  {
1075  options._windowslinetermination = val;
1076  return *this;
1077  }
1078 
1086  {
1087  options._delim_whitespace = val;
1088  return *this;
1089  }
1090 
1098  {
1099  options._skipinitialspace = val;
1100  return *this;
1101  }
1102 
1110  {
1111  options._skip_blank_lines = val;
1112  return *this;
1113  }
1114 
1122  {
1123  options._quoting = style;
1124  return *this;
1125  }
1126 
1134  {
1135  options._quotechar = ch;
1136  return *this;
1137  }
1138 
1146  {
1147  options._doublequote = val;
1148  return *this;
1149  }
1150 
1159  {
1160  options._detect_whitespace_around_quotes = val;
1161  return *this;
1162  }
1163 
1170  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1171  {
1172  options._parse_dates_names = std::move(col_names);
1173  return *this;
1174  }
1175 
1182  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1183  {
1184  options._parse_dates_indexes = std::move(col_indices);
1185  return *this;
1186  }
1187 
1194  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1195  {
1196  options._parse_hex_names = std::move(col_names);
1197  return *this;
1198  }
1199 
1206  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1207  {
1208  options._parse_hex_indexes = std::move(col_indices);
1209  return *this;
1210  }
1211 
1218  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1219  {
1220  options._dtypes = std::move(types);
1221  return *this;
1222  }
1223 
1230  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1231  {
1232  options._dtypes = std::move(types);
1233  return *this;
1234  }
1235 
1242  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1243  {
1244  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1245  return *this;
1246  }
1247 
1254  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1255  {
1256  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1257  return *this;
1258  }
1259 
1266  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1267  {
1268  options.set_na_values(std::move(vals));
1269  return *this;
1270  }
1271 
1279  {
1280  options.enable_keep_default_na(val);
1281  return *this;
1282  }
1283 
1291  {
1292  options.enable_na_filter(val);
1293  return *this;
1294  }
1295 
1303  {
1304  options._dayfirst = val;
1305  return *this;
1306  }
1307 
1315  {
1316  options._timestamp_type = type;
1317  return *this;
1318  }
1319 
1323  operator csv_reader_options&&() { return std::move(options); }
1324 
1332  csv_reader_options&& build() { return std::move(options); }
1333 };
1334 
1353  csv_reader_options options,
1356  // end of group
1368 
1373  // Specify the sink to use for writer output
1374  sink_info _sink;
1375  // Set of columns to output
1376  table_view _table;
1377  // string to use for null entries
1378  std::string _na_rep = "";
1379  // Indicates whether to write headers to csv
1380  bool _include_header = true;
1381  // maximum number of rows to write in each chunk (limits memory use)
1382  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1383  // character to use for separating lines (default "\n")
1384  std::string _line_terminator = "\n";
1385  // character to use for separating column values (default ",")
1386  char _inter_column_delimiter = ',';
1387  // string to use for values != 0 in INT8 types (default 'true')
1388  std::string _true_value = std::string{"true"};
1389  // string to use for values == 0 in INT8 types (default 'false')
1390  std::string _false_value = std::string{"false"};
1391  // Names of all columns; if empty, writer will generate column names
1392  std::vector<std::string> _names;
1393  // Quote style. Currently only MINIMAL and NONE are supported.
1394  quote_style _quoting = quote_style::MINIMAL;
1395 
1402  explicit csv_writer_options(sink_info sink, table_view const& table)
1403  : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
1404  {
1405  }
1406 
1408 
1409  public:
1415  explicit csv_writer_options() = default;
1416 
1426 
1432  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1433 
1439  [[nodiscard]] table_view const& get_table() const { return _table; }
1440 
1446  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1447 
1453  [[nodiscard]] std::string get_na_rep() const { return _na_rep; }
1454 
1460  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1461 
1467  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1468 
1474  [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }
1475 
1481  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1482 
1488  [[nodiscard]] std::string get_true_value() const { return _true_value; }
1489 
1495  [[nodiscard]] std::string get_false_value() const { return _false_value; }
1496 
1507  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1508 
1509  // Setter
1515  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1516 
1522  void set_na_rep(std::string val) { _na_rep = val; }
1523 
1529  void enable_include_header(bool val) { _include_header = val; }
1530 
1536  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1537 
1543  void set_line_terminator(std::string term) { _line_terminator = term; }
1544 
1550  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1551 
1557  void set_true_value(std::string val) { _true_value = val; }
1558 
1564  void set_false_value(std::string val) { _false_value = val; }
1565 
1571  void set_table(table_view const& table) { _table = table; }
1572 
1583  void set_quoting(quote_style quoting)
1584  {
1585  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1586  "Only MINIMAL and NONE are supported for quoting.");
1587  _quoting = quoting;
1588  }
1589 };
1590 
1595  csv_writer_options options;
1596 
1597  public:
1603  explicit csv_writer_options_builder() = default;
1604 
1612  : options{sink, table}
1613  {
1614  }
1615 
1622  csv_writer_options_builder& names(std::vector<std::string> names)
1623  {
1624  options._names = names;
1625  return *this;
1626  }
1627 
1635  {
1636  options._na_rep = val;
1637  return *this;
1638  };
1639 
1647  {
1648  options._include_header = val;
1649  return *this;
1650  }
1651 
1659  {
1660  options._rows_per_chunk = val;
1661  return *this;
1662  }
1663 
1671  {
1672  options._line_terminator = term;
1673  return *this;
1674  }
1675 
1683  {
1684  options._inter_column_delimiter = delim;
1685  return *this;
1686  }
1687 
1695  {
1696  options._true_value = val;
1697  return *this;
1698  }
1699 
1707  {
1708  options._false_value = val;
1709  return *this;
1710  }
1711 
1721  {
1722  options.set_quoting(quoting);
1723  return *this;
1724  }
1725 
1729  operator csv_writer_options&&() { return std::move(options); }
1730 
1738  csv_writer_options&& build() { return std::move(options); }
1739 };
1740 
1758 void write_csv(csv_writer_options const& options,
1760  // end of group
1762 } // namespace io
1763 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
Builder to build options for read_csv().
Definition: csv.hpp:845
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1230
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1254
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:941
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1145
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1194
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:881
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1085
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:977
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1109
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1332
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1218
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1133
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1266
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1242
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1049
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1290
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1037
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1206
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:1158
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1073
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1182
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:965
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:905
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1314
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:929
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:989
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:893
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1278
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1121
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:1013
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1025
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:953
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1170
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:861
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1061
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:869
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:1001
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1302
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:917
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1097
Settings to use for read_csv().
Definition: csv.hpp:50
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:713
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:561
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:280
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:343
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:406
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:500
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:364
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:728
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:738
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:378
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:308
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:301
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:721
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:789
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:647
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:242
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
Definition: csv.hpp:386
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:458
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:315
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:249
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:765
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:197
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:426
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:451
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:831
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:799
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:694
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:434
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:838
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:183
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:486
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:472
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:357
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:329
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:626
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:706
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:336
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:682
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:661
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:587
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:493
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:668
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:235
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:772
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:601
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:479
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:211
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:530
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:169
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:813
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:396
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:537
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:190
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:266
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:256
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:551
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:176
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:371
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:779
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:465
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:619
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:322
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:444
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:758
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:640
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:820
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:515
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:633
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:544
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:273
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:416
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:675
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:287
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:571
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:654
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:350
size_type get_header() const
Returns header row index.
Definition: csv.hpp:294
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:748
Builder to build options for writer_csv()
Definition: csv.hpp:1594
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1738
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1720
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1694
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1646
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1634
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1670
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1706
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1622
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1682
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1611
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1658
Settings to use for write_csv().
Definition: csv.hpp:1372
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1571
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1536
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1495
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1583
std::string get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1474
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1543
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1488
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1550
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1557
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1439
std::string get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1453
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1529
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1460
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1522
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1481
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1432
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1446
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1507
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1564
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1467
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1515
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:40
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:93
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a CSV dataset into a set of columns.
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:86
compression_type
Compression algorithms.
Definition: io/types.hpp:57
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
Destination information for write interfaces.
Definition: io/types.hpp:512
Source information for read interfaces.
Definition: io/types.hpp:337
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.