csv.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
21 #include <cudf/types.hpp>
22 #include <cudf/utilities/error.hpp>
23 
25 
26 #include <memory>
27 #include <string>
28 #include <unordered_map>
29 #include <variant>
30 #include <vector>
31 
32 namespace cudf {
33 namespace io {
34 
44 class csv_reader_options_builder;
45 
51  source_info _source;
52 
53  // Read settings
54 
55  // Specify the compression format of the source or infer from file extension
57  // Bytes to skip from the source start
58  std::size_t _byte_range_offset = 0;
59  // Bytes to read; always reads complete rows
60  std::size_t _byte_range_size = 0;
61  // Names of all the columns; if empty then names are auto-generated
62  std::vector<std::string> _names;
63  // If there is no header or names, prepend this to the column ID as the name
64  std::string _prefix;
65  // Whether to rename duplicate column names
66  bool _mangle_dupe_cols = true;
67 
68  // Filter settings
69 
70  // Names of columns to read; empty is all columns
71  std::vector<std::string> _use_cols_names;
72  // Indexes of columns to read; empty is all columns
73  std::vector<int> _use_cols_indexes;
74  // Rows to read; -1 is all
75  size_type _nrows = -1;
76  // Rows to skip from the start
77  size_type _skiprows = 0;
78  // Rows to skip from the end
79  size_type _skipfooter = 0;
80  // Header row index
81  size_type _header = 0;
82 
83  // Parsing settings
84 
85  // Line terminator
86  char _lineterminator = '\n';
87  // Field delimiter
88  char _delimiter = ',';
89  // Numeric data thousands separator; cannot match delimiter
90  char _thousands = '\0';
91  // Decimal point character; cannot match delimiter
92  char _decimal = '.';
93  // Comment line start character
94  char _comment = '\0';
95  bool _windowslinetermination = false;
96  // Treat whitespace as field delimiter; overrides character delimiter
97  bool _delim_whitespace = false;
98  // Skip whitespace after the delimiter
99  bool _skipinitialspace = false;
100  // Ignore empty lines or parse line values as invalid
101  bool _skip_blank_lines = true;
102  // Treatment of quoting behavior
104  // Quoting character (if `quoting` is true)
105  char _quotechar = '"';
106  // Whether a quote inside a value is double-quoted
107  bool _doublequote = true;
108  // Names of columns to read as datetime
109  std::vector<std::string> _parse_dates_names;
110  // Indexes of columns to read as datetime
111  std::vector<int> _parse_dates_indexes;
112  // Names of columns to parse as hexadecimal
113  std::vector<std::string> _parse_hex_names;
114  // Indexes of columns to parse as hexadecimal
115  std::vector<int> _parse_hex_indexes;
116 
117  // Conversion settings
118 
119  // Per-column types; disables type inference on those columns
120  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
121  // Additional values to recognize as boolean true values
122  std::vector<std::string> _true_values{"True", "TRUE", "true"};
123  // Additional values to recognize as boolean false values
124  std::vector<std::string> _false_values{"False", "FALSE", "false"};
125  // Additional values to recognize as null values
126  std::vector<std::string> _na_values;
127  // Whether to keep the built-in default NA values
128  bool _keep_default_na = true;
129  // Whether to disable null filter; disabling can improve performance
130  bool _na_filter = true;
131  // Whether to parse dates as DD/MM versus MM/DD
132  bool _dayfirst = false;
133  // Cast timestamp columns to a specific type
134  data_type _timestamp_type{type_id::EMPTY};
135 
141  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
142 
144 
145  public:
151  csv_reader_options() = default;
152 
160 
166  [[nodiscard]] source_info const& get_source() const { return _source; }
167 
173  [[nodiscard]] compression_type get_compression() const { return _compression; }
174 
180  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
181 
187  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
188 
194  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
195  {
196  if (_byte_range_size == 0) {
197  return 0;
198  } else {
199  return _byte_range_size + get_byte_range_padding();
200  }
201  }
202 
208  [[nodiscard]] std::size_t get_byte_range_padding() const
209  {
210  auto const num_names = _names.size();
211  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
212  auto const num_columns = std::max(num_dtypes, num_names);
213 
214  auto const max_row_bytes = 16 * 1024; // 16KB
215  auto const column_bytes = 64;
216  auto const base_padding = 1024; // 1KB
217 
218  if (num_columns == 0) {
219  // Use flat size if the number of columns is not known
220  return max_row_bytes;
221  }
222 
223  // Expand the size based on the number of columns, if available
224  return base_padding + num_columns * column_bytes;
225  }
226 
232  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
233 
239  [[nodiscard]] std::string get_prefix() const { return _prefix; }
240 
246  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
247 
253  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
254  {
255  return _use_cols_names;
256  }
257 
263  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
264 
270  [[nodiscard]] size_type get_nrows() const { return _nrows; }
271 
277  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
278 
284  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
285 
291  [[nodiscard]] size_type get_header() const { return _header; }
292 
298  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
299 
305  [[nodiscard]] char get_delimiter() const { return _delimiter; }
306 
312  [[nodiscard]] char get_thousands() const { return _thousands; }
313 
319  [[nodiscard]] char get_decimal() const { return _decimal; }
320 
326  [[nodiscard]] char get_comment() const { return _comment; }
327 
333  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
334 
340  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
341 
347  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
348 
354  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
355 
361  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
362 
368  [[nodiscard]] char get_quotechar() const { return _quotechar; }
369 
375  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
376 
382  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
383  {
384  return _parse_dates_names;
385  }
386 
392  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
393  {
394  return _parse_dates_indexes;
395  }
396 
402  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
403  {
404  return _parse_hex_names;
405  }
406 
412  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
413 
419  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
420  {
421  return _dtypes;
422  }
423 
429  std::vector<std::string> const& get_true_values() const { return _true_values; }
430 
436  std::vector<std::string> const& get_false_values() const { return _false_values; }
437 
443  std::vector<std::string> const& get_na_values() const { return _na_values; }
444 
450  bool is_enabled_keep_default_na() const { return _keep_default_na; }
451 
457  bool is_enabled_na_filter() const { return _na_filter; }
458 
464  bool is_enabled_dayfirst() const { return _dayfirst; }
465 
471  data_type get_timestamp_type() const { return _timestamp_type; }
472 
478  void set_compression(compression_type comp) { _compression = comp; }
479 
485  void set_byte_range_offset(std::size_t offset)
486  {
487  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
488  CUDF_FAIL(
489  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
490  "value");
491  }
492  _byte_range_offset = offset;
493  }
494 
500  void set_byte_range_size(std::size_t size)
501  {
502  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
503  CUDF_FAIL(
504  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
505  "non-zero.");
506  }
507  _byte_range_size = size;
508  }
509 
515  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
516 
522  void set_prefix(std::string pfx) { _prefix = pfx; }
523 
529  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
530 
536  void set_use_cols_names(std::vector<std::string> col_names)
537  {
538  _use_cols_names = std::move(col_names);
539  }
540 
546  void set_use_cols_indexes(std::vector<int> col_indices)
547  {
548  _use_cols_indexes = std::move(col_indices);
549  }
550 
556  void set_nrows(size_type nrows)
557  {
558  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
559  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
560  CUDF_FAIL(
561  "nrows can't be a non negative value if range offset and/or range size has been set");
562  }
563 
564  _nrows = nrows;
565  }
566 
572  void set_skiprows(size_type skiprows)
573  {
574  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
575  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
576  std::invalid_argument);
577  }
578  _skiprows = skiprows;
579  }
580 
586  void set_skipfooter(size_type skipfooter)
587  {
588  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
589  "Cannot use both `nrows` and `skipfooter`",
590  std::invalid_argument);
591  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
592  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
593  std::invalid_argument);
594  }
595 
596  _skipfooter = skipfooter;
597  }
598 
604  void set_header(size_type hdr) { _header = hdr; }
605 
611  void set_lineterminator(char term) { _lineterminator = term; }
612 
618  void set_delimiter(char delim) { _delimiter = delim; }
619 
625  void set_thousands(char val) { _thousands = val; }
626 
632  void set_decimal(char val) { _decimal = val; }
633 
639  void set_comment(char val) { _comment = val; }
640 
646  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
647 
653  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
654 
660  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
661 
667  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
668 
679  void set_quoting(quote_style quoting)
680  {
681  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
682  "Only MINIMAL and NONE are supported for quoting.");
683  _quoting = quoting;
684  }
685 
691  void set_quotechar(char ch) { _quotechar = ch; }
692 
698  void enable_doublequote(bool val) { _doublequote = val; }
699 
705  void set_parse_dates(std::vector<std::string> col_names)
706  {
707  _parse_dates_names = std::move(col_names);
708  }
709 
715  void set_parse_dates(std::vector<int> col_indices)
716  {
717  _parse_dates_indexes = std::move(col_indices);
718  }
719 
725  void set_parse_hex(std::vector<std::string> col_names)
726  {
727  _parse_hex_names = std::move(col_names);
728  }
729 
735  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
736 
742  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
743 
749  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
750 
756  void set_true_values(std::vector<std::string> vals)
757  {
758  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
759  }
760 
766  void set_false_values(std::vector<std::string> vals)
767  {
768  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
769  }
770 
776  void set_na_values(std::vector<std::string> vals)
777  {
778  if ((!vals.empty()) and (!_na_filter)) {
779  CUDF_FAIL("Can't set na_values when na_filtering is disabled");
780  }
781 
782  _na_values = std::move(vals);
783  }
784 
790  void enable_keep_default_na(bool val) { _keep_default_na = val; }
791 
797  void enable_na_filter(bool val)
798  {
799  if (!val) { _na_values.clear(); }
800  _na_filter = val;
801  }
802 
808  void enable_dayfirst(bool val) { _dayfirst = val; }
809 
815  void set_timestamp_type(data_type type) { _timestamp_type = type; }
816 };
817 
823  csv_reader_options options;
824 
825  public:
832 
838  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
839 
847  {
848  options._compression = comp;
849  return *this;
850  }
851 
859  {
860  options.set_byte_range_offset(offset);
861  return *this;
862  }
863 
871  {
872  options.set_byte_range_size(size);
873  return *this;
874  }
875 
882  csv_reader_options_builder& names(std::vector<std::string> col_names)
883  {
884  options._names = std::move(col_names);
885  return *this;
886  }
887 
895  {
896  options._prefix = pfx;
897  return *this;
898  }
899 
907  {
908  options._mangle_dupe_cols = val;
909  return *this;
910  }
911 
918  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
919  {
920  options._use_cols_names = std::move(col_names);
921  return *this;
922  }
923 
930  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
931  {
932  options._use_cols_indexes = std::move(col_indices);
933  return *this;
934  }
935 
943  {
944  options.set_nrows(rows);
945  return *this;
946  }
947 
955  {
956  options.set_skiprows(skip);
957  return *this;
958  }
959 
967  {
968  options.set_skipfooter(skip);
969  return *this;
970  }
971 
979  {
980  options._header = hdr;
981  return *this;
982  }
983 
991  {
992  options._lineterminator = term;
993  return *this;
994  }
995 
1003  {
1004  options._delimiter = delim;
1005  return *this;
1006  }
1007 
1015  {
1016  options._thousands = val;
1017  return *this;
1018  }
1019 
1027  {
1028  options._decimal = val;
1029  return *this;
1030  }
1031 
1039  {
1040  options._comment = val;
1041  return *this;
1042  }
1043 
1051  {
1052  options._windowslinetermination = val;
1053  return *this;
1054  }
1055 
1063  {
1064  options._delim_whitespace = val;
1065  return *this;
1066  }
1067 
1075  {
1076  options._skipinitialspace = val;
1077  return *this;
1078  }
1079 
1087  {
1088  options._skip_blank_lines = val;
1089  return *this;
1090  }
1091 
1099  {
1100  options._quoting = style;
1101  return *this;
1102  }
1103 
1111  {
1112  options._quotechar = ch;
1113  return *this;
1114  }
1115 
1123  {
1124  options._doublequote = val;
1125  return *this;
1126  }
1127 
1134  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1135  {
1136  options._parse_dates_names = std::move(col_names);
1137  return *this;
1138  }
1139 
1146  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1147  {
1148  options._parse_dates_indexes = std::move(col_indices);
1149  return *this;
1150  }
1151 
1158  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1159  {
1160  options._parse_hex_names = std::move(col_names);
1161  return *this;
1162  }
1163 
1170  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1171  {
1172  options._parse_hex_indexes = std::move(col_indices);
1173  return *this;
1174  }
1175 
1182  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1183  {
1184  options._dtypes = std::move(types);
1185  return *this;
1186  }
1187 
1194  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1195  {
1196  options._dtypes = std::move(types);
1197  return *this;
1198  }
1199 
1206  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1207  {
1208  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1209  return *this;
1210  }
1211 
1218  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1219  {
1220  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1221  return *this;
1222  }
1223 
1230  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1231  {
1232  options.set_na_values(std::move(vals));
1233  return *this;
1234  }
1235 
1243  {
1244  options.enable_keep_default_na(val);
1245  return *this;
1246  }
1247 
1255  {
1256  options.enable_na_filter(val);
1257  return *this;
1258  }
1259 
1267  {
1268  options._dayfirst = val;
1269  return *this;
1270  }
1271 
1279  {
1280  options._timestamp_type = type;
1281  return *this;
1282  }
1283 
1287  operator csv_reader_options&&() { return std::move(options); }
1288 
1296  csv_reader_options&& build() { return std::move(options); }
1297 };
1298 
1317  csv_reader_options options,
1320  // end of group
1332 
1337  // Specify the sink to use for writer output
1338  sink_info _sink;
1339  // Set of columns to output
1340  table_view _table;
1341  // string to use for null entries
1342  std::string _na_rep = "";
1343  // Indicates whether to write headers to csv
1344  bool _include_header = true;
1345  // maximum number of rows to write in each chunk (limits memory use)
1346  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1347  // character to use for separating lines (default "\n")
1348  std::string _line_terminator = "\n";
1349  // character to use for separating column values (default ",")
1350  char _inter_column_delimiter = ',';
1351  // string to use for values != 0 in INT8 types (default 'true')
1352  std::string _true_value = std::string{"true"};
1353  // string to use for values == 0 in INT8 types (default 'false')
1354  std::string _false_value = std::string{"false"};
1355  // Names of all columns; if empty, writer will generate column names
1356  std::vector<std::string> _names;
1357  // Quote style. Currently only MINIMAL and NONE are supported.
1358  quote_style _quoting = quote_style::MINIMAL;
1359 
1366  explicit csv_writer_options(sink_info const& sink, table_view const& table)
1367  : _sink(sink), _table(table), _rows_per_chunk(table.num_rows())
1368  {
1369  }
1370 
1372 
1373  public:
1379  explicit csv_writer_options() = default;
1380 
1390 
1396  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1397 
1403  [[nodiscard]] table_view const& get_table() const { return _table; }
1404 
1410  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1411 
1417  [[nodiscard]] std::string get_na_rep() const { return _na_rep; }
1418 
1424  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1425 
1431  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1432 
1438  [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }
1439 
1445  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1446 
1452  [[nodiscard]] std::string get_true_value() const { return _true_value; }
1453 
1459  [[nodiscard]] std::string get_false_value() const { return _false_value; }
1460 
1471  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1472 
1473  // Setter
1479  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1480 
1486  void set_na_rep(std::string val) { _na_rep = val; }
1487 
1493  void enable_include_header(bool val) { _include_header = val; }
1494 
1500  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1501 
1507  void set_line_terminator(std::string term) { _line_terminator = term; }
1508 
1514  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1515 
1521  void set_true_value(std::string val) { _true_value = val; }
1522 
1528  void set_false_value(std::string val) { _false_value = val; }
1529 
1535  void set_table(table_view const& table) { _table = table; }
1536 
1547  void set_quoting(quote_style quoting)
1548  {
1549  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1550  "Only MINIMAL and NONE are supported for quoting.");
1551  _quoting = quoting;
1552  }
1553 };
1554 
1559  csv_writer_options options;
1560 
1561  public:
1567  explicit csv_writer_options_builder() = default;
1568 
1576  : options{sink, table}
1577  {
1578  }
1579 
1586  csv_writer_options_builder& names(std::vector<std::string> names)
1587  {
1588  options._names = names;
1589  return *this;
1590  }
1591 
1599  {
1600  options._na_rep = val;
1601  return *this;
1602  };
1603 
1611  {
1612  options._include_header = val;
1613  return *this;
1614  }
1615 
1623  {
1624  options._rows_per_chunk = val;
1625  return *this;
1626  }
1627 
1635  {
1636  options._line_terminator = term;
1637  return *this;
1638  }
1639 
1647  {
1648  options._inter_column_delimiter = delim;
1649  return *this;
1650  }
1651 
1659  {
1660  options._true_value = val;
1661  return *this;
1662  }
1663 
1671  {
1672  options._false_value = val;
1673  return *this;
1674  }
1675 
1685  {
1686  options.set_quoting(quoting);
1687  return *this;
1688  }
1689 
1693  operator csv_writer_options&&() { return std::move(options); }
1694 
1702  csv_writer_options&& build() { return std::move(options); }
1703 };
1704 
1723 void write_csv(csv_writer_options const& options,
1726  // end of group
1728 } // namespace io
1729 } // namespace cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:241
Builder to build options for read_csv().
Definition: csv.hpp:822
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1194
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1218
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:918
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1122
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1158
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:858
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1062
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:954
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1086
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1296
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1182
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1110
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1230
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1206
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1026
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1254
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1014
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1170
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1050
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1146
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:942
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:882
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1278
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:906
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:966
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:870
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1242
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1098
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:990
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1002
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:930
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1134
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:838
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1038
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:846
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:978
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1266
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:894
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1074
Settings to use for read_csv().
Definition: csv.hpp:50
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:698
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:546
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:277
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:340
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:392
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:485
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:361
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:705
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:715
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:375
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:305
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:298
csv_reader_options()=default
Default constructor.
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:766
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:632
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:239
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:443
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:312
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:246
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:742
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:194
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:412
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:436
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:808
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:776
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:679
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:419
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:815
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:180
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:471
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:457
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:354
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:326
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:611
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:691
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:333
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:667
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:646
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:572
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:478
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:653
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:232
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:749
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:586
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:464
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:208
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:515
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:166
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:790
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:382
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:522
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:187
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:263
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:253
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:536
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:173
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:368
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:756
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:450
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:604
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:319
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:429
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:735
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:625
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:797
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:500
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:618
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:529
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:270
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:402
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:660
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:284
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:556
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:639
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:347
size_type get_header() const
Returns header row index.
Definition: csv.hpp:291
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:725
Builder to build options for writer_csv()
Definition: csv.hpp:1558
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1702
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1684
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1658
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1610
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1598
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1634
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1670
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1586
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1646
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1575
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1622
Settings to use for write_csv().
Definition: csv.hpp:1336
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1535
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1500
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1459
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1547
std::string get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1438
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1507
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1452
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1514
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1521
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1403
std::string get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1417
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1493
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1424
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1486
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1445
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1396
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1410
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1471
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1528
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1431
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1479
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
A set of cudf::column's of the same size.
Definition: table.hpp:40
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:93
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads a CSV dataset into a set of columns.
compression_type
Compression algorithms.
Definition: io/types.hpp:56
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:85
@ AUTO
Automatically detect or select compression format.
@ MINIMAL
Quote only fields which contain special characters.
@ NONE
Never quote fields; disable quotation parsing.
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Writes a set of columns to CSV format.
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:176
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:215
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:34
Destination information for write interfaces.
Definition: io/types.hpp:469
Source information for read interfaces.
Definition: io/types.hpp:294
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:249
Class definitions for (mutable)_table_view
Type declarations for libcudf.