csv.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
21 #include <cudf/types.hpp>
22 #include <cudf/utilities/error.hpp>
23 
25 #include <rmm/resource_ref.hpp>
26 
27 #include <memory>
28 #include <string>
29 #include <unordered_map>
30 #include <utility>
31 #include <variant>
32 #include <vector>
33 
34 namespace CUDF_EXPORT cudf {
35 namespace io {
36 
46 class csv_reader_options_builder;
47 
53  source_info _source;
54 
55  // Read settings
56 
57  // Specify the compression format of the source or infer from file extension
58  compression_type _compression = compression_type::AUTO;
59  // Bytes to skip from the source start
60  std::size_t _byte_range_offset = 0;
61  // Bytes to read; always reads complete rows
62  std::size_t _byte_range_size = 0;
63  // Names of all the columns; if empty then names are auto-generated
64  std::vector<std::string> _names;
65  // If there is no header or names, prepend this to the column ID as the name
66  std::string _prefix;
67  // Whether to rename duplicate column names
68  bool _mangle_dupe_cols = true;
69 
70  // Filter settings
71 
72  // Names of columns to read; empty is all columns
73  std::vector<std::string> _use_cols_names;
74  // Indexes of columns to read; empty is all columns
75  std::vector<int> _use_cols_indexes;
76  // Rows to read; -1 is all
77  size_type _nrows = -1;
78  // Rows to skip from the start
79  size_type _skiprows = 0;
80  // Rows to skip from the end
81  size_type _skipfooter = 0;
82  // Header row index
83  size_type _header = 0;
84 
85  // Parsing settings
86 
87  // Line terminator
88  char _lineterminator = '\n';
89  // Field delimiter
90  char _delimiter = ',';
91  // Numeric data thousands separator; cannot match delimiter
92  char _thousands = '\0';
93  // Decimal point character; cannot match delimiter
94  char _decimal = '.';
95  // Comment line start character
96  char _comment = '\0';
97  bool _windowslinetermination = false;
98  // Treat whitespace as field delimiter; overrides character delimiter
99  bool _delim_whitespace = false;
100  // Skip whitespace after the delimiter
101  bool _skipinitialspace = false;
102  // Ignore empty lines or parse line values as invalid
103  bool _skip_blank_lines = true;
104  // Treatment of quoting behavior
105  quote_style _quoting = quote_style::MINIMAL;
106  // Quoting character (if `quoting` is true)
107  char _quotechar = '"';
108  // Whether a quote inside a value is double-quoted
109  bool _doublequote = true;
110  // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
111  // _doublequote is true
112  bool _detect_whitespace_around_quotes = false;
113  // Names of columns to read as datetime
114  std::vector<std::string> _parse_dates_names;
115  // Indexes of columns to read as datetime
116  std::vector<int> _parse_dates_indexes;
117  // Names of columns to parse as hexadecimal
118  std::vector<std::string> _parse_hex_names;
119  // Indexes of columns to parse as hexadecimal
120  std::vector<int> _parse_hex_indexes;
121 
122  // Conversion settings
123 
124  // Per-column types; disables type inference on those columns
125  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
126  // Additional values to recognize as boolean true values
127  std::vector<std::string> _true_values{"True", "TRUE", "true"};
128  // Additional values to recognize as boolean false values
129  std::vector<std::string> _false_values{"False", "FALSE", "false"};
130  // Additional values to recognize as null values
131  std::vector<std::string> _na_values;
132  // Whether to keep the built-in default NA values
133  bool _keep_default_na = true;
134  // Whether to disable null filter; disabling can improve performance
135  bool _na_filter = true;
136  // Whether to parse dates as DD/MM versus MM/DD
137  bool _dayfirst = false;
138  // Cast timestamp columns to a specific type
139  data_type _timestamp_type{type_id::EMPTY};
140 
146  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
147 
149 
150  public:
156  csv_reader_options() = default;
157 
165 
171  [[nodiscard]] source_info const& get_source() const { return _source; }
172 
178  [[nodiscard]] compression_type get_compression() const { return _compression; }
179 
185  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
186 
192  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
193 
199  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
200  {
201  if (_byte_range_size == 0) {
202  return 0;
203  } else {
204  return _byte_range_size + get_byte_range_padding();
205  }
206  }
207 
213  [[nodiscard]] std::size_t get_byte_range_padding() const
214  {
215  auto const num_names = _names.size();
216  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
217  auto const num_columns = std::max(num_dtypes, num_names);
218 
219  auto const max_row_bytes = 16 * 1024; // 16KB
220  auto const column_bytes = 64;
221  auto const base_padding = 1024; // 1KB
222 
223  if (num_columns == 0) {
224  // Use flat size if the number of columns is not known
225  return max_row_bytes;
226  }
227 
228  // Expand the size based on the number of columns, if available
229  return base_padding + num_columns * column_bytes;
230  }
231 
237  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
238 
244  [[nodiscard]] std::string get_prefix() const { return _prefix; }
245 
251  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
252 
258  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
259  {
260  return _use_cols_names;
261  }
262 
268  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
269 
275  [[nodiscard]] size_type get_nrows() const { return _nrows; }
276 
282  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
283 
289  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
290 
296  [[nodiscard]] size_type get_header() const { return _header; }
297 
303  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
304 
310  [[nodiscard]] char get_delimiter() const { return _delimiter; }
311 
317  [[nodiscard]] char get_thousands() const { return _thousands; }
318 
324  [[nodiscard]] char get_decimal() const { return _decimal; }
325 
331  [[nodiscard]] char get_comment() const { return _comment; }
332 
338  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
339 
345  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
346 
352  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
353 
359  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
360 
366  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
367 
373  [[nodiscard]] char get_quotechar() const { return _quotechar; }
374 
380  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
381 
389  {
390  return _detect_whitespace_around_quotes;
391  }
392 
398  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
399  {
400  return _parse_dates_names;
401  }
402 
408  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
409  {
410  return _parse_dates_indexes;
411  }
412 
418  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
419  {
420  return _parse_hex_names;
421  }
422 
428  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
429 
435  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
436  get_dtypes() const
437  {
438  return _dtypes;
439  }
440 
446  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
447 
453  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
454 
460  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
461 
467  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
468 
474  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
475 
481  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
482 
488  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
489 
495  void set_compression(compression_type comp) { _compression = comp; }
496 
502  void set_byte_range_offset(std::size_t offset)
503  {
504  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
505  CUDF_FAIL(
506  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
507  "value");
508  }
509  _byte_range_offset = offset;
510  }
511 
517  void set_byte_range_size(std::size_t size)
518  {
519  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
520  CUDF_FAIL(
521  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
522  "non-zero.");
523  }
524  _byte_range_size = size;
525  }
526 
532  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
533 
539  void set_prefix(std::string pfx) { _prefix = pfx; }
540 
546  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
547 
553  void set_use_cols_names(std::vector<std::string> col_names)
554  {
555  _use_cols_names = std::move(col_names);
556  }
557 
563  void set_use_cols_indexes(std::vector<int> col_indices)
564  {
565  _use_cols_indexes = std::move(col_indices);
566  }
567 
573  void set_nrows(size_type nrows)
574  {
575  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
576  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
577  CUDF_FAIL(
578  "nrows can't be a non negative value if range offset and/or range size has been set");
579  }
580 
581  _nrows = nrows;
582  }
583 
589  void set_skiprows(size_type skiprows)
590  {
591  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
592  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
593  std::invalid_argument);
594  }
595  _skiprows = skiprows;
596  }
597 
603  void set_skipfooter(size_type skipfooter)
604  {
605  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
606  "Cannot use both `nrows` and `skipfooter`",
607  std::invalid_argument);
608  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
609  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
610  std::invalid_argument);
611  }
612 
613  _skipfooter = skipfooter;
614  }
615 
621  void set_header(size_type hdr) { _header = hdr; }
622 
628  void set_lineterminator(char term) { _lineterminator = term; }
629 
635  void set_delimiter(char delim) { _delimiter = delim; }
636 
642  void set_thousands(char val) { _thousands = val; }
643 
649  void set_decimal(char val) { _decimal = val; }
650 
656  void set_comment(char val) { _comment = val; }
657 
663  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
664 
670  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
671 
677  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
678 
684  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
685 
696  void set_quoting(quote_style quoting)
697  {
698  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
699  "Only MINIMAL and NONE are supported for quoting.");
700  _quoting = quoting;
701  }
702 
708  void set_quotechar(char ch) { _quotechar = ch; }
709 
715  void enable_doublequote(bool val) { _doublequote = val; }
716 
723  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
724 
730  void set_parse_dates(std::vector<std::string> col_names)
731  {
732  _parse_dates_names = std::move(col_names);
733  }
734 
740  void set_parse_dates(std::vector<int> col_indices)
741  {
742  _parse_dates_indexes = std::move(col_indices);
743  }
744 
750  void set_parse_hex(std::vector<std::string> col_names)
751  {
752  _parse_hex_names = std::move(col_names);
753  }
754 
760  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
761 
767  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
768 
774  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
775 
781  void set_true_values(std::vector<std::string> vals)
782  {
783  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
784  }
785 
791  void set_false_values(std::vector<std::string> vals)
792  {
793  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
794  }
795 
801  void set_na_values(std::vector<std::string> vals)
802  {
803  if ((!vals.empty()) and (!_na_filter)) {
804  CUDF_FAIL("Can't set na_values when na_filtering is disabled");
805  }
806 
807  _na_values = std::move(vals);
808  }
809 
815  void enable_keep_default_na(bool val) { _keep_default_na = val; }
816 
822  void enable_na_filter(bool val)
823  {
824  if (!val) { _na_values.clear(); }
825  _na_filter = val;
826  }
827 
833  void enable_dayfirst(bool val) { _dayfirst = val; }
834 
840  void set_timestamp_type(data_type type) { _timestamp_type = type; }
841 };
842 
848  csv_reader_options options;
849 
850  public:
857 
863  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
864 
872  {
873  options._compression = comp;
874  return *this;
875  }
876 
884  {
885  options.set_byte_range_offset(offset);
886  return *this;
887  }
888 
896  {
897  options.set_byte_range_size(size);
898  return *this;
899  }
900 
907  csv_reader_options_builder& names(std::vector<std::string> col_names)
908  {
909  options._names = std::move(col_names);
910  return *this;
911  }
912 
920  {
921  options._prefix = pfx;
922  return *this;
923  }
924 
932  {
933  options._mangle_dupe_cols = val;
934  return *this;
935  }
936 
943  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
944  {
945  options._use_cols_names = std::move(col_names);
946  return *this;
947  }
948 
955  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
956  {
957  options._use_cols_indexes = std::move(col_indices);
958  return *this;
959  }
960 
968  {
969  options.set_nrows(rows);
970  return *this;
971  }
972 
980  {
981  options.set_skiprows(skip);
982  return *this;
983  }
984 
992  {
993  options.set_skipfooter(skip);
994  return *this;
995  }
996 
1004  {
1005  options._header = hdr;
1006  return *this;
1007  }
1008 
1016  {
1017  options._lineterminator = term;
1018  return *this;
1019  }
1020 
1028  {
1029  options._delimiter = delim;
1030  return *this;
1031  }
1032 
1040  {
1041  options._thousands = val;
1042  return *this;
1043  }
1044 
1052  {
1053  options._decimal = val;
1054  return *this;
1055  }
1056 
1064  {
1065  options._comment = val;
1066  return *this;
1067  }
1068 
1076  {
1077  options._windowslinetermination = val;
1078  return *this;
1079  }
1080 
1088  {
1089  options._delim_whitespace = val;
1090  return *this;
1091  }
1092 
1100  {
1101  options._skipinitialspace = val;
1102  return *this;
1103  }
1104 
1112  {
1113  options._skip_blank_lines = val;
1114  return *this;
1115  }
1116 
1124  {
1125  options._quoting = style;
1126  return *this;
1127  }
1128 
1136  {
1137  options._quotechar = ch;
1138  return *this;
1139  }
1140 
1148  {
1149  options._doublequote = val;
1150  return *this;
1151  }
1152 
1161  {
1162  options._detect_whitespace_around_quotes = val;
1163  return *this;
1164  }
1165 
1172  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1173  {
1174  options._parse_dates_names = std::move(col_names);
1175  return *this;
1176  }
1177 
1184  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1185  {
1186  options._parse_dates_indexes = std::move(col_indices);
1187  return *this;
1188  }
1189 
1196  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1197  {
1198  options._parse_hex_names = std::move(col_names);
1199  return *this;
1200  }
1201 
1208  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1209  {
1210  options._parse_hex_indexes = std::move(col_indices);
1211  return *this;
1212  }
1213 
1220  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1221  {
1222  options._dtypes = std::move(types);
1223  return *this;
1224  }
1225 
1232  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1233  {
1234  options._dtypes = std::move(types);
1235  return *this;
1236  }
1237 
1244  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1245  {
1246  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1247  return *this;
1248  }
1249 
1256  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1257  {
1258  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1259  return *this;
1260  }
1261 
1268  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1269  {
1270  options.set_na_values(std::move(vals));
1271  return *this;
1272  }
1273 
1281  {
1282  options.enable_keep_default_na(val);
1283  return *this;
1284  }
1285 
1293  {
1294  options.enable_na_filter(val);
1295  return *this;
1296  }
1297 
1305  {
1306  options._dayfirst = val;
1307  return *this;
1308  }
1309 
1317  {
1318  options._timestamp_type = type;
1319  return *this;
1320  }
1321 
1325  operator csv_reader_options&&() { return std::move(options); }
1326 
1334  csv_reader_options&& build() { return std::move(options); }
1335 };
1336 
1355  csv_reader_options options,
1358  // end of group
1370 
1375  // Specify the sink to use for writer output
1376  sink_info _sink;
1377  // Set of columns to output
1378  table_view _table;
1379  // string to use for null entries
1380  std::string _na_rep = "";
1381  // Indicates whether to write headers to csv
1382  bool _include_header = true;
1383  // maximum number of rows to write in each chunk (limits memory use)
1384  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1385  // character to use for separating lines (default "\n")
1386  std::string _line_terminator = "\n";
1387  // character to use for separating column values (default ",")
1388  char _inter_column_delimiter = ',';
1389  // string to use for values != 0 in INT8 types (default 'true')
1390  std::string _true_value = std::string{"true"};
1391  // string to use for values == 0 in INT8 types (default 'false')
1392  std::string _false_value = std::string{"false"};
1393  // Names of all columns; if empty, writer will generate column names
1394  std::vector<std::string> _names;
1395  // Quote style. Currently only MINIMAL and NONE are supported.
1396  quote_style _quoting = quote_style::MINIMAL;
1397 
1404  explicit csv_writer_options(sink_info sink, table_view const& table)
1405  : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
1406  {
1407  }
1408 
1410 
1411  public:
1417  explicit csv_writer_options() = default;
1418 
1428 
1434  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1435 
1441  [[nodiscard]] table_view const& get_table() const { return _table; }
1442 
1448  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1449 
1455  [[nodiscard]] std::string get_na_rep() const { return _na_rep; }
1456 
1462  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1463 
1469  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1470 
1476  [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }
1477 
1483  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1484 
1490  [[nodiscard]] std::string get_true_value() const { return _true_value; }
1491 
1497  [[nodiscard]] std::string get_false_value() const { return _false_value; }
1498 
1509  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1510 
1511  // Setter
1517  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1518 
1524  void set_na_rep(std::string val) { _na_rep = val; }
1525 
1531  void enable_include_header(bool val) { _include_header = val; }
1532 
1538  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1539 
1545  void set_line_terminator(std::string term) { _line_terminator = term; }
1546 
1552  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1553 
1559  void set_true_value(std::string val) { _true_value = val; }
1560 
1566  void set_false_value(std::string val) { _false_value = val; }
1567 
1573  void set_table(table_view const& table) { _table = table; }
1574 
1585  void set_quoting(quote_style quoting)
1586  {
1587  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1588  "Only MINIMAL and NONE are supported for quoting.");
1589  _quoting = quoting;
1590  }
1591 };
1592 
1597  csv_writer_options options;
1598 
1599  public:
1605  explicit csv_writer_options_builder() = default;
1606 
1614  : options{sink, table}
1615  {
1616  }
1617 
1624  csv_writer_options_builder& names(std::vector<std::string> names)
1625  {
1626  options._names = names;
1627  return *this;
1628  }
1629 
1637  {
1638  options._na_rep = val;
1639  return *this;
1640  };
1641 
1649  {
1650  options._include_header = val;
1651  return *this;
1652  }
1653 
1661  {
1662  options._rows_per_chunk = val;
1663  return *this;
1664  }
1665 
1673  {
1674  options._line_terminator = term;
1675  return *this;
1676  }
1677 
1685  {
1686  options._inter_column_delimiter = delim;
1687  return *this;
1688  }
1689 
1697  {
1698  options._true_value = val;
1699  return *this;
1700  }
1701 
1709  {
1710  options._false_value = val;
1711  return *this;
1712  }
1713 
1723  {
1724  options.set_quoting(quoting);
1725  return *this;
1726  }
1727 
1731  operator csv_writer_options&&() { return std::move(options); }
1732 
1740  csv_writer_options&& build() { return std::move(options); }
1741 };
1742 
1760 void write_csv(csv_writer_options const& options,
1762  // end of group
1764 } // namespace io
1765 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
Builder to build options for read_csv().
Definition: csv.hpp:847
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1232
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1256
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:943
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1147
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1196
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:883
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1087
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:979
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1111
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1334
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1220
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1135
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1268
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1244
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1051
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1292
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1039
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1208
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:1160
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1075
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1184
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:967
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:907
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1316
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:931
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:991
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:895
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1280
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1123
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:1015
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1027
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:955
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1172
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:863
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1063
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:871
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:1003
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1304
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:919
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1099
Settings to use for read_csv().
Definition: csv.hpp:52
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:715
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:563
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:282
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:345
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:408
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:502
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:366
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:730
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:740
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:380
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:310
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:303
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:723
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:791
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:649
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:244
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
Definition: csv.hpp:388
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:460
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:317
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:251
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:767
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:199
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:428
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:453
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:833
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:801
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:696
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:436
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:840
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:185
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:488
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:474
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:359
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:331
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:628
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:708
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:338
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:684
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:663
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:589
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:495
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:670
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:237
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:774
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:603
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:481
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:213
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:532
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:171
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:815
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:398
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:539
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:192
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:268
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:258
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:553
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:178
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:373
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:781
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:467
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:621
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:324
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:446
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:760
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:642
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:822
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:517
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:635
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:546
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:275
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:418
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:677
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:289
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:573
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:656
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:352
size_type get_header() const
Returns header row index.
Definition: csv.hpp:296
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:750
Builder to build options for writer_csv()
Definition: csv.hpp:1596
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1740
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1722
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1696
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1648
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1636
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1672
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1708
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1624
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1684
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1613
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1660
Settings to use for write_csv().
Definition: csv.hpp:1374
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1573
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1538
std::string get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1497
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1585
std::string get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1476
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1545
csv_writer_options()=default
Default constructor.
std::string get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1490
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1552
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1559
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1441
std::string get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1455
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1531
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1462
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1524
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1483
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1434
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1448
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1509
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1566
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1469
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1517
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:41
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:94
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Reads a CSV dataset into a set of columns.
compression_type
Compression algorithms.
Definition: io/types.hpp:57
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:86
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF-IO API type definitions
cuDF interfaces
Definition: aggregation.hpp:35
Destination information for write interfaces.
Definition: io/types.hpp:512
Source information for read interfaces.
Definition: io/types.hpp:337
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.