csv.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/io/detail/utils.hpp>
9 #include <cudf/io/types.hpp>
11 #include <cudf/types.hpp>
12 #include <cudf/utilities/error.hpp>
14 
15 #include <memory>
16 #include <string>
17 #include <unordered_map>
18 #include <utility>
19 #include <variant>
20 #include <vector>
21 
22 namespace CUDF_EXPORT cudf {
23 namespace io {
24 
34 class csv_reader_options_builder;
35 
41  source_info _source;
42 
43  // Read settings
44 
45  // Specify the compression format of the source or infer from file extension
46  compression_type _compression = compression_type::AUTO;
47  // Bytes to skip from the source start
48  std::size_t _byte_range_offset = 0;
49  // Bytes to read; always reads complete rows
50  std::size_t _byte_range_size = 0;
51  // Names of all the columns; if empty then names are auto-generated
52  std::vector<std::string> _names;
53  // If there is no header or names, prepend this to the column ID as the name
54  std::string _prefix;
55  // Whether to rename duplicate column names
56  bool _mangle_dupe_cols = true;
57 
58  // Filter settings
59 
60  // Names of columns to read; empty is all columns
61  std::vector<std::string> _use_cols_names;
62  // Indexes of columns to read; empty is all columns
63  std::vector<int> _use_cols_indexes;
64  // Rows to read; -1 is all
65  size_type _nrows = -1;
66  // Rows to skip from the start
67  size_type _skiprows = 0;
68  // Rows to skip from the end
69  size_type _skipfooter = 0;
70  // Header row index
71  size_type _header = 0;
72 
73  // Parsing settings
74 
75  // Line terminator
76  char _lineterminator = '\n';
77  // Field delimiter
78  char _delimiter = ',';
79  // Numeric data thousands separator; cannot match delimiter
80  char _thousands = '\0';
81  // Decimal point character; cannot match delimiter
82  char _decimal = '.';
83  // Comment line start character
84  char _comment = '\0';
85  bool _windowslinetermination = false;
86  // Treat whitespace as field delimiter; overrides character delimiter
87  bool _delim_whitespace = false;
88  // Skip whitespace after the delimiter
89  bool _skipinitialspace = false;
90  // Ignore empty lines or parse line values as invalid
91  bool _skip_blank_lines = true;
92  // Treatment of quoting behavior
93  quote_style _quoting = quote_style::MINIMAL;
94  // Quoting character (if `quoting` is true)
95  char _quotechar = '"';
96  // Whether a quote inside a value is double-quoted
97  bool _doublequote = true;
98  // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
99  // _doublequote is true
100  bool _detect_whitespace_around_quotes = false;
101  // Names of columns to read as datetime
102  std::vector<std::string> _parse_dates_names;
103  // Indexes of columns to read as datetime
104  std::vector<int> _parse_dates_indexes;
105  // Names of columns to parse as hexadecimal
106  std::vector<std::string> _parse_hex_names;
107  // Indexes of columns to parse as hexadecimal
108  std::vector<int> _parse_hex_indexes;
109 
110  // Conversion settings
111 
112  // Per-column types; disables type inference on those columns
113  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
114  // Additional values to recognize as boolean true values
115  std::vector<std::string> _true_values{"True", "TRUE", "true"};
116  // Additional values to recognize as boolean false values
117  std::vector<std::string> _false_values{"False", "FALSE", "false"};
118  // Additional values to recognize as null values
119  std::vector<std::string> _na_values;
120  // Whether to keep the built-in default NA values
121  bool _keep_default_na = true;
122  // Whether to disable null filter; disabling can improve performance
123  bool _na_filter = true;
124  // Whether to parse dates as DD/MM versus MM/DD
125  bool _dayfirst = false;
126  // Cast timestamp columns to a specific type
127  data_type _timestamp_type{type_id::EMPTY};
128 
134  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
135 
137 
138  public:
144  csv_reader_options() = default;
145 
153 
159  [[nodiscard]] source_info const& get_source() const { return _source; }
160 
166  [[nodiscard]] compression_type get_compression() const { return _compression; }
167 
173  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
174 
180  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
181 
187  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
188  {
189  if (_byte_range_size == 0) {
190  return 0;
191  } else {
192  return _byte_range_size + get_byte_range_padding();
193  }
194  }
195 
201  [[nodiscard]] std::size_t get_byte_range_padding() const
202  {
203  auto const num_names = _names.size();
204  auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
205  auto const num_columns = std::max(num_dtypes, num_names);
206 
207  auto const max_row_bytes = 16 * 1024; // 16KB
208  auto const column_bytes = 64;
209  auto const base_padding = 1024; // 1KB
210 
211  if (num_columns == 0) {
212  // Use flat size if the number of columns is not known
213  return max_row_bytes;
214  }
215 
216  // Expand the size based on the number of columns, if available
217  return base_padding + num_columns * column_bytes;
218  }
219 
225  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
226 
232  [[nodiscard]] std::string get_prefix() const { return _prefix; }
233 
239  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
240 
246  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
247  {
248  return _use_cols_names;
249  }
250 
256  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
257 
263  [[nodiscard]] size_type get_nrows() const { return _nrows; }
264 
270  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
271 
277  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
278 
284  [[nodiscard]] size_type get_header() const { return _header; }
285 
291  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
292 
298  [[nodiscard]] char get_delimiter() const { return _delimiter; }
299 
305  [[nodiscard]] char get_thousands() const { return _thousands; }
306 
312  [[nodiscard]] char get_decimal() const { return _decimal; }
313 
319  [[nodiscard]] char get_comment() const { return _comment; }
320 
326  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
327 
333  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
334 
340  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
341 
347  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
348 
354  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
355 
361  [[nodiscard]] char get_quotechar() const { return _quotechar; }
362 
368  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
369 
377  {
378  return _detect_whitespace_around_quotes;
379  }
380 
386  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
387  {
388  return _parse_dates_names;
389  }
390 
396  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
397  {
398  return _parse_dates_indexes;
399  }
400 
406  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
407  {
408  return _parse_hex_names;
409  }
410 
416  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
417 
423  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
424  get_dtypes() const
425  {
426  return _dtypes;
427  }
428 
434  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
435 
441  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
442 
448  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
449 
455  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
456 
462  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
463 
469  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
470 
476  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
477 
483  void set_source(source_info src) { _source = std::move(src); }
484 
490  void set_compression(compression_type comp) { _compression = comp; }
491 
497  void set_byte_range_offset(std::size_t offset)
498  {
499  if ((offset != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
500  CUDF_FAIL(
501  "When there is valid value in skiprows or skipfooter or nrows, offset can't have non-zero "
502  "value");
503  }
504  _byte_range_offset = offset;
505  }
506 
512  void set_byte_range_size(std::size_t size)
513  {
514  if ((size != 0) and ((_skiprows != 0) or (_skipfooter != 0) or (_nrows != -1))) {
515  CUDF_FAIL(
516  "If the value of any of skiprows, skipfooter or nrows is valid, range size cannot be "
517  "non-zero.");
518  }
519  _byte_range_size = size;
520  }
521 
527  void set_names(std::vector<std::string> col_names) { _names = std::move(col_names); }
528 
534  void set_prefix(std::string pfx) { _prefix = pfx; }
535 
541  void enable_mangle_dupe_cols(bool val) { _mangle_dupe_cols = val; }
542 
548  void set_use_cols_names(std::vector<std::string> col_names)
549  {
550  _use_cols_names = std::move(col_names);
551  }
552 
558  void set_use_cols_indexes(std::vector<int> col_indices)
559  {
560  _use_cols_indexes = std::move(col_indices);
561  }
562 
568  void set_nrows(size_type nrows)
569  {
570  CUDF_EXPECTS((nrows == 0) or (_skipfooter == 0), "Cannot use both `nrows` and `skipfooter`");
571  if ((nrows != -1) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
572  CUDF_FAIL(
573  "nrows can't be a non negative value if range offset and/or range size has been set");
574  }
575 
576  _nrows = nrows;
577  }
578 
584  void set_skiprows(size_type skiprows)
585  {
586  if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
587  CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
588  std::invalid_argument);
589  }
590  _skiprows = skiprows;
591  }
592 
598  void set_skipfooter(size_type skipfooter)
599  {
600  CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
601  "Cannot use both `nrows` and `skipfooter`",
602  std::invalid_argument);
603  if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
604  CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
605  std::invalid_argument);
606  }
607 
608  _skipfooter = skipfooter;
609  }
610 
616  void set_header(size_type hdr) { _header = hdr; }
617 
623  void set_lineterminator(char term) { _lineterminator = term; }
624 
630  void set_delimiter(char delim) { _delimiter = delim; }
631 
637  void set_thousands(char val) { _thousands = val; }
638 
644  void set_decimal(char val) { _decimal = val; }
645 
651  void set_comment(char val) { _comment = val; }
652 
658  void enable_windowslinetermination(bool val) { _windowslinetermination = val; }
659 
665  void enable_delim_whitespace(bool val) { _delim_whitespace = val; }
666 
672  void enable_skipinitialspace(bool val) { _skipinitialspace = val; }
673 
679  void enable_skip_blank_lines(bool val) { _skip_blank_lines = val; }
680 
691  void set_quoting(quote_style quoting)
692  {
693  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
694  "Only MINIMAL and NONE are supported for quoting.");
695  _quoting = quoting;
696  }
697 
703  void set_quotechar(char ch) { _quotechar = ch; }
704 
710  void enable_doublequote(bool val) { _doublequote = val; }
711 
718  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
719 
725  void set_parse_dates(std::vector<std::string> col_names)
726  {
727  _parse_dates_names = std::move(col_names);
728  }
729 
735  void set_parse_dates(std::vector<int> col_indices)
736  {
737  _parse_dates_indexes = std::move(col_indices);
738  }
739 
745  void set_parse_hex(std::vector<std::string> col_names)
746  {
747  _parse_hex_names = std::move(col_names);
748  }
749 
755  void set_parse_hex(std::vector<int> col_indices) { _parse_hex_indexes = std::move(col_indices); }
756 
762  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
763 
769  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
770 
776  void set_true_values(std::vector<std::string> vals)
777  {
778  _true_values.insert(_true_values.end(), vals.begin(), vals.end());
779  }
780 
786  void set_false_values(std::vector<std::string> vals)
787  {
788  _false_values.insert(_false_values.end(), vals.begin(), vals.end());
789  }
790 
796  void set_na_values(std::vector<std::string> vals)
797  {
798  if ((!vals.empty()) and (!_na_filter)) {
799  CUDF_FAIL("Can't set na_values when na_filtering is disabled");
800  }
801 
802  _na_values = std::move(vals);
803  }
804 
810  void enable_keep_default_na(bool val) { _keep_default_na = val; }
811 
817  void enable_na_filter(bool val)
818  {
819  if (!val) { _na_values.clear(); }
820  _na_filter = val;
821  }
822 
828  void enable_dayfirst(bool val) { _dayfirst = val; }
829 
835  void set_timestamp_type(data_type type) { _timestamp_type = type; }
836 };
837 
843  csv_reader_options options;
844 
845  public:
852 
858  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
859 
867  {
868  options._compression = comp;
869  return *this;
870  }
871 
879  {
880  options.set_byte_range_offset(offset);
881  return *this;
882  }
883 
891  {
892  options.set_byte_range_size(size);
893  return *this;
894  }
895 
902  csv_reader_options_builder& names(std::vector<std::string> col_names)
903  {
904  options._names = std::move(col_names);
905  return *this;
906  }
907 
915  {
916  options._prefix = std::move(pfx);
917  return *this;
918  }
919 
927  {
928  options._mangle_dupe_cols = val;
929  return *this;
930  }
931 
938  csv_reader_options_builder& use_cols_names(std::vector<std::string> col_names)
939  {
940  options._use_cols_names = std::move(col_names);
941  return *this;
942  }
943 
950  csv_reader_options_builder& use_cols_indexes(std::vector<int> col_indices)
951  {
952  options._use_cols_indexes = std::move(col_indices);
953  return *this;
954  }
955 
963  {
964  options.set_nrows(rows);
965  return *this;
966  }
967 
975  {
976  options.set_skiprows(skip);
977  return *this;
978  }
979 
987  {
988  options.set_skipfooter(skip);
989  return *this;
990  }
991 
999  {
1000  options._header = hdr;
1001  return *this;
1002  }
1003 
1011  {
1012  options._lineterminator = term;
1013  return *this;
1014  }
1015 
1023  {
1024  options._delimiter = delim;
1025  return *this;
1026  }
1027 
1035  {
1036  options._thousands = val;
1037  return *this;
1038  }
1039 
1047  {
1048  options._decimal = val;
1049  return *this;
1050  }
1051 
1059  {
1060  options._comment = val;
1061  return *this;
1062  }
1063 
1071  {
1072  options._windowslinetermination = val;
1073  return *this;
1074  }
1075 
1083  {
1084  options._delim_whitespace = val;
1085  return *this;
1086  }
1087 
1095  {
1096  options._skipinitialspace = val;
1097  return *this;
1098  }
1099 
1107  {
1108  options._skip_blank_lines = val;
1109  return *this;
1110  }
1111 
1119  {
1120  options._quoting = style;
1121  return *this;
1122  }
1123 
1131  {
1132  options._quotechar = ch;
1133  return *this;
1134  }
1135 
1143  {
1144  options._doublequote = val;
1145  return *this;
1146  }
1147 
1156  {
1157  options._detect_whitespace_around_quotes = val;
1158  return *this;
1159  }
1160 
1167  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
1168  {
1169  options._parse_dates_names = std::move(col_names);
1170  return *this;
1171  }
1172 
1179  csv_reader_options_builder& parse_dates(std::vector<int> col_indices)
1180  {
1181  options._parse_dates_indexes = std::move(col_indices);
1182  return *this;
1183  }
1184 
1191  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
1192  {
1193  options._parse_hex_names = std::move(col_names);
1194  return *this;
1195  }
1196 
1203  csv_reader_options_builder& parse_hex(std::vector<int> col_indices)
1204  {
1205  options._parse_hex_indexes = std::move(col_indices);
1206  return *this;
1207  }
1208 
1215  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
1216  {
1217  options._dtypes = std::move(types);
1218  return *this;
1219  }
1220 
1227  csv_reader_options_builder& dtypes(std::vector<data_type> types)
1228  {
1229  options._dtypes = std::move(types);
1230  return *this;
1231  }
1232 
1239  csv_reader_options_builder& true_values(std::vector<std::string> vals)
1240  {
1241  options._true_values.insert(options._true_values.end(), vals.begin(), vals.end());
1242  return *this;
1243  }
1244 
1251  csv_reader_options_builder& false_values(std::vector<std::string> vals)
1252  {
1253  options._false_values.insert(options._false_values.end(), vals.begin(), vals.end());
1254  return *this;
1255  }
1256 
1263  csv_reader_options_builder& na_values(std::vector<std::string> vals)
1264  {
1265  options.set_na_values(std::move(vals));
1266  return *this;
1267  }
1268 
1276  {
1277  options.enable_keep_default_na(val);
1278  return *this;
1279  }
1280 
1288  {
1289  options.enable_na_filter(val);
1290  return *this;
1291  }
1292 
1300  {
1301  options._dayfirst = val;
1302  return *this;
1303  }
1304 
1312  {
1313  options._timestamp_type = type;
1314  return *this;
1315  }
1316 
1320  operator csv_reader_options&&() { return std::move(options); }
1321 
1329  csv_reader_options&& build() { return std::move(options); }
1330 };
1331 
1350  csv_reader_options options,
1353  // end of group
1365 
1370  // Specify the sink to use for writer output
1371  sink_info _sink;
1372  // Set of columns to output
1373  table_view _table;
1374  // string to use for null entries
1375  std::string _na_rep = "";
1376  // Indicates whether to write headers to csv
1377  bool _include_header = true;
1378  // maximum number of rows to write in each chunk (limits memory use)
1379  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
1380  // character to use for separating lines (default "\n")
1381  std::string _line_terminator = "\n";
1382  // character to use for separating column values (default ",")
1383  char _inter_column_delimiter = ',';
1384  // string to use for values != 0 in INT8 types (default 'true')
1385  std::string _true_value = std::string{"true"};
1386  // string to use for values == 0 in INT8 types (default 'false')
1387  std::string _false_value = std::string{"false"};
1388  // Names of all columns; if empty, writer will generate column names
1389  std::vector<std::string> _names;
1390  // Quote style. Currently only MINIMAL and NONE are supported.
1391  quote_style _quoting = quote_style::MINIMAL;
1392 
1399  explicit csv_writer_options(sink_info sink, table_view const& table)
1400  : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
1401  {
1402  }
1403 
1405 
1406  public:
1412  explicit csv_writer_options() = default;
1413 
1423 
1429  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
1430 
1436  [[nodiscard]] table_view const& get_table() const { return _table; }
1437 
1443  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
1444 
1450  [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
1451 
1457  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
1458 
1464  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
1465 
1471  [[nodiscard]] std::string const& get_line_terminator() const { return _line_terminator; }
1472 
1478  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
1479 
1485  [[nodiscard]] std::string const& get_true_value() const { return _true_value; }
1486 
1492  [[nodiscard]] std::string const& get_false_value() const { return _false_value; }
1493 
1504  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
1505 
1506  // Setter
1512  void set_names(std::vector<std::string> names) { _names = std::move(names); }
1513 
1519  void set_na_rep(std::string val) { _na_rep = std::move(val); }
1520 
1526  void enable_include_header(bool val) { _include_header = val; }
1527 
1533  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
1534 
1540  void set_line_terminator(std::string term) { _line_terminator = std::move(term); }
1541 
1547  void set_inter_column_delimiter(char delim) { _inter_column_delimiter = delim; }
1548 
1554  void set_true_value(std::string val) { _true_value = std::move(val); }
1555 
1561  void set_false_value(std::string val) { _false_value = std::move(val); }
1562 
1568  void set_table(table_view const& table) { _table = table; }
1569 
1580  void set_quoting(quote_style quoting)
1581  {
1582  CUDF_EXPECTS(quoting == quote_style::MINIMAL || quoting == quote_style::NONE,
1583  "Only MINIMAL and NONE are supported for quoting.");
1584  _quoting = quoting;
1585  }
1586 };
1587 
1592  csv_writer_options options;
1593 
1594  public:
1600  explicit csv_writer_options_builder() = default;
1601 
1609  : options{sink, table}
1610  {
1611  }
1612 
1619  csv_writer_options_builder& names(std::vector<std::string> names)
1620  {
1621  options._names = names;
1622  return *this;
1623  }
1624 
1632  {
1633  options._na_rep = val;
1634  return *this;
1635  };
1636 
1644  {
1645  options._include_header = val;
1646  return *this;
1647  }
1648 
1656  {
1657  options._rows_per_chunk = val;
1658  return *this;
1659  }
1660 
1668  {
1669  options._line_terminator = term;
1670  return *this;
1671  }
1672 
1680  {
1681  options._inter_column_delimiter = delim;
1682  return *this;
1683  }
1684 
1692  {
1693  options._true_value = val;
1694  return *this;
1695  }
1696 
1704  {
1705  options._false_value = val;
1706  return *this;
1707  }
1708 
1718  {
1719  options.set_quoting(quoting);
1720  return *this;
1721  }
1722 
1726  operator csv_writer_options&&() { return std::move(options); }
1727 
1735  csv_writer_options&& build() { return std::move(options); }
1736 };
1737 
1755 void write_csv(csv_writer_options const& options,
1757 
1759 struct is_supported_csv_write_type_fn {
1760  template <typename T>
1761  constexpr bool operator()() const
1762  {
1763  return cudf::io::detail::is_convertible_to_string_column<T>();
1764  }
1765 };
1767 
1774 constexpr bool is_supported_write_csv(data_type type)
1775 {
1776  return cudf::type_dispatcher(type, is_supported_csv_write_type_fn{});
1777 }
1778  // end of group
1780 } // namespace io
1781 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:238
Builder to build options for read_csv().
Definition: csv.hpp:842
csv_reader_options_builder & dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:1227
csv_reader_options_builder & false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:1251
csv_reader_options_builder & use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:938
csv_reader_options_builder & doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:1142
csv_reader_options_builder & parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:1191
csv_reader_options_builder & byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:878
csv_reader_options_builder & delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:1082
csv_reader_options_builder & skiprows(size_type skip)
Sets number of rows to skip from start.
Definition: csv.hpp:974
csv_reader_options_builder & skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:1106
csv_reader_options && build()
move csv_reader_options member once it's built.
Definition: csv.hpp:1329
csv_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:1215
csv_reader_options_builder & quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:1130
csv_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:1263
csv_reader_options_builder & true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:1239
csv_reader_options_builder & decimal(char val)
Sets decimal point character.
Definition: csv.hpp:1046
csv_reader_options_builder & na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:1287
csv_reader_options_builder & thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:1034
csv_reader_options_builder & parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:1203
csv_reader_options_builder & detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:1155
csv_reader_options_builder & windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:1070
csv_reader_options_builder & parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:1179
csv_reader_options_builder & nrows(size_type rows)
Sets number of rows to read.
Definition: csv.hpp:962
csv_reader_options_builder & names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:902
csv_reader_options_builder & timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:1311
csv_reader_options_builder & mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:926
csv_reader_options_builder & skipfooter(size_type skip)
Sets number of rows to skip from end.
Definition: csv.hpp:986
csv_reader_options_builder()=default
Default constructor.
csv_reader_options_builder & byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:890
csv_reader_options_builder & keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:1275
csv_reader_options_builder & quoting(quote_style style)
Sets quoting style.
Definition: csv.hpp:1118
csv_reader_options_builder & lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:1010
csv_reader_options_builder & delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:1022
csv_reader_options_builder & use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:950
csv_reader_options_builder & parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:1167
csv_reader_options_builder(source_info src)
Constructor from source info.
Definition: csv.hpp:858
csv_reader_options_builder & comment(char val)
Sets comment line start character.
Definition: csv.hpp:1058
csv_reader_options_builder & compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:866
csv_reader_options_builder & header(size_type hdr)
Sets header row index.
Definition: csv.hpp:998
csv_reader_options_builder & dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:1299
csv_reader_options_builder & prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:914
csv_reader_options_builder & skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:1094
Settings to use for read_csv().
Definition: csv.hpp:40
void enable_doublequote(bool val)
Sets a quote inside a value is double-quoted.
Definition: csv.hpp:710
void set_use_cols_indexes(std::vector< int > col_indices)
Sets indexes of columns to read.
Definition: csv.hpp:558
size_type get_skiprows() const
Returns number of rows to skip from start.
Definition: csv.hpp:270
bool is_enabled_delim_whitespace() const
Whether to treat whitespace as field delimiter.
Definition: csv.hpp:333
std::vector< int > const & get_parse_dates_indexes() const
Returns indexes of columns to read as datetime.
Definition: csv.hpp:396
void set_byte_range_offset(std::size_t offset)
Sets number of bytes to skip from source start.
Definition: csv.hpp:497
quote_style get_quoting() const
Returns quoting style.
Definition: csv.hpp:354
void set_parse_dates(std::vector< std::string > col_names)
Sets names of columns to read as datetime.
Definition: csv.hpp:725
void set_parse_dates(std::vector< int > col_indices)
Sets indexes of columns to read as datetime.
Definition: csv.hpp:735
bool is_enabled_doublequote() const
Whether a quote inside a value is double-quoted.
Definition: csv.hpp:368
char get_delimiter() const
Returns field delimiter.
Definition: csv.hpp:298
char get_lineterminator() const
Returns line terminator.
Definition: csv.hpp:291
csv_reader_options()=default
Default constructor.
void enable_detect_whitespace_around_quotes(bool val)
Sets whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doubleq...
Definition: csv.hpp:718
void set_false_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean false values.
Definition: csv.hpp:786
void set_decimal(char val)
Sets decimal point character.
Definition: csv.hpp:644
std::string get_prefix() const
Returns prefix to be used for column ID.
Definition: csv.hpp:232
bool is_enabled_detect_whitespace_around_quotes() const
Whether to detect quotes surrounded by spaces e.g. "data". This flag has no effect when _doublequote ...
Definition: csv.hpp:376
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
Definition: csv.hpp:448
char get_thousands() const
Returns numeric data thousands separator.
Definition: csv.hpp:305
bool is_enabled_mangle_dupe_cols() const
Whether to rename duplicate column names.
Definition: csv.hpp:239
void set_dtypes(std::map< std::string, data_type > types)
Sets per-column types.
Definition: csv.hpp:762
std::size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: csv.hpp:187
std::vector< int > const & get_parse_hex_indexes() const
Returns indexes of columns to read as hexadecimal.
Definition: csv.hpp:416
std::vector< std::string > const & get_false_values() const
Returns additional values to recognize as boolean false values.
Definition: csv.hpp:441
void set_source(source_info src)
Sets source info.
Definition: csv.hpp:483
void enable_dayfirst(bool val)
Sets whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:828
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
Definition: csv.hpp:796
void set_quoting(quote_style quoting)
Sets the expected quoting style used in the input CSV data.
Definition: csv.hpp:691
std::variant< std::vector< data_type >, std::map< std::string, data_type > > const & get_dtypes() const
Returns per-column types.
Definition: csv.hpp:424
void set_timestamp_type(data_type type)
Sets timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:835
std::size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: csv.hpp:173
data_type get_timestamp_type() const
Returns timestamp_type to which all timestamp columns will be cast.
Definition: csv.hpp:476
bool is_enabled_na_filter() const
Whether to disable null filter.
Definition: csv.hpp:462
bool is_enabled_skip_blank_lines() const
Whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:347
char get_comment() const
Returns comment line start character.
Definition: csv.hpp:319
void set_lineterminator(char term)
Sets line terminator.
Definition: csv.hpp:623
void set_quotechar(char ch)
Sets quoting character.
Definition: csv.hpp:703
bool is_enabled_windowslinetermination() const
Whether to treat \r\n as line terminator.
Definition: csv.hpp:326
void enable_skip_blank_lines(bool val)
Sets whether to ignore empty lines or parse line values as invalid.
Definition: csv.hpp:679
void enable_windowslinetermination(bool val)
Sets whether to treat \r\n as line terminator.
Definition: csv.hpp:658
void set_skiprows(size_type skiprows)
Sets number of rows to skip from start.
Definition: csv.hpp:584
void set_compression(compression_type comp)
Sets compression format of the source.
Definition: csv.hpp:490
void enable_delim_whitespace(bool val)
Sets whether to treat whitespace as field delimiter.
Definition: csv.hpp:665
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:225
void set_dtypes(std::vector< data_type > types)
Sets per-column types.
Definition: csv.hpp:769
void set_skipfooter(size_type skipfooter)
Sets number of rows to skip from end.
Definition: csv.hpp:598
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: csv.hpp:469
std::size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: csv.hpp:201
void set_names(std::vector< std::string > col_names)
Sets names of the column.
Definition: csv.hpp:527
source_info const & get_source() const
Returns source info.
Definition: csv.hpp:159
void enable_keep_default_na(bool val)
Sets whether to keep the built-in default NA values.
Definition: csv.hpp:810
std::vector< std::string > const & get_parse_dates_names() const
Returns names of columns to read as datetime.
Definition: csv.hpp:386
void set_prefix(std::string pfx)
Sets prefix to be used for column ID.
Definition: csv.hpp:534
static csv_reader_options_builder builder(source_info src)
Creates a csv_reader_options_builder which will build csv_reader_options.
std::size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: csv.hpp:180
std::vector< int > const & get_use_cols_indexes() const
Returns indexes of columns to read.
Definition: csv.hpp:256
std::vector< std::string > const & get_use_cols_names() const
Returns names of the columns to be read.
Definition: csv.hpp:246
void set_use_cols_names(std::vector< std::string > col_names)
Sets names of the columns to be read.
Definition: csv.hpp:548
compression_type get_compression() const
Returns compression format of the source.
Definition: csv.hpp:166
char get_quotechar() const
Returns quoting character.
Definition: csv.hpp:361
void set_true_values(std::vector< std::string > vals)
Sets additional values to recognize as boolean true values.
Definition: csv.hpp:776
bool is_enabled_keep_default_na() const
Whether to keep the built-in default NA values.
Definition: csv.hpp:455
void set_header(size_type hdr)
Sets header row index.
Definition: csv.hpp:616
char get_decimal() const
Returns decimal point character.
Definition: csv.hpp:312
std::vector< std::string > const & get_true_values() const
Returns additional values to recognize as boolean true values.
Definition: csv.hpp:434
void set_parse_hex(std::vector< int > col_indices)
Sets indexes of columns to parse as hexadecimal.
Definition: csv.hpp:755
void set_thousands(char val)
Sets numeric data thousands separator.
Definition: csv.hpp:637
void enable_na_filter(bool val)
Sets whether to disable null filter.
Definition: csv.hpp:817
void set_byte_range_size(std::size_t size)
Sets number of bytes to read.
Definition: csv.hpp:512
void set_delimiter(char delim)
Sets field delimiter.
Definition: csv.hpp:630
void enable_mangle_dupe_cols(bool val)
Sets whether to rename duplicate column names.
Definition: csv.hpp:541
size_type get_nrows() const
Returns number of rows to read.
Definition: csv.hpp:263
std::vector< std::string > const & get_parse_hex_names() const
Returns names of columns to read as hexadecimal.
Definition: csv.hpp:406
void enable_skipinitialspace(bool val)
Sets whether to skip whitespace after the delimiter.
Definition: csv.hpp:672
size_type get_skipfooter() const
Returns number of rows to skip from end.
Definition: csv.hpp:277
void set_nrows(size_type nrows)
Sets number of rows to read.
Definition: csv.hpp:568
void set_comment(char val)
Sets comment line start character.
Definition: csv.hpp:651
bool is_enabled_skipinitialspace() const
Whether to skip whitespace after the delimiter.
Definition: csv.hpp:340
size_type get_header() const
Returns header row index.
Definition: csv.hpp:284
void set_parse_hex(std::vector< std::string > col_names)
Sets names of columns to parse as hexadecimal.
Definition: csv.hpp:745
Builder to build options for writer_csv()
Definition: csv.hpp:1591
csv_writer_options_builder()=default
Default constructor.
csv_writer_options && build()
move csv_writer_options member once it's built.
Definition: csv.hpp:1735
csv_writer_options_builder & quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1717
csv_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1691
csv_writer_options_builder & include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1643
csv_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1631
csv_writer_options_builder & line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1667
csv_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1703
csv_writer_options_builder & names(std::vector< std::string > names)
Sets optional column names.
Definition: csv.hpp:1619
csv_writer_options_builder & inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1679
csv_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: csv.hpp:1608
csv_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1655
Settings to use for write_csv().
Definition: csv.hpp:1369
void set_table(table_view const &table)
(Re)sets the table being written.
Definition: csv.hpp:1568
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: csv.hpp:1533
void set_quoting(quote_style quoting)
Sets the quote style for the writer.
Definition: csv.hpp:1580
void set_line_terminator(std::string term)
Sets character used for separating lines.
Definition: csv.hpp:1540
csv_writer_options()=default
Default constructor.
void set_inter_column_delimiter(char delim)
Sets character used for separating column values.
Definition: csv.hpp:1547
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: csv.hpp:1554
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create csv_writer_options.
table_view const & get_table() const
Returns table that would be written to output.
Definition: csv.hpp:1436
void enable_include_header(bool val)
Enables/Disables headers being written to csv.
Definition: csv.hpp:1526
bool is_enabled_include_header() const
Whether to write headers to csv.
Definition: csv.hpp:1457
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: csv.hpp:1519
char get_inter_column_delimiter() const
Returns character used for separating column values.
Definition: csv.hpp:1478
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: csv.hpp:1492
std::string const & get_line_terminator() const
Returns character used for separating lines.
Definition: csv.hpp:1471
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: csv.hpp:1429
std::vector< std::string > const & get_names() const
Returns names of the columns.
Definition: csv.hpp:1443
quote_style get_quoting() const
Returns the quote style for the writer.
Definition: csv.hpp:1504
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: csv.hpp:1485
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: csv.hpp:1561
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: csv.hpp:1464
std::string const & get_na_rep() const
Returns string to used for null entries.
Definition: csv.hpp:1450
void set_names(std::vector< std::string > names)
Sets optional associated column names.
Definition: csv.hpp:1512
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:82
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_csv(csv_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a CSV dataset into a set of columns.
quote_style
Behavior when handling quotations in field data.
Definition: io/types.hpp:75
compression_type
Compression algorithms.
Definition: io/types.hpp:46
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to CSV format.
constexpr bool is_supported_write_csv(data_type type)
Checks if a cudf::data_type is supported for CSV writing.
Definition: csv.hpp:1774
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
CUDF_HOST_DEVICE constexpr decltype(auto) __forceinline__ type_dispatcher(cudf::data_type dtype, Functor f, Ts &&... args)
Invokes an operator() template with the type instantiation based on the specified cudf::data_type's i...
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:143
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:182
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:26
Destination information for write interfaces.
Definition: io/types.hpp:471
Source information for read interfaces.
Definition: io/types.hpp:316
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.