io/json.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "types.hpp"
20 
22 #include <cudf/types.hpp>
23 
24 #include <rmm/mr/device/per_device_resource.hpp>
25 
26 #include <map>
27 #include <string>
28 #include <variant>
29 #include <vector>
30 
31 namespace cudf {
32 namespace io {
39 class json_reader_options_builder;
40 
50 
54  std::map<std::string, schema_element> child_types;
55 };
56 
61  FAIL,
63 };
64 
89  source_info _source;
90 
91  // Data types of the column; empty to infer dtypes
92  std::variant<std::vector<data_type>,
93  std::map<std::string, data_type>,
94  std::map<std::string, schema_element>>
95  _dtypes;
96  // Specify the compression format of the source or infer from file extension
98 
99  // Read the file as a json object per line
100  bool _lines = false;
101  // Parse mixed types as a string column
102  bool _mixed_types_as_string = false;
103 
104  // Bytes to skip from the start
105  size_t _byte_range_offset = 0;
106  // Bytes to read; always reads complete rows
107  size_t _byte_range_size = 0;
108 
109  // Whether to parse dates as DD/MM versus MM/DD
110  bool _dayfirst = false;
111 
112  // Whether to use the legacy reader
113  bool _legacy = false;
114 
115  // Whether to keep the quote characters of string values
116  bool _keep_quotes = false;
117 
118  // Normalize single quotes
119  bool _normalize_single_quotes = false;
120 
121  // Normalize unquoted spaces and tabs
122  bool _normalize_whitespace = false;
123 
124  // Whether to recover after an invalid JSON line
126 
132  explicit json_reader_options(source_info src) : _source{std::move(src)} {}
133 
135 
136  public:
142  json_reader_options() = default;
143 
151 
157  [[nodiscard]] source_info const& get_source() const { return _source; }
158 
164  std::variant<std::vector<data_type>,
165  std::map<std::string, data_type>,
166  std::map<std::string, schema_element>> const&
167  get_dtypes() const
168  {
169  return _dtypes;
170  }
171 
177  compression_type get_compression() const { return _compression; }
178 
184  size_t get_byte_range_offset() const { return _byte_range_offset; }
185 
191  size_t get_byte_range_size() const { return _byte_range_size; }
192 
199  {
200  if (_byte_range_size == 0) {
201  return 0;
202  } else {
203  return _byte_range_size + get_byte_range_padding();
204  }
205  }
206 
212  size_t get_byte_range_padding() const
213  {
214  auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
215 
216  auto const max_row_bytes = 16 * 1024; // 16KB
217  auto const column_bytes = 64;
218  auto const base_padding = 1024; // 1KB
219 
220  if (num_columns == 0) {
221  // Use flat size if the number of columns is not known
222  return max_row_bytes;
223  }
224 
225  // Expand the size based on the number of columns, if available
226  return base_padding + num_columns * column_bytes;
227  }
228 
234  bool is_enabled_lines() const { return _lines; }
235 
241  bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
242 
248  bool is_enabled_dayfirst() const { return _dayfirst; }
249 
255  bool is_enabled_legacy() const { return _legacy; }
256 
262  bool is_enabled_keep_quotes() const { return _keep_quotes; }
263 
269  bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
270 
276  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
277 
283  json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
284 
290  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
291 
297  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
298 
304  void set_dtypes(std::map<std::string, schema_element> types) { _dtypes = std::move(types); }
305 
311  void set_compression(compression_type comp_type) { _compression = comp_type; }
312 
318  void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; }
319 
325  void set_byte_range_size(size_type size) { _byte_range_size = size; }
326 
332  void enable_lines(bool val) { _lines = val; }
333 
340  void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
341 
347  void enable_dayfirst(bool val) { _dayfirst = val; }
348 
354  void enable_legacy(bool val) { _legacy = val; }
355 
362  void enable_keep_quotes(bool val) { _keep_quotes = val; }
363 
370  void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
371 
378  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
379 
385  void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
386 };
387 
392  json_reader_options options;
393 
394  public:
400  explicit json_reader_options_builder() = default;
401 
407  explicit json_reader_options_builder(source_info src) : options{std::move(src)} {}
408 
415  json_reader_options_builder& dtypes(std::vector<data_type> types)
416  {
417  options._dtypes = std::move(types);
418  return *this;
419  }
420 
427  json_reader_options_builder& dtypes(std::map<std::string, data_type> types)
428  {
429  options._dtypes = std::move(types);
430  return *this;
431  }
432 
439  json_reader_options_builder& dtypes(std::map<std::string, schema_element> types)
440  {
441  options._dtypes = std::move(types);
442  return *this;
443  }
444 
452  {
453  options._compression = comp_type;
454  return *this;
455  }
456 
464  {
465  options._byte_range_offset = offset;
466  return *this;
467  }
468 
476  {
477  options._byte_range_size = size;
478  return *this;
479  }
480 
488  {
489  options._lines = val;
490  return *this;
491  }
492 
501  {
502  options._mixed_types_as_string = val;
503  return *this;
504  }
505 
513  {
514  options._dayfirst = val;
515  return *this;
516  }
517 
525  {
526  options._legacy = val;
527  return *this;
528  }
529 
538  {
539  options._keep_quotes = val;
540  return *this;
541  }
542 
551  {
552  options._normalize_single_quotes = val;
553  return *this;
554  }
555 
564  {
565  options._normalize_whitespace = val;
566  return *this;
567  }
568 
576  {
577  options._recovery_mode = val;
578  return *this;
579  }
580 
584  operator json_reader_options&&() { return std::move(options); }
585 
593  json_reader_options&& build() { return std::move(options); }
594 };
595 
614  json_reader_options options,
615  rmm::cuda_stream_view stream = cudf::get_default_stream(),
616  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
617  // end of group
619 
630 
635  // Specify the sink to use for writer output
636  sink_info _sink;
637  // Set of columns to output
638  table_view _table;
639  // string to use for null entries
640  std::string _na_rep = "";
641  // Indicates whether to output nulls as 'null' or exclude the field
642  bool _include_nulls = false;
643  // Indicates whether to use JSON lines for records format
644  bool _lines = false;
645  // maximum number of rows to write in each chunk (limits memory use)
646  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
647  // string to use for values != 0 in INT8 types (default 'true')
648  std::string _true_value = std::string{"true"};
649  // string to use for values == 0 in INT8 types (default 'false')
650  std::string _false_value = std::string{"false"};
651  // Names of all columns; if empty, writer will generate column names
652  std::optional<table_metadata> _metadata; // Optional column names
653 
660  explicit json_writer_options(sink_info const& sink, table_view const& table)
661  : _sink(sink), _table(table), _rows_per_chunk(table.num_rows())
662  {
663  }
664 
666 
667  public:
673  explicit json_writer_options() = default;
674 
684 
690  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
691 
697  [[nodiscard]] table_view const& get_table() const { return _table; }
698 
704  [[nodiscard]] std::optional<table_metadata> const& get_metadata() const { return _metadata; }
705 
711  [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
712 
718  [[nodiscard]] bool is_enabled_include_nulls() const { return _include_nulls; }
719 
725  [[nodiscard]] bool is_enabled_lines() const { return _lines; }
726 
732  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
733 
739  [[nodiscard]] std::string const& get_true_value() const { return _true_value; }
740 
746  [[nodiscard]] std::string const& get_false_value() const { return _false_value; }
747 
748  // Setter
749 
755  void set_table(table_view tbl) { _table = tbl; }
756 
762  void set_metadata(table_metadata metadata) { _metadata = std::move(metadata); }
763 
769  void set_na_rep(std::string val) { _na_rep = std::move(val); }
770 
776  void enable_include_nulls(bool val) { _include_nulls = val; }
777 
783  void enable_lines(bool val) { _lines = val; }
784 
790  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
791 
797  void set_true_value(std::string val) { _true_value = std::move(val); }
798 
804  void set_false_value(std::string val) { _false_value = std::move(val); }
805 };
806 
811  json_writer_options options;
812 
813  public:
819  explicit json_writer_options_builder() = default;
820 
828  : options{sink, table}
829  {
830  }
831 
839  {
840  options._table = tbl;
841  return *this;
842  }
843 
851  {
852  options._metadata = std::move(metadata);
853  return *this;
854  }
855 
863  {
864  options._na_rep = std::move(val);
865  return *this;
866  };
867 
875  {
876  options._include_nulls = val;
877  return *this;
878  }
879 
887  {
888  options._lines = val;
889  return *this;
890  }
891 
899  {
900  options._rows_per_chunk = val;
901  return *this;
902  }
903 
911  {
912  options._true_value = std::move(val);
913  return *this;
914  }
915 
923  {
924  options._false_value = std::move(val);
925  return *this;
926  }
927 
931  operator json_writer_options&&() { return std::move(options); }
932 
940  json_writer_options&& build() { return std::move(options); }
941 };
942 
961 void write_json(json_writer_options const& options,
962  rmm::cuda_stream_view stream = cudf::get_default_stream(),
963  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
964  // end of group
966 } // namespace io
967 } // namespace cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:241
Builds settings to use for read_json().
Definition: io/json.hpp:391
json_reader_options_builder & normalize_single_quotes(bool val)
Set whether the reader should normalize single quotes around strings.
Definition: io/json.hpp:550
json_reader_options_builder & keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
Definition: io/json.hpp:537
json_reader_options_builder & normalize_whitespace(bool val)
Set whether the reader should normalize unquoted whitespace.
Definition: io/json.hpp:563
json_reader_options_builder & dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:512
json_reader_options_builder & recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:575
json_reader_options_builder & lines(bool val)
Set whether to read the file as a json object per line.
Definition: io/json.hpp:487
json_reader_options_builder & dtypes(std::vector< data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:415
json_reader_options && build()
move json_reader_options member once it's built.
Definition: io/json.hpp:593
json_reader_options_builder & mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
Definition: io/json.hpp:500
json_reader_options_builder & compression(compression_type comp_type)
Set the compression type.
Definition: io/json.hpp:451
json_reader_options_builder(source_info src)
Constructor from source info.
Definition: io/json.hpp:407
json_reader_options_builder & byte_range_size(size_type size)
Set number of bytes to read.
Definition: io/json.hpp:475
json_reader_options_builder & dtypes(std::map< std::string, schema_element > types)
Set data types for columns to be read.
Definition: io/json.hpp:439
json_reader_options_builder & legacy(bool val)
Set whether to use the legacy reader.
Definition: io/json.hpp:524
json_reader_options_builder & byte_range_offset(size_type offset)
Set number of bytes to skip from source start.
Definition: io/json.hpp:463
json_reader_options_builder()=default
Default constructor.
json_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:427
Input arguments to the read_json interface.
Definition: io/json.hpp:88
void enable_mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
Definition: io/json.hpp:340
void set_compression(compression_type comp_type)
Set the compression type.
Definition: io/json.hpp:311
void set_dtypes(std::vector< data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:290
void enable_normalize_single_quotes(bool val)
Set whether the reader should enable normalization of single quotes around strings.
Definition: io/json.hpp:370
bool is_enabled_keep_quotes() const
Whether the reader should keep quotes of string values.
Definition: io/json.hpp:262
void enable_normalize_whitespace(bool val)
Set whether the reader should enable normalization of unquoted whitespace.
Definition: io/json.hpp:378
size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: io/json.hpp:184
source_info const & get_source() const
Returns source info.
Definition: io/json.hpp:157
void enable_legacy(bool val)
Set whether to use the legacy reader.
Definition: io/json.hpp:354
void set_dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:297
bool is_enabled_lines() const
Whether to read the file as a json object per line.
Definition: io/json.hpp:234
void set_byte_range_size(size_type size)
Set number of bytes to read.
Definition: io/json.hpp:325
bool is_enabled_mixed_types_as_string() const
Whether to parse mixed types as a string column.
Definition: io/json.hpp:241
json_reader_options()=default
Default constructor.
void enable_dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:347
std::variant< std::vector< data_type >, std::map< std::string, data_type >, std::map< std::string, schema_element > > const & get_dtypes() const
Returns data types of the columns.
Definition: io/json.hpp:167
size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: io/json.hpp:198
void set_recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:385
bool is_enabled_normalize_whitespace() const
Whether the reader should normalize unquoted whitespace characters.
Definition: io/json.hpp:276
void enable_lines(bool val)
Set whether to read the file as a json object per line.
Definition: io/json.hpp:332
void enable_keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
Definition: io/json.hpp:362
bool is_enabled_normalize_single_quotes() const
Whether the reader should normalize single quotes around strings.
Definition: io/json.hpp:269
compression_type get_compression() const
Returns compression format of the source.
Definition: io/json.hpp:177
void set_dtypes(std::map< std::string, schema_element > types)
Set data types for a potentially nested column hierarchy.
Definition: io/json.hpp:304
size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: io/json.hpp:191
json_recovery_mode_t recovery_mode() const
Queries the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:283
static json_reader_options_builder builder(source_info src)
create json_reader_options_builder which will build json_reader_options.
void set_byte_range_offset(size_type offset)
Set number of bytes to skip from source start.
Definition: io/json.hpp:318
bool is_enabled_legacy() const
Whether the legacy reader should be used.
Definition: io/json.hpp:255
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:248
size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: io/json.hpp:212
Builder to build options for writer_json()
Definition: io/json.hpp:810
json_writer_options_builder & include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
Definition: io/json.hpp:874
json_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: io/json.hpp:838
json_writer_options_builder()=default
Default constructor.
json_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: io/json.hpp:898
json_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: io/json.hpp:910
json_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: io/json.hpp:922
json_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: io/json.hpp:827
json_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: io/json.hpp:862
json_writer_options_builder & metadata(table_metadata metadata)
Sets optional metadata (with column names).
Definition: io/json.hpp:850
json_writer_options && build()
move json_writer_options member once it's built.
Definition: io/json.hpp:940
json_writer_options_builder & lines(bool val)
Enables/Disables JSON lines for records format.
Definition: io/json.hpp:886
Settings to use for write_json().
Definition: io/json.hpp:634
table_view const & get_table() const
Returns table that would be written to output.
Definition: io/json.hpp:697
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: io/json.hpp:804
void enable_include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
Definition: io/json.hpp:776
bool is_enabled_include_nulls() const
Whether to output nulls as 'null'.
Definition: io/json.hpp:718
void enable_lines(bool val)
Enables/Disables JSON lines for records format.
Definition: io/json.hpp:783
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: io/json.hpp:769
static json_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create json_writer_options.
json_writer_options()=default
Default constructor.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: io/json.hpp:797
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: io/json.hpp:690
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: io/json.hpp:790
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: io/json.hpp:739
void set_table(table_view tbl)
Sets table to be written to output.
Definition: io/json.hpp:755
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: io/json.hpp:746
bool is_enabled_lines() const
Whether to use JSON lines for records format.
Definition: io/json.hpp:725
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: io/json.hpp:732
std::optional< table_metadata > const & get_metadata() const
Returns metadata information.
Definition: io/json.hpp:704
std::string const & get_na_rep() const
Returns string to used for null entries.
Definition: io/json.hpp:711
void set_metadata(table_metadata metadata)
Sets metadata.
Definition: io/json.hpp:762
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
A set of cudf::column's of the same size.
Definition: table.hpp:40
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:93
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_json(json_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads a JSON dataset into a set of columns.
json_recovery_mode_t
Control the error recovery behavior of the json parser.
Definition: io/json.hpp:60
@ RECOVER_WITH_NULL
Recovers from an error, replacing invalid records with null.
@ FAIL
Does not recover from an error when encountering an invalid format.
compression_type
Compression algorithms.
Definition: io/types.hpp:56
@ AUTO
Automatically detect or select compression format.
void write_json(json_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Writes a set of columns to JSON format.
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
cuDF interfaces
Definition: aggregation.hpp:34
Allows specifying the target types for nested JSON data via json_reader_options' set_dtypes method.
Definition: io/json.hpp:45
data_type type
The type that this column should be converted to.
Definition: io/json.hpp:49
std::map< std::string, schema_element > child_types
Allows specifying this column's child columns target type.
Definition: io/json.hpp:54
Destination information for write interfaces.
Definition: io/types.hpp:489
Source information for read interfaces.
Definition: io/types.hpp:314
Table metadata returned by IO readers.
Definition: io/types.hpp:257
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:269
Class definitions for (mutable)_table_view
Type declarations for libcudf.