io/json.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "types.hpp"
20 
22 #include <cudf/types.hpp>
23 
25 #include <rmm/resource_ref.hpp>
26 
27 #include <map>
28 #include <string>
29 #include <utility>
30 #include <variant>
31 #include <vector>
32 
33 namespace CUDF_EXPORT cudf {
34 namespace io {
41 class json_reader_options_builder;
42 
52 
56  std::map<std::string, schema_element> child_types;
57 };
58 
63  FAIL,
65 };
66 
91  source_info _source;
92 
93  // Data types of the column; empty to infer dtypes
94  std::variant<std::vector<data_type>,
95  std::map<std::string, data_type>,
96  std::map<std::string, schema_element>>
97  _dtypes;
98  // Specify the compression format of the source or infer from file extension
99  compression_type _compression = compression_type::AUTO;
100 
101  // Read the file as a json object per line
102  bool _lines = false;
103  // Parse mixed types as a string column
104  bool _mixed_types_as_string = false;
105  // Delimiter separating records in JSON lines
106  char _delimiter = '\n';
107  // Prune columns on read, selected based on the _dtypes option
108  bool _prune_columns = false;
109 
110  // Bytes to skip from the start
111  size_t _byte_range_offset = 0;
112  // Bytes to read; always reads complete rows
113  size_t _byte_range_size = 0;
114 
115  // Whether to parse dates as DD/MM versus MM/DD
116  bool _dayfirst = false;
117 
118  // Whether to use the legacy reader
119  bool _legacy = false;
120 
121  // Whether to keep the quote characters of string values
122  bool _keep_quotes = false;
123 
124  // Normalize single quotes
125  bool _normalize_single_quotes = false;
126 
127  // Normalize unquoted spaces and tabs
128  bool _normalize_whitespace = false;
129 
130  // Whether to recover after an invalid JSON line
131  json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
132 
138  explicit json_reader_options(source_info src) : _source{std::move(src)} {}
139 
141 
142  public:
148  json_reader_options() = default;
149 
157 
163  [[nodiscard]] source_info const& get_source() const { return _source; }
164 
170  [[nodiscard]] std::variant<std::vector<data_type>,
171  std::map<std::string, data_type>,
172  std::map<std::string, schema_element>> const&
173  get_dtypes() const
174  {
175  return _dtypes;
176  }
177 
183  [[nodiscard]] compression_type get_compression() const { return _compression; }
184 
190  [[nodiscard]] size_t get_byte_range_offset() const { return _byte_range_offset; }
191 
197  [[nodiscard]] size_t get_byte_range_size() const { return _byte_range_size; }
198 
204  [[nodiscard]] size_t get_byte_range_size_with_padding() const
205  {
206  if (_byte_range_size == 0) {
207  return 0;
208  } else {
209  return _byte_range_size + get_byte_range_padding();
210  }
211  }
212 
218  [[nodiscard]] size_t get_byte_range_padding() const
219  {
220  auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
221 
222  auto const max_row_bytes = 16 * 1024; // 16KB
223  auto const column_bytes = 64;
224  auto const base_padding = 1024; // 1KB
225 
226  if (num_columns == 0) {
227  // Use flat size if the number of columns is not known
228  return max_row_bytes;
229  }
230 
231  // Expand the size based on the number of columns, if available
232  return base_padding + num_columns * column_bytes;
233  }
234 
240  [[nodiscard]] char get_delimiter() const { return _delimiter; }
241 
247  [[nodiscard]] bool is_enabled_lines() const { return _lines; }
248 
254  [[nodiscard]] bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
255 
266  [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }
267 
273  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
274 
280  [[nodiscard]] bool is_enabled_keep_quotes() const { return _keep_quotes; }
281 
287  [[nodiscard]] bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
288 
294  [[nodiscard]] bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
295 
301  [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
302 
308  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
309 
315  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
316 
322  void set_dtypes(std::map<std::string, schema_element> types) { _dtypes = std::move(types); }
323 
329  void set_compression(compression_type comp_type) { _compression = comp_type; }
330 
336  void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; }
337 
343  void set_byte_range_size(size_t size) { _byte_range_size = size; }
344 
350  void set_delimiter(char delimiter)
351  {
352  switch (delimiter) {
353  case '{':
354  case '[':
355  case '}':
356  case ']':
357  case ',':
358  case ':':
359  case '"':
360  case '\'':
361  case '\\':
362  case ' ':
363  case '\t':
364  case '\r': CUDF_FAIL("Unsupported delimiter character.", std::invalid_argument); break;
365  }
366  _delimiter = delimiter;
367  }
368 
374  void enable_lines(bool val) { _lines = val; }
375 
382  void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
383 
393  void enable_prune_columns(bool val) { _prune_columns = val; }
394 
400  void enable_dayfirst(bool val) { _dayfirst = val; }
401 
408  void enable_keep_quotes(bool val) { _keep_quotes = val; }
409 
416  void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
417 
424  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
425 
431  void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
432 };
433 
438  json_reader_options options;
439 
440  public:
446  explicit json_reader_options_builder() = default;
447 
453  explicit json_reader_options_builder(source_info src) : options{std::move(src)} {}
454 
461  json_reader_options_builder& dtypes(std::vector<data_type> types)
462  {
463  options._dtypes = std::move(types);
464  return *this;
465  }
466 
473  json_reader_options_builder& dtypes(std::map<std::string, data_type> types)
474  {
475  options._dtypes = std::move(types);
476  return *this;
477  }
478 
485  json_reader_options_builder& dtypes(std::map<std::string, schema_element> types)
486  {
487  options._dtypes = std::move(types);
488  return *this;
489  }
490 
498  {
499  options._compression = comp_type;
500  return *this;
501  }
502 
510  {
511  options._byte_range_offset = offset;
512  return *this;
513  }
514 
522  {
523  options._byte_range_size = size;
524  return *this;
525  }
526 
534  {
535  options.set_delimiter(delimiter);
536  return *this;
537  }
538 
546  {
547  options._lines = val;
548  return *this;
549  }
550 
559  {
560  options._mixed_types_as_string = val;
561  return *this;
562  }
563 
575  {
576  options._prune_columns = val;
577  return *this;
578  }
579 
587  {
588  options._dayfirst = val;
589  return *this;
590  }
591 
600  {
601  options._keep_quotes = val;
602  return *this;
603  }
604 
613  {
614  options._normalize_single_quotes = val;
615  return *this;
616  }
617 
626  {
627  options._normalize_whitespace = val;
628  return *this;
629  }
630 
638  {
639  options._recovery_mode = val;
640  return *this;
641  }
642 
646  operator json_reader_options&&() { return std::move(options); }
647 
655  json_reader_options&& build() { return std::move(options); }
656 };
657 
676  json_reader_options options,
679  // end of group
681 
692 
697  // Specify the sink to use for writer output
698  sink_info _sink;
699  // Set of columns to output
700  table_view _table;
701  // string to use for null entries
702  std::string _na_rep = "";
703  // Indicates whether to output nulls as 'null' or exclude the field
704  bool _include_nulls = false;
705  // Indicates whether to use JSON lines for records format
706  bool _lines = false;
707  // maximum number of rows to write in each chunk (limits memory use)
708  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
709  // string to use for values != 0 in INT8 types (default 'true')
710  std::string _true_value = std::string{"true"};
711  // string to use for values == 0 in INT8 types (default 'false')
712  std::string _false_value = std::string{"false"};
713  // Names of all columns; if empty, writer will generate column names
714  std::optional<table_metadata> _metadata; // Optional column names
715 
723  : _sink(std::move(sink)), _table(std::move(table)), _rows_per_chunk(table.num_rows())
724  {
725  }
726 
728 
729  public:
735  explicit json_writer_options() = default;
736 
746 
752  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
753 
759  [[nodiscard]] table_view const& get_table() const { return _table; }
760 
766  [[nodiscard]] std::optional<table_metadata> const& get_metadata() const { return _metadata; }
767 
773  [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
774 
780  [[nodiscard]] bool is_enabled_include_nulls() const { return _include_nulls; }
781 
787  [[nodiscard]] bool is_enabled_lines() const { return _lines; }
788 
794  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
795 
801  [[nodiscard]] std::string const& get_true_value() const { return _true_value; }
802 
808  [[nodiscard]] std::string const& get_false_value() const { return _false_value; }
809 
810  // Setter
811 
817  void set_table(table_view tbl) { _table = tbl; }
818 
824  void set_metadata(table_metadata metadata) { _metadata = std::move(metadata); }
825 
831  void set_na_rep(std::string val) { _na_rep = std::move(val); }
832 
838  void enable_include_nulls(bool val) { _include_nulls = val; }
839 
845  void enable_lines(bool val) { _lines = val; }
846 
852  void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; }
853 
859  void set_true_value(std::string val) { _true_value = std::move(val); }
860 
866  void set_false_value(std::string val) { _false_value = std::move(val); }
867 };
868 
873  json_writer_options options;
874 
875  public:
881  explicit json_writer_options_builder() = default;
882 
890  : options{sink, table}
891  {
892  }
893 
901  {
902  options._table = tbl;
903  return *this;
904  }
905 
913  {
914  options._metadata = std::move(metadata);
915  return *this;
916  }
917 
925  {
926  options._na_rep = std::move(val);
927  return *this;
928  };
929 
937  {
938  options._include_nulls = val;
939  return *this;
940  }
941 
949  {
950  options._lines = val;
951  return *this;
952  }
953 
961  {
962  options._rows_per_chunk = val;
963  return *this;
964  }
965 
973  {
974  options._true_value = std::move(val);
975  return *this;
976  }
977 
985  {
986  options._false_value = std::move(val);
987  return *this;
988  }
989 
993  operator json_writer_options&&() { return std::move(options); }
994 
1002  json_writer_options&& build() { return std::move(options); }
1003 };
1004 
1022 void write_json(json_writer_options const& options,
1024  // end of group
1026 } // namespace io
1027 } // namespace CUDF_EXPORT cudf
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
Builds settings to use for read_json().
Definition: io/json.hpp:437
json_reader_options_builder & normalize_single_quotes(bool val)
Set whether the reader should normalize single quotes around strings.
Definition: io/json.hpp:612
json_reader_options_builder & keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
Definition: io/json.hpp:599
json_reader_options_builder & normalize_whitespace(bool val)
Set whether the reader should normalize unquoted whitespace.
Definition: io/json.hpp:625
json_reader_options_builder & dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:586
json_reader_options_builder & recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:637
json_reader_options_builder & delimiter(char delimiter)
Set delimiter separating records in JSON lines.
Definition: io/json.hpp:533
json_reader_options_builder & prune_columns(bool val)
Set whether to prune columns on read, selected based on the dtypes option.
Definition: io/json.hpp:574
json_reader_options_builder & lines(bool val)
Set whether to read the file as a json object per line.
Definition: io/json.hpp:545
json_reader_options_builder & dtypes(std::vector< data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:461
json_reader_options && build()
move json_reader_options member once it's built.
Definition: io/json.hpp:655
json_reader_options_builder & mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
Definition: io/json.hpp:558
json_reader_options_builder & compression(compression_type comp_type)
Set the compression type.
Definition: io/json.hpp:497
json_reader_options_builder(source_info src)
Constructor from source info.
Definition: io/json.hpp:453
json_reader_options_builder & byte_range_size(size_type size)
Set number of bytes to read.
Definition: io/json.hpp:521
json_reader_options_builder & dtypes(std::map< std::string, schema_element > types)
Set data types for columns to be read.
Definition: io/json.hpp:485
json_reader_options_builder & byte_range_offset(size_type offset)
Set number of bytes to skip from source start.
Definition: io/json.hpp:509
json_reader_options_builder()=default
Default constructor.
json_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:473
Input arguments to the read_json interface.
Definition: io/json.hpp:90
void enable_mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
Definition: io/json.hpp:382
void set_compression(compression_type comp_type)
Set the compression type.
Definition: io/json.hpp:329
void set_dtypes(std::vector< data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:308
void enable_normalize_single_quotes(bool val)
Set whether the reader should enable normalization of single quotes around strings.
Definition: io/json.hpp:416
void enable_prune_columns(bool val)
Set whether to prune columns on read, selected based on the set_dtypes option.
Definition: io/json.hpp:393
bool is_enabled_keep_quotes() const
Whether the reader should keep quotes of string values.
Definition: io/json.hpp:280
void enable_normalize_whitespace(bool val)
Set whether the reader should enable normalization of unquoted whitespace.
Definition: io/json.hpp:424
size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
Definition: io/json.hpp:190
source_info const & get_source() const
Returns source info.
Definition: io/json.hpp:163
void set_dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
Definition: io/json.hpp:315
bool is_enabled_prune_columns() const
Whether to prune columns on read, selected based on the set_dtypes option.
Definition: io/json.hpp:266
char get_delimiter() const
Returns delimiter separating records in JSON lines.
Definition: io/json.hpp:240
bool is_enabled_lines() const
Whether to read the file as a json object per line.
Definition: io/json.hpp:247
bool is_enabled_mixed_types_as_string() const
Whether to parse mixed types as a string column.
Definition: io/json.hpp:254
json_reader_options()=default
Default constructor.
void enable_dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:400
std::variant< std::vector< data_type >, std::map< std::string, data_type >, std::map< std::string, schema_element > > const & get_dtypes() const
Returns data types of the columns.
Definition: io/json.hpp:173
size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
Definition: io/json.hpp:204
void set_recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:431
bool is_enabled_normalize_whitespace() const
Whether the reader should normalize unquoted whitespace characters.
Definition: io/json.hpp:294
void set_delimiter(char delimiter)
Set delimiter separating records in JSON lines.
Definition: io/json.hpp:350
void set_byte_range_offset(size_t offset)
Set number of bytes to skip from source start.
Definition: io/json.hpp:336
void enable_lines(bool val)
Set whether to read the file as a json object per line.
Definition: io/json.hpp:374
void enable_keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
Definition: io/json.hpp:408
bool is_enabled_normalize_single_quotes() const
Whether the reader should normalize single quotes around strings.
Definition: io/json.hpp:287
compression_type get_compression() const
Returns compression format of the source.
Definition: io/json.hpp:183
void set_dtypes(std::map< std::string, schema_element > types)
Set data types for a potentially nested column hierarchy.
Definition: io/json.hpp:322
size_t get_byte_range_size() const
Returns number of bytes to read.
Definition: io/json.hpp:197
json_recovery_mode_t recovery_mode() const
Queries the JSON reader's behavior on invalid JSON lines.
Definition: io/json.hpp:301
static json_reader_options_builder builder(source_info src)
create json_reader_options_builder which will build json_reader_options.
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
Definition: io/json.hpp:273
size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
Definition: io/json.hpp:218
void set_byte_range_size(size_t size)
Set number of bytes to read.
Definition: io/json.hpp:343
Builder to build options for writer_json()
Definition: io/json.hpp:872
json_writer_options_builder & include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
Definition: io/json.hpp:936
json_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
Definition: io/json.hpp:900
json_writer_options_builder()=default
Default constructor.
json_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
Definition: io/json.hpp:960
json_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: io/json.hpp:972
json_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: io/json.hpp:984
json_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
Definition: io/json.hpp:889
json_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
Definition: io/json.hpp:924
json_writer_options_builder & metadata(table_metadata metadata)
Sets optional metadata (with column names).
Definition: io/json.hpp:912
json_writer_options && build()
move json_writer_options member once it's built.
Definition: io/json.hpp:1002
json_writer_options_builder & lines(bool val)
Enables/Disables JSON lines for records format.
Definition: io/json.hpp:948
Settings to use for write_json().
Definition: io/json.hpp:696
table_view const & get_table() const
Returns table that would be written to output.
Definition: io/json.hpp:759
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
Definition: io/json.hpp:866
void enable_include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
Definition: io/json.hpp:838
bool is_enabled_include_nulls() const
Whether to output nulls as 'null'.
Definition: io/json.hpp:780
void enable_lines(bool val)
Enables/Disables JSON lines for records format.
Definition: io/json.hpp:845
void set_na_rep(std::string val)
Sets string to used for null entries.
Definition: io/json.hpp:831
static json_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create json_writer_options.
json_writer_options()=default
Default constructor.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
Definition: io/json.hpp:859
sink_info const & get_sink() const
Returns sink used for writer output.
Definition: io/json.hpp:752
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
Definition: io/json.hpp:852
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
Definition: io/json.hpp:801
void set_table(table_view tbl)
Sets table to be written to output.
Definition: io/json.hpp:817
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
Definition: io/json.hpp:808
bool is_enabled_lines() const
Whether to use JSON lines for records format.
Definition: io/json.hpp:787
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
Definition: io/json.hpp:794
std::optional< table_metadata > const & get_metadata() const
Returns metadata information.
Definition: io/json.hpp:766
std::string const & get_na_rep() const
Returns string to used for null entries.
Definition: io/json.hpp:773
void set_metadata(table_metadata metadata)
Sets metadata.
Definition: io/json.hpp:824
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
A set of cudf::column's of the same size.
Definition: table.hpp:41
size_type num_rows() const noexcept
Returns the number of rows.
Definition: table.hpp:94
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_json(json_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Reads a JSON dataset into a set of columns.
json_recovery_mode_t
Control the error recovery behavior of the json parser.
Definition: io/json.hpp:62
@ RECOVER_WITH_NULL
Recovers from an error, replacing invalid records with null.
@ FAIL
Does not recover from an error when encountering an invalid format.
compression_type
Compression algorithms.
Definition: io/types.hpp:57
void write_json(json_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to JSON format.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
cuDF interfaces
Definition: aggregation.hpp:35
Allows specifying the target types for nested JSON data via json_reader_options' set_dtypes method.
Definition: io/json.hpp:47
data_type type
The type that this column should be converted to.
Definition: io/json.hpp:51
std::map< std::string, schema_element > child_types
Allows specifying this column's child columns target type.
Definition: io/json.hpp:56
Destination information for write interfaces.
Definition: io/types.hpp:512
Source information for read interfaces.
Definition: io/types.hpp:337
Table metadata returned by IO readers.
Definition: io/types.hpp:277
Table with table metadata used by io readers to return the metadata by value.
Definition: io/types.hpp:292
Class definitions for (mutable)_table_view
Type declarations for libcudf.