21 #include <cudf/detail/utilities/visitor_overload.hpp>
33 namespace CUDF_EXPORT
cudf {
41 class json_reader_options_builder;
98 std::variant<std::vector<data_type>,
99 std::map<std::string, data_type>,
100 std::map<std::string, schema_element>,
114 bool _mixed_types_as_string =
false;
116 char _delimiter =
'\n';
118 bool _prune_columns =
false;
120 bool _experimental =
false;
123 size_t _byte_range_offset = 0;
125 size_t _byte_range_size = 0;
128 bool _dayfirst =
false;
131 bool _keep_quotes =
false;
134 bool _normalize_single_quotes =
false;
137 bool _normalize_whitespace =
false;
145 bool _strict_validation =
false;
147 bool _allow_numeric_leading_zeros =
true;
149 bool _allow_nonnumeric_numbers =
true;
151 bool _allow_unquoted_control_chars =
true;
153 std::vector<std::string> _na_values;
222 if (_byte_range_size == 0) {
225 return _byte_range_size + get_byte_range_padding();
236 auto const num_columns =
237 std::visit(cudf::detail::visitor_overload{
238 [](
auto const& dtypes) {
return dtypes.size(); },
239 [](
schema_element const& dtypes) {
return dtypes.child_types.size(); }},
242 auto const max_row_bytes = 16 * 1024;
243 auto const column_bytes = 64;
244 auto const base_padding = 1024;
246 if (num_columns == 0) {
248 return max_row_bytes;
252 return base_padding + num_columns * column_bytes;
348 return _allow_numeric_leading_zeros;
371 return _allow_unquoted_control_chars;
379 [[nodiscard]] std::vector<std::string>
const&
get_na_values()
const {
return _na_values; }
386 void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
393 void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
400 void set_dtypes(std::map<std::string, schema_element> types) { _dtypes = std::move(types); }
450 case '\r':
CUDF_FAIL(
"Unsupported delimiter character.", std::invalid_argument);
break;
452 _delimiter = delimiter;
546 CUDF_EXPECTS(_strict_validation,
"Strict validation must be enabled for this to work.");
547 _allow_numeric_leading_zeros = val;
560 CUDF_EXPECTS(_strict_validation,
"Strict validation must be enabled for this to work.");
561 _allow_nonnumeric_numbers = val;
575 CUDF_EXPECTS(_strict_validation,
"Strict validation must be enabled for this to work.");
576 _allow_unquoted_control_chars = val;
584 void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
616 options._dtypes = std::move(types);
628 options._dtypes = std::move(types);
640 options._dtypes = std::move(types);
664 options._compression = comp_type;
676 options._byte_range_offset = offset;
688 options._byte_range_size = size;
712 options._lines = val;
725 options._mixed_types_as_string = val;
741 options._prune_columns = val;
756 options._experimental = val;
768 options._dayfirst = val;
781 options._keep_quotes = val;
794 options._normalize_single_quotes = val;
807 options._normalize_whitespace = val;
819 options._recovery_mode = val;
952 size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
956 std::string _na_rep =
"";
958 bool _include_nulls =
false;
962 std::string _true_value = std::string{
"true"};
964 std::string _false_value = std::string{
"false"};
966 std::optional<table_metadata> _metadata;
1018 [[nodiscard]] std::optional<table_metadata>
const&
get_metadata()
const {
return _metadata; }
1025 [[nodiscard]] std::string
const&
get_na_rep()
const {
return _na_rep; }
1156 : options{sink,
table}
1168 options._table = tbl;
1180 options._compression = comptype;
1192 options._metadata = std::move(metadata);
1204 options._na_rep = std::move(val);
1216 options._include_nulls = val;
1228 options._lines = val;
1240 options._rows_per_chunk = val;
1252 options._true_value = std::move(val);
1264 options._false_value = std::move(val);
Indicator for the logical data type of an element in a column.
Builds settings to use for read_json().
json_reader_options_builder & normalize_single_quotes(bool val)
Set whether the reader should normalize single quotes around strings.
json_reader_options_builder & nonnumeric_numbers(bool val)
Set whether specific unquoted number values are valid JSON. The values are NaN, +INF,...
json_reader_options_builder & keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
json_reader_options_builder & normalize_whitespace(bool val)
Set whether the reader should normalize unquoted whitespace.
json_reader_options_builder & numeric_leading_zeros(bool val)
Set Whether leading zeros are allowed in numeric values. Strict validation must be enabled for this t...
json_reader_options_builder & dtypes(schema_element types)
Set data types for columns to be read.
json_reader_options_builder & dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
json_reader_options_builder & recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
json_reader_options_builder & na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
json_reader_options_builder & delimiter(char delimiter)
Set delimiter separating records in JSON lines.
json_reader_options_builder & prune_columns(bool val)
Set whether to prune columns on read, selected based on the dtypes option.
json_reader_options_builder & experimental(bool val)
Set whether to enable experimental features.
json_reader_options_builder & lines(bool val)
Set whether to read the file as a json object per line.
json_reader_options_builder & dtypes(std::vector< data_type > types)
Set data types for columns to be read.
json_reader_options && build()
move json_reader_options member once it's built.
json_reader_options_builder & mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
json_reader_options_builder & unquoted_control_chars(bool val)
Set whether chars >= 0 and < 32 are allowed in a quoted string without some form of escaping....
json_reader_options_builder & compression(compression_type comp_type)
Set the compression type.
json_reader_options_builder(source_info src)
Constructor from source info.
json_reader_options_builder & strict_validation(bool val)
Set whether json validation should be strict or not.
json_reader_options_builder & byte_range_size(size_type size)
Set number of bytes to read.
json_reader_options_builder & dtypes(std::map< std::string, schema_element > types)
Set data types for columns to be read.
json_reader_options_builder & byte_range_offset(size_type offset)
Set number of bytes to skip from source start.
json_reader_options_builder()=default
Default constructor.
json_reader_options_builder & dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
Input arguments to the read_json interface.
bool is_allowed_nonnumeric_numbers() const
Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,...
void enable_mixed_types_as_string(bool val)
Set whether to parse mixed types as a string column. Also enables forcing to read a struct as string ...
void set_compression(compression_type comp_type)
Set the compression type.
void set_dtypes(std::vector< data_type > types)
Set data types for columns to be read.
void allow_unquoted_control_chars(bool val)
Set whether in a quoted string should characters greater than or equal to 0 and less than 32 be allow...
void enable_normalize_single_quotes(bool val)
Set whether the reader should enable normalization of single quotes around strings.
bool is_allowed_numeric_leading_zeros() const
Whether leading zeros are allowed in numeric values.
void enable_prune_columns(bool val)
Set whether to prune columns on read, selected based on the set_dtypes option.
bool is_enabled_keep_quotes() const
Whether the reader should keep quotes of string values.
void set_dtypes(schema_element types)
Set data types for a potentially nested column hierarchy.
void enable_normalize_whitespace(bool val)
Set whether the reader should enable normalization of unquoted whitespace.
void allow_nonnumeric_numbers(bool val)
Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,...
size_t get_byte_range_offset() const
Returns number of bytes to skip from source start.
source_info const & get_source() const
Returns source info.
void enable_experimental(bool val)
Set whether to enable experimental features.
void set_dtypes(std::map< std::string, data_type > types)
Set data types for columns to be read.
bool is_enabled_prune_columns() const
Whether to prune columns on read, selected based on the set_dtypes option.
char get_delimiter() const
Returns delimiter separating records in JSON lines.
bool is_enabled_lines() const
Whether to read the file as a json object per line.
void allow_numeric_leading_zeros(bool val)
Set whether leading zeros are allowed in numeric values. Strict validation must be enabled for this t...
void set_strict_validation(bool val)
Set whether strict validation is enabled or not.
bool is_enabled_mixed_types_as_string() const
Whether to parse mixed types as a string column.
json_reader_options()=default
Default constructor.
void set_na_values(std::vector< std::string > vals)
Sets additional values to recognize as null values.
void enable_dayfirst(bool val)
Set whether to parse dates as DD/MM versus MM/DD.
size_t get_byte_range_size_with_padding() const
Returns number of bytes to read with padding.
void set_recovery_mode(json_recovery_mode_t val)
Specifies the JSON reader's behavior on invalid JSON lines.
bool is_enabled_normalize_whitespace() const
Whether the reader should normalize unquoted whitespace characters.
bool is_strict_validation() const
Whether json validation should be enforced strictly or not.
void set_delimiter(char delimiter)
Set delimiter separating records in JSON lines.
void set_byte_range_offset(size_t offset)
Set number of bytes to skip from source start.
void enable_lines(bool val)
Set whether to read the file as a json object per line.
dtype_variant const & get_dtypes() const
Returns data types of the columns.
void enable_keep_quotes(bool val)
Set whether the reader should keep quotes of string values.
bool is_enabled_normalize_single_quotes() const
Whether the reader should normalize single quotes around strings.
compression_type get_compression() const
Returns compression format of the source.
void set_dtypes(std::map< std::string, schema_element > types)
Set data types for a potentially nested column hierarchy.
size_t get_byte_range_size() const
Returns number of bytes to read.
std::variant< std::vector< data_type >, std::map< std::string, data_type >, std::map< std::string, schema_element >, schema_element > dtype_variant
Variant type holding dtypes information for the columns.
bool is_enabled_experimental() const
Whether to enable experimental features.
json_recovery_mode_t recovery_mode() const
Queries the JSON reader's behavior on invalid JSON lines.
static json_reader_options_builder builder(source_info src)
create json_reader_options_builder which will build json_reader_options.
bool is_enabled_dayfirst() const
Whether to parse dates as DD/MM versus MM/DD.
size_t get_byte_range_padding() const
Returns number of bytes to pad when reading.
bool is_allowed_unquoted_control_chars() const
Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed w...
std::vector< std::string > const & get_na_values() const
Returns additional values to recognize as null values.
void set_byte_range_size(size_t size)
Set number of bytes to read.
Builder to build options for writer_json()
json_writer_options_builder & compression(compression_type comptype)
Sets compression type of output sink.
json_writer_options_builder & include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
json_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
json_writer_options_builder()=default
Default constructor.
json_writer_options_builder & rows_per_chunk(int val)
Sets maximum number of rows to process for each file write.
json_writer_options_builder & true_value(std::string val)
Sets string used for values != 0 in INT8 types.
json_writer_options_builder & false_value(std::string val)
Sets string used for values == 0 in INT8 types.
json_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
json_writer_options_builder & na_rep(std::string val)
Sets string to used for null entries.
json_writer_options_builder & metadata(table_metadata metadata)
Sets optional metadata (with column names).
json_writer_options && build()
move json_writer_options member once it's built.
json_writer_options_builder & lines(bool val)
Enables/Disables JSON lines for records format.
Settings to use for write_json().
void set_compression(compression_type comptype)
Sets compression type to be used.
compression_type get_compression() const
Returns compression type used for sink.
table_view const & get_table() const
Returns table that would be written to output.
void set_false_value(std::string val)
Sets string used for values == 0 in INT8 types.
void enable_include_nulls(bool val)
Enables/Disables output of nulls as 'null'.
bool is_enabled_include_nulls() const
Whether to output nulls as 'null'.
void enable_lines(bool val)
Enables/Disables JSON lines for records format.
void set_na_rep(std::string val)
Sets string to used for null entries.
static json_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create json_writer_options.
json_writer_options()=default
Default constructor.
void set_true_value(std::string val)
Sets string used for values != 0 in INT8 types.
sink_info const & get_sink() const
Returns sink used for writer output.
void set_rows_per_chunk(size_type val)
Sets maximum number of rows to process for each file write.
std::string const & get_true_value() const
Returns string used for values != 0 in INT8 types.
void set_table(table_view tbl)
Sets table to be written to output.
std::string const & get_false_value() const
Returns string used for values == 0 in INT8 types.
bool is_enabled_lines() const
Whether to use JSON lines for records format.
size_type get_rows_per_chunk() const
Returns maximum number of rows to process for each file write.
std::optional< table_metadata > const & get_metadata() const
Returns metadata information.
std::string const & get_na_rep() const
Returns string to used for null entries.
void set_metadata(table_metadata metadata)
Sets metadata.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
size_type num_rows() const noexcept
Returns the number of rows.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
table_with_metadata read_json(json_reader_options options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads a JSON dataset into a set of columns.
json_recovery_mode_t
Control the error recovery behavior of the json parser.
@ RECOVER_WITH_NULL
Recovers from an error, replacing invalid records with null.
@ FAIL
Does not recover from an error when encountering an invalid format.
compression_type
Compression algorithms.
void write_json(json_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to JSON format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
int32_t size_type
Row index type for columns and tables.
Allows specifying the target types for nested JSON data via json_reader_options' set_dtypes method.
std::optional< std::vector< std::string > > column_order
Allows specifying the order of the columns.
data_type type
The type that this column should be converted to.
std::map< std::string, schema_element > child_types
Allows specifying this column's child columns target type.
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.