22 #include <unordered_map>
26 namespace CUDF_EXPORT
cudf {
35 namespace CUDF_EXPORT
cudf {
103 DELTA_LENGTH_BYTE_ARRAY,
133 size_t num_failed_bytes,
134 size_t num_skipped_bytes,
135 size_t num_compressed_output_bytes)
136 : _num_compressed_bytes(num_compressed_bytes),
137 _num_failed_bytes(num_failed_bytes),
138 _num_skipped_bytes(num_skipped_bytes),
139 _num_compressed_output_bytes(num_compressed_output_bytes)
151 _num_compressed_bytes += other._num_compressed_bytes;
152 _num_failed_bytes += other._num_failed_bytes;
153 _num_skipped_bytes += other._num_skipped_bytes;
154 _num_compressed_output_bytes += other._num_compressed_output_bytes;
189 return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
202 return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
206 std::size_t _num_compressed_bytes = 0;
207 std::size_t _num_failed_bytes = 0;
208 std::size_t _num_skipped_bytes = 0;
209 std::size_t _num_compressed_output_bytes = 0;
242 std::optional<bool> _is_nullable = std::nullopt,
243 std::optional<bool> _is_binary = std::nullopt)
244 : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
268 std::vector<column_name_info>
275 std::vector<std::unordered_map<std::string, std::string>>
280 std::optional<size_type>
284 std::optional<size_type>
294 std::unique_ptr<table>
tbl;
305 template <
typename T>
308 using non_cv_T = std::remove_cv_t<T>;
309 return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
310 std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
311 std::is_same_v<non_cv_T, std::byte>;
322 std::optional<std::size_t> size{};
342 _filepath_sources.reserve(file_paths.size());
343 for (
auto& path : file_paths) {
344 _filepath_sources.push_back({std::move(path), std::nullopt});
355 :
source_info(std::vector<std::string>{std::move(file_path)})
365 : _type(
io_type::
FILEPATH), _num_sources(sources.size()), _filepath_sources(std::move(sources))
375 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
379 if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
380 _host_buffers.reserve(host_buffers.size());
383 std::back_inserter(_host_buffers),
385 return cudf::host_span<std::byte const>{
386 reinterpret_cast<std::byte const*>(s.data()), s.size()};
389 _host_buffers.assign(host_buffers.begin(), host_buffers.end());
398 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
403 reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
414 _num_sources(device_buffers.size()),
415 _device_buffers(device_buffers.begin(), device_buffers.end())
434 explicit source_info(std::vector<cudf::io::datasource*>
const& sources)
454 [[nodiscard]]
auto type()
const {
return _type; }
466 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
472 [[nodiscard]]
auto const&
host_buffers()
const {
return _host_buffers; }
484 [[nodiscard]]
auto const&
user_sources()
const {
return _user_sources; }
494 void rebuild_filepaths()
497 _filepaths.reserve(_filepath_sources.size());
498 for (
auto const& source : _filepath_sources) {
499 _filepaths.push_back(source.path);
504 size_t _num_sources = 0;
505 std::vector<filepath_source> _filepath_sources;
506 std::vector<std::string> _filepaths;
507 std::vector<cudf::host_span<std::byte const>> _host_buffers;
508 std::vector<cudf::device_span<std::byte const>> _device_buffers;
509 std::vector<cudf::io::datasource*> _user_sources;
530 : _type(
io_type::
FILEPATH), _num_sinks(file_paths.size()), _filepaths(std::move(file_paths))
549 explicit sink_info(std::vector<std::vector<char>*> buffers)
550 : _type(
io_type::
HOST_BUFFER), _num_sinks(buffers.size()), _buffers(std::move(buffers))
565 explicit sink_info(std::vector<cudf::io::data_sink*>
const& user_sinks)
567 _num_sinks(user_sinks.size()),
568 _user_sinks(std::move(user_sinks))
587 [[nodiscard]]
auto type()
const {
return _type; }
593 [[nodiscard]]
auto num_sinks()
const {
return _num_sinks; }
599 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
605 [[nodiscard]]
auto const&
buffers()
const {
return _buffers; }
611 [[nodiscard]]
auto const&
user_sinks()
const {
return _user_sinks; }
615 size_t _num_sinks = 1;
616 std::vector<std::string> _filepaths;
617 std::vector<std::vector<char>*> _buffers;
618 std::vector<cudf::io::data_sink*> _user_sinks;
621 class table_input_metadata;
628 std::string _name =
"";
629 std::optional<bool> _nullable;
630 bool _list_column_is_map =
false;
631 bool _use_int96_timestamp =
false;
632 bool _output_as_binary =
false;
633 bool _skip_compression =
false;
634 std::optional<uint8_t> _decimal_precision;
635 std::optional<int32_t> _parquet_field_id;
636 std::optional<int32_t> _type_length;
637 std::vector<column_in_metadata> children;
656 children.push_back(child);
693 _list_column_is_map =
true;
707 _use_int96_timestamp = req;
720 _decimal_precision = precision;
733 _type_length = length;
745 _parquet_field_id = field_id;
759 _output_as_binary = binary;
760 if (_output_as_binary and children.size() == 1) {
761 children.emplace_back();
762 }
else if (!_output_as_binary and children.size() == 2) {
777 _skip_compression = skip;
793 _encoding = encoding;
818 [[nodiscard]] std::string
const&
get_name() const noexcept {
return _name; }
834 [[nodiscard]]
bool nullable()
const {
return _nullable.value(); }
841 [[nodiscard]]
bool is_map() const noexcept {
return _list_column_is_map; }
858 return _decimal_precision.has_value();
893 return _parquet_field_id.has_value();
991 bool _convert_binary_to_strings{
true};
992 int32_t _type_length{0};
994 std::vector<reader_column_schema> children;
1013 children.assign(child_span.begin(), child_span.end());
1024 children.push_back(child);
1054 _convert_binary_to_strings = convert_to_string;
1066 _type_length = type_length;
1077 return _convert_binary_to_strings;
Interface class for storing the output data from the writers.
Interface class for providing input data to the readers.
schema element for reader
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
reader_column_schema & set_type_length(int32_t type_length)
Sets the length of fixed length data.
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
int32_t get_type_length() const
Get the length in bytes of this fixed length data.
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
size_t get_num_children() const
Get the number of child objects.
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
reader_column_schema(std::span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
reader_column_schema & child(size_type i)
Get reference to a child of this column.
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
Statistics about compression performed by a writer.
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
writer_compression_statistics()=default
Default constructor.
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
statistics_freq
Column statistics granularity type for parquet/orc writers.
column_encoding
Valid encodings for use with column_in_metadata::set_encoding()
quote_style
Behavior when handling quotations in field data.
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
dictionary_policy
Control use of dictionary encoding for parquet writer.
compression_type
Compression algorithms.
io_type
Data source or destination types.
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
@ STATISTICS_NONE
No column statistics.
@ STATISTICS_PAGE
Per-page column statistics.
@ USE_DEFAULT
No encoding has been requested, use default encoding.
@ MINIMAL
Quote only fields which contain special characters.
@ NONNUMERIC
Quote all non-numeric fields.
@ ALWAYS
Use dictionary regardless of impact on compression.
@ ADAPTIVE
Use dictionary when it will not impact compression.
@ NEVER
Never use dictionary encoding.
@ XZ
XZ format, using LZMA(2) algorithm.
@ ZIP
ZIP format, using DEFLATE algorithm.
@ BZIP2
BZIP2 format, using Burrows-Wheeler transform.
@ AUTO
Automatically detect or select compression format.
@ HOST_BUFFER
Input/output is a buffer in host memory.
@ USER_IMPLEMENTED
Input/output is handled by a custom user class.
@ VOID
Input/output is nothing. No work is done. Useful for benchmarking.
@ FILEPATH
Input/output is a file path.
@ DEVICE_BUFFER
Input/output is a buffer in device memory.
cuda::std::span< T, Extent > device_span
Device span is an alias of cuda::std::span.
int32_t size_type
Row index type for columns and tables.
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
@ ALL
All initialization steps (default behavior)
C++20 std::span with reduced feature set.
Detailed name (and optionally nullability) information for output columns.
std::optional< bool > is_nullable
Column nullability.
std::optional< bool > is_binary
Column is binary (i.e. not a list)
std::vector< column_name_info > children
Child column names.
bool operator==(column_name_info const &rhs) const
Compares two column name info structs for equality.
std::optional< int32_t > type_length
Byte width of data (for fixed length data)
std::string name
Column name.
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
Construct a column name info with a name, optional nullabilty, and no children.
A file path with an optional known size in bytes.
std::string path
Path or URL of the input file.
Information used while writing partitioned datasets.
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
size_type start_row
The start row of the partition.
size_type num_rows
The number of rows in the partition.
Destination information for write interfaces.
auto const & buffers() const
Get the host buffers of the input.
sink_info(std::vector< std::vector< char > * > buffers)
Construct a new sink info object for multiple host buffers.
auto const & filepaths() const
Get the filepaths of the input.
sink_info(std::string file_path)
Construct a new sink info object for a single file.
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
auto num_sinks() const
Get the number of sinks.
auto const & user_sinks() const
Get the user sinks of the input.
sink_info(size_t num_sinks)
Construct a new sink info object.
auto type() const
Get the type of the input.
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
sink_info(std::vector< std::string > file_paths)
Construct a new sink info object for multiple files.
Source information for read interfaces.
auto const & device_buffers() const
Get the device buffers of the input.
source_info()=default
Default constructor for the next-gen parquet reader.
source_info(std::vector< std::string > file_paths)
Construct a new source info object for multiple files.
auto const & filepath_sources() const
Get the filepath sources of the input.
auto const & filepaths() const
Get the filepaths of the input.
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
source_info(std::vector< filepath_source > sources)
Construct a new source info object from filepath sources with optional known sizes.
auto num_sources() const
Get the number of input sources.
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
auto const & host_buffers() const
Get the host buffers of the input.
auto type() const
Get the type of the input.
source_info(std::string file_path)
Construct a new source info object for a single file.
auto const & user_sources() const
Get the user sources of the input.
Class definition for cudf::table.
Type declarations for libcudf.