32 #include <unordered_map>
36 namespace CUDF_EXPORT
cudf {
45 namespace CUDF_EXPORT
cudf {
143 size_t num_failed_bytes,
144 size_t num_skipped_bytes,
145 size_t num_compressed_output_bytes)
146 : _num_compressed_bytes(num_compressed_bytes),
147 _num_failed_bytes(num_failed_bytes),
148 _num_skipped_bytes(num_skipped_bytes),
149 _num_compressed_output_bytes(num_compressed_output_bytes)
161 _num_compressed_bytes += other._num_compressed_bytes;
162 _num_failed_bytes += other._num_failed_bytes;
163 _num_skipped_bytes += other._num_skipped_bytes;
164 _num_compressed_output_bytes += other._num_compressed_output_bytes;
199 return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
212 return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
216 std::size_t _num_compressed_bytes = 0;
217 std::size_t _num_failed_bytes = 0;
218 std::size_t _num_skipped_bytes = 0;
219 std::size_t _num_compressed_output_bytes = 0;
252 std::optional<bool> _is_nullable = std::nullopt,
253 std::optional<bool> _is_binary = std::nullopt)
254 : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
278 std::vector<column_name_info>
285 std::vector<std::unordered_map<std::string, std::string>>
293 std::unique_ptr<table>
tbl;
306 char const* data =
nullptr;
315 host_buffer(
char const* data,
size_t size) : data(data), size(size) {}
325 template <
typename T>
328 using non_cv_T = std::remove_cv_t<T>;
329 return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
330 std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
331 std::is_same_v<non_cv_T, std::byte>;
369 _host_buffers.reserve(host_buffers.size());
372 std::back_inserter(_host_buffers),
374 return cudf::host_span<std::byte const>{
375 reinterpret_cast<std::byte const*>(hb.data), hb.size};
399 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
403 if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
404 _host_buffers.reserve(host_buffers.size());
407 std::back_inserter(_host_buffers),
409 return cudf::host_span<std::byte const>{
410 reinterpret_cast<std::byte const*>(s.data()), s.size()};
413 _host_buffers.assign(host_buffers.begin(), host_buffers.end());
422 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
426 reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
455 explicit source_info(std::vector<cudf::io::datasource*>
const& sources)
475 [[nodiscard]]
auto type()
const {
return _type; }
481 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
487 [[nodiscard]]
auto const&
host_buffers()
const {
return _host_buffers; }
499 [[nodiscard]]
auto const&
user_sources()
const {
return _user_sources; }
503 std::vector<std::string> _filepaths;
504 std::vector<cudf::host_span<std::byte const>> _host_buffers;
505 std::vector<cudf::device_span<std::byte const>> _device_buffers;
506 std::vector<cudf::io::datasource*> _user_sources;
526 explicit sink_info(std::vector<std::string>
const& file_paths)
527 : _type(
io_type::
FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
546 explicit sink_info(std::vector<std::vector<char>*>
const& buffers)
562 explicit sink_info(std::vector<cudf::io::data_sink*>
const& user_sinks)
582 [[nodiscard]]
auto type()
const {
return _type; }
588 [[nodiscard]]
auto num_sinks()
const {
return _num_sinks; }
594 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
600 [[nodiscard]]
auto const&
buffers()
const {
return _buffers; }
606 [[nodiscard]]
auto const&
user_sinks()
const {
return _user_sinks; }
610 size_t _num_sinks = 1;
611 std::vector<std::string> _filepaths;
612 std::vector<std::vector<char>*> _buffers;
613 std::vector<cudf::io::data_sink*> _user_sinks;
616 class table_input_metadata;
623 std::string _name =
"";
624 std::optional<bool> _nullable;
625 bool _list_column_is_map =
false;
626 bool _use_int96_timestamp =
false;
627 bool _output_as_binary =
false;
628 bool _skip_compression =
false;
629 std::optional<uint8_t> _decimal_precision;
630 std::optional<int32_t> _parquet_field_id;
631 std::optional<int32_t> _type_length;
632 std::vector<column_in_metadata> children;
651 children.push_back(child);
688 _list_column_is_map =
true;
702 _use_int96_timestamp = req;
715 _decimal_precision = precision;
728 _type_length = length;
740 _parquet_field_id = field_id;
754 _output_as_binary = binary;
755 if (_output_as_binary and children.size() == 1) {
756 children.emplace_back();
757 }
else if (!_output_as_binary and children.size() == 2) {
772 _skip_compression = skip;
788 _encoding = encoding;
813 [[nodiscard]] std::string
get_name() const noexcept {
return _name; }
829 [[nodiscard]]
bool nullable()
const {
return _nullable.value(); }
836 [[nodiscard]]
bool is_map() const noexcept {
return _list_column_is_map; }
853 return _decimal_precision.has_value();
888 return _parquet_field_id.has_value();
986 bool _convert_binary_to_strings{
true};
987 int32_t _type_length{0};
989 std::vector<reader_column_schema> children;
1008 children.assign(child_span.
begin(), child_span.
end());
1019 children.push_back(child);
1049 _convert_binary_to_strings = convert_to_string;
1061 _type_length = type_length;
1072 return _convert_binary_to_strings;
constexpr iterator end() const noexcept
Returns an iterator to the element following the last element of the span.
constexpr iterator begin() const noexcept
Returns an iterator to the first element of the span.
Interface class for storing the output data from the writers.
Interface class for providing input data to the readers.
schema element for reader
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
reader_column_schema & set_type_length(int32_t type_length)
Sets the length of fixed length data.
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
int32_t get_type_length() const
Get the length in bytes of this fixed length data.
reader_column_schema(host_span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
size_t get_num_children() const
Get the number of child objects.
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
reader_column_schema & child(size_type i)
Get reference to a child of this column.
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
Statistics about compression performed by a writer.
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
writer_compression_statistics()=default
Default constructor.
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
io_type
Data source or destination types.
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
compression_type
Compression algorithms.
quote_style
Behavior when handling quotations in field data.
column_encoding
Valid encodings for use with column_in_metadata::set_encoding()
statistics_freq
Column statistics granularity type for parquet/orc writers.
dictionary_policy
Control use of dictionary encoding for parquet writer.
@ HOST_BUFFER
Input/output is a buffer in host memory.
@ USER_IMPLEMENTED
Input/output is handled by a custom user class.
@ VOID
Input/output is nothing. No work is done. Useful for benchmarking.
@ FILEPATH
Input/output is a file path.
@ DEVICE_BUFFER
Input/output is a buffer in device memory.
@ BROTLI
BROTLI format, using LZ77 + Huffman + 2nd order context modeling.
@ XZ
XZ format, using LZMA(2) algorithm.
@ ZIP
ZIP format, using DEFLATE algorithm.
@ BZIP2
BZIP2 format, using Burrows-Wheeler transform.
@ AUTO
Automatically detect or select compression format.
@ GZIP
GZIP format, using DEFLATE algorithm.
@ MINIMAL
Quote only fields which contain special characters.
@ NONNUMERIC
Quote all non-numeric fields.
@ DELTA_BINARY_PACKED
Use DELTA_BINARY_PACKED encoding (only valid for integer columns)
@ USE_DEFAULT
No encoding has been requested, use default encoding.
@ DELTA_LENGTH_BYTE_ARRAY
@ PLAIN
Use plain encoding.
@ BYTE_STREAM_SPLIT
Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
@ STATISTICS_NONE
No column statistics.
@ STATISTICS_PAGE
Per-page column statistics.
@ ALWAYS
Use dictionary regardless of impact on compression.
@ ADAPTIVE
Use dictionary when it will not impact compression.
@ NEVER
Never use dictionary encoding.
int32_t size_type
Row index type for columns and tables.
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
Device version of C++20 std::span with reduced feature set.
C++20 std::span with reduced feature set.
Detailed name (and optionally nullability) information for output columns.
std::optional< bool > is_nullable
Column nullability.
std::optional< bool > is_binary
Column is binary (i.e. not a list)
std::vector< column_name_info > children
Child column names.
bool operator==(column_name_info const &rhs) const
Compares two column name info structs for equality.
std::optional< int32_t > type_length
Byte width of data (for fixed length data)
std::string name
Column name.
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
Construct a column name info with a name, optional nullabilty, and no children.
Non-owning view of a host memory buffer.
host_buffer(char const *data, size_t size)
Construct a new host buffer object.
Information used while writing partitioned datasets.
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
size_type start_row
The start row of the partition.
size_type num_rows
The number of rows in the partition.
Destination information for write interfaces.
auto const & buffers() const
Get the host buffers of the input.
sink_info(std::vector< std::vector< char > * > const &buffers)
Construct a new sink info object for multiple host buffers.
sink_info(std::string const &file_path)
Construct a new sink info object for a single file.
auto const & filepaths() const
Get the filepaths of the input.
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
auto num_sinks() const
Get the number of sinks.
auto const & user_sinks() const
Get the user sinks of the input.
sink_info(std::vector< std::string > const &file_paths)
Construct a new sink info object for multiple files.
sink_info(size_t num_sinks)
Construct a new sink info object.
auto type() const
Get the type of the input.
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
Source information for read interfaces.
auto const & device_buffers() const
Get the device buffers of the input.
source_info(char const *host_data, size_t size)
Construct a new source info object for a single buffer.
auto const & filepaths() const
Get the filepaths of the input.
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
source_info(std::string const &file_path)
Construct a new source info object for a single file.
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
auto const & host_buffers() const
Get the host buffers of the input.
auto type() const
Get the type of the input.
source_info(std::vector< std::string > const &file_paths)
Construct a new source info object for multiple files.
auto const & user_sources() const
Get the user sources of the input.
source_info(std::vector< host_buffer > const &host_buffers)
Construct a new source info object for multiple buffers in host memory.
Class definition for cudf::table.
Type declarations for libcudf.