Go to the documentation of this file.
26 #include <cudf/utilities/span.hpp>
32 #include <unordered_map>
38 class RandomAccessFile;
124 size_t num_compressed_output_bytes)
128 _num_compressed_output_bytes(num_compressed_output_bytes)
140 _num_compressed_bytes += other._num_compressed_bytes;
141 _num_failed_bytes += other._num_failed_bytes;
142 _num_skipped_bytes += other._num_skipped_bytes;
143 _num_compressed_output_bytes += other._num_compressed_output_bytes;
195 std::size_t _num_compressed_bytes = 0;
196 std::size_t _num_failed_bytes = 0;
197 std::size_t _num_skipped_bytes = 0;
198 std::size_t _num_compressed_output_bytes = 0;
232 std::vector<column_name_info>
235 std::vector<std::unordered_map<std::string, std::string>>
244 std::unique_ptr<table>
tbl;
276 template <
typename T>
279 using non_cv_T = std::remove_cv_t<T>;
280 return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
281 std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
282 std::is_same_v<non_cv_T, std::byte>;
289 std::vector<std::shared_ptr<arrow::io::RandomAccessFile>>
_files;
298 explicit source_info(std::vector<std::string>
const& file_paths) : _filepaths(file_paths) {}
305 explicit source_info(std::string
const& file_path) : _filepaths({file_path}) {}
319 std::back_inserter(_host_buffers),
321 return cudf::host_span<std::byte const>{
322 reinterpret_cast<std::byte const*>(hb.data), hb.size};
346 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
350 if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
351 _host_buffers.reserve(host_buffers.size());
354 std::back_inserter(_host_buffers),
356 return cudf::host_span<std::byte const>{
357 reinterpret_cast<std::byte const*>(s.data()), s.size()};
360 _host_buffers.assign(host_buffers.begin(), host_buffers.end());
369 template <
typename T, CUDF_ENABLE_IF(is_
byte_like_type<std::remove_cv_t<T>>())>
373 reinterpret_cast<std::byte const*
>(host_data.
data()), host_data.
size())}
383 : _type(
io_type::DEVICE_BUFFER), _device_buffers(device_buffers.begin(), device_buffers.end())
393 : _type(
io_type::DEVICE_BUFFER), _device_buffers({{d_buffer}})
402 explicit source_info(std::vector<cudf::io::datasource*>
const& sources)
422 [[nodiscard]]
auto type()
const {
return _type; }
428 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
434 [[nodiscard]]
auto const&
host_buffers()
const {
return _host_buffers; }
446 [[nodiscard]]
auto const&
files()
const {
return _files; }
452 [[nodiscard]]
auto const&
user_sources()
const {
return _user_sources; }
455 io_type _type = io_type::FILEPATH;
456 std::vector<std::string> _filepaths;
457 std::vector<cudf::host_span<std::byte const>> _host_buffers;
458 std::vector<cudf::device_span<std::byte const>> _device_buffers;
459 std::vector<cudf::io::datasource*> _user_sources;
479 explicit sink_info(std::vector<std::string>
const& file_paths)
480 : _type(
io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
490 : _type(
io_type::FILEPATH), _filepaths({file_path})
499 explicit sink_info(std::vector<std::vector<char>*>
const& buffers)
500 : _type(
io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(buffers)
508 explicit sink_info(std::vector<char>* buffer) : _type(
io_type::HOST_BUFFER), _buffers({buffer}) {}
515 explicit sink_info(std::vector<cudf::io::data_sink*>
const& user_sinks)
535 [[nodiscard]]
auto type()
const {
return _type; }
541 [[nodiscard]]
auto num_sinks()
const {
return _num_sinks; }
547 [[nodiscard]]
auto const&
filepaths()
const {
return _filepaths; }
553 [[nodiscard]]
auto const&
buffers()
const {
return _buffers; }
559 [[nodiscard]]
auto const&
user_sinks()
const {
return _user_sinks; }
563 size_t _num_sinks = 1;
564 std::vector<std::string> _filepaths;
565 std::vector<std::vector<char>*> _buffers;
566 std::vector<cudf::io::data_sink*> _user_sinks;
569 class table_input_metadata;
576 std::string _name =
"";
577 std::optional<bool> _nullable;
578 bool _list_column_is_map =
false;
579 bool _use_int96_timestamp =
false;
580 bool _output_as_binary =
false;
581 std::optional<uint8_t> _decimal_precision;
582 std::optional<int32_t> _parquet_field_id;
583 std::vector<column_in_metadata> children;
601 children.push_back(child);
638 _list_column_is_map =
true;
652 _use_int96_timestamp = req;
665 _decimal_precision = precision;
677 _parquet_field_id = field_id;
691 _output_as_binary = binary;
716 [[nodiscard]] std::string
get_name() const noexcept {
return _name; }
732 [[nodiscard]]
bool nullable()
const {
return _nullable.value(); }
739 [[nodiscard]]
bool is_map() const noexcept {
return _list_column_is_map; }
756 return _decimal_precision.has_value();
775 return _parquet_field_id.has_value();
849 bool _convert_binary_to_strings{
true};
851 std::vector<reader_column_schema> children;
870 children.assign(child_span.
begin(), child_span.
end());
881 children.push_back(child);
911 _convert_binary_to_strings = convert_to_string;
922 return _convert_binary_to_strings;
auto num_skipped_bytes() const noexcept
Returns the number of bytes in blocks that were skipped during compression.
@ ADAPTIVE
Use dictionary when it will not impact compression.
partition_info(size_type start_row, size_type num_rows)
Construct a new partition_info.
compression_type
Compression algorithms.
@ MINIMAL
Quote only fields which contain special characters.
statistics_freq
Column statistics granularity type for parquet/orc writers.
source_info(std::vector< host_buffer > const &host_buffers)
Construct a new source info object for multiple buffers in host memory.
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
Constructor with initial values.
reader_column_schema const & child(size_type i) const
Get const reference to a child of this column.
Destination information for write interfaces.
schema element for reader
int32_t size_type
Row index type for columns and tables.
@ NEVER
Never use dictionary encoding.
auto type() const
Get the type of the input.
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
Construct a new source info object for multiple buffers in host memory.
sink_info(std::string const &file_path)
Construct a new sink info object for a single file.
auto const & device_buffers() const
Get the device buffers of the input.
C++20 std::span with reduced feature set.
Type declarations for libcudf.
constexpr auto is_byte_like_type()
Returns true if the type is byte-like, meaning it is reasonable to pass as a pointer to bytes.
sink_info(size_t num_sinks)
Construct a new sink info object.
writer_compression_statistics()=default
Default constructor.
constexpr iterator end() const noexcept
Returns an iterator to the element following the last element of the span.
@ FILEPATH
Input/output is a file path.
A set of cudf::column's of the same size.
std::string name
Column name.
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
Adds the values from another writer_compression_statistics object.
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
Construct a new sink info object for multiple user-implemented sinks.
reader_column_schema & child(size_type i)
Get reference to a child of this column.
auto compression_ratio() const noexcept
Returns the compression ratio for the successfully compressed blocks.
bool nullable(table_view const &view)
Returns True if any of the columns in the table is nullable. (not entire hierarchy)
Non-owning view of a host memory buffer.
size_t size
Size of the buffer.
reader_column_schema(size_type number_of_children)
Construct a new reader column schema object.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
auto const & host_buffers() const
Get the host buffers of the input.
auto const & filepaths() const
Get the filepaths of the input.
auto const & filepaths() const
Get the filepaths of the input.
auto const & files() const
Get the input files.
sink_info(std::vector< char > *buffer)
Construct a new sink info object for a single host buffer.
sink_info(std::vector< std::vector< char > * > const &buffers)
Construct a new sink info object for multiple host buffers.
Source information for read interfaces.
auto num_total_input_bytes() const noexcept
Returns the total size of compression inputs.
constexpr pointer data() const noexcept
Returns a pointer to the beginning of the sequence.
Interface class for storing the output data from the writers.
source_info(std::vector< cudf::io::datasource * > const &sources)
Construct a new source info object for multiple user-implemented sources.
std::vector< column_name_info > children
Child column names.
source_info(char const *host_data, size_t size)
Construct a new source info object for a single buffer.
@ STATISTICS_COLUMN
Full column and offset indices. Implies STATISTICS_ROWGROUP.
A set of cudf::column_view's of the same size.
@ STATISTICS_PAGE
Per-page column statistics.
auto num_sinks() const
Get the number of sinks.
source_info(cudf::device_span< std::byte const > d_buffer)
Construct a new source info object from a device buffer.
source_info(std::vector< std::string > const &file_paths)
Construct a new source info object for multiple files.
sink_info(std::vector< std::string > const &file_paths)
Construct a new sink info object for multiple files.
sink_info(class cudf::io::data_sink *user_sink)
Construct a new sink info object for a single user-implemented sink.
size_t get_num_children() const
Get the number of child objects.
std::vector< std::shared_ptr< arrow::io::RandomAccessFile > > _files
Input files.
auto const & user_sources() const
Get the user sources of the input.
Statistics about compression performed by a writer.
bool is_enabled_convert_binary_to_strings() const
Get whether to encode this column as binary or string data.
dictionary_policy
Control use of dictionary encoding for parquet writer.
@ STATISTICS_NONE
No column statistics.
Information used while writing partitioned datasets.
size_type num_rows
The number of rows in the partition.
source_info(std::string const &file_path)
Construct a new source info object for a single file.
Class definition for cudf::table.
Detailed name information for output columns.
auto type() const
Get the type of the input.
auto const & user_sinks() const
Get the user sinks of the input.
reader_column_schema(host_span< reader_column_schema > const &child_span)
Construct a new reader column schema object with a span defining the children.
source_info(cudf::host_span< T > host_data)
Construct a new source info object for a single buffer.
column_name_info(std::string const &_name)
Construct a column name info with a name and no children.
Device version of C++20 std::span with reduced feature set.
Interface class for providing input data to the readers.
auto num_compressed_bytes() const noexcept
Returns the number of bytes in blocks that were successfully compressed.
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
Specifies whether this column should be written as binary or string data Only valid for the following...
auto const & buffers() const
Get the host buffers of the input.
reader_column_schema & add_child(reader_column_schema const &child)
Add the children metadata of this column.
auto num_failed_bytes() const noexcept
Returns the number of bytes in blocks that failed to compress.
constexpr size_type size() const noexcept
Returns the number of elements in the span.
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
Construct a new source info object for multiple buffers in device memory.
char const * data
Pointer to the buffer.
size_type start_row
The start row of the partition.
constexpr iterator begin() const noexcept
Returns an iterator to the first element of the span.
host_buffer(char const *data, size_t size)
Construct a new host buffer object.
source_info(cudf::io::datasource *source)
Construct a new source info object for a single user-implemented source.
@ ALWAYS
Use dictionary reqardless of impact on compression.
quote_style
Behavior when handling quotations in field data.
io_type
Data source or destination types.