19 #include <cudf/io/detail/orc.hpp>
23 #include <cudf/utilities/export.hpp>
29 #include <unordered_map>
33 namespace CUDF_EXPORT
cudf {
57 std::optional<std::vector<std::string>> _columns;
60 std::vector<std::vector<size_type>> _stripes;
62 int64_t _skip_rows = 0;
64 std::optional<int64_t> _num_rows;
67 bool _use_index =
true;
70 bool _use_np_dtypes =
true;
72 data_type _timestamp_type{type_id::EMPTY};
75 std::vector<std::string> _decimal128_columns;
114 [[nodiscard]]
auto const&
get_columns()
const {
return _columns; }
121 [[nodiscard]]
auto const&
get_stripes()
const {
return _stripes; }
136 [[nodiscard]] std::optional<int64_t>
const&
get_num_rows()
const {
return _num_rows; }
166 return _decimal128_columns;
176 void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
190 CUDF_EXPECTS(stripes.empty() or (_skip_rows == 0),
"Can't set stripes along with skip_rows");
191 CUDF_EXPECTS(stripes.empty() or not _num_rows.has_value(),
192 "Can't set stripes along with num_rows");
193 _stripes = std::move(stripes);
206 CUDF_EXPECTS(rows >= 0,
"skip_rows cannot be negative");
207 CUDF_EXPECTS(rows == 0 or _stripes.empty(),
"Can't set both skip_rows along with stripes");
221 CUDF_EXPECTS(nrows >= 0,
"num_rows cannot be negative");
222 CUDF_EXPECTS(_stripes.empty(),
"Can't set both num_rows and stripes");
254 _decimal128_columns = std::move(val);
287 options._columns = std::move(col_names);
335 options._use_index = use;
347 options._use_np_dtypes = use;
359 options._timestamp_type = type;
371 options._decimal128_columns = std::move(val);
475 std::size_t chunk_read_limit,
476 std::size_t pass_read_limit,
497 std::size_t chunk_read_limit,
498 std::size_t pass_read_limit,
516 std::size_t chunk_read_limit,
547 std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
593 std::optional<table_input_metadata> _metadata;
595 std::map<std::string, std::string> _user_data;
597 std::shared_ptr<writer_compression_statistics> _compression_stats;
599 bool _enable_dictionary_sort =
true;
610 : _sink(std::move(sink)), _table(std::move(
table))
684 auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
685 return unaligned_stride - unaligned_stride % 8;
719 return _compression_stats;
759 CUDF_EXPECTS(size_bytes >= 64 << 10,
"64KB is the minimum stripe size");
760 _stripe_size_bytes = size_bytes;
775 CUDF_EXPECTS(size_rows >= 512,
"Maximum stripe size cannot be smaller than 512");
776 _stripe_size_rows = size_rows;
790 CUDF_EXPECTS(stride >= 512,
"Row index stride cannot be smaller than 512");
791 _row_index_stride = stride;
815 _user_data = std::move(metadata);
825 _compression_stats = std::move(comp_stats);
868 options._compression = comp;
885 options._stats_freq = val;
933 options._table = tbl;
945 options._metadata = std::move(meta);
957 options._user_data = std::move(metadata);
968 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
970 options._compression_stats = comp_stats;
982 options._enable_dictionary_sort = val;
1039 std::optional<table_input_metadata> _metadata;
1041 std::map<std::string, std::string> _user_data;
1043 std::shared_ptr<writer_compression_statistics> _compression_stats;
1045 bool _enable_dictionary_sort =
true;
1115 auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
1116 return unaligned_stride - unaligned_stride % 8;
1143 return _compression_stats;
1183 CUDF_EXPECTS(size_bytes >= 64 << 10,
"64KB is the minimum stripe size");
1184 _stripe_size_bytes = size_bytes;
1199 CUDF_EXPECTS(size_rows >= 512,
"maximum stripe size cannot be smaller than 512");
1200 _stripe_size_rows = size_rows;
1214 CUDF_EXPECTS(stride >= 512,
"Row index stride cannot be smaller than 512");
1215 _row_index_stride = stride;
1232 _user_data = std::move(metadata);
1242 _compression_stats = std::move(comp_stats);
1282 options._compression = comp;
1299 options._stats_freq = val;
1347 options._metadata = std::move(meta);
1358 std::map<std::string, std::string> metadata)
1360 options._user_data = std::move(metadata);
1371 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
1373 options._compression_stats = comp_stats;
1385 options._enable_dictionary_sort = val;
Indicator for the logical data type of an element in a column.
The chunked orc reader class to read an ORC file iteratively into a series of tables,...
chunked_orc_reader(std::size_t chunk_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from output size limits along with other ORC reader options.
bool has_next() const
Check if there is any data in the given data sources has not yet read.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits along with other ORC reader options.
~chunked_orc_reader()
Destructor, destroying the internal reader instance.
table_with_metadata read_chunk() const
Read a chunk of rows in the given data sources.
chunked_orc_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, size_type output_row_granularity, orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct the reader from input/output size limits, output row granularity, along with other ORC read...
chunked_orc_reader()
Default constructor, this should never be used.
Builds settings to use for write_orc_chunked().
chunked_orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
chunked_orc_writer_options && build()
move chunked_orc_writer_options member once it's built.
chunked_orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
chunked_orc_writer_options_builder()=default
Default constructor.
chunked_orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
chunked_orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
chunked_orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
chunked_orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
chunked_orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
chunked_orc_writer_options_builder(sink_info const &sink)
Constructor from sink and table.
chunked_orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
chunked_orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
Settings to use for write_orc_chunked().
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
chunked_orc_writer_options()=default
Default constructor.
void metadata(table_input_metadata meta)
Sets associated metadata.
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
sink_info const & get_sink() const
Returns sink info.
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
auto get_row_index_stride() const
Returns the row index stride.
void set_row_index_stride(size_type stride)
Sets the row index stride.
statistics_freq get_statistics_freq() const
Returns granularity of statistics collection.
void set_compression(compression_type comp)
Sets compression type.
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
auto const & get_metadata() const
Returns associated metadata.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
compression_type get_compression() const
Returns compression type.
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
static chunked_orc_writer_options_builder builder(sink_info const &sink)
Create builder to create chunked_orc_writer_options.
Chunked orc writer class writes an ORC file in a chunked/stream form.
~orc_chunked_writer()
virtual destructor, Added so we don't leak detail types.
orc_chunked_writer()
Default constructor, this should never be used. This is added just to satisfy cython.
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
std::unique_ptr< orc::detail::writer > writer
Unique pointer to impl writer class.
orc_chunked_writer & write(table_view const &table)
Writes table to output.
void close()
Finishes the chunked/streamed write process.
Builds settings to use for read_orc().
orc_reader_options_builder & use_index(bool use)
Enable/Disable use of row index to speed-up reading.
orc_reader_options_builder & decimal128_columns(std::vector< std::string > val)
Columns that should be read as 128-bit Decimal.
orc_reader_options_builder & use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
orc_reader_options_builder & skip_rows(int64_t rows)
Sets number of rows to skip from the start.
orc_reader_options_builder()=default
Default constructor.
orc_reader_options_builder(source_info src)
Constructor from source info.
orc_reader_options_builder & stripes(std::vector< std::vector< size_type >> stripes)
Sets list of individual stripes to read per source.
orc_reader_options_builder & num_rows(int64_t nrows)
Sets number of row to read.
orc_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the column to read.
orc_reader_options && build()
move orc_reader_options member once it's built.
orc_reader_options_builder & timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Settings to use for read_orc().
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
orc_reader_options()=default
Default constructor.
void enable_use_np_dtypes(bool use)
Enable/Disable use of numpy-compatible dtypes.
void set_num_rows(int64_t nrows)
Sets number of row to read.
auto const & get_stripes() const
Returns vector of vectors, stripes to read for each input source.
void set_decimal128_columns(std::vector< std::string > val)
Set columns that should be read as 128-bit Decimal.
void set_skip_rows(int64_t rows)
Sets number of rows to skip from the start.
void enable_use_index(bool use)
Enable/Disable use of row index to speed-up reading.
void set_columns(std::vector< std::string > col_names)
Sets names of the column to read.
void set_stripes(std::vector< std::vector< size_type >> stripes)
Sets list of stripes to read for each input source.
data_type get_timestamp_type() const
Returns timestamp type to which timestamp column will be cast.
auto const & get_columns() const
Returns names of the columns to read, if set.
static orc_reader_options_builder builder(source_info src)
Creates orc_reader_options_builder which will build orc_reader_options.
std::optional< int64_t > const & get_num_rows() const
Returns number of row to read.
source_info const & get_source() const
Returns source info.
bool is_enabled_use_np_dtypes() const
Whether to use numpy-compatible dtypes.
bool is_enabled_use_index() const
Whether to use row index to speed-up reading.
std::vector< std::string > const & get_decimal128_columns() const
Returns fully qualified names of columns that should be read as 128-bit Decimal.
void set_timestamp_type(data_type type)
Sets timestamp type to which timestamp column will be cast.
Builds settings to use for write_orc().
orc_writer_options_builder & table(table_view tbl)
Sets table to be written to output.
orc_writer_options_builder & row_index_stride(size_type val)
Sets the row index stride.
orc_writer_options_builder & enable_statistics(statistics_freq val)
Choose granularity of column statistics to be written.
orc_writer_options_builder & metadata(table_input_metadata meta)
Sets associated metadata.
orc_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
orc_writer_options && build()
move orc_writer_options member once it's built.
orc_writer_options_builder()=default
Default constructor.
orc_writer_options_builder & key_value_metadata(std::map< std::string, std::string > metadata)
Sets Key-Value footer metadata.
orc_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
orc_writer_options_builder & stripe_size_rows(size_type val)
Sets the maximum number of rows in output stripes.
orc_writer_options_builder & enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
orc_writer_options_builder & compression(compression_type comp)
Sets compression type.
orc_writer_options_builder & stripe_size_bytes(size_t val)
Sets the maximum stripe size, in bytes.
Settings to use for write_orc().
void enable_statistics(statistics_freq val)
Choose granularity of statistics collection.
auto const & get_metadata() const
Returns associated metadata.
std::map< std::string, std::string > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
bool is_enabled_statistics() const
Whether writing column statistics is enabled/disabled.
auto get_stripe_size_bytes() const
Returns maximum stripe size, in bytes.
void set_stripe_size_rows(size_type size_rows)
Sets the maximum stripe size, in rows.
void set_key_value_metadata(std::map< std::string, std::string > metadata)
Sets metadata.
auto get_stripe_size_rows() const
Returns maximum stripe size, in rows.
table_view get_table() const
Returns table to be written to output.
void set_metadata(table_input_metadata meta)
Sets associated metadata.
statistics_freq get_statistics_freq() const
Returns frequency of statistics collection.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
auto get_row_index_stride() const
Returns the row index stride.
void set_table(table_view tbl)
Sets table to be written to output.
orc_writer_options()=default
Default constructor.
void set_compression(compression_type comp)
Sets compression type.
void set_enable_dictionary_sort(bool val)
Sets whether string dictionaries should be sorted.
void set_row_index_stride(size_type stride)
Sets the row index stride.
compression_type get_compression() const
Returns compression type.
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create orc_writer_options.
bool get_enable_dictionary_sort() const
Returns whether string dictionaries should be sorted.
void set_stripe_size_bytes(size_t size_bytes)
Sets the maximum stripe size, in bytes.
sink_info const & get_sink() const
Returns sink info.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_stripe_size_rows
1M rows default orc stripe rows
constexpr size_type default_row_index_stride
10K rows default orc row index stride
table_with_metadata read_orc(orc_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Reads an ORC dataset into a set of columns.
constexpr size_t default_stripe_size_bytes
64MB default orc stripe size
statistics_freq
Column statistics granularity type for parquet/orc writers.
compression_type
Compression algorithms.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
@ STATISTICS_NONE
No column statistics.
@ STATISTICS_PAGE
Per-page column statistics.
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to ORC format.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
int32_t size_type
Row index type for columns and tables.
cuDF-IO API type definitions
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.