Aggregation Factories#
- group aggregation_factories
Enums
-
enum class rank_percentage : int32_t#
Whether returned rank should be percentage or not and mention the type of percentage normalization.
Values:
-
enumerator NONE#
rank
-
enumerator ZERO_NORMALIZED#
rank / count
-
enumerator ONE_NORMALIZED#
(rank - 1) / (count - 1)
-
enumerator NONE#
-
enum class udf_type : bool#
Type of code in the user defined function string.
Values:
-
enumerator CUDA#
-
enumerator PTX#
-
enumerator CUDA#
Functions
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_sum_aggregation()# Factory to create a SUM aggregation
- Returns:
A SUM aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_product_aggregation()# Factory to create a PRODUCT aggregation
- Returns:
A PRODUCT aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_min_aggregation()# Factory to create a MIN aggregation
- Returns:
A MIN aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_max_aggregation()# Factory to create a MAX aggregation
- Returns:
A MAX aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE)# Factory to create a COUNT aggregation.
- Parameters:
null_handling – Indicates if null values will be counted
- Returns:
A COUNT aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_any_aggregation()# Factory to create an ANY aggregation
- Returns:
A ANY aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_all_aggregation()# Factory to create a ALL aggregation
- Returns:
A ALL aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_histogram_aggregation()# Factory to create a HISTOGRAM aggregation
- Returns:
A HISTOGRAM aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_sum_of_squares_aggregation()# Factory to create a SUM_OF_SQUARES aggregation
- Returns:
A SUM_OF_SQUARES aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_mean_aggregation()# Factory to create a MEAN aggregation
- Returns:
A MEAN aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_m2_aggregation()# Factory to create a M2 aggregation.
A M2 aggregation is sum of squares of differences from the mean. That is:
M2 = SUM((x - MEAN) * (x - MEAN))
.This aggregation produces the intermediate values that are used to compute variance and standard deviation across multiple discrete sets. See
#Parallel_algorithm
for more detail.- Returns:
A M2 aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1)# Factory to create a VARIANCE aggregation.
- Parameters:
ddof – Delta degrees of freedom. The divisor used in calculation of
variance
isN - ddof
, whereN
is the population size.- Throws:
cudf::logic_error – if input type is chrono or compound types.
- Returns:
A VARIANCE aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1)# Factory to create a STD aggregation.
- Parameters:
ddof – Delta degrees of freedom. The divisor used in calculation of
std
isN - ddof
, whereN
is the population size.- Throws:
cudf::logic_error – if input type is chrono or compound types.
- Returns:
A STD aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_median_aggregation()# Factory to create a MEDIAN aggregation
- Returns:
A MEDIAN aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const &quantiles, interpolation interp = interpolation::LINEAR)# Factory to create a QUANTILE aggregation.
- Parameters:
quantiles – The desired quantiles
interp – The desired interpolation
- Returns:
A QUANTILE aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_argmax_aggregation()# Factory to create an ARGMAX aggregation.
ARGMAX returns the index of the maximum element.
- Returns:
A ARGMAX aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_argmin_aggregation()# Factory to create an ARGMIN aggregation.
argmin
returns the index of the minimum element.- Returns:
A ARGMIN aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE)# Factory to create a NUNIQUE aggregation.
NUNIQUE returns the number of unique elements.
- Parameters:
null_handling – Indicates if null values will be counted
- Returns:
A NUNIQUE aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null_handling = null_policy::INCLUDE)# Factory to create a NTH_ELEMENT aggregation.
NTH_ELEMENT returns the n’th element of the group/series.
If
n
is not within the range[-group_size, group_size)
, the result of the respective group will be null. Negative indices[-group_size, -1]
corresponds to[0, group_size-1]
indices respectively wheregroup_size
is the size of each group.- Parameters:
n – index of nth element in each group
null_handling – Indicates to include/exclude nulls during indexing
- Returns:
A NTH_ELEMENT aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_row_number_aggregation()# Factory to create a ROW_NUMBER aggregation
- Returns:
A ROW_NUMBER aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_ewma_aggregation(double const center_of_mass, ewm_history history)# Factory to create an EWMA aggregation.
EWMA
returns a non-nullable column with the same type as the input, whose values are the exponentially weighted moving average of the input sequence. Let these values be known as the y_i.EWMA aggregations are parameterized by a center of mass (
com
) which affects the contribution of the previous values (y_0 … y_{i-1}) in computing the y_i.EWMA aggregations are also parameterized by a history
cudf::ewm_history
. Special considerations have to be given to the mathematical treatment of the first value of the input sequence. There are two approaches to this, one which considers the first value of the sequence to be the exponential weighted moving average of some infinite history of data, and one which takes the first value to be the only datapoint known. These assumptions lead to two different formulas for the y_i.ewm_history
selects which.EWMA aggregations have special null handling. Nulls have two effects. The first is to propagate forward the last valid value as far as it has been computed. This could be thought of as the nulls not affecting the average in any way. The second effect changes the way the y_i are computed. Since a moving average is conceptually designed to weight contributing values by their recency, nulls ought to count as valid periods even though they do not change the average. For example, if the input sequence is {1, NULL, 3} then when computing y_2 one should weigh y_0 as if it occurs two periods before y_2 rather than just one.
- Parameters:
center_of_mass – the center of mass.
history – which assumption to make about the first value
- Returns:
A EWM aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_rank_aggregation(rank_method method, order column_order = order::ASCENDING, null_policy null_handling = null_policy::EXCLUDE, null_order null_precedence = null_order::AFTER, rank_percentage percentage = rank_percentage::NONE)# Factory to create a RANK aggregation.
RANK
returns a column of size_type or double “ranks” (see note 3 below for how the data type is determined) for a given rank method and column order. If nulls are excluded, the rank will be null for those rows, otherwise a non-nullable column is returned. Double precision column is returned only when percentage!=NONE and when rank method is average.This aggregation only works with “scan” algorithms. The input column into the group or ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. If rows are ordered by more than one column, the orderby input column should be a struct column containing the ordering columns.
Note:
This method could work faster with the rows that are presorted by the group keys and order_by columns. Though groupby object does not require order_by column to be sorted, groupby rank scan aggregation does require the order_by column to be sorted if the keys are sorted.
RANK
aggregations are not compatible with exclusive scans.All rank methods except AVERAGE method and percentage!=NONE returns size_type column. For AVERAGE method and percentage!=NONE, the return type is double column.
Example: Consider a motor-racing statistics dataset, containing the following columns: 1. venue: (STRING) Location of the race event 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) 3. time: (INT32) Time taken to complete the circuit For the following presorted data: [ // venue, driver, time { "silverstone", "HAM" ("hamilton"), 15823}, { "silverstone", "LEC" ("leclerc"), 15827}, { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. { "silverstone", "RIC" ("ricciardo"), 15905}, { "monza", "RIC" ("ricciardo"), 12154}, { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. { "monza", "LEC" ("leclerc"), 12201}, { "monza", "PER" ("perez"), 12203} ] A grouped rank aggregation scan with: groupby column : venue input orderby column: time Produces the following rank column for each methods: first: { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5} average: { 1, 2, 3.5, 3.5, 5, 1, 2.5, 2.5, 4, 5} min: { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5} max: { 1, 2, 4, 4, 5, 1, 3, 3, 4, 5} dense: { 1, 2, 3, 3, 4, 1, 2, 2, 3, 4} This corresponds to the following grouping and `driver` rows: { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } <----------silverstone----------->|<-------------monza--------------> min rank for each percentage types: NONE: { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5 } ZERO_NORMALIZED : { 0.16, 0.33, 0.50, 0.50, 0.83, 0.16, 0.33, 0.33, 0.66, 0.83 } ONE_NORMALIZED: { 0.00, 0.25, 0.50, 0.50, 1.00, 0.00, 0.25, 0.25, 0.75, 1.00 } where count corresponds to the number of rows in the group. @see cudf::rank_percentage
- Parameters:
method – The ranking method used for tie breaking (same values)
column_order – The desired sort order for ranking
null_handling – flag to include nulls during ranking If nulls are not included, the corresponding rank will be null.
null_precedence – The desired order of null compared to other elements for column
percentage – enum to denote the type of conversion of ranks to percentage in range (0,1]
- Returns:
A RANK aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)# Factory to create a COLLECT_LIST aggregation.
COLLECT_LIST
returns a list column of all included elements in the group/series.If
null_handling
is set toEXCLUDE
, null elements are dropped from each of the list rows.- Parameters:
null_handling – Indicates whether to include/exclude nulls in list elements
- Returns:
A COLLECT_LIST aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL)# Factory to create a COLLECT_SET aggregation.
COLLECT_SET
returns a lists column of all included elements in the group/series. Within each list, the duplicated entries are dropped out such that each entry appears only once.If
null_handling
is set toEXCLUDE
, null elements are dropped from each of the list rows.- Parameters:
null_handling – Indicates whether to include/exclude nulls during collection
nulls_equal – Flag to specify whether null entries within each list should be considered equal.
nans_equal – Flag to specify whether NaN values in floating point column should be considered equal.
- Returns:
A COLLECT_SET aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_lag_aggregation(size_type offset)# Factory to create a LAG aggregation.
- Parameters:
offset – The number of rows to lag the input
- Returns:
A LAG aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_lead_aggregation(size_type offset)# Factory to create a LEAD aggregation.
- Parameters:
offset – The number of rows to lead the input
- Returns:
A LEAD aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_udf_aggregation(udf_type type, std::string const &user_defined_aggregator, data_type output_type)# Factory to create an aggregation base on UDF for PTX or CUDA.
- Parameters:
type – [in] either udf_type::PTX or udf_type::CUDA
user_defined_aggregator – [in] A string containing the aggregator code
output_type – [in] expected output type
- Returns:
An aggregation containing a user-defined aggregator string
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> host_udf)# Factory to create a HOST_UDF aggregation.
- Parameters:
host_udf – An instance of a class derived from
host_udf_base
to perform aggregation- Returns:
A HOST_UDF aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_merge_lists_aggregation()# Factory to create a MERGE_LISTS aggregation.
Given a lists column, this aggregation merges all the lists corresponding to the same key value into one list. It is designed specifically to merge the partial results of multiple (distributed) groupby
COLLECT_LIST
aggregations into a finalCOLLECT_LIST
result. As such, it requires the input lists column to be non-nullable (the child column containing list entries is not subjected to this requirement).- Returns:
A MERGE_LISTS aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL)# Factory to create a MERGE_SETS aggregation.
Given a lists column, this aggregation firstly merges all the lists corresponding to the same key value into one list, then it drops all the duplicate entries in each lists, producing a lists column containing non-repeated entries.
This aggregation is designed specifically to merge the partial results of multiple (distributed) groupby
COLLECT_LIST
orCOLLECT_SET
aggregations into a finalCOLLECT_SET
result. As such, it requires the input lists column to be non-nullable (the child column containing list entries is not subjected to this requirement).In practice, the input (partial results) to this aggregation should be generated by (distributed)
COLLECT_LIST
aggregations, notCOLLECT_SET
, to avoid unnecessarily removing duplicate entries for the partial results.- Parameters:
nulls_equal – Flag to specify whether nulls within each list should be considered equal during dropping duplicate list entries.
nans_equal – Flag to specify whether NaN values in floating point column should be considered equal during dropping duplicate list entries.
- Returns:
A MERGE_SETS aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_merge_m2_aggregation()# Factory to create a MERGE_M2 aggregation.
Merges the results of
M2
aggregations on independent sets into a newM2
value equivalent to if a singleM2
aggregation was done across all of the sets at once. This aggregation is only valid on structs whose members are the result of theCOUNT_VALID
,MEAN
, andM2
aggregations on the same sets. The output of this aggregation is a struct containing the mergedCOUNT_VALID
,MEAN
, andM2
aggregations.The input
M2
aggregation values are expected to be all non-negative numbers, since they were output fromM2
aggregation.- Returns:
A MERGE_M2 aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_merge_histogram_aggregation()# Factory to create a MERGE_HISTOGRAM aggregation.
Merges the results of
HISTOGRAM
aggregations on independent sets into a newHISTOGRAM
value equivalent to if a singleHISTOGRAM
aggregation was done across all of the sets at once.- Returns:
A MERGE_HISTOGRAM aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1)# Factory to create a COVARIANCE aggregation.
Compute covariance between two columns. The input columns are child columns of a non-nullable struct columns.
- Parameters:
min_periods – Minimum number of non-null observations required to produce a result
ddof – Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is the number of non-null observations.
- Returns:
A COVARIANCE aggregation object
-
template<typename Base = aggregation>
std::unique_ptr<Base> make_correlation_aggregation(correlation_type type, size_type min_periods = 1)# Factory to create a CORRELATION aggregation.
Compute correlation coefficient between two columns. The input columns are child columns of a non-nullable struct columns.
- Parameters:
type – correlation_type
min_periods – Minimum number of non-null observations required to produce a result
- Returns:
A CORRELATION aggregation object
-
template<typename Base>
std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000)# Factory to create a TDIGEST aggregation.
Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values. The input aggregation values are expected to be fixed-width numeric types.
The tdigest column produced is of the following structure:
struct { // centroids for the digest list { struct { double // mean double // weight }, … } // these are from the input stream, not the centroids. they are used // during the percentile_approx computation near the beginning or // end of the quantiles double // min double // max }
Each output row is a single tdigest. The length of the row is the “size” of the tdigest, each element of which represents a weighted centroid (mean, weight).
- Parameters:
max_centroids – Parameter controlling compression level and accuracy on subsequent queries on the output tdigest data.
max_centroids
places an upper bound on the size of the computed tdigests: A value of 1000 will result in a tdigest containing no more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.- Returns:
A TDIGEST aggregation object
-
template<typename Base>
std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000)# Factory to create a MERGE_TDIGEST aggregation.
Merges the results from a previous aggregation resulting from a
make_tdigest_aggregation
ormake_merge_tdigest_aggregation
to produce a new a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column.The tdigest column produced is of the following structure:
struct { // centroids for the digest list { struct { double // mean double // weight }, … } // these are from the input stream, not the centroids. they are used // during the percentile_approx computation near the beginning or // end of the quantiles double // min double // max }
Each output row is a single tdigest. The length of the row is the “size” of the tdigest, each element of which represents a weighted centroid (mean, weight).
- Parameters:
max_centroids – Parameter controlling compression level and accuracy on subsequent queries on the output tdigest data.
max_centroids
places an upper bound on the size of the computed tdigests: A value of 1000 will result in a tdigest containing no more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.- Returns:
A MERGE_TDIGEST aggregation object
-
struct host_udf_base#
- #include <host_udf.hpp>
The interface for host-based UDF implementation.
An implementation of host-based UDF needs to be derived from this base class, defining its own version of the required functions. In particular:
The derived class is required to implement
get_empty_output
,operator()
,is_equal
, andclone
functions.If necessary, the derived class can also override
do_hash
to compute hashing for its instance, andget_required_data
to selectively access to the input data as well as intermediate data provided by libcudf.
Example of such implementation:
struct my_udf_aggregation : cudf::host_udf_base { my_udf_aggregation() = default; // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, // and the result from groupby `MAX` aggregation. [[nodiscard]] data_attribute_set_t get_required_data() const override { return {groupby_data_attribute::GROUPED_VALUES, groupby_data_attribute::GROUP_OFFSETS, cudf::make_max_aggregation<cudf::groupby_aggregation>()}; } [[nodiscard]] output_t get_empty_output( [[maybe_unused]] std::optional<cudf::data_type> output_dtype, [[maybe_unused]] rmm::cuda_stream_view stream, [[maybe_unused]] rmm::device_async_resource_ref mr) const override { // This UDF aggregation always returns a column of type INT32. return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); } [[nodiscard]] output_t operator()(input_map_t const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { // Perform UDF computation using the input data and return the result. } [[nodiscard]] bool is_equal(host_udf_base const& other) const override { // Check if the other object is also instance of this class. return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr; } [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override { return std::make_unique<my_udf_aggregation>(); } };
Public Types
-
enum class groupby_data_attribute : int32_t#
Define the possible data needed for groupby aggregations.
Note that only sort-based groupby aggregations are supported.
Values:
-
enumerator INPUT_VALUES#
The input values column.
-
enumerator GROUPED_VALUES#
The input values grouped according to the input
keys
for which the values within each group maintain their original order.
-
enumerator SORTED_GROUPED_VALUES#
The input values grouped according to the input
keys
and sorted within each group.
-
enumerator NUM_GROUPS#
The number of groups (i.e., number of distinct keys).
-
enumerator GROUP_OFFSETS#
The offsets separating groups.
-
enumerator GROUP_LABELS#
Group labels (which is also the same as group indices).
-
enumerator INPUT_VALUES#
-
using data_attribute_set_t = std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>#
Set of attributes for the input data that is needed for computing the aggregation.
-
using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>#
Hold all possible types of the data that is passed to the derived class for executing the aggregation.
-
using input_map_t = std::unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>#
Input to the aggregation, mapping from each data attribute to its actual data.
Public Functions
-
inline virtual data_attribute_set_t get_required_data() const#
Return a set of attributes for the data that is needed for computing the aggregation.
The derived class should return the attributes corresponding to only the data that it needs to avoid unnecessary computation performed in libcudf. If this function is not overridden, an empty set is returned. That means all the data attributes (except results from other aggregations in groupby) will be needed.
- Returns:
A set of
data_attribute
-
virtual output_t get_empty_output(std::optional<data_type> output_dtype, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0#
Get the output when the input values column is empty.
This is called in libcudf when the input values column is empty. In such situations libcudf tries to generate the output directly without unnecessarily evaluating the intermediate data.
- Parameters:
output_dtype – The expected output data type
stream – The CUDA stream to use for any kernel launches
mr – Device memory resource to use for any allocations
- Returns:
The output result of the aggregation when input values is empty
-
virtual output_t operator()(input_map_t const &input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0#
Perform the main computation for the host-based UDF.
- Parameters:
input – The input data needed for performing all computation
stream – The CUDA stream to use for any kernel launches
mr – Device memory resource to use for any allocations
- Returns:
The output result of the aggregation
-
inline virtual std::size_t do_hash() const#
Computes hash value of the class’s instance.
- Returns:
The hash value of the instance
-
virtual bool is_equal(host_udf_base const &other) const = 0#
Compares two instances of the derived class for equality.
- Parameters:
other – The other derived class’s instance to compare with
- Returns:
True if the two instances are equal
-
virtual std::unique_ptr<host_udf_base> clone() const = 0#
Clones the instance.
A class derived from
host_udf_base
should not store too much data such that its instances remain lightweight for efficient cloning.- Returns:
A new instance cloned from this
-
struct data_attribute#
- #include <host_udf.hpp>
Describe possible data that may be needed in the derived class for its operations.
Such data can be either intermediate data such as sorted values or group labels etc, or the results of other aggregations.
Each derived host-based UDF class may need a different set of data. It is inefficient to evaluate and pass down all these possible data at once from libcudf. A solution for that is, the derived class can define a subset of data that it needs and libcudf will evaluate and pass down only data requested from that set.
Public Types
-
using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>#
Hold all possible data types for the input of the aggregation in the derived class.
Public Functions
-
data_attribute() = default#
Default constructor.
-
data_attribute(data_attribute&&) = default#
Move constructor.
-
template<typename T>
inline data_attribute(T value_)# Construct a new data attribute from an aggregation attribute.
- Parameters:
value_ – An aggregation attribute
-
template<typename T>
inline data_attribute(std::unique_ptr<T> value_)# Construct a new data attribute from another aggregation request.
- Parameters:
value_ – An aggregation request
-
data_attribute(data_attribute const &other)#
Copy constructor.
- Parameters:
other – The other data attribute to copy from
Public Members
-
value_type value#
The actual data attribute, wrapped by this struct as a wrapper is needed to define
hash
andequal_to
functors.
-
struct equal_to#
- #include <host_udf.hpp>
Equality comparison functor for
data_attribute
.Public Functions
-
bool operator()(data_attribute const &lhs, data_attribute const &rhs) const#
Check if two data attributes are equal.
- Parameters:
lhs – The left-hand side data attribute
rhs – The right-hand side data attribute
- Returns:
True if the two data attributes are equal
-
bool operator()(data_attribute const &lhs, data_attribute const &rhs) const#
-
struct hash#
- #include <host_udf.hpp>
Hash functor for
data_attribute
.Public Functions
-
std::size_t operator()(data_attribute const &attr) const#
Compute the hash value of a data attribute.
- Parameters:
attr – The data attribute to hash
- Returns:
The hash value of the data attribute
-
std::size_t operator()(data_attribute const &attr) const#
-
using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>#
-
class aggregation#
- #include <aggregation.hpp>
Abstract base class for specifying the desired aggregation in an
aggregation_request
.All aggregations must derive from this class to implement the pure virtual functions and potentially encapsulate additional information needed to compute the aggregation.
Subclassed by cudf::groupby_aggregation, cudf::groupby_scan_aggregation, cudf::reduce_aggregation, cudf::rolling_aggregation, cudf::scan_aggregation, cudf::segmented_reduce_aggregation
Public Types
-
enum Kind#
Possible aggregation operations.
Values:
-
enumerator SUM#
sum reduction
-
enumerator PRODUCT#
product reduction
-
enumerator MIN#
min reduction
-
enumerator MAX#
max reduction
-
enumerator COUNT_VALID#
count number of valid elements
-
enumerator COUNT_ALL#
count number of elements
-
enumerator ANY#
any reduction
-
enumerator ALL#
all reduction
-
enumerator SUM_OF_SQUARES#
sum of squares reduction
-
enumerator MEAN#
arithmetic mean reduction
-
enumerator M2#
sum of squares of differences from the mean
-
enumerator VARIANCE#
variance
-
enumerator STD#
standard deviation
-
enumerator MEDIAN#
median reduction
-
enumerator QUANTILE#
compute specified quantile(s)
-
enumerator ARGMAX#
Index of max element.
-
enumerator ARGMIN#
Index of min element.
-
enumerator NUNIQUE#
count number of unique elements
-
enumerator NTH_ELEMENT#
get the nth element
-
enumerator ROW_NUMBER#
get row-number of current index (relative to rolling window)
-
enumerator EWMA#
get exponential weighted moving average at current index
-
enumerator RANK#
get rank of current index
-
enumerator COLLECT_LIST#
collect values into a list
-
enumerator COLLECT_SET#
collect values into a list without duplicate entries
-
enumerator LEAD#
window function, accesses row at specified offset following current row
-
enumerator LAG#
window function, accesses row at specified offset preceding current row
-
enumerator PTX#
PTX based UDF aggregation.
-
enumerator CUDA#
CUDA based UDF aggregation.
-
enumerator HOST_UDF#
host based UDF aggregation
-
enumerator MERGE_LISTS#
merge multiple lists values into one list
-
enumerator MERGE_SETS#
merge multiple lists values into one list then drop duplicate entries
-
enumerator MERGE_M2#
merge partial values of M2 aggregation,
-
enumerator COVARIANCE#
covariance between two sets of elements
-
enumerator CORRELATION#
correlation between two sets of elements
-
enumerator TDIGEST#
create a tdigest from a set of input values
-
enumerator MERGE_TDIGEST#
create a tdigest by merging multiple tdigests together
-
enumerator HISTOGRAM#
compute frequency of each element
-
enumerator MERGE_HISTOGRAM#
merge partial values of HISTOGRAM aggregation
-
enumerator SUM#
Public Functions
-
inline aggregation(aggregation::Kind a)#
Construct a new aggregation object.
- Parameters:
a – aggregation::Kind enum value
-
inline virtual bool is_equal(aggregation const &other) const#
Compares two aggregation objects for equality.
- Parameters:
other – The other aggregation to compare with
- Returns:
True if the two aggregations are equal
-
inline virtual size_t do_hash() const#
Computes the hash value of the aggregation.
- Returns:
The hash value of the aggregation
-
virtual std::unique_ptr<aggregation> clone() const = 0#
Clones the aggregation object.
- Returns:
A copy of the aggregation object
-
virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(data_type col_type, cudf::detail::simple_aggregations_collector &collector) const = 0#
Get the simple aggregations that this aggregation requires to compute.
- Parameters:
col_type – The type of the column to aggregate
collector – The collector visitor pattern to use to collect the simple aggregations
- Returns:
Vector of pre-requisite simple aggregations
-
enum Kind#
-
class rolling_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for rolling_window specific aggregation usage.
As an example, rolling_window will only accept rolling_aggregation inputs, and the appropriate derived classes (sum_aggregation, mean_aggregation, etc) derive from this interface to represent these valid options.
-
class groupby_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for groupby specific aggregation usage.
-
class groupby_scan_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for groupby specific scan usage.
-
class reduce_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for reduction usage.
-
class scan_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for scan usage.
-
class segmented_reduce_aggregation : public virtual cudf::aggregation#
- #include <aggregation.hpp>
Derived class intended for segmented reduction usage.
-
enum class rank_percentage : int32_t#