aggregation.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/types.hpp>
10 #include <cudf/utilities/export.hpp>
11 
12 #include <functional>
13 #include <memory>
14 #include <vector>
15 
25 namespace CUDF_EXPORT cudf {
38 enum class rank_method : int32_t {
39  FIRST,
40  AVERAGE,
41  MIN,
42  MAX,
43  DENSE
44 };
45 
51 enum class rank_percentage : int32_t {
52  NONE,
55 };
56 
60 enum class bitwise_op : int32_t {
61  AND,
62  OR,
63  XOR
64 };
65 
74 class aggregation {
75  public:
79  enum Kind : int32_t {
80  SUM = 0,
83  MIN,
84  MAX,
87  ANY,
88  ALL,
90  MEAN,
91  M2,
93  STD,
106  LAG,
107  PTX,
121  INVALID
122  };
123 
130  aggregation() : kind{Kind::INVALID}
131  {
132  CUDF_FAIL("No-parameter aggregation constructor should never be called");
133  }
134 
140  aggregation(Kind kind_) : kind{kind_} { CUDF_EXPECTS(is_valid(), "Invalid aggregation kind"); }
142  virtual ~aggregation() = default;
143 
150  [[nodiscard]] bool is_valid() const { return kind >= 0 && kind < Kind::INVALID; }
151 
158  [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
159 
165  [[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }
166 
172  [[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;
173 };
174 
178 class rolling_aggregation : public virtual aggregation {};
179 
183 class groupby_aggregation : public virtual aggregation {};
184 
188 class groupby_scan_aggregation : public virtual aggregation {};
189 
193 class reduce_aggregation : public virtual aggregation {};
194 
198 class scan_aggregation : public virtual aggregation {};
199 
203 class segmented_reduce_aggregation : public virtual aggregation {};
204 
206 enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
208 enum class ewm_history : int32_t { INFINITE, FINITE };
209 
212 template <typename Base = aggregation>
213 std::unique_ptr<Base> make_sum_aggregation();
214 
217 template <typename Base = aggregation>
218 std::unique_ptr<Base> make_sum_with_overflow_aggregation();
219 
222 template <typename Base = aggregation>
223 std::unique_ptr<Base> make_product_aggregation();
224 
227 template <typename Base = aggregation>
228 std::unique_ptr<Base> make_min_aggregation();
229 
232 template <typename Base = aggregation>
233 std::unique_ptr<Base> make_max_aggregation();
234 
241 template <typename Base = aggregation>
242 std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE);
243 
246 template <typename Base = aggregation>
247 std::unique_ptr<Base> make_any_aggregation();
248 
251 template <typename Base = aggregation>
252 std::unique_ptr<Base> make_all_aggregation();
253 
256 template <typename Base = aggregation>
257 std::unique_ptr<Base> make_histogram_aggregation();
258 
261 template <typename Base = aggregation>
262 std::unique_ptr<Base> make_sum_of_squares_aggregation();
263 
266 template <typename Base = aggregation>
267 std::unique_ptr<Base> make_mean_aggregation();
268 
281 template <typename Base = aggregation>
282 std::unique_ptr<Base> make_m2_aggregation();
283 
293 template <typename Base = aggregation>
294 std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
295 
305 template <typename Base = aggregation>
306 std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1);
307 
310 template <typename Base = aggregation>
311 std::unique_ptr<Base> make_median_aggregation();
312 
320 template <typename Base = aggregation>
321 std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quantiles,
322  interpolation interp = interpolation::LINEAR);
323 
330 template <typename Base = aggregation>
331 std::unique_ptr<Base> make_argmax_aggregation();
332 
339 template <typename Base = aggregation>
340 std::unique_ptr<Base> make_argmin_aggregation();
341 
349 template <typename Base = aggregation>
350 std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE);
351 
366 template <typename Base = aggregation>
367 std::unique_ptr<Base> make_nth_element_aggregation(
368  size_type n, null_policy null_handling = null_policy::INCLUDE);
369 
372 template <typename Base = aggregation>
373 std::unique_ptr<Base> make_row_number_aggregation();
374 
408 template <typename Base = aggregation>
409 std::unique_ptr<Base> make_ewma_aggregation(double const center_of_mass, ewm_history history);
410 
483 template <typename Base = aggregation>
484 std::unique_ptr<Base> make_rank_aggregation(rank_method method,
485  order column_order = order::ASCENDING,
486  null_policy null_handling = null_policy::EXCLUDE,
487  null_order null_precedence = null_order::AFTER,
488  rank_percentage percentage = rank_percentage::NONE);
489 
501 template <typename Base = aggregation>
502 std::unique_ptr<Base> make_collect_list_aggregation(
503  null_policy null_handling = null_policy::INCLUDE);
504 
521 template <typename Base = aggregation>
522 std::unique_ptr<Base> make_collect_set_aggregation(
523  null_policy null_handling = null_policy::INCLUDE,
524  null_equality nulls_equal = null_equality::EQUAL,
525  nan_equality nans_equal = nan_equality::ALL_EQUAL);
526 
533 template <typename Base = aggregation>
534 std::unique_ptr<Base> make_lag_aggregation(size_type offset);
535 
542 template <typename Base = aggregation>
543 std::unique_ptr<Base> make_lead_aggregation(size_type offset);
544 
554 template <typename Base = aggregation>
555 std::unique_ptr<Base> make_udf_aggregation(udf_source_type type,
556  std::string const& user_defined_aggregator,
557  data_type output_type);
558 
559 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
560 class host_udf_base;
561 
568 template <typename Base = aggregation>
569 std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> host_udf);
570 
582 template <typename Base = aggregation>
583 std::unique_ptr<Base> make_merge_lists_aggregation();
584 
607 template <typename Base = aggregation>
608 std::unique_ptr<Base> make_merge_sets_aggregation(
609  null_equality nulls_equal = null_equality::EQUAL,
610  nan_equality nans_equal = nan_equality::ALL_EQUAL);
611 
626 template <typename Base = aggregation>
627 std::unique_ptr<Base> make_merge_m2_aggregation();
628 
637 template <typename Base = aggregation>
638 std::unique_ptr<Base> make_merge_histogram_aggregation();
639 
650 template <typename Base = aggregation>
651 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
652 
663 template <typename Base = aggregation>
665  size_type min_periods = 1);
666 
701 template <typename Base>
702 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
703 
739 template <typename Base>
740 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
741 
748 template <typename Base>
749 std::unique_ptr<Base> make_bitwise_aggregation(bitwise_op op);
750 
761 template <typename Base = aggregation>
762 std::unique_ptr<Base> make_top_k_aggregation(size_type k, order topk_order = order::DESCENDING);
763 
772  // end of group
774 } // namespace CUDF_EXPORT cudf
Abstract base class for specifying the desired aggregation in an aggregation_request.
Definition: aggregation.hpp:74
bool is_valid() const
Checks if the aggregation is valid, i.e. it was constructed with a valid value for the aggregation ki...
aggregation()
Default constructor.
aggregation(Kind kind_)
Construct a new aggregation object from a given aggregation kind.
virtual bool is_equal(aggregation const &other) const
Compares two aggregation objects for equality.
virtual size_t do_hash() const
Computes the hash value of the aggregation.
Kind
Possible aggregation operations.
Definition: aggregation.hpp:79
@ PRODUCT
product reduction
Definition: aggregation.hpp:82
@ ALL
all reduction
Definition: aggregation.hpp:88
@ TOP_K
top k elements in a group
@ M2
sum of squares of differences from the mean
Definition: aggregation.hpp:91
@ TDIGEST
create a tdigest from a set of input values
@ MEAN
arithmetic mean reduction
Definition: aggregation.hpp:90
@ MERGE_M2
merge partial values of M2 aggregation,
@ BITWISE_AGG
bitwise aggregation on numeric columns
@ PTX
PTX based UDF aggregation.
@ MERGE_SETS
merge multiple lists values into one list then drop duplicate entries
@ MEDIAN
median reduction
Definition: aggregation.hpp:94
@ NUNIQUE
count number of unique elements
Definition: aggregation.hpp:98
@ MERGE_HISTOGRAM
merge partial values of HISTOGRAM aggregation
@ ARGMIN
Index of min element.
Definition: aggregation.hpp:97
@ VARIANCE
variance
Definition: aggregation.hpp:92
@ CORRELATION
correlation between two sets of elements
@ STD
standard deviation
Definition: aggregation.hpp:93
@ QUANTILE
compute specified quantile(s)
Definition: aggregation.hpp:95
@ COVARIANCE
covariance between two sets of elements
@ MAX
max reduction
Definition: aggregation.hpp:84
@ MIN
min reduction
Definition: aggregation.hpp:83
@ COLLECT_SET
collect values into a list without duplicate entries
@ LAG
window function, accesses row at specified offset preceding current row
@ CUDA
CUDA based UDF aggregation.
@ LEAD
window function, accesses row at specified offset following current row
@ SUM_OF_SQUARES
sum of squares reduction
Definition: aggregation.hpp:89
@ NTH_ELEMENT
get the nth element
Definition: aggregation.hpp:99
@ EWMA
get exponential weighted moving average at current index
@ MERGE_LISTS
merge multiple lists values into one list
@ MERGE_TDIGEST
create a tdigest by merging multiple tdigests together
@ HOST_UDF
host based UDF aggregation
@ ANY
any reduction
Definition: aggregation.hpp:87
@ COLLECT_LIST
collect values into a list
@ COUNT_VALID
count number of valid elements
Definition: aggregation.hpp:85
@ ROW_NUMBER
get row-number of current index (relative to rolling window)
@ SUM_WITH_OVERFLOW
sum reduction with overflow detection
Definition: aggregation.hpp:81
@ ARGMAX
Index of max element.
Definition: aggregation.hpp:96
@ HISTOGRAM
compute frequency of each element
@ RANK
get rank of current index
@ COUNT_ALL
count number of elements
Definition: aggregation.hpp:86
virtual std::unique_ptr< aggregation > clone() const =0
Clones the aggregation object.
Kind kind
The aggregation to perform.
Indicator for the logical data type of an element in a column.
Definition: types.hpp:277
Derived class intended for groupby specific aggregation usage.
Derived class intended for groupby specific scan usage.
The fundamental interface for host-based UDF implementation.
Definition: host_udf.hpp:39
Derived class intended for reduction usage.
Derived class intended for rolling_window specific aggregation usage.
Derived class intended for scan usage.
Derived class intended for segmented reduction usage.
std::unique_ptr< Base > make_bitwise_aggregation(bitwise_op op)
Factory to create a BITWISE_AGG aggregation.
std::unique_ptr< Base > make_top_k_aggregation(size_type k, order topk_order=order::DESCENDING)
Factory to create a TOP_K aggregation.
std::unique_ptr< Base > make_median_aggregation()
correlation_type
Type of correlation method.
std::unique_ptr< Base > make_host_udf_aggregation(std::unique_ptr< host_udf_base > host_udf)
Factory to create a HOST_UDF aggregation.
std::unique_ptr< Base > make_lag_aggregation(size_type offset)
Factory to create a LAG aggregation.
std::unique_ptr< Base > make_tdigest_aggregation(int max_centroids=1000)
Factory to create a TDIGEST aggregation.
rank_percentage
Whether returned rank should be percentage or not and mention the type of percentage normalization.
Definition: aggregation.hpp:51
std::unique_ptr< Base > make_covariance_aggregation(size_type min_periods=1, size_type ddof=1)
Factory to create a COVARIANCE aggregation.
std::unique_ptr< Base > make_std_aggregation(size_type ddof=1)
Factory to create a STD aggregation.
std::unique_ptr< Base > make_correlation_aggregation(correlation_type type, size_type min_periods=1)
Factory to create a CORRELATION aggregation.
std::unique_ptr< Base > make_merge_sets_aggregation(null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
Factory to create a MERGE_SETS aggregation.
std::unique_ptr< Base > make_variance_aggregation(size_type ddof=1)
Factory to create a VARIANCE aggregation.
std::unique_ptr< Base > make_lead_aggregation(size_type offset)
Factory to create a LEAD aggregation.
std::unique_ptr< Base > make_any_aggregation()
std::unique_ptr< Base > make_nunique_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a NUNIQUE aggregation.
std::unique_ptr< Base > make_max_aggregation()
std::unique_ptr< Base > make_sum_with_overflow_aggregation()
std::unique_ptr< Base > make_histogram_aggregation()
std::unique_ptr< Base > make_rank_aggregation(rank_method method, order column_order=order::ASCENDING, null_policy null_handling=null_policy::EXCLUDE, null_order null_precedence=null_order::AFTER, rank_percentage percentage=rank_percentage::NONE)
Factory to create a RANK aggregation.
std::unique_ptr< Base > make_row_number_aggregation()
std::unique_ptr< Base > make_merge_histogram_aggregation()
Factory to create a MERGE_HISTOGRAM aggregation.
std::unique_ptr< Base > make_count_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a COUNT aggregation.
bitwise_op
Bitwise operations to use for BITWISE_AGG aggregations on numeric columns.
Definition: aggregation.hpp:60
ewm_history
Type of treatment of EWM input values' first value.
std::unique_ptr< Base > make_collect_list_aggregation(null_policy null_handling=null_policy::INCLUDE)
Factory to create a COLLECT_LIST aggregation.
std::unique_ptr< Base > make_argmax_aggregation()
Factory to create an ARGMAX aggregation.
std::unique_ptr< Base > make_sum_aggregation()
std::unique_ptr< Base > make_all_aggregation()
std::unique_ptr< Base > make_m2_aggregation()
Factory to create a M2 aggregation.
std::unique_ptr< Base > make_merge_m2_aggregation()
Factory to create a MERGE_M2 aggregation.
std::unique_ptr< Base > make_sum_of_squares_aggregation()
std::unique_ptr< Base > make_min_aggregation()
bool is_valid_aggregation(data_type source, aggregation::Kind kind)
Indicate if an aggregation is supported for a source datatype.
std::unique_ptr< Base > make_product_aggregation()
std::unique_ptr< Base > make_nth_element_aggregation(size_type n, null_policy null_handling=null_policy::INCLUDE)
Factory to create a NTH_ELEMENT aggregation.
std::unique_ptr< Base > make_ewma_aggregation(double const center_of_mass, ewm_history history)
Factory to create an EWMA aggregation.
std::unique_ptr< Base > make_udf_aggregation(udf_source_type type, std::string const &user_defined_aggregator, data_type output_type)
Factory to create an aggregation base on UDF for PTX or CUDA.
std::unique_ptr< Base > make_merge_lists_aggregation()
Factory to create a MERGE_LISTS aggregation.
std::unique_ptr< Base > make_collect_set_aggregation(null_policy null_handling=null_policy::INCLUDE, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
Factory to create a COLLECT_SET aggregation.
std::unique_ptr< Base > make_argmin_aggregation()
Factory to create an ARGMIN aggregation.
std::unique_ptr< Base > make_quantile_aggregation(std::vector< double > const &quantiles, interpolation interp=interpolation::LINEAR)
Factory to create a QUANTILE aggregation.
std::unique_ptr< Base > make_mean_aggregation()
std::unique_ptr< Base > make_merge_tdigest_aggregation(int max_centroids=1000)
Factory to create a MERGE_TDIGEST aggregation.
@ ONE_NORMALIZED
(rank - 1) / (count - 1)
@ ZERO_NORMALIZED
rank / count
@ OR
bitwise OR operation
@ AND
bitwise AND operation
@ XOR
bitwise XOR operation
std::unique_ptr< table > quantiles(table_view const &input, std::vector< double > const &q, interpolation interp=interpolation::NEAREST, cudf::sorted is_input_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the rows of the input corresponding to the requested quantiles.
rank_method
Tie-breaker method to use for ranking the column.
Definition: aggregation.hpp:38
@ DENSE
rank always increases by 1 between groups
@ AVERAGE
mean of first in the group
@ MAX
max of first in the group
@ FIRST
stable sort order ranking (no ties)
@ MIN
min of first in the group
std::unique_ptr< cudf::column > is_valid(cudf::column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a column of type_id::BOOL8 elements where for every element in input true indicates the value...
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:143
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:182
null_order
Indicates how null values compare against all other values.
Definition: types.hpp:148
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:140
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:115
order
Indicates the order in which elements should be sorted.
Definition: types.hpp:107
interpolation
Interpolation method to use when the desired quantile lies between two data points i and j.
Definition: types.hpp:181
udf_source_type
Indicates the source language of a user defined function (UDF) to be used in JIT APIs.
Definition: types.hpp:266
nan_equality
Enum to consider different elements (of floating point types) holding NaN value as equal or unequal.
Definition: types.hpp:132
cuDF interfaces
Definition: host_udf.hpp:26
Type declarations for libcudf.