aggregation.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/types.hpp>
20 #include <cudf/utilities/export.hpp>
21 
22 #include <functional>
23 #include <memory>
24 #include <vector>
25 
35 namespace CUDF_EXPORT cudf {
42 // forward declaration
43 namespace detail {
44 class simple_aggregations_collector;
45 class aggregation_finalizer;
46 } // namespace detail
47 
54 enum class rank_method : int32_t {
55  FIRST,
56  AVERAGE,
57  MIN,
58  MAX,
59  DENSE
60 };
61 
67 enum class rank_percentage : int32_t {
68  NONE,
71 };
72 
76 enum class bitwise_op : int32_t {
77  AND,
78  OR,
79  XOR
80 };
81 
90 class aggregation {
91  public:
95  enum Kind {
96  SUM,
98  MIN,
99  MAX,
102  ANY,
103  ALL,
106  M2,
108  STD,
121  LAG,
122  PTX,
134  BITWISE_AGG
135  };
136 
137  aggregation() = delete;
138 
146  virtual ~aggregation() = default;
147 
154  [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
155 
161  [[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }
162 
168  [[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;
169 
170  // override functions for compound aggregations
178  virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
179  data_type col_type, cudf::detail::simple_aggregations_collector& collector) const = 0;
180 
187  virtual void finalize(cudf::detail::aggregation_finalizer& finalizer) const = 0;
188 };
189 
197 class rolling_aggregation : public virtual aggregation {
198  public:
199  ~rolling_aggregation() override = default;
200 
201  protected:
204  using aggregation::aggregation;
205 };
206 
210 class groupby_aggregation : public virtual aggregation {
211  public:
212  ~groupby_aggregation() override = default;
213 
214  protected:
216 };
217 
221 class groupby_scan_aggregation : public virtual aggregation {
222  public:
223  ~groupby_scan_aggregation() override = default;
224 
225  protected:
227 };
228 
232 class reduce_aggregation : public virtual aggregation {
233  public:
234  ~reduce_aggregation() override = default;
235 
236  protected:
237  reduce_aggregation() {}
238 };
239 
243 class scan_aggregation : public virtual aggregation {
244  public:
245  ~scan_aggregation() override = default;
246 
247  protected:
248  scan_aggregation() {}
249 };
250 
255  public:
256  ~segmented_reduce_aggregation() override = default;
257 
258  protected:
260 };
261 
263 enum class udf_type : bool { CUDA, PTX };
265 enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
267 enum class ewm_history : int32_t { INFINITE, FINITE };
268 
271 template <typename Base = aggregation>
272 std::unique_ptr<Base> make_sum_aggregation();
273 
276 template <typename Base = aggregation>
277 std::unique_ptr<Base> make_product_aggregation();
278 
281 template <typename Base = aggregation>
282 std::unique_ptr<Base> make_min_aggregation();
283 
286 template <typename Base = aggregation>
287 std::unique_ptr<Base> make_max_aggregation();
288 
295 template <typename Base = aggregation>
296 std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE);
297 
300 template <typename Base = aggregation>
301 std::unique_ptr<Base> make_any_aggregation();
302 
305 template <typename Base = aggregation>
306 std::unique_ptr<Base> make_all_aggregation();
307 
310 template <typename Base = aggregation>
311 std::unique_ptr<Base> make_histogram_aggregation();
312 
315 template <typename Base = aggregation>
316 std::unique_ptr<Base> make_sum_of_squares_aggregation();
317 
320 template <typename Base = aggregation>
321 std::unique_ptr<Base> make_mean_aggregation();
322 
335 template <typename Base = aggregation>
336 std::unique_ptr<Base> make_m2_aggregation();
337 
347 template <typename Base = aggregation>
348 std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
349 
359 template <typename Base = aggregation>
360 std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1);
361 
364 template <typename Base = aggregation>
365 std::unique_ptr<Base> make_median_aggregation();
366 
374 template <typename Base = aggregation>
375 std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quantiles,
376  interpolation interp = interpolation::LINEAR);
377 
384 template <typename Base = aggregation>
385 std::unique_ptr<Base> make_argmax_aggregation();
386 
393 template <typename Base = aggregation>
394 std::unique_ptr<Base> make_argmin_aggregation();
395 
403 template <typename Base = aggregation>
404 std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE);
405 
420 template <typename Base = aggregation>
421 std::unique_ptr<Base> make_nth_element_aggregation(
422  size_type n, null_policy null_handling = null_policy::INCLUDE);
423 
426 template <typename Base = aggregation>
427 std::unique_ptr<Base> make_row_number_aggregation();
428 
462 template <typename Base = aggregation>
463 std::unique_ptr<Base> make_ewma_aggregation(double const center_of_mass, ewm_history history);
464 
537 template <typename Base = aggregation>
538 std::unique_ptr<Base> make_rank_aggregation(rank_method method,
539  order column_order = order::ASCENDING,
540  null_policy null_handling = null_policy::EXCLUDE,
541  null_order null_precedence = null_order::AFTER,
542  rank_percentage percentage = rank_percentage::NONE);
543 
555 template <typename Base = aggregation>
556 std::unique_ptr<Base> make_collect_list_aggregation(
557  null_policy null_handling = null_policy::INCLUDE);
558 
575 template <typename Base = aggregation>
576 std::unique_ptr<Base> make_collect_set_aggregation(
577  null_policy null_handling = null_policy::INCLUDE,
578  null_equality nulls_equal = null_equality::EQUAL,
579  nan_equality nans_equal = nan_equality::ALL_EQUAL);
580 
587 template <typename Base = aggregation>
588 std::unique_ptr<Base> make_lag_aggregation(size_type offset);
589 
596 template <typename Base = aggregation>
597 std::unique_ptr<Base> make_lead_aggregation(size_type offset);
598 
608 template <typename Base = aggregation>
609 std::unique_ptr<Base> make_udf_aggregation(udf_type type,
610  std::string const& user_defined_aggregator,
611  data_type output_type);
612 
613 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
614 class host_udf_base;
615 
622 template <typename Base = aggregation>
623 std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> host_udf);
624 
636 template <typename Base = aggregation>
637 std::unique_ptr<Base> make_merge_lists_aggregation();
638 
661 template <typename Base = aggregation>
662 std::unique_ptr<Base> make_merge_sets_aggregation(
663  null_equality nulls_equal = null_equality::EQUAL,
664  nan_equality nans_equal = nan_equality::ALL_EQUAL);
665 
680 template <typename Base = aggregation>
681 std::unique_ptr<Base> make_merge_m2_aggregation();
682 
691 template <typename Base = aggregation>
692 std::unique_ptr<Base> make_merge_histogram_aggregation();
693 
704 template <typename Base = aggregation>
705 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
706 
717 template <typename Base = aggregation>
719  size_type min_periods = 1);
720 
755 template <typename Base>
756 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
757 
793 template <typename Base>
794 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
795 
802 template <typename Base>
803 std::unique_ptr<Base> make_bitwise_aggregation(bitwise_op op);
804 
813  // end of group
815 } // namespace CUDF_EXPORT cudf
Abstract base class for specifying the desired aggregation in an aggregation_request.
Definition: aggregation.hpp:90
virtual std::vector< std::unique_ptr< aggregation > > get_simple_aggregations(data_type col_type, cudf::detail::simple_aggregations_collector &collector) const =0
Get the simple aggregations that this aggregation requires to compute.
virtual void finalize(cudf::detail::aggregation_finalizer &finalizer) const =0
Compute the aggregation after pre-requisite simple aggregations have been computed.
Kind
Possible aggregation operations.
Definition: aggregation.hpp:95
@ PRODUCT
product reduction
Definition: aggregation.hpp:97
@ ALL
all reduction
@ M2
sum of squares of differences from the mean
@ TDIGEST
create a tdigest from a set of input values
@ MEAN
arithmetic mean reduction
@ MERGE_M2
merge partial values of M2 aggregation,
@ PTX
PTX based UDF aggregation.
@ MERGE_SETS
merge multiple lists values into one list then drop duplicate entries
@ MEDIAN
median reduction
@ NUNIQUE
count number of unique elements
@ MERGE_HISTOGRAM
merge partial values of HISTOGRAM aggregation
@ ARGMIN
Index of min element.
@ CORRELATION
correlation between two sets of elements
@ STD
standard deviation
@ QUANTILE
compute specified quantile(s)
@ COVARIANCE
covariance between two sets of elements
@ MAX
max reduction
Definition: aggregation.hpp:99
@ MIN
min reduction
Definition: aggregation.hpp:98
@ COLLECT_SET
collect values into a list without duplicate entries
@ LAG
window function, accesses row at specified offset preceding current row
@ CUDA
CUDA based UDF aggregation.
@ LEAD
window function, accesses row at specified offset following current row
@ SUM_OF_SQUARES
sum of squares reduction
@ SUM
sum reduction
Definition: aggregation.hpp:96
@ NTH_ELEMENT
get the nth element
@ EWMA
get exponential weighted moving average at current index
@ MERGE_LISTS
merge multiple lists values into one list
@ MERGE_TDIGEST
create a tdigest by merging multiple tdigests together
@ HOST_UDF
host based UDF aggregation
@ ANY
any reduction
@ COLLECT_LIST
collect values into a list
@ COUNT_VALID
count number of valid elements
@ ROW_NUMBER
get row-number of current index (relative to rolling window)
@ ARGMAX
Index of max element.
@ HISTOGRAM
compute frequency of each element
@ RANK
get rank of current index
@ COUNT_ALL
count number of elements
virtual bool is_equal(aggregation const &other) const
Compares two aggregation objects for equality.
aggregation(aggregation::Kind a)
Construct a new aggregation object.
virtual size_t do_hash() const
Computes the hash value of the aggregation.
virtual std::unique_ptr< aggregation > clone() const =0
Clones the aggregation object.
Kind kind
The aggregation to perform.
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
Derived class intended for groupby specific aggregation usage.
Derived class intended for groupby specific scan usage.
The fundamental interface for host-based UDF implementation.
Definition: host_udf.hpp:50
Derived class intended for reduction usage.
Derived class intended for rolling_window specific aggregation usage.
Derived class intended for scan usage.
Derived class intended for segmented reduction usage.
std::unique_ptr< Base > make_bitwise_aggregation(bitwise_op op)
Factory to create a BITWISE_AGG aggregation.
std::unique_ptr< Base > make_median_aggregation()
correlation_type
Type of correlation method.
std::unique_ptr< Base > make_host_udf_aggregation(std::unique_ptr< host_udf_base > host_udf)
Factory to create a HOST_UDF aggregation.
std::unique_ptr< Base > make_lag_aggregation(size_type offset)
Factory to create a LAG aggregation.
std::unique_ptr< Base > make_tdigest_aggregation(int max_centroids=1000)
Factory to create a TDIGEST aggregation.
rank_percentage
Whether returned rank should be percentage or not and mention the type of percentage normalization.
Definition: aggregation.hpp:67
std::unique_ptr< Base > make_covariance_aggregation(size_type min_periods=1, size_type ddof=1)
Factory to create a COVARIANCE aggregation.
std::unique_ptr< Base > make_std_aggregation(size_type ddof=1)
Factory to create a STD aggregation.
std::unique_ptr< Base > make_correlation_aggregation(correlation_type type, size_type min_periods=1)
Factory to create a CORRELATION aggregation.
std::unique_ptr< Base > make_merge_sets_aggregation(null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
Factory to create a MERGE_SETS aggregation.
std::unique_ptr< Base > make_variance_aggregation(size_type ddof=1)
Factory to create a VARIANCE aggregation.
std::unique_ptr< Base > make_lead_aggregation(size_type offset)
Factory to create a LEAD aggregation.
std::unique_ptr< Base > make_any_aggregation()
std::unique_ptr< Base > make_nunique_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a NUNIQUE aggregation.
std::unique_ptr< Base > make_max_aggregation()
std::unique_ptr< Base > make_histogram_aggregation()
std::unique_ptr< Base > make_rank_aggregation(rank_method method, order column_order=order::ASCENDING, null_policy null_handling=null_policy::EXCLUDE, null_order null_precedence=null_order::AFTER, rank_percentage percentage=rank_percentage::NONE)
Factory to create a RANK aggregation.
std::unique_ptr< Base > make_udf_aggregation(udf_type type, std::string const &user_defined_aggregator, data_type output_type)
Factory to create an aggregation base on UDF for PTX or CUDA.
std::unique_ptr< Base > make_row_number_aggregation()
std::unique_ptr< Base > make_merge_histogram_aggregation()
Factory to create a MERGE_HISTOGRAM aggregation.
std::unique_ptr< Base > make_count_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a COUNT aggregation.
bitwise_op
Bitwise operations to use for BITWISE_AGG aggregations on numeric columns.
Definition: aggregation.hpp:76
ewm_history
Type of treatment of EWM input values' first value.
std::unique_ptr< Base > make_collect_list_aggregation(null_policy null_handling=null_policy::INCLUDE)
Factory to create a COLLECT_LIST aggregation.
std::unique_ptr< Base > make_argmax_aggregation()
Factory to create an ARGMAX aggregation.
std::unique_ptr< Base > make_sum_aggregation()
std::unique_ptr< Base > make_all_aggregation()
std::unique_ptr< Base > make_m2_aggregation()
Factory to create a M2 aggregation.
std::unique_ptr< Base > make_merge_m2_aggregation()
Factory to create a MERGE_M2 aggregation.
std::unique_ptr< Base > make_sum_of_squares_aggregation()
std::unique_ptr< Base > make_min_aggregation()
bool is_valid_aggregation(data_type source, aggregation::Kind kind)
Indicate if an aggregation is supported for a source datatype.
std::unique_ptr< Base > make_product_aggregation()
std::unique_ptr< Base > make_nth_element_aggregation(size_type n, null_policy null_handling=null_policy::INCLUDE)
Factory to create a NTH_ELEMENT aggregation.
udf_type
Type of code in the user defined function string.
std::unique_ptr< Base > make_ewma_aggregation(double const center_of_mass, ewm_history history)
Factory to create an EWMA aggregation.
std::unique_ptr< Base > make_merge_lists_aggregation()
Factory to create a MERGE_LISTS aggregation.
std::unique_ptr< Base > make_collect_set_aggregation(null_policy null_handling=null_policy::INCLUDE, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
Factory to create a COLLECT_SET aggregation.
std::unique_ptr< Base > make_argmin_aggregation()
Factory to create an ARGMIN aggregation.
std::unique_ptr< Base > make_quantile_aggregation(std::vector< double > const &quantiles, interpolation interp=interpolation::LINEAR)
Factory to create a QUANTILE aggregation.
std::unique_ptr< Base > make_mean_aggregation()
std::unique_ptr< Base > make_merge_tdigest_aggregation(int max_centroids=1000)
Factory to create a MERGE_TDIGEST aggregation.
@ ONE_NORMALIZED
(rank - 1) / (count - 1)
@ ZERO_NORMALIZED
rank / count
@ OR
bitwise OR operation
@ AND
bitwise AND operation
@ XOR
bitwise XOR operation
std::unique_ptr< table > quantiles(table_view const &input, std::vector< double > const &q, interpolation interp=interpolation::NEAREST, cudf::sorted is_input_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the rows of the input corresponding to the requested quantiles.
rank_method
Tie-breaker method to use for ranking the column.
Definition: aggregation.hpp:54
@ DENSE
rank always increases by 1 between groups
@ AVERAGE
mean of first in the group
@ MAX
max of first in the group
@ FIRST
stable sort order ranking (no ties)
@ MIN
min of first in the group
null_order
Indicates how null values compare against all other values.
Definition: types.hpp:159
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:151
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:126
order
Indicates the order in which elements should be sorted.
Definition: types.hpp:118
interpolation
Interpolation method to use when the desired quantile lies between two data points i and j.
Definition: types.hpp:192
nan_equality
Enum to consider different elements (of floating point types) holding NaN value as equal or unequal.
Definition: types.hpp:143
cuDF interfaces
Definition: host_udf.hpp:37
Type declarations for libcudf.