aggregation.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2022, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/types.hpp>
20 
21 #include <functional>
22 #include <memory>
23 #include <vector>
24 
34 namespace cudf {
41 // forward declaration
42 namespace detail {
43 class simple_aggregations_collector;
44 class aggregation_finalizer;
45 } // namespace detail
46 
53 enum class rank_method : int32_t {
54  FIRST,
55  AVERAGE,
56  MIN,
57  MAX,
58  DENSE
59 };
60 
66 enum class rank_percentage : int32_t {
67  NONE,
68  ZERO_NORMALIZED,
70 };
71 
80 class aggregation {
81  public:
85  enum Kind {
86  SUM,
88  MIN,
89  MAX,
92  ANY,
93  ALL,
95  MEAN,
96  M2,
98  STD,
110  LAG,
111  PTX,
120  };
121 
122  aggregation() = delete;
125  virtual ~aggregation() = default;
126 
127  [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
128  [[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }
129  [[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;
130 
131  // override functions for compound aggregations
132  virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
133  data_type col_type, cudf::detail::simple_aggregations_collector& collector) const = 0;
134  virtual void finalize(cudf::detail::aggregation_finalizer& finalizer) const = 0;
135 };
136 
144 class rolling_aggregation : public virtual aggregation {
145  public:
146  ~rolling_aggregation() override = default;
147 
148  protected:
151 };
152 
156 class groupby_aggregation : public virtual aggregation {
157  public:
158  ~groupby_aggregation() override = default;
159 
160  protected:
162 };
163 
167 class groupby_scan_aggregation : public virtual aggregation {
168  public:
169  ~groupby_scan_aggregation() override = default;
170 
171  protected:
173 };
174 
178 class reduce_aggregation : public virtual aggregation {
179  public:
180  ~reduce_aggregation() override = default;
181 
182  protected:
183  reduce_aggregation() {}
184 };
185 
189 class scan_aggregation : public virtual aggregation {
190  public:
191  ~scan_aggregation() override = default;
192 
193  protected:
194  scan_aggregation() {}
195 };
196 
201  public:
202  ~segmented_reduce_aggregation() override = default;
203 
204  protected:
206 };
207 
208 enum class udf_type : bool { CUDA, PTX };
209 enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
210 
212 template <typename Base = aggregation>
213 std::unique_ptr<Base> make_sum_aggregation();
214 
216 template <typename Base = aggregation>
217 std::unique_ptr<Base> make_product_aggregation();
218 
220 template <typename Base = aggregation>
221 std::unique_ptr<Base> make_min_aggregation();
222 
224 template <typename Base = aggregation>
225 std::unique_ptr<Base> make_max_aggregation();
226 
232 template <typename Base = aggregation>
233 std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE);
234 
236 template <typename Base = aggregation>
237 std::unique_ptr<Base> make_any_aggregation();
238 
240 template <typename Base = aggregation>
241 std::unique_ptr<Base> make_all_aggregation();
242 
244 template <typename Base = aggregation>
245 std::unique_ptr<Base> make_sum_of_squares_aggregation();
246 
248 template <typename Base = aggregation>
249 std::unique_ptr<Base> make_mean_aggregation();
250 
262 template <typename Base = aggregation>
263 std::unique_ptr<Base> make_m2_aggregation();
264 
273 template <typename Base = aggregation>
274 std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
275 
284 template <typename Base = aggregation>
285 std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1);
286 
288 template <typename Base = aggregation>
289 std::unique_ptr<Base> make_median_aggregation();
290 
297 template <typename Base = aggregation>
298 std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quantiles,
299  interpolation interp = interpolation::LINEAR);
300 
306 template <typename Base = aggregation>
307 std::unique_ptr<Base> make_argmax_aggregation();
308 
314 template <typename Base = aggregation>
315 std::unique_ptr<Base> make_argmin_aggregation();
316 
323 template <typename Base = aggregation>
324 std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE);
325 
339 template <typename Base = aggregation>
340 std::unique_ptr<Base> make_nth_element_aggregation(
341  size_type n, null_policy null_handling = null_policy::INCLUDE);
342 
344 template <typename Base = aggregation>
345 std::unique_ptr<Base> make_row_number_aggregation();
346 
418 template <typename Base = aggregation>
419 std::unique_ptr<Base> make_rank_aggregation(rank_method method,
420  order column_order = order::ASCENDING,
421  null_policy null_handling = null_policy::EXCLUDE,
422  null_order null_precedence = null_order::AFTER,
423  rank_percentage percentage = rank_percentage::NONE);
424 
435 template <typename Base = aggregation>
436 std::unique_ptr<Base> make_collect_list_aggregation(
437  null_policy null_handling = null_policy::INCLUDE);
438 
454 template <typename Base = aggregation>
455 std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
456  null_equality nulls_equal = null_equality::EQUAL,
457  nan_equality nans_equal = nan_equality::UNEQUAL);
458 
460 template <typename Base = aggregation>
461 std::unique_ptr<Base> make_lag_aggregation(size_type offset);
462 
464 template <typename Base = aggregation>
465 std::unique_ptr<Base> make_lead_aggregation(size_type offset);
466 
476 template <typename Base = aggregation>
477 std::unique_ptr<Base> make_udf_aggregation(udf_type type,
478  std::string const& user_defined_aggregator,
479  data_type output_type);
480 
490 template <typename Base = aggregation>
491 std::unique_ptr<Base> make_merge_lists_aggregation();
492 
514 template <typename Base = aggregation>
515 std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL,
516  nan_equality nans_equal = nan_equality::UNEQUAL);
517 
530 template <typename Base = aggregation>
531 std::unique_ptr<Base> make_merge_m2_aggregation();
532 
542 template <typename Base = aggregation>
543 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
544 
554 template <typename Base = aggregation>
555 std::unique_ptr<Base> make_correlation_aggregation(correlation_type type,
556  size_type min_periods = 1);
557 
592 template <typename Base>
593 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
594 
630 template <typename Base>
631 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
632  // end of group
634 } // namespace cudf
cudf::rank_method
rank_method
Tie-breaker method to use for ranking the column.
Definition: aggregation.hpp:53
cudf::scan_aggregation
Derived class intended for scan usage.
Definition: aggregation.hpp:189
cudf::make_variance_aggregation
std::unique_ptr< Base > make_variance_aggregation(size_type ddof=1)
Factory to create a VARIANCE aggregation.
cudf::quantiles
std::unique_ptr< table > quantiles(table_view const &input, std::vector< double > const &q, interpolation interp=interpolation::NEAREST, cudf::sorted is_input_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the rows of the input corresponding to the requested quantiles.
cudf::make_m2_aggregation
std::unique_ptr< Base > make_m2_aggregation()
Factory to create a M2 aggregation.
cudf::aggregation::kind
Kind kind
The aggregation to perform.
Definition: aggregation.hpp:124
cudf::rank_method::MIN
@ MIN
min of first in the group
cudf::size_type
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
cudf::reduce_aggregation
Derived class intended for reduction usage.
Definition: aggregation.hpp:178
cudf::null_policy
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:123
cudf::aggregation::NTH_ELEMENT
@ NTH_ELEMENT
get the nth element
Definition: aggregation.hpp:104
cudf::aggregation::SUM
@ SUM
sum reduction
Definition: aggregation.hpp:86
cudf::make_argmax_aggregation
std::unique_ptr< Base > make_argmax_aggregation()
Factory to create an argmax aggregation.
cudf::make_min_aggregation
std::unique_ptr< Base > make_min_aggregation()
Factory to create a MIN aggregation.
types.hpp
Type declarations for libcudf.
cudf::interpolation
interpolation
Interpolation method to use when the desired quantile lies between two data points i and j.
Definition: types.hpp:189
cudf::aggregation::MIN
@ MIN
min reduction
Definition: aggregation.hpp:88
cudf::make_row_number_aggregation
std::unique_ptr< Base > make_row_number_aggregation()
Factory to create a ROW_NUMBER aggregation.
cudf::make_merge_lists_aggregation
std::unique_ptr< Base > make_merge_lists_aggregation()
Factory to create a MERGE_LISTS aggregation.
cudf::aggregation::STD
@ STD
standard deviation
Definition: aggregation.hpp:98
cudf::make_sum_aggregation
std::unique_ptr< Base > make_sum_aggregation()
Factory to create a SUM aggregation.
cudf::aggregation::COUNT_VALID
@ COUNT_VALID
count number of valid elements
Definition: aggregation.hpp:90
cudf::aggregation::ARGMAX
@ ARGMAX
Index of max element.
Definition: aggregation.hpp:101
cudf::aggregation::PRODUCT
@ PRODUCT
product reduction
Definition: aggregation.hpp:87
cudf::aggregation::COLLECT_LIST
@ COLLECT_LIST
collect values into a list
Definition: aggregation.hpp:107
cudf::rank_method::MAX
@ MAX
max of first in the group
cudf::segmented_reduce_aggregation
Derived class intended for segmented reduction usage.
Definition: aggregation.hpp:200
cudf::aggregation::VARIANCE
@ VARIANCE
variance
Definition: aggregation.hpp:97
cudf::aggregation::CUDA
@ CUDA
CUDA UDF based reduction.
Definition: aggregation.hpp:112
cudf::make_lag_aggregation
std::unique_ptr< Base > make_lag_aggregation(size_type offset)
Factory to create a LAG aggregation.
cudf::aggregation::ANY
@ ANY
any reduction
Definition: aggregation.hpp:92
cudf::make_any_aggregation
std::unique_ptr< Base > make_any_aggregation()
Factory to create an ANY aggregation.
cudf::groupby_aggregation
Derived class intended for groupby specific aggregation usage.
Definition: aggregation.hpp:156
cudf::make_sum_of_squares_aggregation
std::unique_ptr< Base > make_sum_of_squares_aggregation()
Factory to create a SUM_OF_SQUARES aggregation.
cudf::aggregation::ARGMIN
@ ARGMIN
Index of min element.
Definition: aggregation.hpp:102
cudf::make_product_aggregation
std::unique_ptr< Base > make_product_aggregation()
Factory to create a PRODUCT aggregation.
cudf::aggregation::RANK
@ RANK
get rank of current index
Definition: aggregation.hpp:106
cudf::null_order
null_order
Indicates how null values compare against all other values.
Definition: types.hpp:156
cudf::groupby_scan_aggregation
Derived class intended for groupby specific scan usage.
Definition: aggregation.hpp:167
cudf::aggregation::CORRELATION
@ CORRELATION
correlation between two sets of elements
Definition: aggregation.hpp:117
cudf::make_covariance_aggregation
std::unique_ptr< Base > make_covariance_aggregation(size_type min_periods=1, size_type ddof=1)
Factory to create a COVARIANCE aggregation.
cudf::aggregation::MERGE_SETS
@ MERGE_SETS
merge multiple lists values into one list then drop duplicate entries
Definition: aggregation.hpp:114
cudf::aggregation::MEAN
@ MEAN
arithmetic mean reduction
Definition: aggregation.hpp:95
cudf::nan_equality
nan_equality
Enum to consider different elements (of floating point types) holding NaN value as equal or unequal.
Definition: types.hpp:140
cudf::aggregation::MEDIAN
@ MEDIAN
median reduction
Definition: aggregation.hpp:99
cudf::make_nunique_aggregation
std::unique_ptr< Base > make_nunique_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a nunique aggregation.
cudf::nan_equality::ALL_EQUAL
@ ALL_EQUAL
All NaNs compare equal, regardless of sign.
cudf::rank_percentage::NONE
@ NONE
rank
cudf::make_merge_m2_aggregation
std::unique_ptr< Base > make_merge_m2_aggregation()
Factory to create a MERGE_M2 aggregation.
cudf::make_merge_tdigest_aggregation
std::unique_ptr< Base > make_merge_tdigest_aggregation(int max_centroids=1000)
Factory to create a MERGE_TDIGEST aggregation.
cudf::aggregation::QUANTILE
@ QUANTILE
compute specified quantile(s)
Definition: aggregation.hpp:100
cudf::aggregation::MERGE_M2
@ MERGE_M2
merge partial values of M2 aggregation,
Definition: aggregation.hpp:115
cudf::make_all_aggregation
std::unique_ptr< Base > make_all_aggregation()
Factory to create a ALL aggregation.
cudf::aggregation::MERGE_LISTS
@ MERGE_LISTS
merge multiple lists values into one list
Definition: aggregation.hpp:113
cudf::aggregation::LEAD
@ LEAD
window function, accesses row at specified offset following current row
Definition: aggregation.hpp:109
cudf::make_tdigest_aggregation
std::unique_ptr< Base > make_tdigest_aggregation(int max_centroids=1000)
Factory to create a TDIGEST aggregation.
cudf::make_mean_aggregation
std::unique_ptr< Base > make_mean_aggregation()
Factory to create a MEAN aggregation.
cudf::make_rank_aggregation
std::unique_ptr< Base > make_rank_aggregation(rank_method method, order column_order=order::ASCENDING, null_policy null_handling=null_policy::EXCLUDE, null_order null_precedence=null_order::AFTER, rank_percentage percentage=rank_percentage::NONE)
Factory to create a RANK aggregation.
cudf::make_merge_sets_aggregation
std::unique_ptr< Base > make_merge_sets_aggregation(null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::UNEQUAL)
Factory to create a MERGE_SETS aggregation.
cudf::aggregation::LAG
@ LAG
window function, accesses row at specified offset preceding current row
Definition: aggregation.hpp:110
cudf::make_count_aggregation
std::unique_ptr< Base > make_count_aggregation(null_policy null_handling=null_policy::EXCLUDE)
Factory to create a COUNT aggregation.
cudf::data_type
Indicator for the logical data type of an element in a column.
Definition: types.hpp:240
cudf::rank_percentage
rank_percentage
Whether returned rank should be percentage or not and mention the type of percentage normalization.
Definition: aggregation.hpp:66
cudf::aggregation::MERGE_TDIGEST
@ MERGE_TDIGEST
create a tdigest by merging multiple tdigests together
Definition: aggregation.hpp:119
cudf::make_nth_element_aggregation
std::unique_ptr< Base > make_nth_element_aggregation(size_type n, null_policy null_handling=null_policy::INCLUDE)
Factory to create a nth_element aggregation.
cudf::make_collect_list_aggregation
std::unique_ptr< Base > make_collect_list_aggregation(null_policy null_handling=null_policy::INCLUDE)
Factory to create a COLLECT_LIST aggregation.
cudf
cuDF interfaces
Definition: aggregation.hpp:34
cudf::rolling_aggregation
Derived class intended for rolling_window specific aggregation usage.
Definition: aggregation.hpp:144
cudf::aggregation::SUM_OF_SQUARES
@ SUM_OF_SQUARES
sum of squares reduction
Definition: aggregation.hpp:94
cudf::make_std_aggregation
std::unique_ptr< Base > make_std_aggregation(size_type ddof=1)
Factory to create a STD aggregation.
cudf::aggregation::M2
@ M2
sum of squares of differences from the mean
Definition: aggregation.hpp:96
cudf::aggregation::COLLECT_SET
@ COLLECT_SET
collect values into a list without duplicate entries
Definition: aggregation.hpp:108
cudf::aggregation::PTX
@ PTX
PTX UDF based reduction.
Definition: aggregation.hpp:111
cudf::null_policy::EXCLUDE
@ EXCLUDE
exclude null elements
cudf::aggregation::TDIGEST
@ TDIGEST
create a tdigest from a set of input values
Definition: aggregation.hpp:118
cudf::make_correlation_aggregation
std::unique_ptr< Base > make_correlation_aggregation(correlation_type type, size_type min_periods=1)
Factory to create a CORRELATION aggregation.
cudf::make_max_aggregation
std::unique_ptr< Base > make_max_aggregation()
Factory to create a MAX aggregation.
cudf::rank_method::FIRST
@ FIRST
stable sort order ranking (no ties)
cudf::make_median_aggregation
std::unique_ptr< Base > make_median_aggregation()
Factory to create a MEDIAN aggregation.
cudf::null_equality
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:148
cudf::aggregation::MAX
@ MAX
max reduction
Definition: aggregation.hpp:89
cudf::aggregation::ROW_NUMBER
@ ROW_NUMBER
get row-number of current index (relative to rolling window)
Definition: aggregation.hpp:105
cudf::make_argmin_aggregation
std::unique_ptr< Base > make_argmin_aggregation()
Factory to create an argmin aggregation.
cudf::aggregation
Abstract base class for specifying the desired aggregation in an aggregation_request.
Definition: aggregation.hpp:80
cudf::make_lead_aggregation
std::unique_ptr< Base > make_lead_aggregation(size_type offset)
Factory to create a LEAD aggregation.
cudf::make_quantile_aggregation
std::unique_ptr< Base > make_quantile_aggregation(std::vector< double > const &quantiles, interpolation interp=interpolation::LINEAR)
Factory to create a QUANTILE aggregation.
cudf::aggregation::COUNT_ALL
@ COUNT_ALL
count number of elements
Definition: aggregation.hpp:91
cudf::aggregation::Kind
Kind
Possible aggregation operations.
Definition: aggregation.hpp:85
cudf::make_udf_aggregation
std::unique_ptr< Base > make_udf_aggregation(udf_type type, std::string const &user_defined_aggregator, data_type output_type)
Factory to create an aggregation base on UDF for PTX or CUDA.
cudf::make_collect_set_aggregation
std::unique_ptr< Base > make_collect_set_aggregation(null_policy null_handling=null_policy::INCLUDE, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::UNEQUAL)
Factory to create a COLLECT_SET aggregation.
cudf::aggregation::NUNIQUE
@ NUNIQUE
count number of unique elements
Definition: aggregation.hpp:103
cudf::aggregation::COVARIANCE
@ COVARIANCE
covariance between two sets of elements
Definition: aggregation.hpp:116
cudf::aggregation::ALL
@ ALL
all reduction
Definition: aggregation.hpp:93
cudf::order
order
Indicates the order in which elements should be sorted.
Definition: types.hpp:115