Files
file	aggregation.hpp
	Representation for specifying desired aggregations from aggregation-based APIs, e.g., groupby, reductions, rolling, etc.

Classes
struct	cudf::host_udf_base
	The interface for host-based UDF implementation. More...

class	cudf::aggregation
	Abstract base class for specifying the desired aggregation in an `aggregation_request`. More...

class	cudf::rolling_aggregation
	Derived class intended for rolling_window specific aggregation usage. More...

class	cudf::groupby_aggregation
	Derived class intended for groupby specific aggregation usage. More...

class	cudf::groupby_scan_aggregation
	Derived class intended for groupby specific scan usage. More...

class	cudf::reduce_aggregation
	Derived class intended for reduction usage. More...

class	cudf::scan_aggregation
	Derived class intended for scan usage. More...

class	cudf::segmented_reduce_aggregation
	Derived class intended for segmented reduction usage. More...

Enumerations
enum class	cudf::rank_percentage : int32_t { cudf::NONE , cudf::ZERO_NORMALIZED , cudf::ONE_NORMALIZED }
	Whether returned rank should be percentage or not and mention the type of percentage normalization. More...

enum class	cudf::udf_type : bool { CUDA , PTX }
	Type of code in the user defined function string.

enum class	cudf::correlation_type : int32_t { PEARSON , KENDALL , SPEARMAN }
	Type of correlation method.

enum class	cudf::ewm_history : int32_t { INFINITE , FINITE }
	Type of treatment of EWM input values' first value.

Functions
template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_sum_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_product_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_min_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_max_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_count_aggregation (null_policy null_handling=null_policy::EXCLUDE)
	Factory to create a COUNT aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_any_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_all_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_histogram_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_sum_of_squares_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_mean_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_m2_aggregation ()
	Factory to create a M2 aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_variance_aggregation (size_type ddof=1)
	Factory to create a VARIANCE aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_std_aggregation (size_type ddof=1)
	Factory to create a STD aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_median_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_quantile_aggregation (std::vector< double > const &quantiles, interpolation interp=interpolation::LINEAR)
	Factory to create a QUANTILE aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_argmax_aggregation ()
	Factory to create an ARGMAX aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_argmin_aggregation ()
	Factory to create an ARGMIN aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_nunique_aggregation (null_policy null_handling=null_policy::EXCLUDE)
	Factory to create a NUNIQUE aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_nth_element_aggregation (size_type n, null_policy null_handling=null_policy::INCLUDE)
	Factory to create a NTH_ELEMENT aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_row_number_aggregation ()

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_ewma_aggregation (double const center_of_mass, ewm_history history)
	Factory to create an EWMA aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_rank_aggregation (rank_method method, order column_order=order::ASCENDING, null_policy null_handling=null_policy::EXCLUDE, null_order null_precedence=null_order::AFTER, rank_percentage percentage=rank_percentage::NONE)
	Factory to create a RANK aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_collect_list_aggregation (null_policy null_handling=null_policy::INCLUDE)
	Factory to create a COLLECT_LIST aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_collect_set_aggregation (null_policy null_handling=null_policy::INCLUDE, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
	Factory to create a COLLECT_SET aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_lag_aggregation (size_type offset)
	Factory to create a LAG aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_lead_aggregation (size_type offset)
	Factory to create a LEAD aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_udf_aggregation (udf_type type, std::string const &user_defined_aggregator, data_type output_type)
	Factory to create an aggregation base on UDF for PTX or CUDA. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_host_udf_aggregation (std::unique_ptr< host_udf_base > host_udf)
	Factory to create a HOST_UDF aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_merge_lists_aggregation ()
	Factory to create a MERGE_LISTS aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_merge_sets_aggregation (null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL)
	Factory to create a MERGE_SETS aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_merge_m2_aggregation ()
	Factory to create a MERGE_M2 aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_merge_histogram_aggregation ()
	Factory to create a MERGE_HISTOGRAM aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_covariance_aggregation (size_type min_periods=1, size_type ddof=1)
	Factory to create a COVARIANCE aggregation. More...

template<typename Base = aggregation>
std::unique_ptr< Base >	cudf::make_correlation_aggregation (correlation_type type, size_type min_periods=1)
	Factory to create a CORRELATION aggregation. More...

template<typename Base >
std::unique_ptr< Base >	cudf::make_tdigest_aggregation (int max_centroids=1000)
	Factory to create a TDIGEST aggregation. More...

template<typename Base >
std::unique_ptr< Base >	cudf::make_merge_tdigest_aggregation (int max_centroids=1000)
	Factory to create a MERGE_TDIGEST aggregation. More...

Detailed Description

Enumeration Type Documentation

◆ rank_percentage

enum cudf::rank_percentage : int32_t

strong

Whether returned rank should be percentage or not and mention the type of percentage normalization.

Enumerator
NONE	rank
ZERO_NORMALIZED	rank / count
ONE_NORMALIZED	(rank - 1) / (count - 1)

Definition at line 67 of file aggregation.hpp.

Function Documentation

◆ make_all_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_all_aggregation ( )

Factory to create a ALL aggregation

Returns: A ALL aggregation object

◆ make_any_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_any_aggregation ( )

Factory to create an ANY aggregation

Returns: A ANY aggregation object

◆ make_argmax_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_argmax_aggregation ( )

Factory to create an ARGMAX aggregation.

ARGMAX returns the index of the maximum element.

Returns: A ARGMAX aggregation object

◆ make_argmin_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_argmin_aggregation ( )

Factory to create an ARGMIN aggregation.

argmin returns the index of the minimum element.

Returns: A ARGMIN aggregation object

◆ make_collect_list_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_collect_list_aggregation ( null_policy null_handling = null_policy::INCLUDE )

Factory to create a COLLECT_LIST aggregation.

COLLECT_LIST returns a list column of all included elements in the group/series.

If null_handling is set to EXCLUDE, null elements are dropped from each of the list rows.

Parameters

null_handling Indicates whether to include/exclude nulls in list elements

Returns: A COLLECT_LIST aggregation object

◆ make_collect_set_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_collect_set_aggregation	(	null_policy	null_handling = `null_policy::INCLUDE`,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		nan_equality	nans_equal = `nan_equality::ALL_EQUAL`
	)

Factory to create a COLLECT_SET aggregation.

COLLECT_SET returns a lists column of all included elements in the group/series. Within each list, the duplicated entries are dropped out such that each entry appears only once.

If null_handling is set to EXCLUDE, null elements are dropped from each of the list rows.

Parameters

null_handling	Indicates whether to include/exclude nulls during collection
nulls_equal	Flag to specify whether null entries within each list should be considered equal.
nans_equal	Flag to specify whether NaN values in floating point column should be considered equal.

Returns: A COLLECT_SET aggregation object

◆ make_correlation_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_correlation_aggregation	(	correlation_type	type,
		size_type	min_periods = `1`
	)

Factory to create a CORRELATION aggregation.

Compute correlation coefficient between two columns. The input columns are child columns of a non-nullable struct columns.

Parameters

type	correlation_type
min_periods	Minimum number of non-null observations required to produce a result

Returns: A CORRELATION aggregation object

◆ make_count_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_count_aggregation ( null_policy null_handling = null_policy::EXCLUDE )

Factory to create a COUNT aggregation.

Parameters

null_handling Indicates if null values will be counted

Returns: A COUNT aggregation object

◆ make_covariance_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_covariance_aggregation	(	size_type	min_periods = `1`,
		size_type	ddof = `1`
	)

Factory to create a COVARIANCE aggregation.

Compute covariance between two columns. The input columns are child columns of a non-nullable struct columns.

Parameters

min_periods	Minimum number of non-null observations required to produce a result
ddof	Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is the number of non-null observations.

Returns: A COVARIANCE aggregation object

◆ make_ewma_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_ewma_aggregation	(	double const	center_of_mass,
		ewm_history	history
	)

Factory to create an EWMA aggregation.

EWMA returns a non-nullable column with the same type as the input, whose values are the exponentially weighted moving average of the input sequence. Let these values be known as the y_i.

EWMA aggregations are parameterized by a center of mass (com) which affects the contribution of the previous values (y_0 ... y_{i-1}) in computing the y_i.

EWMA aggregations are also parameterized by a history cudf::ewm_history. Special considerations have to be given to the mathematical treatment of the first value of the input sequence. There are two approaches to this, one which considers the first value of the sequence to be the exponential weighted moving average of some infinite history of data, and one which takes the first value to be the only datapoint known. These assumptions lead to two different formulas for the y_i. ewm_history selects which.

EWMA aggregations have special null handling. Nulls have two effects. The first is to propagate forward the last valid value as far as it has been computed. This could be thought of as the nulls not affecting the average in any way. The second effect changes the way the y_i are computed. Since a moving average is conceptually designed to weight contributing values by their recency, nulls ought to count as valid periods even though they do not change the average. For example, if the input sequence is {1, NULL, 3} then when computing y_2 one should weigh y_0 as if it occurs two periods before y_2 rather than just one.

Parameters

center_of_mass	the center of mass.
history	which assumption to make about the first value

Returns: A EWM aggregation object

◆ make_histogram_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_histogram_aggregation ( )

Factory to create a HISTOGRAM aggregation

Returns: A HISTOGRAM aggregation object

◆ make_host_udf_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_host_udf_aggregation ( std::unique_ptr< host_udf_base > host_udf )

Factory to create a HOST_UDF aggregation.

Parameters

host_udf An instance of a class derived from host_udf_base to perform aggregation

Returns: A HOST_UDF aggregation object

◆ make_lag_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_lag_aggregation ( size_type offset )

Factory to create a LAG aggregation.

Parameters

offset The number of rows to lag the input

Returns: A LAG aggregation object

◆ make_lead_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_lead_aggregation ( size_type offset )

Factory to create a LEAD aggregation.

Parameters

offset The number of rows to lead the input

Returns: A LEAD aggregation object

◆ make_m2_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_m2_aggregation ( )

Factory to create a M2 aggregation.

A M2 aggregation is sum of squares of differences from the mean. That is: M2 = SUM((x - MEAN) * (x - MEAN)).

This aggregation produces the intermediate values that are used to compute variance and standard deviation across multiple discrete sets. See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm for more detail.

Returns: A M2 aggregation object

◆ make_max_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_max_aggregation ( )

Factory to create a MAX aggregation

Returns: A MAX aggregation object

◆ make_mean_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_mean_aggregation ( )

Factory to create a MEAN aggregation

Returns: A MEAN aggregation object

◆ make_median_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_median_aggregation ( )

Factory to create a MEDIAN aggregation

Returns: A MEDIAN aggregation object

◆ make_merge_histogram_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_merge_histogram_aggregation ( )

Factory to create a MERGE_HISTOGRAM aggregation.

Merges the results of HISTOGRAM aggregations on independent sets into a new HISTOGRAM value equivalent to if a single HISTOGRAM aggregation was done across all of the sets at once.

Returns: A MERGE_HISTOGRAM aggregation object

◆ make_merge_lists_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_merge_lists_aggregation ( )

Factory to create a MERGE_LISTS aggregation.

Given a lists column, this aggregation merges all the lists corresponding to the same key value into one list. It is designed specifically to merge the partial results of multiple (distributed) groupby COLLECT_LIST aggregations into a final COLLECT_LIST result. As such, it requires the input lists column to be non-nullable (the child column containing list entries is not subjected to this requirement).

Returns: A MERGE_LISTS aggregation object

◆ make_merge_m2_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_merge_m2_aggregation ( )

Factory to create a MERGE_M2 aggregation.

Merges the results of M2 aggregations on independent sets into a new M2 value equivalent to if a single M2 aggregation was done across all of the sets at once. This aggregation is only valid on structs whose members are the result of the COUNT_VALID, MEAN, and M2 aggregations on the same sets. The output of this aggregation is a struct containing the merged COUNT_VALID, MEAN, and M2 aggregations.

The input M2 aggregation values are expected to be all non-negative numbers, since they were output from M2 aggregation.

Returns: A MERGE_M2 aggregation object

◆ make_merge_sets_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_merge_sets_aggregation	(	null_equality	nulls_equal = `null_equality::EQUAL`,
		nan_equality	nans_equal = `nan_equality::ALL_EQUAL`
	)

Factory to create a MERGE_SETS aggregation.

Given a lists column, this aggregation firstly merges all the lists corresponding to the same key value into one list, then it drops all the duplicate entries in each lists, producing a lists column containing non-repeated entries.

This aggregation is designed specifically to merge the partial results of multiple (distributed) groupby COLLECT_LIST or COLLECT_SET aggregations into a final COLLECT_SET result. As such, it requires the input lists column to be non-nullable (the child column containing list entries is not subjected to this requirement).

In practice, the input (partial results) to this aggregation should be generated by (distributed) COLLECT_LIST aggregations, not COLLECT_SET, to avoid unnecessarily removing duplicate entries for the partial results.

Parameters

nulls_equal	Flag to specify whether nulls within each list should be considered equal during dropping duplicate list entries.
nans_equal	Flag to specify whether NaN values in floating point column should be considered equal during dropping duplicate list entries.

Returns: A MERGE_SETS aggregation object

◆ make_merge_tdigest_aggregation()

template<typename Base >

std::unique_ptr<Base> cudf::make_merge_tdigest_aggregation ( int max_centroids = 1000 )

Factory to create a MERGE_TDIGEST aggregation.

Merges the results from a previous aggregation resulting from a make_tdigest_aggregation or make_merge_tdigest_aggregation to produce a new a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column.

The tdigest column produced is of the following structure:

struct { // centroids for the digest list { struct { double // mean double // weight }, ... } // these are from the input stream, not the centroids. they are used // during the percentile_approx computation near the beginning or // end of the quantiles double // min double // max }

Each output row is a single tdigest. The length of the row is the "size" of the tdigest, each element of which represents a weighted centroid (mean, weight).

Parameters

max_centroids Parameter controlling compression level and accuracy on subsequent queries on the output tdigest data. max_centroids places an upper bound on the size of the computed tdigests: A value of 1000 will result in a tdigest containing no more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.

Returns: A MERGE_TDIGEST aggregation object

◆ make_min_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_min_aggregation ( )

Factory to create a MIN aggregation

Returns: A MIN aggregation object

◆ make_nth_element_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_nth_element_aggregation	(	size_type	n,
		null_policy	null_handling = `null_policy::INCLUDE`
	)

Factory to create a NTH_ELEMENT aggregation.

NTH_ELEMENT returns the n'th element of the group/series.

If n is not within the range [-group_size, group_size), the result of the respective group will be null. Negative indices [-group_size, -1] corresponds to [0, group_size-1] indices respectively where group_size is the size of each group.

Parameters

n	index of nth element in each group
null_handling	Indicates to include/exclude nulls during indexing

Returns: A NTH_ELEMENT aggregation object

◆ make_nunique_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_nunique_aggregation ( null_policy null_handling = null_policy::EXCLUDE )

Factory to create a NUNIQUE aggregation.

NUNIQUE returns the number of unique elements.

Parameters

null_handling Indicates if null values will be counted

Returns: A NUNIQUE aggregation object

◆ make_product_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_product_aggregation ( )

Factory to create a PRODUCT aggregation

Returns: A PRODUCT aggregation object

◆ make_quantile_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_quantile_aggregation	(	std::vector< double > const &	quantiles,
		interpolation	interp = `interpolation::LINEAR`
	)

Factory to create a QUANTILE aggregation.

Parameters

quantiles	The desired quantiles
interp	The desired interpolation

Returns: A QUANTILE aggregation object

◆ make_rank_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_rank_aggregation	(	rank_method	method,
		order	column_order = `order::ASCENDING`,
		null_policy	null_handling = `null_policy::EXCLUDE`,
		null_order	null_precedence = `null_order::AFTER`,
		rank_percentage	percentage = `rank_percentage::NONE`
	)

Factory to create a RANK aggregation.

RANK returns a column of size_type or double "ranks" (see note 3 below for how the data type is determined) for a given rank method and column order. If nulls are excluded, the rank will be null for those rows, otherwise a non-nullable column is returned. Double precision column is returned only when percentage!=NONE and when rank method is average.

This aggregation only works with "scan" algorithms. The input column into the group or ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. If rows are ordered by more than one column, the orderby input column should be a struct column containing the ordering columns.

Note:

This method could work faster with the rows that are presorted by the group keys and order_by columns. Though groupby object does not require order_by column to be sorted, groupby rank scan aggregation does require the order_by column to be sorted if the keys are sorted.
RANK aggregations are not compatible with exclusive scans.
All rank methods except AVERAGE method and percentage!=NONE returns size_type column. For AVERAGE method and percentage!=NONE, the return type is double column.

Example: Consider a motor-racing statistics dataset, containing the following columns:
  1. venue:  (STRING) Location of the race event
  2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
  3. time:   (INT32)  Time taken to complete the circuit
 
For the following presorted data:
 
 [ //      venue,           driver,           time
   {   "silverstone",  "HAM" ("hamilton"),   15823},
   {   "silverstone",  "LEC" ("leclerc"),    15827},
   {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
   {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
   {   "silverstone",  "RIC" ("ricciardo"),  15905},
   {      "monza",     "RIC" ("ricciardo"),  12154},
   {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
   {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
   {      "monza",     "LEC" ("leclerc"),    12201},
   {      "monza",     "PER" ("perez"),      12203}
 ]
 
A grouped rank aggregation scan with:
  groupby column      : venue
  input orderby column: time
Produces the following rank column for each methods:
first:   {   1,     2,     3,     4,     5,      1,     2,     3,     4,     5}
average: {   1,     2,   3.5,   3.5,     5,      1,   2.5,   2.5,     4,     5}
min:     {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
max:     {   1,     2,     4,     4,     5,      1,     3,     3,     4,     5}
dense:   {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
This corresponds to the following grouping and `driver` rows:
         { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
           <----------silverstone----------->|<-------------monza-------------->
 
min rank for each percentage types:
NONE:             {   1,      2,     3,     3,     5,      1,     2,     2,     4,     5 }
ZERO_NORMALIZED : { 0.16,  0.33,  0.50,  0.50,  0.83,   0.16,  0.33,  0.33,  0.66,  0.83 }
ONE_NORMALIZED:   { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
where count corresponds to the number of rows in the group. @see cudf::rank_percentage

Parameters

method	The ranking method used for tie breaking (same values)
column_order	The desired sort order for ranking
null_handling	flag to include nulls during ranking If nulls are not included, the corresponding rank will be null.
null_precedence	The desired order of null compared to other elements for column
percentage	enum to denote the type of conversion of ranks to percentage in range (0,1]

Returns: A RANK aggregation object

◆ make_row_number_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_row_number_aggregation ( )

Factory to create a ROW_NUMBER aggregation

Returns: A ROW_NUMBER aggregation object

◆ make_std_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_std_aggregation ( size_type ddof = 1 )

Factory to create a STD aggregation.

Parameters

ddof	Delta degrees of freedom. The divisor used in calculation of `std` is `N - ddof`, where `N` is the population size.

Exceptions

cudf::logic_error if input type is chrono or compound types.

Returns: A STD aggregation object

◆ make_sum_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_sum_aggregation ( )

Factory to create a SUM aggregation

Returns: A SUM aggregation object

◆ make_sum_of_squares_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_sum_of_squares_aggregation ( )

Factory to create a SUM_OF_SQUARES aggregation

Returns: A SUM_OF_SQUARES aggregation object

◆ make_tdigest_aggregation()

template<typename Base >

std::unique_ptr<Base> cudf::make_tdigest_aggregation ( int max_centroids = 1000 )

Factory to create a TDIGEST aggregation.

Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values. The input aggregation values are expected to be fixed-width numeric types.

The tdigest column produced is of the following structure:

struct { // centroids for the digest list { struct { double // mean double // weight }, ... } // these are from the input stream, not the centroids. they are used // during the percentile_approx computation near the beginning or // end of the quantiles double // min double // max }

Each output row is a single tdigest. The length of the row is the "size" of the tdigest, each element of which represents a weighted centroid (mean, weight).

Parameters

max_centroids Parameter controlling compression level and accuracy on subsequent queries on the output tdigest data. max_centroids places an upper bound on the size of the computed tdigests: A value of 1000 will result in a tdigest containing no more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.

Returns: A TDIGEST aggregation object

◆ make_udf_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_udf_aggregation	(	udf_type	type,
		std::string const &	user_defined_aggregator,
		data_type	output_type
	)

Factory to create an aggregation base on UDF for PTX or CUDA.

Parameters

[in]	type	either udf_type::PTX or udf_type::CUDA
[in]	user_defined_aggregator	A string containing the aggregator code
[in]	output_type	expected output type

Returns: An aggregation containing a user-defined aggregator string

◆ make_variance_aggregation()

template<typename Base = aggregation>

std::unique_ptr<Base> cudf::make_variance_aggregation ( size_type ddof = 1 )

Factory to create a VARIANCE aggregation.

Parameters

ddof	Delta degrees of freedom. The divisor used in calculation of `variance` is `N - ddof`, where `N` is the population size.

Exceptions

cudf::logic_error if input type is chrono or compound types.

Returns: A VARIANCE aggregation object

Files

Classes

Enumerations

Functions

Detailed Description

Enumeration Type Documentation

◆ rank_percentage

Function Documentation

◆ make_all_aggregation()

◆ make_any_aggregation()

◆ make_argmax_aggregation()

◆ make_argmin_aggregation()

◆ make_collect_list_aggregation()

◆ make_collect_set_aggregation()

◆ make_correlation_aggregation()

◆ make_count_aggregation()

◆ make_covariance_aggregation()

◆ make_ewma_aggregation()

◆ make_histogram_aggregation()

◆ make_host_udf_aggregation()

◆ make_lag_aggregation()

◆ make_lead_aggregation()

◆ make_m2_aggregation()

◆ make_max_aggregation()

◆ make_mean_aggregation()

◆ make_median_aggregation()

◆ make_merge_histogram_aggregation()

◆ make_merge_lists_aggregation()

◆ make_merge_m2_aggregation()

◆ make_merge_sets_aggregation()

◆ make_merge_tdigest_aggregation()

◆ make_min_aggregation()

◆ make_nth_element_aggregation()

◆ make_nunique_aggregation()

◆ make_product_aggregation()

◆ make_quantile_aggregation()

◆ make_rank_aggregation()

◆ make_row_number_aggregation()

◆ make_std_aggregation()

◆ make_sum_aggregation()

◆ make_sum_of_squares_aggregation()

◆ make_tdigest_aggregation()

◆ make_udf_aggregation()

◆ make_variance_aggregation()