Files
file	range_window_bounds.hpp

file	rolling.hpp

Classes
struct	cudf::range_window_bounds
	Abstraction for window boundary sizes, to be used with `grouped_range_rolling_window()`. More...

struct	cudf::bounded_closed
	Strongly typed wrapper for bounded closed rolling windows. More...

struct	cudf::bounded_open
	Strongly typed wrapper for bounded open rolling windows. More...

struct	cudf::unbounded
	Strongly typed wrapper for unbounded rolling windows. More...

struct	cudf::current_row
	Strongly typed wrapper for current_row rolling windows. More...

struct	cudf::rolling_request
	A request for a rolling aggregation on a column. More...

struct	cudf::window_bounds
	Abstraction for window boundary sizes. More...

Typedefs
using	cudf::range_window_type = std::variant< unbounded, current_row, bounded_closed, bounded_open >
	The type of the range-based rolling window endpoint.

Functions
std::pair< std::unique_ptr< column >, std::unique_ptr< column > >	cudf::make_range_windows (table_view const &group_keys, column_view const &orderby, order order, null_order null_order, range_window_type preceding, range_window_type following, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Constructs preceding and following columns given window range specifications. More...

std::unique_ptr< column >	cudf::rolling_window (column_view const &input, size_type preceding_window, size_type following_window, size_type min_periods, rolling_aggregation const &agg, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::rolling_window (column_view const &input, column_view const &default_outputs, size_type preceding_window, size_type following_window, size_type min_periods, rolling_aggregation const &agg, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::grouped_rolling_window (table_view const &group_keys, column_view const &input, size_type preceding_window, size_type following_window, size_type min_periods, rolling_aggregation const &aggr, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a grouping-aware, fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::grouped_rolling_window (table_view const &group_keys, column_view const &input, window_bounds preceding_window, window_bounds following_window, size_type min_periods, rolling_aggregation const &aggr, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a grouping-aware, fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::grouped_rolling_window (table_view const &group_keys, column_view const &input, column_view const &default_outputs, size_type preceding_window, size_type following_window, size_type min_periods, rolling_aggregation const &aggr, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a grouping-aware, fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::grouped_rolling_window (table_view const &group_keys, column_view const &input, column_view const &default_outputs, window_bounds preceding_window, window_bounds following_window, size_type min_periods, rolling_aggregation const &aggr, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a grouping-aware, fixed-size rolling window function to the values in a column. More...

std::unique_ptr< column >	cudf::grouped_range_rolling_window (table_view const &group_keys, column_view const &orderby_column, cudf::order const &order, column_view const &input, range_window_bounds const &preceding, range_window_bounds const &following, size_type min_periods, rolling_aggregation const &aggr, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a grouping-aware, value range-based rolling window function to the values in a column. More...

std::unique_ptr< table >	cudf::grouped_range_rolling_window (table_view const &group_keys, column_view const &orderby, order order, null_order null_order, range_window_type preceding, range_window_type following, host_span< rolling_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Apply a grouping-aware range-based rolling window function to a sequence of columns. More...

std::unique_ptr< column >	cudf::rolling_window (column_view const &input, column_view const &preceding_window, column_view const &following_window, size_type min_periods, rolling_aggregation const &agg, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Applies a variable-size rolling window function to the values in a column. More...

bool	cudf::is_valid_rolling_aggregation (data_type source, aggregation::Kind kind)
	Indicate if a rolling aggregation is supported for a source datatype. More...

Detailed Description

Function Documentation

◆ grouped_range_rolling_window() [1/2]

std::unique_ptr<table> cudf::grouped_range_rolling_window	(	table_view const &	group_keys,
		column_view const &	orderby,
		order	order,
		null_order	null_order,
		range_window_type	preceding,
		range_window_type	following,
		host_span< rolling_request const >	requests,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Apply a grouping-aware range-based rolling window function to a sequence of columns.

Parameters

group_keys	Possibly empty table of sorted keys defining groups.
orderby	Column defining window ranges. Must be sorted. If `group_keys` is non-empty, must be sorted groupwise.
order	Sort order of the `orderby` column.
null_order	Null sort order in the sorted `orderby` column.
preceding	Type of the preceding window.
following	Type of the following window.
requests	Columns to aggregate and the aggregation for each column.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table's device memory

Returns: A table of results, one column per input request.

◆ grouped_range_rolling_window() [2/2]

std::unique_ptr<column> cudf::grouped_range_rolling_window	(	table_view const &	group_keys,
		column_view const &	orderby_column,
		cudf::order const &	order,
		column_view const &	input,
		range_window_bounds const &	preceding,
		range_window_bounds const &	following,
		size_type	min_periods,
		rolling_aggregation const &	aggr,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a grouping-aware, value range-based rolling window function to the values in a column.

This function aggregates rows in a window around each element of a specified input column. The window is determined based on the values of an ordered orderby column, and on the values of a preceding and following scalar representing an inclusive range of orderby column values.

The elements of the input column are grouped into distinct groups (e.g. the result of a groupby), determined by the corresponding values of the columns under group_keys. The window-aggregation cannot cross the group boundaries.
Within a group, with all rows sorted by the orderby column, the aggregation window for a row at index i is determined as follows: a) If orderby is ASCENDING, aggregation window for row i includes all input rows at index j such that:
(orderby[i] - preceding) <= orderby[j] <= orderby[i] + following

b) If orderby is DESCENDING, aggregation window for row i includes all input rows at index j such that:
(orderby[i] + preceding) >= orderby[j] >= orderby[i] - following

Note: This method requires that the rows are presorted by the group keys and orderby column values.

The window intervals are specified as scalar values appropriate for the orderby column. Currently, only the following combinations of orderby column type and range types are supported:

If orderby column is a TIMESTAMP, the preceding/following windows are specified in terms of DURATION scalars of the same resolution. E.g. For orderby column of type TIMESTAMP_SECONDS, the intervals may only be DURATION_SECONDS. Durations of higher resolution (e.g. DURATION_NANOSECONDS) or lower (e.g. DURATION_DAYS) cannot be used.
If the orderby column is an integral type (e.g. INT32), the preceding/following should be the exact same type (INT32).

Example: Consider a motor-racing statistics dataset, containing the following columns:
  1. driver_name:   (STRING) Name of the car driver
  2. num_overtakes: (INT32)  Number of times the driver overtook another car in a lap
  3. lap_number:    (INT32)  The number of the lap
 
The `group_range_rolling_window()` function allows one to calculate the total number of overtakes
each driver made within any 3 lap window of each entry:
  1. Group/partition the dataset by `driver_id` (This is the group_keys argument.)
  2. Sort each group by the `lap_number` (i.e. This is the orderby_column.)
  3. Calculate the SUM(num_overtakes) over a window (preceding=1, following=1)
 
For the following input:
 
 [ // driver_name,  num_overtakes,  lap_number
   {   "bottas",        1,            1        },
   {   "hamilton",      2,            1        },
   {   "bottas",        2,            2        },
   {   "bottas",        1,            3        },
   {   "hamilton",      3,            1        },
   {   "hamilton",      8,            2        },
   {   "bottas",        5,            7        },
   {   "bottas",        6,            8        },
   {   "hamilton",      4,            4        }
 ]
 
Partitioning (grouping) by `driver_name`, and ordering by `lap_number` yields the following
`num_overtakes` vector (with 2 groups, one for each distinct `driver_name`):
 
lap_number:      [ 1,  2,  3,  7,  8,   1,  1,   2,  4 ]
num_overtakes:   [ 1,  2,  1,  5,  6,   2,  3,   8,  4 ]
                   <-----bottas------>|<----hamilton--->
 
The SUM aggregation is applied, with 1 preceding, and 1 following, with a minimum of 1
period. The aggregation window is thus 3 (laps) wide, yielding the following output column:
 
 Results:        [ 3,  4,  3,  11, 11,  13, 13,  13,  4 ]

Note: The number of rows participating in each window might vary, based on the index within the group, datestamp, and min_periods. Apropos:

results[0] considers 2 values, because it is at the beginning of its group, and has no preceding values.
results[5] considers 3 values, despite being at the beginning of its group. It must include 2 following values, based on its orderby_column value.

Each aggregation operation cannot cross group boundaries.

The type of the returned column depends on the input column type T, and the aggregation:

COUNT returns INT32 columns
MIN/MAX returns T columns
SUM returns the promoted type for T. Sum on INT32 yields INT64.
MEAN returns FLOAT64 columns
COLLECT returns columns of type LIST<T>.

LEAD/LAG/ROW_NUMBER are undefined for range queries.

Parameters

[in]	group_keys	The (pre-sorted) grouping columns
[in]	orderby_column	The (pre-sorted) order-by column, for range comparisons
[in]	order	The order (ASCENDING/DESCENDING) in which the order-by column is sorted
[in]	input	The input column (to be aggregated)
[in]	preceding	The interval value in the backward direction
[in]	following	The interval value in the forward direction
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	aggr	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

◆ grouped_rolling_window() [1/4]

std::unique_ptr<column> cudf::grouped_rolling_window	(	table_view const &	group_keys,
		column_view const &	input,
		column_view const &	default_outputs,
		size_type	preceding_window,
		size_type	following_window,
		size_type	min_periods,
		rolling_aggregation const &	aggr,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a grouping-aware, fixed-size rolling window function to the values in a column.

Like rolling_window(), this function aggregates values in a window around each element of a specified input column. It differs from rolling_window() in that elements of the input column are grouped into distinct groups (e.g. the result of a groupby). The window aggregation cannot cross the group boundaries. For a row i of input, the group is determined from the corresponding (i.e. i-th) values of the columns under group_keys.

Note: This method requires that the rows are presorted by the group_key values.

Example: Consider a user-sales dataset, where the rows look as follows:
{ "user_id", sales_amt, day }
 
The `grouped_rolling_window()` method enables windowing queries such as grouping a dataset by
`user_id`, and summing up the `sales_amt` column over a window of 3 rows (2 preceding (including
current row), 1 row following).
 
In this example,
   1. `group_keys == [ user_id ]`
   2. `input == sales_amt`
The data are grouped by `user_id`, and ordered by `day`-string. The aggregation
(SUM) is then calculated for a window of 3 values around (and including) each row.
 
For the following input:
 
 [ // user,  sales_amt
   { "user1",   10      },
   { "user2",   20      },
   { "user1",   20      },
   { "user1",   10      },
   { "user2",   30      },
   { "user2",   80      },
   { "user1",   50      },
   { "user1",   60      },
   { "user2",   40      }
 ]
 
Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
(with 2 groups, one for each distinct `user_id`):
 
   [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
     <-------user1-------->|<------user2------->
 
The SUM aggregation is applied with 1 preceding and 1 following
row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
yielding the following column:
 
   [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
 
Note: The SUMs calculated at the group boundaries (i.e. indices 0, 4, 5, and 8)
consider only 2 values each, in spite of the window-size being 3.
Each aggregation operation cannot cross group boundaries.

The returned column for op == COUNT always has INT32 type. All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Note: preceding_window and following_window could well have negative values. This yields windows where the current row might not be included at all. For instance, consider a window defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding the current row, and 1 row preceding the current row. For the example above, the window for row#3 is:

[ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] <–window--> ^ | current_row

Similarly, preceding could have a negative value, indicating that the window begins at a position after the current row. It differs slightly from the semantics for following, because preceding includes the current row. Therefore:

preceding=1 => Window starts at the current row.
preceding=0 => Window starts at 1 past the current row.
preceding=-1 => Window starts at 2 past the current row. Etc.

Parameters

[in]	group_keys	The (pre-sorted) grouping columns
[in]	input	The input column (to be aggregated)
[in]	preceding_window	The static rolling window size in the backward direction (for positive values), or forward direction (for negative values)
[in]	following_window	The static rolling window size in the forward direction (for positive values), or backward direction (for negative values)
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	aggr	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

Parameters

default_outputs A column of per-row default values to be returned instead of nulls. Used for LEAD()/LAG(), if the row offset crosses the boundaries of the column or group.

◆ grouped_rolling_window() [2/4]

std::unique_ptr<column> cudf::grouped_rolling_window	(	table_view const &	group_keys,
		column_view const &	input,
		column_view const &	default_outputs,
		window_bounds	preceding_window,
		window_bounds	following_window,
		size_type	min_periods,
		rolling_aggregation const &	aggr,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a grouping-aware, fixed-size rolling window function to the values in a column.

Like rolling_window(), this function aggregates values in a window around each element of a specified input column. It differs from rolling_window() in that elements of the input column are grouped into distinct groups (e.g. the result of a groupby). The window aggregation cannot cross the group boundaries. For a row i of input, the group is determined from the corresponding (i.e. i-th) values of the columns under group_keys.

Note: This method requires that the rows are presorted by the group_key values.

Example: Consider a user-sales dataset, where the rows look as follows:
{ "user_id", sales_amt, day }
 
The `grouped_rolling_window()` method enables windowing queries such as grouping a dataset by
`user_id`, and summing up the `sales_amt` column over a window of 3 rows (2 preceding (including
current row), 1 row following).
 
In this example,
   1. `group_keys == [ user_id ]`
   2. `input == sales_amt`
The data are grouped by `user_id`, and ordered by `day`-string. The aggregation
(SUM) is then calculated for a window of 3 values around (and including) each row.
 
For the following input:
 
 [ // user,  sales_amt
   { "user1",   10      },
   { "user2",   20      },
   { "user1",   20      },
   { "user1",   10      },
   { "user2",   30      },
   { "user2",   80      },
   { "user1",   50      },
   { "user1",   60      },
   { "user2",   40      }
 ]
 
Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
(with 2 groups, one for each distinct `user_id`):
 
   [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
     <-------user1-------->|<------user2------->
 
The SUM aggregation is applied with 1 preceding and 1 following
row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
yielding the following column:
 
   [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
 
Note: The SUMs calculated at the group boundaries (i.e. indices 0, 4, 5, and 8)
consider only 2 values each, in spite of the window-size being 3.
Each aggregation operation cannot cross group boundaries.

The returned column for op == COUNT always has INT32 type. All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Note: preceding_window and following_window could well have negative values. This yields windows where the current row might not be included at all. For instance, consider a window defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding the current row, and 1 row preceding the current row. For the example above, the window for row#3 is:

[ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] <–window--> ^ | current_row

Similarly, preceding could have a negative value, indicating that the window begins at a position after the current row. It differs slightly from the semantics for following, because preceding includes the current row. Therefore:

preceding=1 => Window starts at the current row.
preceding=0 => Window starts at 1 past the current row.
preceding=-1 => Window starts at 2 past the current row. Etc.

Parameters

[in]	group_keys	The (pre-sorted) grouping columns
[in]	input	The input column (to be aggregated)
[in]	preceding_window	The static rolling window size in the backward direction (for positive values), or forward direction (for negative values)
[in]	following_window	The static rolling window size in the forward direction (for positive values), or backward direction (for negative values)
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	aggr	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

Parameters

default_outputs A column of per-row default values to be returned instead of nulls. Used for LEAD()/LAG(), if the row offset crosses the boundaries of the column or group.

◆ grouped_rolling_window() [3/4]

std::unique_ptr<column> cudf::grouped_rolling_window	(	table_view const &	group_keys,
		column_view const &	input,
		size_type	preceding_window,
		size_type	following_window,
		size_type	min_periods,
		rolling_aggregation const &	aggr,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a grouping-aware, fixed-size rolling window function to the values in a column.

Like rolling_window(), this function aggregates values in a window around each element of a specified input column. It differs from rolling_window() in that elements of the input column are grouped into distinct groups (e.g. the result of a groupby). The window aggregation cannot cross the group boundaries. For a row i of input, the group is determined from the corresponding (i.e. i-th) values of the columns under group_keys.

Note: This method requires that the rows are presorted by the group_key values.

Example: Consider a user-sales dataset, where the rows look as follows:
{ "user_id", sales_amt, day }
 
The `grouped_rolling_window()` method enables windowing queries such as grouping a dataset by
`user_id`, and summing up the `sales_amt` column over a window of 3 rows (2 preceding (including
current row), 1 row following).
 
In this example,
   1. `group_keys == [ user_id ]`
   2. `input == sales_amt`
The data are grouped by `user_id`, and ordered by `day`-string. The aggregation
(SUM) is then calculated for a window of 3 values around (and including) each row.
 
For the following input:
 
 [ // user,  sales_amt
   { "user1",   10      },
   { "user2",   20      },
   { "user1",   20      },
   { "user1",   10      },
   { "user2",   30      },
   { "user2",   80      },
   { "user1",   50      },
   { "user1",   60      },
   { "user2",   40      }
 ]
 
Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
(with 2 groups, one for each distinct `user_id`):
 
   [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
     <-------user1-------->|<------user2------->
 
The SUM aggregation is applied with 1 preceding and 1 following
row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
yielding the following column:
 
   [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
 
Note: The SUMs calculated at the group boundaries (i.e. indices 0, 4, 5, and 8)
consider only 2 values each, in spite of the window-size being 3.
Each aggregation operation cannot cross group boundaries.

The returned column for op == COUNT always has INT32 type. All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Note: preceding_window and following_window could well have negative values. This yields windows where the current row might not be included at all. For instance, consider a window defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding the current row, and 1 row preceding the current row. For the example above, the window for row#3 is:

[ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] <–window--> ^ | current_row

Similarly, preceding could have a negative value, indicating that the window begins at a position after the current row. It differs slightly from the semantics for following, because preceding includes the current row. Therefore:

preceding=1 => Window starts at the current row.
preceding=0 => Window starts at 1 past the current row.
preceding=-1 => Window starts at 2 past the current row. Etc.

Parameters

[in]	group_keys	The (pre-sorted) grouping columns
[in]	input	The input column (to be aggregated)
[in]	preceding_window	The static rolling window size in the backward direction (for positive values), or forward direction (for negative values)
[in]	following_window	The static rolling window size in the forward direction (for positive values), or backward direction (for negative values)
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	aggr	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

◆ grouped_rolling_window() [4/4]

std::unique_ptr<column> cudf::grouped_rolling_window	(	table_view const &	group_keys,
		column_view const &	input,
		window_bounds	preceding_window,
		window_bounds	following_window,
		size_type	min_periods,
		rolling_aggregation const &	aggr,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a grouping-aware, fixed-size rolling window function to the values in a column.

Like rolling_window(), this function aggregates values in a window around each element of a specified input column. It differs from rolling_window() in that elements of the input column are grouped into distinct groups (e.g. the result of a groupby). The window aggregation cannot cross the group boundaries. For a row i of input, the group is determined from the corresponding (i.e. i-th) values of the columns under group_keys.

Note: This method requires that the rows are presorted by the group_key values.

Example: Consider a user-sales dataset, where the rows look as follows:
{ "user_id", sales_amt, day }
 
The `grouped_rolling_window()` method enables windowing queries such as grouping a dataset by
`user_id`, and summing up the `sales_amt` column over a window of 3 rows (2 preceding (including
current row), 1 row following).
 
In this example,
   1. `group_keys == [ user_id ]`
   2. `input == sales_amt`
The data are grouped by `user_id`, and ordered by `day`-string. The aggregation
(SUM) is then calculated for a window of 3 values around (and including) each row.
 
For the following input:
 
 [ // user,  sales_amt
   { "user1",   10      },
   { "user2",   20      },
   { "user1",   20      },
   { "user1",   10      },
   { "user2",   30      },
   { "user2",   80      },
   { "user1",   50      },
   { "user1",   60      },
   { "user2",   40      }
 ]
 
Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
(with 2 groups, one for each distinct `user_id`):
 
   [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
     <-------user1-------->|<------user2------->
 
The SUM aggregation is applied with 1 preceding and 1 following
row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
yielding the following column:
 
   [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
 
Note: The SUMs calculated at the group boundaries (i.e. indices 0, 4, 5, and 8)
consider only 2 values each, in spite of the window-size being 3.
Each aggregation operation cannot cross group boundaries.

The returned column for op == COUNT always has INT32 type. All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Note: preceding_window and following_window could well have negative values. This yields windows where the current row might not be included at all. For instance, consider a window defined as (preceding=3, following=-1). This produces a window from 2 (i.e. 3-1) rows preceding the current row, and 1 row preceding the current row. For the example above, the window for row#3 is:

[ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] <–window--> ^ | current_row

Similarly, preceding could have a negative value, indicating that the window begins at a position after the current row. It differs slightly from the semantics for following, because preceding includes the current row. Therefore:

preceding=1 => Window starts at the current row.
preceding=0 => Window starts at 1 past the current row.
preceding=-1 => Window starts at 2 past the current row. Etc.

Parameters

[in]	group_keys	The (pre-sorted) grouping columns
[in]	input	The input column (to be aggregated)
[in]	preceding_window	The static rolling window size in the backward direction (for positive values), or forward direction (for negative values)
[in]	following_window	The static rolling window size in the forward direction (for positive values), or backward direction (for negative values)
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	aggr	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

◆ is_valid_rolling_aggregation()

bool cudf::is_valid_rolling_aggregation	(	data_type	source,
		aggregation::Kind	kind
	)

Indicate if a rolling aggregation is supported for a source datatype.

Parameters

source	Type of the column to perform the aggregation on.
kind	The kind of the aggregation.

Returns: true if the aggregation is supported.

◆ make_range_windows()

std::pair<std::unique_ptr<column>, std::unique_ptr<column> > cudf::make_range_windows	(	table_view const &	group_keys,
		column_view const &	orderby,
		order	order,
		null_order	null_order,
		range_window_type	preceding,
		range_window_type	following,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Constructs preceding and following columns given window range specifications.

Parameters

group_keys	Possibly empty table of sorted keys defining groups.
orderby	Column defining window ranges. Must be sorted. If `group_keys` is non-empty, must be sorted groupwise.
order	Sort order of the `orderby` column.
null_order	Null sort order in the sorted `orderby` column. Apples groupwise if `group_keys` is non-empty.
preceding	Type of the preceding window.
following	Type of the following window.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: pair of preceding and following columns that define the window bounds for each row, suitable for passing to rolling_window.

◆ rolling_window() [1/3]

std::unique_ptr<column> cudf::rolling_window	(	column_view const &	input,
		column_view const &	default_outputs,
		size_type	preceding_window,
		size_type	following_window,
		size_type	min_periods,
		rolling_aggregation const &	agg,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a fixed-size rolling window function to the values in a column.

This function aggregates values in a window around each element i of the input column, and invalidates the bit mask for element i if there are not enough observations. The window size is static (the same for each element). This matches Pandas' API for DataFrame.rolling with a few notable differences:

instead of the center flag it uses a two-part window to allow for more flexible windows. The total window size = preceding_window + following_window. Element i uses elements [i-preceding_window+1, i+following_window] to do the window computation.
instead of storing NA/NaN for output rows that do not meet the minimum number of observations this function updates the valid bitmask of the column to indicate which elements are valid.

Note: Windows near the endpoints of the input are automatically clamped to be in-bounds.

Notes on return column types:

The returned column for count aggregation always has INT32 type.
The returned column for VARIANCE/STD aggregations always has FLOAT64 type.
All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Parameters

[in]	input	The input column
[in]	preceding_window	The static rolling window size in the backward direction
[in]	following_window	The static rolling window size in the forward direction
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	agg	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

Parameters

default_outputs A column of per-row default values to be returned instead of nulls. Used for LEAD()/LAG(), if the row offset crosses the boundaries of the column.

◆ rolling_window() [2/3]

std::unique_ptr<column> cudf::rolling_window	(	column_view const &	input,
		column_view const &	preceding_window,
		column_view const &	following_window,
		size_type	min_periods,
		rolling_aggregation const &	agg,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a variable-size rolling window function to the values in a column.

This function aggregates values in a window around each element i of the input column, and invalidates the bit mask for element i if there are not enough observations. The window size is dynamic (varying for each element). This matches Pandas' API for DataFrame.rolling with a few notable differences:

instead of the center flag it uses a two-part window to allow for more flexible windows. The total window size = preceding_window + following_window. Element i uses elements [i-preceding_window+1, i+following_window] to do the window computation.
instead of storing NA/NaN for output rows that do not meet the minimum number of observations this function updates the valid bitmask of the column to indicate which elements are valid.
support for dynamic rolling windows, i.e. window size can be specified for each element using an additional array.

The returned column for count aggregation always has INT32 type. All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Note: All entries in preceding_window and following_window must produce window extents that are in-bounds for the input. That is, for all i, it is required that the set of rows defined by the interval [i - preceding_window[i] + 1, ..., i + following_window[i] + 1) is a subset of [0, input.size()).

Exceptions

cudf::logic_error if window column type is not INT32

Parameters

[in]	input	The input column
[in]	preceding_window	A non-nullable column of INT32 window sizes in the forward direction. `preceding_window[i]` specifies preceding window size for element `i`.
[in]	following_window	A non-nullable column of INT32 window sizes in the backward direction. `following_window[i]` specifies following window size for element `i`.
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	agg	The rolling window aggregation type (sum, max, min, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

◆ rolling_window() [3/3]

std::unique_ptr<column> cudf::rolling_window	(	column_view const &	input,
		size_type	preceding_window,
		size_type	following_window,
		size_type	min_periods,
		rolling_aggregation const &	agg,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Applies a fixed-size rolling window function to the values in a column.

This function aggregates values in a window around each element i of the input column, and invalidates the bit mask for element i if there are not enough observations. The window size is static (the same for each element). This matches Pandas' API for DataFrame.rolling with a few notable differences:

instead of the center flag it uses a two-part window to allow for more flexible windows. The total window size = preceding_window + following_window. Element i uses elements [i-preceding_window+1, i+following_window] to do the window computation.
instead of storing NA/NaN for output rows that do not meet the minimum number of observations this function updates the valid bitmask of the column to indicate which elements are valid.

Note: Windows near the endpoints of the input are automatically clamped to be in-bounds.

Notes on return column types:

The returned column for count aggregation always has INT32 type.
The returned column for VARIANCE/STD aggregations always has FLOAT64 type.
All other operators return a column of the same type as the input. Therefore it is suggested to convert integer column types (especially low-precision integers) to FLOAT32 or FLOAT64 before doing a rolling MEAN.

Parameters

[in]	input	The input column
[in]	preceding_window	The static rolling window size in the backward direction
[in]	following_window	The static rolling window size in the forward direction
[in]	min_periods	Minimum number of observations in window required to have a value, otherwise element `i` is null.
[in]	agg	The rolling window aggregation type (SUM, MAX, MIN, etc.)
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned column's device memory

Returns: A nullable output column containing the rolling window results

Files

Classes

Typedefs

Functions

Detailed Description

Function Documentation

◆ grouped_range_rolling_window() [1/2]

◆ grouped_range_rolling_window() [2/2]

◆ grouped_rolling_window() [1/4]

◆ grouped_rolling_window() [2/4]

◆ grouped_rolling_window() [3/4]

◆ grouped_rolling_window() [4/4]

◆ is_valid_rolling_aggregation()

◆ make_range_windows()

◆ rolling_window() [1/3]

◆ rolling_window() [2/3]

◆ rolling_window() [3/3]