groupby.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/aggregation.hpp>
10 #include <cudf/replace.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
15 #include <cudf/utilities/span.hpp>
16 
17 #include <rmm/cuda_stream_view.hpp>
18 
19 #include <memory>
20 #include <utility>
21 #include <vector>
22 
23 namespace CUDF_EXPORT cudf {
25 namespace groupby {
26 namespace detail {
27 namespace sort {
28 struct sort_groupby_helper;
29 
30 } // namespace sort
31 } // namespace detail
32 
51  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;
52 };
53 
64 struct scan_request {
66  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;
67 };
68 
78  std::vector<std::unique_ptr<column>> results{};
79 };
80 
84 class groupby {
85  public:
86  groupby() = delete;
87  ~groupby();
88  groupby(groupby const&) = delete;
89  groupby(groupby&&) = delete;
90  groupby& operator=(groupby const&) = delete;
91  groupby& operator=(groupby&&) = delete;
92 
116  explicit groupby(table_view const& keys,
117  null_policy null_handling = null_policy::EXCLUDE,
118  sorted keys_are_sorted = sorted::NO,
119  std::vector<order> const& column_order = {},
120  std::vector<null_order> const& null_precedence = {});
121 
176  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
232  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
236 
288  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
289  table_view const& values,
291  std::vector<std::reference_wrapper<scalar const>> const& fill_values,
294 
303  struct groups {
304  std::unique_ptr<table> keys;
305  std::vector<size_type> offsets;
306  std::unique_ptr<table> values;
307  };
308 
325 
362  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
363  table_view const& values,
364  host_span<cudf::replace_policy const> replace_policies,
367 
368  private:
369  table_view _keys;
370  null_policy _include_null_keys{null_policy::EXCLUDE};
372  sorted _keys_are_sorted{sorted::NO};
373  std::vector<order> _column_order{};
375  std::vector<null_order> _null_precedence{};
378  std::unique_ptr<detail::sort::sort_groupby_helper>
379  _helper;
381 
388  detail::sort::sort_groupby_helper& helper();
389 
394  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
395  host_span<aggregation_request const> requests,
396  rmm::cuda_stream_view stream,
398 
399  // Sort-based groupby
400  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
401  host_span<aggregation_request const> requests,
402  rmm::cuda_stream_view stream,
404 
405  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
406  host_span<scan_request const> requests,
407  rmm::cuda_stream_view stream,
409 };
411 } // namespace groupby
412 } // namespace CUDF_EXPORT cudf
Representation for specifying desired aggregations from aggregation-based APIs, e....
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
Groups values by keys and computes aggregations on those groups.
Definition: groupby.hpp:84
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > scan(host_span< scan_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped scans on the specified values.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > replace_nulls(table_view const &values, host_span< cudf::replace_policy const > replace_policies, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped replace nulls on value.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > shift(table_view const &values, host_span< size_type const > offsets, std::vector< std::reference_wrapper< scalar const >> const &fill_values, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped shifts for specified values.
groupby(table_view const &keys, null_policy null_handling=null_policy::EXCLUDE, sorted keys_are_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={})
Construct a groupby object with the specified keys
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > aggregate(host_span< aggregation_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped aggregations on the specified values.
groups get_groups(cudf::table_view values={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Get the grouped keys and values corresponding to a groupby operation on a set of values.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
column view class definitions
std::unique_ptr< table > sort(table_view const &input, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs a lexicographic sort of the rows of a table.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:115
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:156
cuDF interfaces
Definition: host_udf.hpp:26
APIs for spans.
Request for groupby aggregation(s) to perform on a column.
Definition: groupby.hpp:49
std::vector< std::unique_ptr< groupby_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:51
column_view values
The elements to aggregate.
Definition: groupby.hpp:50
The result(s) of an aggregation_request
Definition: groupby.hpp:76
The grouped data corresponding to a groupby operation on a set of values.
Definition: groupby.hpp:303
std::unique_ptr< table > keys
Table of grouped keys.
Definition: groupby.hpp:304
std::vector< size_type > offsets
Group Offsets.
Definition: groupby.hpp:305
std::unique_ptr< table > values
Table of grouped values.
Definition: groupby.hpp:306
Request for groupby aggregation(s) for scanning a column.
Definition: groupby.hpp:64
column_view values
The elements to aggregate.
Definition: groupby.hpp:65
std::vector< std::unique_ptr< groupby_scan_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:66
C++20 std::span with reduced feature set.
Definition: span.hpp:182
Class definitions for (mutable)_table_view
Type declarations for libcudf.