groupby.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/aggregation.hpp>
10 #include <cudf/replace.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
15 #include <cudf/utilities/span.hpp>
16 
17 #include <rmm/cuda_stream_view.hpp>
18 
19 #include <memory>
20 #include <utility>
21 #include <vector>
22 
23 namespace CUDF_EXPORT cudf {
25 namespace groupby {
26 namespace detail {
27 namespace sort {
28 struct sort_groupby_helper;
29 
30 } // namespace sort
31 } // namespace detail
32 
51  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;
52 };
53 
64 struct scan_request {
66  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;
67 };
68 
78  std::vector<std::unique_ptr<column>> results{};
79 };
80 
84 class groupby {
85  public:
86  groupby() = delete;
87  ~groupby();
88  groupby(groupby const&) = delete;
89  groupby(groupby&&) = delete;
90  groupby& operator=(groupby const&) = delete;
91  groupby& operator=(groupby&&) = delete;
92 
116  explicit groupby(table_view const& keys,
117  null_policy null_handling = null_policy::EXCLUDE,
118  sorted keys_are_sorted = sorted::NO,
119  std::vector<order> const& column_order = {},
120  std::vector<null_order> const& null_precedence = {});
121 
176  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
232  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
236 
288  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
289  table_view const& values,
291  std::vector<std::reference_wrapper<scalar const>> const& fill_values,
294 
303  struct groups {
304  std::unique_ptr<table> keys;
305  std::vector<size_type> offsets;
306  std::unique_ptr<table> values;
307  };
308 
325 
362  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
363  table_view const& values,
364  host_span<cudf::replace_policy const> replace_policies,
367 
368  private:
369  table_view _keys;
370  null_policy _include_null_keys{null_policy::EXCLUDE};
372  sorted _keys_are_sorted{sorted::NO};
373  std::vector<order> _column_order{};
375  std::vector<null_order> _null_precedence{};
378  std::unique_ptr<detail::sort::sort_groupby_helper>
379  _helper;
381 
388  detail::sort::sort_groupby_helper& helper();
389 
394  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
395  host_span<aggregation_request const> requests,
396  rmm::cuda_stream_view stream,
398 
399  // Sort-based groupby
400  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
401  host_span<aggregation_request const> requests,
402  rmm::cuda_stream_view stream,
404 
405  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
406  host_span<scan_request const> requests,
407  rmm::cuda_stream_view stream,
409 };
410 
425  std::unique_ptr<groupby_aggregation> aggregation;
426 };
427 
474  public:
475  streaming_groupby() = delete;
477  streaming_groupby(streaming_groupby const&) = delete;
478  streaming_groupby& operator=(streaming_groupby const&) = delete;
479 
482 
487  streaming_groupby& operator=(streaming_groupby&&) noexcept;
488 
502  explicit streaming_groupby(host_span<size_type const> key_indices,
504  size_type max_distinct_keys,
505  null_policy null_handling = null_policy::EXCLUDE);
506 
520  void aggregate(table_view const& data, rmm::cuda_stream_view stream = cudf::get_default_stream());
521 
538  void merge(streaming_groupby const& other,
539  rmm::cuda_stream_view stream = cudf::get_default_stream());
540 
556  [[nodiscard]] std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> finalize(
557  rmm::cuda_stream_view stream = cudf::get_default_stream(),
559 
567  [[nodiscard]] size_type distinct_keys() const noexcept;
568 
569  private:
570  struct impl;
571  std::unique_ptr<impl> _impl;
572 
573  void do_aggregate(table_view const& data, rmm::cuda_stream_view stream);
574  void do_merge(streaming_groupby const& other, rmm::cuda_stream_view stream);
575  [[nodiscard]] std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> do_finalize(
576  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
577 };
578 
591 [[nodiscard]] bool is_streaming_groupby_supported(data_type values_type, aggregation::Kind kind);
592 
594 } // namespace groupby
595 } // namespace CUDF_EXPORT cudf
Representation for specifying desired aggregations from aggregation-based APIs, e....
Abstract base class for specifying the desired aggregation in an aggregation_request.
Definition: aggregation.hpp:74
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
Indicator for the logical data type of an element in a column.
Definition: types.hpp:278
Groups values by keys and computes aggregations on those groups.
Definition: groupby.hpp:84
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > scan(host_span< scan_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped scans on the specified values.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > replace_nulls(table_view const &values, host_span< cudf::replace_policy const > replace_policies, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped replace nulls on value.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > shift(table_view const &values, host_span< size_type const > offsets, std::vector< std::reference_wrapper< scalar const >> const &fill_values, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped shifts for specified values.
groupby(table_view const &keys, null_policy null_handling=null_policy::EXCLUDE, sorted keys_are_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={})
Construct a groupby object with the specified keys
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > aggregate(host_span< aggregation_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped aggregations on the specified values.
groups get_groups(cudf::table_view values={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Get the grouped keys and values corresponding to a groupby operation on a set of values.
Stateful streaming groupby that accumulates partial aggregates across batches.
Definition: groupby.hpp:473
streaming_groupby(streaming_groupby &&) noexcept
Move constructor.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
A set of cudf::column's of the same size.
Definition: table.hpp:29
column view class definitions
bool is_streaming_groupby_supported(data_type values_type, aggregation::Kind kind)
Returns true if streaming_groupby supports the given value type and aggregation kind combination.
std::unique_ptr< cudf::table > merge(std::vector< table_view > const &tables_to_merge, std::vector< cudf::size_type > const &key_cols, std::vector< cudf::order > const &column_order, std::vector< cudf::null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Merge a set of sorted tables.
std::unique_ptr< table > sort(table_view const &input, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs a lexicographic sort of the rows of a table.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
device_async_resource_ref get_current_device_resource_ref()
cuda::mr::resource_ref< cuda::mr::device_accessible > device_async_resource_ref
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:116
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:157
cuDF interfaces
Definition: host_udf.hpp:26
APIs for spans.
Request for groupby aggregation(s) to perform on a column.
Definition: groupby.hpp:49
std::vector< std::unique_ptr< groupby_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:51
column_view values
The elements to aggregate.
Definition: groupby.hpp:50
The result(s) of an aggregation_request
Definition: groupby.hpp:76
The grouped data corresponding to a groupby operation on a set of values.
Definition: groupby.hpp:303
std::unique_ptr< table > keys
Table of grouped keys.
Definition: groupby.hpp:304
std::vector< size_type > offsets
Group Offsets.
Definition: groupby.hpp:305
std::unique_ptr< table > values
Table of grouped values.
Definition: groupby.hpp:306
Request for groupby aggregation(s) for scanning a column.
Definition: groupby.hpp:64
column_view values
The elements to aggregate.
Definition: groupby.hpp:65
std::vector< std::unique_ptr< groupby_scan_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:66
Request for a single streaming groupby aggregation on a column.
Definition: groupby.hpp:423
std::unique_ptr< groupby_aggregation > aggregation
Desired aggregation.
Definition: groupby.hpp:425
size_type column_index
Index of the value column.
Definition: groupby.hpp:424
C++20 std::span with reduced feature set.
Definition: span.hpp:184
Class definitions for (mutable)_table_view
Type declarations for libcudf.