groupby.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/aggregation.hpp>
21 #include <cudf/replace.hpp>
23 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
25 #include <cudf/utilities/span.hpp>
26 
27 #include <rmm/cuda_stream_view.hpp>
29 #include <rmm/resource_ref.hpp>
30 
31 #include <memory>
32 #include <utility>
33 #include <vector>
34 
35 namespace CUDF_EXPORT cudf {
37 namespace groupby {
38 namespace detail {
39 namespace sort {
40 class sort_groupby_helper;
41 
42 } // namespace sort
43 } // namespace detail
44 
63  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;
64 };
65 
76 struct scan_request {
78  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;
79 };
80 
90  std::vector<std::unique_ptr<column>> results{};
91 };
92 
96 class groupby {
97  public:
98  groupby() = delete;
99  ~groupby();
100  groupby(groupby const&) = delete;
101  groupby(groupby&&) = delete;
102  groupby& operator=(groupby const&) = delete;
103  groupby& operator=(groupby&&) = delete;
104 
128  explicit groupby(table_view const& keys,
129  null_policy null_handling = null_policy::EXCLUDE,
130  sorted keys_are_sorted = sorted::NO,
131  std::vector<order> const& column_order = {},
132  std::vector<null_order> const& null_precedence = {});
133 
187  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
190 
196  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
198  rmm::cuda_stream_view stream,
251  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
254 
305  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
306  table_view const& values,
308  std::vector<std::reference_wrapper<scalar const>> const& fill_values,
310 
319  struct groups {
320  std::unique_ptr<table> keys;
321  std::vector<size_type> offsets;
322  std::unique_ptr<table> values;
323  };
324 
339 
375  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
376  table_view const& values,
377  host_span<cudf::replace_policy const> replace_policies,
379 
380  private:
381  table_view _keys;
382  null_policy _include_null_keys{null_policy::EXCLUDE};
384  sorted _keys_are_sorted{sorted::NO};
385  std::vector<order> _column_order{};
387  std::vector<null_order> _null_precedence{};
390  std::unique_ptr<detail::sort::sort_groupby_helper>
391  _helper;
393 
400  detail::sort::sort_groupby_helper& helper();
401 
406  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
407  host_span<aggregation_request const> requests,
408  rmm::cuda_stream_view stream,
410 
411  // Sort-based groupby
412  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
413  host_span<aggregation_request const> requests,
414  rmm::cuda_stream_view stream,
416 
417  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
418  host_span<scan_request const> requests,
419  rmm::cuda_stream_view stream,
421 };
423 } // namespace groupby
424 } // namespace CUDF_EXPORT cudf
Representation for specifying desired aggregations from aggregation-based APIs, e....
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
Groups values by keys and computes aggregations on those groups.
Definition: groupby.hpp:96
groups get_groups(cudf::table_view values={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Get the grouped keys and values corresponding to a groupby operation on a set of values.
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > aggregate(host_span< aggregation_request const > requests, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs grouped aggregations on the specified values.
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > aggregate(host_span< aggregation_request const > requests, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs grouped aggregations on the specified values.
groupby(table_view const &keys, null_policy null_handling=null_policy::EXCLUDE, sorted keys_are_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={})
Construct a groupby object with the specified keys
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > replace_nulls(table_view const &values, host_span< cudf::replace_policy const > replace_policies, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs grouped replace nulls on value.
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > scan(host_span< scan_request const > requests, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs grouped scans on the specified values.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > shift(table_view const &values, host_span< size_type const > offsets, std::vector< std::reference_wrapper< scalar const >> const &fill_values, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs grouped shifts for specified values.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
column view class definitions
std::unique_ptr< table > sort(table_view const &input, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs a lexicographic sort of the rows of a table.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:126
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:167
cuDF interfaces
Definition: aggregation.hpp:35
APIs for spans.
Request for groupby aggregation(s) to perform on a column.
Definition: groupby.hpp:61
std::vector< std::unique_ptr< groupby_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:63
column_view values
The elements to aggregate.
Definition: groupby.hpp:62
The result(s) of an aggregation_request
Definition: groupby.hpp:88
The grouped data corresponding to a groupby operation on a set of values.
Definition: groupby.hpp:319
std::unique_ptr< table > keys
Table of grouped keys.
Definition: groupby.hpp:320
std::vector< size_type > offsets
Group Offsets.
Definition: groupby.hpp:321
std::unique_ptr< table > values
Table of grouped values.
Definition: groupby.hpp:322
Request for groupby aggregation(s) for scanning a column.
Definition: groupby.hpp:76
column_view values
The elements to aggregate.
Definition: groupby.hpp:77
std::vector< std::unique_ptr< groupby_scan_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:78
C++20 std::span with reduced feature set.
Definition: span.hpp:231
Class definitions for (mutable)_table_view
Type declarations for libcudf.