groupby.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/aggregation.hpp>
21 #include <cudf/replace.hpp>
23 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
26 #include <cudf/utilities/span.hpp>
27 
28 #include <rmm/cuda_stream_view.hpp>
29 
30 #include <memory>
31 #include <utility>
32 #include <vector>
33 
34 namespace CUDF_EXPORT cudf {
36 namespace groupby {
37 namespace detail {
38 namespace sort {
39 struct sort_groupby_helper;
40 
41 } // namespace sort
42 } // namespace detail
43 
62  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;
63 };
64 
75 struct scan_request {
77  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;
78 };
79 
89  std::vector<std::unique_ptr<column>> results{};
90 };
91 
95 class groupby {
96  public:
97  groupby() = delete;
98  ~groupby();
99  groupby(groupby const&) = delete;
100  groupby(groupby&&) = delete;
101  groupby& operator=(groupby const&) = delete;
102  groupby& operator=(groupby&&) = delete;
103 
127  explicit groupby(table_view const& keys,
128  null_policy null_handling = null_policy::EXCLUDE,
129  sorted keys_are_sorted = sorted::NO,
130  std::vector<order> const& column_order = {},
131  std::vector<null_order> const& null_precedence = {});
132 
187  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
243  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
247 
299  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
300  table_view const& values,
302  std::vector<std::reference_wrapper<scalar const>> const& fill_values,
305 
314  struct groups {
315  std::unique_ptr<table> keys;
316  std::vector<size_type> offsets;
317  std::unique_ptr<table> values;
318  };
319 
336 
373  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
374  table_view const& values,
375  host_span<cudf::replace_policy const> replace_policies,
378 
379  private:
380  table_view _keys;
381  null_policy _include_null_keys{null_policy::EXCLUDE};
383  sorted _keys_are_sorted{sorted::NO};
384  std::vector<order> _column_order{};
386  std::vector<null_order> _null_precedence{};
389  std::unique_ptr<detail::sort::sort_groupby_helper>
390  _helper;
392 
399  detail::sort::sort_groupby_helper& helper();
400 
405  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
406  host_span<aggregation_request const> requests,
407  rmm::cuda_stream_view stream,
409 
410  // Sort-based groupby
411  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
412  host_span<aggregation_request const> requests,
413  rmm::cuda_stream_view stream,
415 
416  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
417  host_span<scan_request const> requests,
418  rmm::cuda_stream_view stream,
420 };
422 } // namespace groupby
423 } // namespace CUDF_EXPORT cudf
Representation for specifying desired aggregations from aggregation-based APIs, e....
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
Groups values by keys and computes aggregations on those groups.
Definition: groupby.hpp:95
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > scan(host_span< scan_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped scans on the specified values.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > replace_nulls(table_view const &values, host_span< cudf::replace_policy const > replace_policies, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped replace nulls on value.
std::pair< std::unique_ptr< table >, std::unique_ptr< table > > shift(table_view const &values, host_span< size_type const > offsets, std::vector< std::reference_wrapper< scalar const >> const &fill_values, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped shifts for specified values.
groupby(table_view const &keys, null_policy null_handling=null_policy::EXCLUDE, sorted keys_are_sorted=sorted::NO, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={})
Construct a groupby object with the specified keys
std::pair< std::unique_ptr< table >, std::vector< aggregation_result > > aggregate(host_span< aggregation_request const > requests, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs grouped aggregations on the specified values.
groups get_groups(cudf::table_view values={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Get the grouped keys and values corresponding to a groupby operation on a set of values.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
column view class definitions
std::unique_ptr< table > sort(table_view const &input, std::vector< order > const &column_order={}, std::vector< null_order > const &null_precedence={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs a lexicographic sort of the rows of a table.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:126
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:167
cuDF interfaces
Definition: host_udf.hpp:39
APIs for spans.
Request for groupby aggregation(s) to perform on a column.
Definition: groupby.hpp:60
std::vector< std::unique_ptr< groupby_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:62
column_view values
The elements to aggregate.
Definition: groupby.hpp:61
The result(s) of an aggregation_request
Definition: groupby.hpp:87
The grouped data corresponding to a groupby operation on a set of values.
Definition: groupby.hpp:314
std::unique_ptr< table > keys
Table of grouped keys.
Definition: groupby.hpp:315
std::vector< size_type > offsets
Group Offsets.
Definition: groupby.hpp:316
std::unique_ptr< table > values
Table of grouped values.
Definition: groupby.hpp:317
Request for groupby aggregation(s) for scanning a column.
Definition: groupby.hpp:75
column_view values
The elements to aggregate.
Definition: groupby.hpp:76
std::vector< std::unique_ptr< groupby_scan_aggregation > > aggregations
Desired aggregations.
Definition: groupby.hpp:77
C++20 std::span with reduced feature set.
Definition: span.hpp:194
Class definitions for (mutable)_table_view
Type declarations for libcudf.