approx_distinct_count.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <cuda/std/span>
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <memory>
20 
21 namespace CUDF_EXPORT cudf {
22 
29 // Forward declarations
30 namespace hashing::detail {
31 template <typename Key>
32 struct XXHash_64;
33 }
34 
35 namespace detail {
36 template <template <typename> class Hasher>
38 }
39 
83  public:
84  using impl_type =
86 
98  std::int32_t precision = 12,
99  null_policy null_handling = null_policy::EXCLUDE,
100  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
102 
121  approx_distinct_count(cuda::std::span<cuda::std::byte> sketch_span,
122  std::int32_t precision,
123  null_policy null_handling = null_policy::EXCLUDE,
124  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
126 
128 
130  approx_distinct_count& operator=(approx_distinct_count const&) = delete;
138 
146 
159  void merge(approx_distinct_count const& other,
161 
175  void merge(cuda::std::span<cuda::std::byte> sketch_span,
177 
184  [[nodiscard]] std::size_t estimate(
186 
196  [[nodiscard]] cuda::std::span<cuda::std::byte> sketch() noexcept;
197 
207  [[nodiscard]] cuda::std::span<cuda::std::byte const> sketch() const noexcept;
208 
214  [[nodiscard]] null_policy null_handling() const noexcept;
215 
221  [[nodiscard]] nan_policy nan_handling() const noexcept;
222 
228  [[nodiscard]] std::int32_t precision() const noexcept;
229 
230  private:
231  std::unique_ptr<impl_type> _impl;
232 };
233 
236 } // namespace CUDF_EXPORT cudf
Object-oriented HyperLogLog sketch for approximate distinct counting.
std::size_t estimate(rmm::cuda_stream_view stream=cudf::get_default_stream()) const
Estimates the approximate number of distinct rows in the sketch.
void merge(cuda::std::span< cuda::std::byte > sketch_span, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges a sketch from raw bytes into this sketch.
void merge(approx_distinct_count const &other, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges another sketch into this sketch.
approx_distinct_count(approx_distinct_count &&)=default
Default move constructor.
approx_distinct_count & operator=(approx_distinct_count &&)=default
Move assignment operator.
approx_distinct_count(cuda::std::span< cuda::std::byte > sketch_span, std::int32_t precision, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from serialized sketch bytes.
cuda::std::span< cuda::std::byte > sketch() noexcept
Gets the raw sketch bytes for serialization or external merging.
void add(table_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream())
Adds rows from a table to the sketch.
approx_distinct_count(table_view const &input, std::int32_t precision=12, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from a table.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:115
nan_policy
Enum to treat NaN floating point value as null or non-null element.
Definition: types.hpp:123
cuDF interfaces
Definition: host_udf.hpp:26
Class definitions for (mutable)_table_view
Type declarations for libcudf.