approx_distinct_count.hpp
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <cuda/std/span>
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <memory>
20 
21 namespace CUDF_EXPORT cudf {
22 
23 // Forward declarations
24 namespace hashing::detail {
25 template <typename Key>
26 struct XXHash_64;
27 }
28 
29 namespace detail {
30 template <template <typename> class Hasher>
32 }
33 
77  public:
78  using impl_type =
80 
92  std::int32_t precision = 12,
93  null_policy null_handling = null_policy::EXCLUDE,
94  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
96 
115  approx_distinct_count(cuda::std::span<cuda::std::byte> sketch_span,
116  std::int32_t precision,
117  null_policy null_handling = null_policy::EXCLUDE,
118  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
120 
122 
124  approx_distinct_count& operator=(approx_distinct_count const&) = delete;
132 
140 
153  void merge(approx_distinct_count const& other,
155 
169  void merge(cuda::std::span<cuda::std::byte> sketch_span,
171 
178  [[nodiscard]] std::size_t estimate(
180 
190  [[nodiscard]] cuda::std::span<cuda::std::byte> sketch() noexcept;
191 
201  [[nodiscard]] cuda::std::span<cuda::std::byte const> sketch() const noexcept;
202 
208  [[nodiscard]] null_policy null_handling() const noexcept;
209 
215  [[nodiscard]] nan_policy nan_handling() const noexcept;
216 
222  [[nodiscard]] std::int32_t precision() const noexcept;
223 
224  private:
225  std::unique_ptr<impl_type> _impl;
226 };
227 
228 } // namespace CUDF_EXPORT cudf
Object-oriented HyperLogLog sketch for approximate distinct counting.
std::size_t estimate(rmm::cuda_stream_view stream=cudf::get_default_stream()) const
Estimates the approximate number of distinct rows in the sketch.
void merge(cuda::std::span< cuda::std::byte > sketch_span, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges a sketch from raw bytes into this sketch.
void merge(approx_distinct_count const &other, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges another sketch into this sketch.
approx_distinct_count(approx_distinct_count &&)=default
Default move constructor.
approx_distinct_count & operator=(approx_distinct_count &&)=default
Move assignment operator.
approx_distinct_count(cuda::std::span< cuda::std::byte > sketch_span, std::int32_t precision, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from serialized sketch bytes.
cuda::std::span< cuda::std::byte > sketch() noexcept
Gets the raw sketch bytes for serialization or external merging.
void add(table_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream())
Adds rows from a table to the sketch.
approx_distinct_count(table_view const &input, std::int32_t precision=12, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from a table.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:115
nan_policy
Enum to treat NaN floating point value as null or non-null element.
Definition: types.hpp:123
cuDF interfaces
Definition: host_udf.hpp:26
Class definitions for (mutable)_table_view
Type declarations for libcudf.