approx_distinct_count.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 #include <cudf/types.hpp>
11 #include <cudf/utilities/export.hpp>
12 
13 #include <rmm/cuda_stream_view.hpp>
14 
15 #include <cuda/std/span>
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <memory>
20 
21 namespace CUDF_EXPORT cudf {
22 
29 // Forward declarations
30 namespace hashing::detail {
31 template <typename Key>
32 struct XXHash_64;
33 }
34 
35 namespace detail {
36 template <template <typename> class Hasher>
38 }
39 
83  public:
84  using impl_type =
86 
100  double value;
101 
106  explicit constexpr desired_standard_error(double v) : value{v} {}
107  };
108 
120  std::int32_t precision = 12,
121  null_policy null_handling = null_policy::EXCLUDE,
122  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
124 
147  null_policy null_handling = null_policy::EXCLUDE,
148  nan_policy nan_handling = nan_policy::NAN_IS_NULL,
150 
166  approx_distinct_count(cuda::std::span<cuda::std::byte> sketch_span,
167  std::int32_t precision,
168  null_policy null_handling = null_policy::EXCLUDE,
169  nan_policy nan_handling = nan_policy::NAN_IS_NULL);
170 
172 
174  approx_distinct_count& operator=(approx_distinct_count const&) = delete;
182 
190 
203  void merge(approx_distinct_count const& other,
205 
219  void merge(cuda::std::span<cuda::std::byte const> sketch_span,
221 
228  [[nodiscard]] std::size_t estimate(
230 
240  [[nodiscard]] cuda::std::span<cuda::std::byte> sketch() noexcept;
241 
251  [[nodiscard]] cuda::std::span<cuda::std::byte const> sketch() const noexcept;
252 
258  [[nodiscard]] null_policy null_handling() const noexcept;
259 
265  [[nodiscard]] nan_policy nan_handling() const noexcept;
266 
272  [[nodiscard]] std::int32_t precision() const noexcept;
273 
282  [[nodiscard]] double standard_error() const noexcept;
283 
290  [[nodiscard]] static std::size_t sketch_bytes(std::int32_t precision);
291 
297  [[nodiscard]] static std::size_t sketch_alignment();
298 
299  private:
300  std::unique_ptr<impl_type> _impl;
301 };
302 
305 } // namespace CUDF_EXPORT cudf
Object-oriented HyperLogLog sketch for approximate distinct counting.
approx_distinct_count(table_view const &input, desired_standard_error error, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from a table with specified standard error.
std::size_t estimate(rmm::cuda_stream_view stream=cudf::get_default_stream()) const
Estimates the approximate number of distinct rows in the sketch.
void merge(approx_distinct_count const &other, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges another sketch into this sketch.
approx_distinct_count(approx_distinct_count &&)=default
Default move constructor.
approx_distinct_count & operator=(approx_distinct_count &&)=default
Move assignment operator.
approx_distinct_count(cuda::std::span< cuda::std::byte > sketch_span, std::int32_t precision, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL)
Constructs a non-owning sketch that operates on user-allocated storage.
void merge(cuda::std::span< cuda::std::byte const > sketch_span, rmm::cuda_stream_view stream=cudf::get_default_stream())
Merges a sketch from raw bytes into this sketch.
cuda::std::span< cuda::std::byte > sketch() noexcept
Gets the raw sketch bytes for serialization or external merging.
void add(table_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream())
Adds rows from a table to the sketch.
approx_distinct_count(table_view const &input, std::int32_t precision=12, null_policy null_handling=null_policy::EXCLUDE, nan_policy nan_handling=nan_policy::NAN_IS_NULL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs an approximate distinct count sketch from a table with specified precision.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
null_policy
Enum to specify whether to include nulls or exclude nulls.
Definition: types.hpp:115
nan_policy
Enum to treat NaN floating point value as null or non-null element.
Definition: types.hpp:123
cuDF interfaces
Definition: host_udf.hpp:26
Strong type wrapper for the desired standard error constructor parameter.
double value
The requested standard error value (must be positive)
constexpr desired_standard_error(double v)
Constructs a desired_standard_error with the given value.
Class definitions for (mutable)_table_view
Type declarations for libcudf.