hdbscan.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
9 
10 #include <raft/core/handle.hpp>
11 
12 #include <rmm/device_uvector.hpp>
13 
14 #include <cstddef>
15 
16 namespace ML {
17 namespace HDBSCAN {
18 namespace Common {
19 
28 template <typename value_idx, typename value_t>
30  public:
37  CondensedHierarchy(const raft::handle_t& handle_, size_t n_leaves_);
38 
50  CondensedHierarchy(const raft::handle_t& handle_,
51  size_t n_leaves_,
52  int n_edges_,
53  value_idx* parents_,
54  value_idx* children_,
55  value_t* lambdas_,
56  value_idx* sizes_);
57 
70  CondensedHierarchy(const raft::handle_t& handle_,
71  size_t n_leaves_,
72  int n_edges_,
73  int n_clusters_,
74  rmm::device_uvector<value_idx>&& parents_,
75  rmm::device_uvector<value_idx>&& children_,
76  rmm::device_uvector<value_t>&& lambdas_,
77  rmm::device_uvector<value_idx>&& sizes_);
96  void condense(value_idx* full_parents,
97  value_idx* full_children,
98  value_t* full_lambdas,
99  value_idx* full_sizes,
100  value_idx size = -1);
101 
103 
104  value_idx* get_parents() { return parents.data(); }
105  value_idx* get_children() { return children.data(); }
106  value_t* get_lambdas() { return lambdas.data(); }
107  value_idx* get_sizes() { return sizes.data(); }
108  value_idx get_n_edges() { return n_edges; }
109  int get_n_clusters() { return n_clusters; }
110  value_idx get_n_leaves() const { return n_leaves; }
111 
112  private:
113  const raft::handle_t& handle;
114 
115  rmm::device_uvector<value_idx> parents;
116  rmm::device_uvector<value_idx> children;
117  rmm::device_uvector<value_t> lambdas;
118  rmm::device_uvector<value_idx> sizes;
119 
120  size_t n_edges;
121  size_t n_leaves;
122  int n_clusters;
123  value_idx root_cluster;
124 };
125 
128 
130  public:
131  int min_samples = 5;
134 
136 
137  bool allow_single_cluster = false;
138 
139  float alpha = 1.0;
140 };
141 
143 
152  // not directly using cuvs::neighbors::nn_descent::index_params to distinguish HDBSCAN-exposed NN
153  // Descent parameters
154  size_t graph_degree = 64;
156  size_t max_iterations = 20;
157  float termination_threshold = 0.0001;
158 };
159 
183  size_t overlap_factor = 2;
190  size_t n_clusters = 1;
192 };
193 } // namespace graph_build_params
194 
196  public:
200 };
201 
208 template <typename value_idx, typename value_t>
210  public:
224  robust_single_linkage_output(const raft::handle_t& handle_,
225  int n_leaves_,
226  value_idx* labels_,
227  value_idx* children_,
228  value_idx* sizes_,
229  value_t* deltas_,
230  value_idx* mst_src_,
231  value_idx* mst_dst_,
232  value_t* mst_weights_)
233  : handle(handle_),
234  n_leaves(n_leaves_),
235  n_clusters(-1),
236  labels(labels_),
237  children(children_),
238  sizes(sizes_),
239  deltas(deltas_),
240  mst_src(mst_src_),
241  mst_dst(mst_dst_),
242  mst_weights(mst_weights_)
243  {
244  }
245 
246  int get_n_leaves() const { return n_leaves; }
247  int get_n_clusters() const { return n_clusters; }
248  value_idx* get_labels() { return labels; }
249  value_idx* get_children() { return children; }
250  value_idx* get_sizes() { return sizes; }
251  value_t* get_deltas() { return deltas; }
252  value_idx* get_mst_src() { return mst_src; }
253  value_idx* get_mst_dst() { return mst_dst; }
254  value_t* get_mst_weights() { return mst_weights; }
255 
260  void set_n_clusters(int n_clusters_) { n_clusters = n_clusters_; }
261 
262  protected:
263  const raft::handle_t& get_handle() { return handle; }
264 
265  const raft::handle_t& handle;
266 
267  int n_leaves;
269 
270  value_idx* labels; // size n_leaves
271 
272  // Dendrogram
273  value_idx* children; // size n_leaves * 2
274  value_idx* sizes; // size n_leaves
275  value_t* deltas; // size n_leaves
276 
277  // MST (size n_leaves - 1).
278  value_idx* mst_src;
279  value_idx* mst_dst;
280  value_t* mst_weights;
281 };
282 
298 template <typename value_idx, typename value_t>
299 class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
300  public:
301  hdbscan_output(const raft::handle_t& handle_,
302  int n_leaves_,
303  value_idx* labels_,
304  value_t* probabilities_,
305  value_idx* children_,
306  value_idx* sizes_,
307  value_t* deltas_,
308  value_idx* mst_src_,
309  value_idx* mst_dst_,
310  value_t* mst_weights_)
311  : robust_single_linkage_output<value_idx, value_t>(
312  handle_, n_leaves_, labels_, children_, sizes_, deltas_, mst_src_, mst_dst_, mst_weights_),
313  probabilities(probabilities_),
314  stabilities(0, handle_.get_stream()),
315  condensed_tree(handle_, n_leaves_),
316  inverse_label_map(0, handle_.get_stream())
317  {
318  }
319 
320  // Using getters here, making the members private and forcing
321  // consistent state with the constructor. This should make
322  // it much easier to use / debug.
323  value_t* get_probabilities() { return probabilities; }
324  value_t* get_stabilities() { return stabilities.data(); }
325  value_idx* get_inverse_label_map() { return inverse_label_map.data(); }
326  // internal function
327  rmm::device_uvector<value_idx>& _get_inverse_label_map() { return inverse_label_map; }
328 
334  void set_n_clusters(int n_clusters_)
335  {
337  stabilities.resize(n_clusters_,
339  }
340 
342 
343  private:
344  value_t* probabilities; // size n_leaves
345  // inversely maps normalized labels to pre-normalized labels
346  // used for out-of-sample prediction
347  rmm::device_uvector<value_idx> inverse_label_map; // size n_clusters
348 
349  // Size not known ahead of time. Initialize
350  // with `initialize_stabilities()` method.
351  rmm::device_uvector<value_t> stabilities;
352 
353  // Use condensed hierarchy to wrap
354  // condensed tree outputs since we do not
355  // know the size ahead of time.
357 };
358 
359 template class CondensedHierarchy<int64_t, float>;
360 
368 template <typename value_idx, typename value_t>
370  public:
371  PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n, value_t* core_dists_)
372  : handle(handle_),
373  exemplar_idx(0, handle.get_stream()),
374  exemplar_label_offsets(0, handle.get_stream()),
375  n_selected_clusters(0),
376  selected_clusters(0, handle.get_stream()),
377  deaths(0, handle.get_stream()),
378  core_dists(core_dists_),
379  index_into_children(0, handle.get_stream()),
380  n_exemplars(0),
381  n_rows(m),
382  n_cols(n)
383  {
384  }
385  size_t n_rows;
386  size_t n_cols;
387 
388  // Using getters here, making the members private and forcing
389  // consistent state with the constructor. This should make
390  // it much easier to use / debug.
391  value_idx get_n_exemplars() { return n_exemplars; }
392  value_idx get_n_selected_clusters() { return n_selected_clusters; }
393  value_idx* get_exemplar_idx() { return exemplar_idx.data(); }
394  value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
395  value_idx* get_selected_clusters() { return selected_clusters.data(); }
396  value_t* get_deaths() { return deaths.data(); }
397  value_t* get_core_dists() { return core_dists; }
398  value_idx* get_index_into_children() { return index_into_children.data(); }
399 
408  void allocate(const raft::handle_t& handle,
409  value_idx n_exemplars_,
410  value_idx n_selected_clusters_,
411  value_idx n_edges_);
412 
418  void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
419  {
420  deaths.resize(n_clusters_, handle.get_stream());
421  }
422 
423  private:
424  const raft::handle_t& handle;
425  rmm::device_uvector<value_idx> exemplar_idx;
426  rmm::device_uvector<value_idx> exemplar_label_offsets;
427  value_idx n_exemplars;
428  value_idx n_selected_clusters;
429  rmm::device_uvector<value_idx> selected_clusters;
430  rmm::device_uvector<value_t> deaths;
431  value_t* core_dists;
432  rmm::device_uvector<value_idx> index_into_children;
433 };
434 
435 template class PredictionData<int64_t, float>;
436 
437 void generate_prediction_data(const raft::handle_t& handle,
438  CondensedHierarchy<int64_t, float>& condensed_tree,
439  int64_t* labels,
440  int64_t* inverse_label_map,
441  int n_selected_clusters,
442  PredictionData<int64_t, float>& prediction_data);
443 
444 }; // namespace Common
445 }; // namespace HDBSCAN
446 
469 void hdbscan(const raft::handle_t& handle,
470  const float* X,
471  size_t m,
472  size_t n,
476  float* core_dists);
477 
478 void build_condensed_hierarchy(const raft::handle_t& handle,
479  const int64_t* children,
480  const float* delta,
481  const int64_t* sizes,
482  int min_cluster_size,
483  int n_leaves,
485 
486 void _extract_clusters(const raft::handle_t& handle,
487  size_t n_leaves,
488  int n_edges,
489  int64_t* parents,
490  int64_t* children,
491  float* lambdas,
492  int64_t* sizes,
493  int64_t* labels,
494  float* probabilities,
495  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
496  bool allow_single_cluster,
497  int64_t max_cluster_size,
498  float cluster_selection_epsilon);
499 
501  const raft::handle_t& handle,
504  const float* X,
506  float* membership_vec,
507  size_t batch_size = 4096);
508 
509 void compute_membership_vector(const raft::handle_t& handle,
512  const float* X,
513  const float* points_to_predict,
514  size_t n_prediction_points,
515  int min_samples,
517  float* membership_vec,
518  size_t batch_size = 4096);
519 
520 void out_of_sample_predict(const raft::handle_t& handle,
523  const float* X,
524  int64_t* labels,
525  const float* points_to_predict,
526  size_t n_prediction_points,
528  int min_samples,
529  int64_t* out_labels,
530  float* out_probabilities);
531 
532 namespace HDBSCAN::HELPER {
533 
545 void compute_core_dists(const raft::handle_t& handle,
546  const float* X,
547  float* core_dists,
548  size_t m,
549  size_t n,
551  int min_samples);
552 
566 void compute_inverse_label_map(const raft::handle_t& handle,
568  size_t n_leaves,
569  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
570  rmm::device_uvector<int64_t>& inverse_label_map,
571  bool allow_single_cluster,
572  int64_t max_cluster_size,
573  float cluster_selection_epsilon);
574 
575 } // namespace HDBSCAN::HELPER
576 } // END namespace ML
Definition: hdbscan.hpp:29
value_idx * get_sizes()
Definition: hdbscan.hpp:107
value_t * get_lambdas()
Definition: hdbscan.hpp:106
value_idx get_n_leaves() const
Definition: hdbscan.hpp:110
value_idx get_n_edges()
Definition: hdbscan.hpp:108
value_idx * get_children()
Definition: hdbscan.hpp:105
int get_n_clusters()
Definition: hdbscan.hpp:109
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_)
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, int n_clusters_, rmm::device_uvector< value_idx > &&parents_, rmm::device_uvector< value_idx > &&children_, rmm::device_uvector< value_t > &&lambdas_, rmm::device_uvector< value_idx > &&sizes_)
value_idx * get_parents()
Definition: hdbscan.hpp:104
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, value_idx *parents_, value_idx *children_, value_t *lambdas_, value_idx *sizes_)
void condense(value_idx *full_parents, value_idx *full_children, value_t *full_lambdas, value_idx *full_sizes, value_idx size=-1)
Definition: hdbscan.hpp:195
CLUSTER_SELECTION_METHOD cluster_selection_method
Definition: hdbscan.hpp:197
graph_build_params::graph_build_params build_params
Definition: hdbscan.hpp:199
GRAPH_BUILD_ALGO build_algo
Definition: hdbscan.hpp:198
Definition: hdbscan.hpp:369
value_t * get_core_dists()
Definition: hdbscan.hpp:397
void allocate(const raft::handle_t &handle, value_idx n_exemplars_, value_idx n_selected_clusters_, value_idx n_edges_)
value_t * get_deaths()
Definition: hdbscan.hpp:396
value_idx get_n_selected_clusters()
Definition: hdbscan.hpp:392
PredictionData(const raft::handle_t &handle_, value_idx m, value_idx n, value_t *core_dists_)
Definition: hdbscan.hpp:371
value_idx * get_exemplar_label_offsets()
Definition: hdbscan.hpp:394
value_idx * get_index_into_children()
Definition: hdbscan.hpp:398
void set_n_clusters(const raft::handle_t &handle, value_idx n_clusters_)
Definition: hdbscan.hpp:418
value_idx * get_exemplar_idx()
Definition: hdbscan.hpp:393
value_idx * get_selected_clusters()
Definition: hdbscan.hpp:395
value_idx get_n_exemplars()
Definition: hdbscan.hpp:391
size_t n_rows
Definition: hdbscan.hpp:385
size_t n_cols
Definition: hdbscan.hpp:386
float alpha
Definition: hdbscan.hpp:139
float cluster_selection_epsilon
Definition: hdbscan.hpp:135
int max_cluster_size
Definition: hdbscan.hpp:133
bool allow_single_cluster
Definition: hdbscan.hpp:137
int min_cluster_size
Definition: hdbscan.hpp:132
int min_samples
Definition: hdbscan.hpp:131
Definition: hdbscan.hpp:299
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:341
value_t * get_stabilities()
Definition: hdbscan.hpp:324
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:327
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:334
value_t * get_probabilities()
Definition: hdbscan.hpp:323
hdbscan_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_t *probabilities_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:301
value_idx * get_inverse_label_map()
Definition: hdbscan.hpp:325
const raft::handle_t & get_handle()
Definition: hdbscan.hpp:263
value_idx * children
Definition: hdbscan.hpp:273
value_idx * labels
Definition: hdbscan.hpp:270
int get_n_leaves() const
Definition: hdbscan.hpp:246
robust_single_linkage_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:224
int get_n_clusters() const
Definition: hdbscan.hpp:247
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:260
value_t * mst_weights
Definition: hdbscan.hpp:280
value_idx * get_sizes()
Definition: hdbscan.hpp:250
value_t * deltas
Definition: hdbscan.hpp:275
const raft::handle_t & handle
Definition: hdbscan.hpp:265
value_idx * get_mst_src()
Definition: hdbscan.hpp:252
value_t * get_mst_weights()
Definition: hdbscan.hpp:254
value_idx * get_mst_dst()
Definition: hdbscan.hpp:253
value_idx * get_labels()
Definition: hdbscan.hpp:248
value_idx * sizes
Definition: hdbscan.hpp:274
value_t * get_deltas()
Definition: hdbscan.hpp:251
int n_clusters
Definition: hdbscan.hpp:268
value_idx * mst_src
Definition: hdbscan.hpp:278
value_idx * mst_dst
Definition: hdbscan.hpp:279
value_idx * get_children()
Definition: hdbscan.hpp:249
Definition: params.hpp:23
CLUSTER_SELECTION_METHOD
Definition: hdbscan.hpp:126
@ EOM
Definition: hdbscan.hpp:126
@ LEAF
Definition: hdbscan.hpp:126
void generate_prediction_data(const raft::handle_t &handle, CondensedHierarchy< int64_t, float > &condensed_tree, int64_t *labels, int64_t *inverse_label_map, int n_selected_clusters, PredictionData< int64_t, float > &prediction_data)
GRAPH_BUILD_ALGO
Definition: hdbscan.hpp:127
@ NN_DESCENT
Definition: hdbscan.hpp:127
@ BRUTE_FORCE_KNN
Definition: hdbscan.hpp:127
void compute_inverse_label_map(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree, size_t n_leaves, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, rmm::device_uvector< int64_t > &inverse_label_map, bool allow_single_cluster, int64_t max_cluster_size, float cluster_selection_epsilon)
Compute the map from final, normalize labels to the labels in the CondensedHierarchy.
void compute_core_dists(const raft::handle_t &handle, const float *X, float *core_dists, size_t m, size_t n, ML::distance::DistanceType metric, int min_samples)
Compute the core distances for each point in the training matrix.
DistanceType
Definition: distance_type.hpp:10
Definition: dbscan.hpp:18
void hdbscan(const raft::handle_t &handle, const float *X, size_t m, size_t n, ML::distance::DistanceType metric, HDBSCAN::Common::HDBSCANParams ¶ms, HDBSCAN::Common::hdbscan_output< int64_t, float > &out, float *core_dists)
void build_condensed_hierarchy(const raft::handle_t &handle, const int64_t *children, const float *delta, const int64_t *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree)
void compute_membership_vector(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree, HDBSCAN::Common::PredictionData< int64_t, float > &prediction_data, const float *X, const float *points_to_predict, size_t n_prediction_points, int min_samples, ML::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
void _extract_clusters(const raft::handle_t &handle, size_t n_leaves, int n_edges, int64_t *parents, int64_t *children, float *lambdas, int64_t *sizes, int64_t *labels, float *probabilities, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, bool allow_single_cluster, int64_t max_cluster_size, float cluster_selection_epsilon)
void out_of_sample_predict(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree, HDBSCAN::Common::PredictionData< int64_t, float > &prediction_data, const float *X, int64_t *labels, const float *points_to_predict, size_t n_prediction_points, ML::distance::DistanceType metric, int min_samples, int64_t *out_labels, float *out_probabilities)
void compute_all_points_membership_vectors(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree, HDBSCAN::Common::PredictionData< int64_t, float > &prediction_data, const float *X, ML::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
nn_descent_params_hdbscan nn_descent_params
Definition: hdbscan.hpp:191