hdbscan.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <raft/core/handle.hpp>
20 
21 #include <rmm/device_uvector.hpp>
22 
23 #include <cuvs/distance/distance.hpp>
24 
25 #include <cstddef>
26 
27 namespace ML {
28 namespace HDBSCAN {
29 namespace Common {
30 
39 template <typename value_idx, typename value_t>
41  public:
48  CondensedHierarchy(const raft::handle_t& handle_, size_t n_leaves_);
49 
61  CondensedHierarchy(const raft::handle_t& handle_,
62  size_t n_leaves_,
63  int n_edges_,
64  value_idx* parents_,
65  value_idx* children_,
66  value_t* lambdas_,
67  value_idx* sizes_);
68 
81  CondensedHierarchy(const raft::handle_t& handle_,
82  size_t n_leaves_,
83  int n_edges_,
84  int n_clusters_,
85  rmm::device_uvector<value_idx>&& parents_,
86  rmm::device_uvector<value_idx>&& children_,
87  rmm::device_uvector<value_t>&& lambdas_,
88  rmm::device_uvector<value_idx>&& sizes_);
107  void condense(value_idx* full_parents,
108  value_idx* full_children,
109  value_t* full_lambdas,
110  value_idx* full_sizes,
111  value_idx size = -1);
112 
114 
115  value_idx* get_parents() { return parents.data(); }
116  value_idx* get_children() { return children.data(); }
117  value_t* get_lambdas() { return lambdas.data(); }
118  value_idx* get_sizes() { return sizes.data(); }
119  value_idx get_n_edges() { return n_edges; }
120  int get_n_clusters() { return n_clusters; }
121  value_idx get_n_leaves() const { return n_leaves; }
122 
123  private:
124  const raft::handle_t& handle;
125 
126  rmm::device_uvector<value_idx> parents;
127  rmm::device_uvector<value_idx> children;
128  rmm::device_uvector<value_t> lambdas;
129  rmm::device_uvector<value_idx> sizes;
130 
131  size_t n_edges;
132  size_t n_leaves;
133  int n_clusters;
134  value_idx root_cluster;
135 };
136 
138 
140  public:
141  int min_samples = 5;
144 
146 
147  bool allow_single_cluster = false;
148 
149  float alpha = 1.0;
150 };
151 
153  public:
155 };
156 
163 template <typename value_idx, typename value_t>
165  public:
179  robust_single_linkage_output(const raft::handle_t& handle_,
180  int n_leaves_,
181  value_idx* labels_,
182  value_idx* children_,
183  value_idx* sizes_,
184  value_t* deltas_,
185  value_idx* mst_src_,
186  value_idx* mst_dst_,
187  value_t* mst_weights_)
188  : handle(handle_),
189  n_leaves(n_leaves_),
190  n_clusters(-1),
191  labels(labels_),
192  children(children_),
193  sizes(sizes_),
194  deltas(deltas_),
195  mst_src(mst_src_),
196  mst_dst(mst_dst_),
197  mst_weights(mst_weights_)
198  {
199  }
200 
201  int get_n_leaves() const { return n_leaves; }
202  int get_n_clusters() const { return n_clusters; }
203  value_idx* get_labels() { return labels; }
204  value_idx* get_children() { return children; }
205  value_idx* get_sizes() { return sizes; }
206  value_t* get_deltas() { return deltas; }
207  value_idx* get_mst_src() { return mst_src; }
208  value_idx* get_mst_dst() { return mst_dst; }
209  value_t* get_mst_weights() { return mst_weights; }
210 
215  void set_n_clusters(int n_clusters_) { n_clusters = n_clusters_; }
216 
217  protected:
218  const raft::handle_t& get_handle() { return handle; }
219 
220  const raft::handle_t& handle;
221 
222  int n_leaves;
224 
225  value_idx* labels; // size n_leaves
226 
227  // Dendrogram
228  value_idx* children; // size n_leaves * 2
229  value_idx* sizes; // size n_leaves
230  value_t* deltas; // size n_leaves
231 
232  // MST (size n_leaves - 1).
233  value_idx* mst_src;
234  value_idx* mst_dst;
235  value_t* mst_weights;
236 };
237 
253 template <typename value_idx, typename value_t>
254 class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
255  public:
256  hdbscan_output(const raft::handle_t& handle_,
257  int n_leaves_,
258  value_idx* labels_,
259  value_t* probabilities_,
260  value_idx* children_,
261  value_idx* sizes_,
262  value_t* deltas_,
263  value_idx* mst_src_,
264  value_idx* mst_dst_,
265  value_t* mst_weights_)
266  : robust_single_linkage_output<value_idx, value_t>(
267  handle_, n_leaves_, labels_, children_, sizes_, deltas_, mst_src_, mst_dst_, mst_weights_),
268  probabilities(probabilities_),
269  stabilities(0, handle_.get_stream()),
270  condensed_tree(handle_, n_leaves_),
271  inverse_label_map(0, handle_.get_stream())
272  {
273  }
274 
275  // Using getters here, making the members private and forcing
276  // consistent state with the constructor. This should make
277  // it much easier to use / debug.
278  value_t* get_probabilities() { return probabilities; }
279  value_t* get_stabilities() { return stabilities.data(); }
280  value_idx* get_inverse_label_map() { return inverse_label_map.data(); }
281  // internal function
282  rmm::device_uvector<value_idx>& _get_inverse_label_map() { return inverse_label_map; }
283 
289  void set_n_clusters(int n_clusters_)
290  {
292  stabilities.resize(n_clusters_,
294  }
295 
297 
298  private:
299  value_t* probabilities; // size n_leaves
300  // inversely maps normalized labels to pre-normalized labels
301  // used for out-of-sample prediction
302  rmm::device_uvector<value_idx> inverse_label_map; // size n_clusters
303 
304  // Size not known ahead of time. Initialize
305  // with `initialize_stabilities()` method.
306  rmm::device_uvector<value_t> stabilities;
307 
308  // Use condensed hierarchy to wrap
309  // condensed tree outputs since we do not
310  // know the size ahead of time.
312 };
313 
314 template class CondensedHierarchy<int, float>;
315 
323 template <typename value_idx, typename value_t>
325  public:
326  PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n, value_t* core_dists_)
327  : handle(handle_),
328  exemplar_idx(0, handle.get_stream()),
329  exemplar_label_offsets(0, handle.get_stream()),
330  n_selected_clusters(0),
331  selected_clusters(0, handle.get_stream()),
332  deaths(0, handle.get_stream()),
333  core_dists(core_dists_),
334  index_into_children(0, handle.get_stream()),
335  n_exemplars(0),
336  n_rows(m),
337  n_cols(n)
338  {
339  }
340  size_t n_rows;
341  size_t n_cols;
342 
343  // Using getters here, making the members private and forcing
344  // consistent state with the constructor. This should make
345  // it much easier to use / debug.
346  value_idx get_n_exemplars() { return n_exemplars; }
347  value_idx get_n_selected_clusters() { return n_selected_clusters; }
348  value_idx* get_exemplar_idx() { return exemplar_idx.data(); }
349  value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
350  value_idx* get_selected_clusters() { return selected_clusters.data(); }
351  value_t* get_deaths() { return deaths.data(); }
352  value_t* get_core_dists() { return core_dists; }
353  value_idx* get_index_into_children() { return index_into_children.data(); }
354 
363  void allocate(const raft::handle_t& handle,
364  value_idx n_exemplars_,
365  value_idx n_selected_clusters_,
366  value_idx n_edges_);
367 
373  void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
374  {
375  deaths.resize(n_clusters_, handle.get_stream());
376  }
377 
378  private:
379  const raft::handle_t& handle;
380  rmm::device_uvector<value_idx> exemplar_idx;
381  rmm::device_uvector<value_idx> exemplar_label_offsets;
382  value_idx n_exemplars;
383  value_idx n_selected_clusters;
384  rmm::device_uvector<value_idx> selected_clusters;
385  rmm::device_uvector<value_t> deaths;
386  value_t* core_dists;
387  rmm::device_uvector<value_idx> index_into_children;
388 };
389 
390 template class PredictionData<int, float>;
391 
392 void generate_prediction_data(const raft::handle_t& handle,
393  CondensedHierarchy<int, float>& condensed_tree,
394  int* labels,
395  int* inverse_label_map,
396  int n_selected_clusters,
397  PredictionData<int, float>& prediction_data);
398 
399 }; // namespace Common
400 }; // namespace HDBSCAN
401 
424 void hdbscan(const raft::handle_t& handle,
425  const float* X,
426  size_t m,
427  size_t n,
428  cuvs::distance::DistanceType metric,
431  float* core_dists);
432 
433 void build_condensed_hierarchy(const raft::handle_t& handle,
434  const int* children,
435  const float* delta,
436  const int* sizes,
437  int min_cluster_size,
438  int n_leaves,
440 
441 void _extract_clusters(const raft::handle_t& handle,
442  size_t n_leaves,
443  int n_edges,
444  int* parents,
445  int* children,
446  float* lambdas,
447  int* sizes,
448  int* labels,
449  float* probabilities,
450  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
451  bool allow_single_cluster,
452  int max_cluster_size,
453  float cluster_selection_epsilon);
454 
456  const raft::handle_t& handle,
459  const float* X,
460  cuvs::distance::DistanceType metric,
461  float* membership_vec,
462  size_t batch_size = 4096);
463 
464 void compute_membership_vector(const raft::handle_t& handle,
467  const float* X,
468  const float* points_to_predict,
469  size_t n_prediction_points,
470  int min_samples,
471  cuvs::distance::DistanceType metric,
472  float* membership_vec,
473  size_t batch_size = 4096);
474 
475 void out_of_sample_predict(const raft::handle_t& handle,
478  const float* X,
479  int* labels,
480  const float* points_to_predict,
481  size_t n_prediction_points,
482  cuvs::distance::DistanceType metric,
483  int min_samples,
484  int* out_labels,
485  float* out_probabilities);
486 
487 namespace HDBSCAN::HELPER {
488 
500 void compute_core_dists(const raft::handle_t& handle,
501  const float* X,
502  float* core_dists,
503  size_t m,
504  size_t n,
505  cuvs::distance::DistanceType metric,
506  int min_samples);
507 
521 void compute_inverse_label_map(const raft::handle_t& handle,
523  size_t n_leaves,
524  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
525  rmm::device_uvector<int>& inverse_label_map,
526  bool allow_single_cluster,
527  int max_cluster_size,
528  float cluster_selection_epsilon);
529 
530 } // namespace HDBSCAN::HELPER
531 } // END namespace ML
Definition: hdbscan.hpp:40
value_idx * get_sizes()
Definition: hdbscan.hpp:118
value_t * get_lambdas()
Definition: hdbscan.hpp:117
value_idx get_n_leaves() const
Definition: hdbscan.hpp:121
value_idx get_n_edges()
Definition: hdbscan.hpp:119
value_idx * get_children()
Definition: hdbscan.hpp:116
int get_n_clusters()
Definition: hdbscan.hpp:120
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_)
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, int n_clusters_, rmm::device_uvector< value_idx > &&parents_, rmm::device_uvector< value_idx > &&children_, rmm::device_uvector< value_t > &&lambdas_, rmm::device_uvector< value_idx > &&sizes_)
value_idx * get_parents()
Definition: hdbscan.hpp:115
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, value_idx *parents_, value_idx *children_, value_t *lambdas_, value_idx *sizes_)
void condense(value_idx *full_parents, value_idx *full_children, value_t *full_lambdas, value_idx *full_sizes, value_idx size=-1)
Definition: hdbscan.hpp:152
CLUSTER_SELECTION_METHOD cluster_selection_method
Definition: hdbscan.hpp:154
Definition: hdbscan.hpp:324
value_t * get_core_dists()
Definition: hdbscan.hpp:352
void allocate(const raft::handle_t &handle, value_idx n_exemplars_, value_idx n_selected_clusters_, value_idx n_edges_)
value_t * get_deaths()
Definition: hdbscan.hpp:351
value_idx get_n_selected_clusters()
Definition: hdbscan.hpp:347
PredictionData(const raft::handle_t &handle_, value_idx m, value_idx n, value_t *core_dists_)
Definition: hdbscan.hpp:326
value_idx * get_exemplar_label_offsets()
Definition: hdbscan.hpp:349
value_idx * get_index_into_children()
Definition: hdbscan.hpp:353
void set_n_clusters(const raft::handle_t &handle, value_idx n_clusters_)
Definition: hdbscan.hpp:373
value_idx * get_exemplar_idx()
Definition: hdbscan.hpp:348
value_idx * get_selected_clusters()
Definition: hdbscan.hpp:350
value_idx get_n_exemplars()
Definition: hdbscan.hpp:346
size_t n_rows
Definition: hdbscan.hpp:340
size_t n_cols
Definition: hdbscan.hpp:341
float alpha
Definition: hdbscan.hpp:149
float cluster_selection_epsilon
Definition: hdbscan.hpp:145
int max_cluster_size
Definition: hdbscan.hpp:143
bool allow_single_cluster
Definition: hdbscan.hpp:147
int min_cluster_size
Definition: hdbscan.hpp:142
int min_samples
Definition: hdbscan.hpp:141
Definition: hdbscan.hpp:254
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:296
value_t * get_stabilities()
Definition: hdbscan.hpp:279
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:282
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:289
value_t * get_probabilities()
Definition: hdbscan.hpp:278
hdbscan_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_t *probabilities_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:256
value_idx * get_inverse_label_map()
Definition: hdbscan.hpp:280
const raft::handle_t & get_handle()
Definition: hdbscan.hpp:218
value_idx * children
Definition: hdbscan.hpp:228
value_idx * labels
Definition: hdbscan.hpp:225
int get_n_leaves() const
Definition: hdbscan.hpp:201
robust_single_linkage_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:179
int get_n_clusters() const
Definition: hdbscan.hpp:202
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:215
value_t * mst_weights
Definition: hdbscan.hpp:235
value_idx * get_sizes()
Definition: hdbscan.hpp:205
value_t * deltas
Definition: hdbscan.hpp:230
const raft::handle_t & handle
Definition: hdbscan.hpp:220
value_idx * get_mst_src()
Definition: hdbscan.hpp:207
value_t * get_mst_weights()
Definition: hdbscan.hpp:209
value_idx * get_mst_dst()
Definition: hdbscan.hpp:208
value_idx * get_labels()
Definition: hdbscan.hpp:203
value_idx * sizes
Definition: hdbscan.hpp:229
value_t * get_deltas()
Definition: hdbscan.hpp:206
int n_clusters
Definition: hdbscan.hpp:223
value_idx * mst_src
Definition: hdbscan.hpp:233
value_idx * mst_dst
Definition: hdbscan.hpp:234
value_idx * get_children()
Definition: hdbscan.hpp:204
Definition: params.hpp:34
void generate_prediction_data(const raft::handle_t &handle, CondensedHierarchy< int, float > &condensed_tree, int *labels, int *inverse_label_map, int n_selected_clusters, PredictionData< int, float > &prediction_data)
CLUSTER_SELECTION_METHOD
Definition: hdbscan.hpp:137
@ EOM
Definition: hdbscan.hpp:137
@ LEAF
Definition: hdbscan.hpp:137
void compute_inverse_label_map(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, size_t n_leaves, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, rmm::device_uvector< int > &inverse_label_map, bool allow_single_cluster, int max_cluster_size, float cluster_selection_epsilon)
Compute the map from final, normalize labels to the labels in the CondensedHierarchy.
void compute_core_dists(const raft::handle_t &handle, const float *X, float *core_dists, size_t m, size_t n, cuvs::distance::DistanceType metric, int min_samples)
Compute the core distances for each point in the training matrix.
Definition: dbscan.hpp:30
void build_condensed_hierarchy(const raft::handle_t &handle, const int *children, const float *delta, const int *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree)
void compute_membership_vector(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, const float *points_to_predict, size_t n_prediction_points, int min_samples, cuvs::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
void out_of_sample_predict(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, int *labels, const float *points_to_predict, size_t n_prediction_points, cuvs::distance::DistanceType metric, int min_samples, int *out_labels, float *out_probabilities)
void compute_all_points_membership_vectors(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, cuvs::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
void _extract_clusters(const raft::handle_t &handle, size_t n_leaves, int n_edges, int *parents, int *children, float *lambdas, int *sizes, int *labels, float *probabilities, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, bool allow_single_cluster, int max_cluster_size, float cluster_selection_epsilon)
void hdbscan(const raft::handle_t &handle, const float *X, size_t m, size_t n, cuvs::distance::DistanceType metric, HDBSCAN::Common::HDBSCANParams &params, HDBSCAN::Common::hdbscan_output< int, float > &out, float *core_dists)