hdbscan.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <raft/core/handle.hpp>
20 #include <raft/distance/distance_types.hpp>
21 
22 #include <rmm/device_uvector.hpp>
23 
24 #include <cstddef>
25 
26 namespace ML {
27 namespace HDBSCAN {
28 namespace Common {
29 
38 template <typename value_idx, typename value_t>
40  public:
47  CondensedHierarchy(const raft::handle_t& handle_, size_t n_leaves_);
48 
60  CondensedHierarchy(const raft::handle_t& handle_,
61  size_t n_leaves_,
62  int n_edges_,
63  value_idx* parents_,
64  value_idx* children_,
65  value_t* lambdas_,
66  value_idx* sizes_);
67 
80  CondensedHierarchy(const raft::handle_t& handle_,
81  size_t n_leaves_,
82  int n_edges_,
83  int n_clusters_,
84  rmm::device_uvector<value_idx>&& parents_,
85  rmm::device_uvector<value_idx>&& children_,
86  rmm::device_uvector<value_t>&& lambdas_,
87  rmm::device_uvector<value_idx>&& sizes_);
106  void condense(value_idx* full_parents,
107  value_idx* full_children,
108  value_t* full_lambdas,
109  value_idx* full_sizes,
110  value_idx size = -1);
111 
113 
114  value_idx* get_parents() { return parents.data(); }
115  value_idx* get_children() { return children.data(); }
116  value_t* get_lambdas() { return lambdas.data(); }
117  value_idx* get_sizes() { return sizes.data(); }
118  value_idx get_n_edges() { return n_edges; }
119  int get_n_clusters() { return n_clusters; }
120  value_idx get_n_leaves() const { return n_leaves; }
121 
122  private:
123  const raft::handle_t& handle;
124 
125  rmm::device_uvector<value_idx> parents;
126  rmm::device_uvector<value_idx> children;
127  rmm::device_uvector<value_t> lambdas;
128  rmm::device_uvector<value_idx> sizes;
129 
130  size_t n_edges;
131  size_t n_leaves;
132  int n_clusters;
133  value_idx root_cluster;
134 };
135 
137 
139  public:
140  int min_samples = 5;
143 
145 
146  bool allow_single_cluster = false;
147 
148  float alpha = 1.0;
149 };
150 
152  public:
154 };
155 
162 template <typename value_idx, typename value_t>
164  public:
178  robust_single_linkage_output(const raft::handle_t& handle_,
179  int n_leaves_,
180  value_idx* labels_,
181  value_idx* children_,
182  value_idx* sizes_,
183  value_t* deltas_,
184  value_idx* mst_src_,
185  value_idx* mst_dst_,
186  value_t* mst_weights_)
187  : handle(handle_),
188  n_leaves(n_leaves_),
189  n_clusters(-1),
190  labels(labels_),
191  children(children_),
192  sizes(sizes_),
193  deltas(deltas_),
194  mst_src(mst_src_),
195  mst_dst(mst_dst_),
196  mst_weights(mst_weights_)
197  {
198  }
199 
200  int get_n_leaves() const { return n_leaves; }
201  int get_n_clusters() const { return n_clusters; }
202  value_idx* get_labels() { return labels; }
203  value_idx* get_children() { return children; }
204  value_idx* get_sizes() { return sizes; }
205  value_t* get_deltas() { return deltas; }
206  value_idx* get_mst_src() { return mst_src; }
207  value_idx* get_mst_dst() { return mst_dst; }
208  value_t* get_mst_weights() { return mst_weights; }
209 
214  void set_n_clusters(int n_clusters_) { n_clusters = n_clusters_; }
215 
216  protected:
217  const raft::handle_t& get_handle() { return handle; }
218 
219  const raft::handle_t& handle;
220 
221  int n_leaves;
223 
224  value_idx* labels; // size n_leaves
225 
226  // Dendrogram
227  value_idx* children; // size n_leaves * 2
228  value_idx* sizes; // size n_leaves
229  value_t* deltas; // size n_leaves
230 
231  // MST (size n_leaves - 1).
232  value_idx* mst_src;
233  value_idx* mst_dst;
234  value_t* mst_weights;
235 };
236 
252 template <typename value_idx, typename value_t>
253 class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
254  public:
255  hdbscan_output(const raft::handle_t& handle_,
256  int n_leaves_,
257  value_idx* labels_,
258  value_t* probabilities_,
259  value_idx* children_,
260  value_idx* sizes_,
261  value_t* deltas_,
262  value_idx* mst_src_,
263  value_idx* mst_dst_,
264  value_t* mst_weights_)
265  : robust_single_linkage_output<value_idx, value_t>(
266  handle_, n_leaves_, labels_, children_, sizes_, deltas_, mst_src_, mst_dst_, mst_weights_),
267  probabilities(probabilities_),
268  stabilities(0, handle_.get_stream()),
269  condensed_tree(handle_, n_leaves_),
270  inverse_label_map(0, handle_.get_stream())
271  {
272  }
273 
274  // Using getters here, making the members private and forcing
275  // consistent state with the constructor. This should make
276  // it much easier to use / debug.
277  value_t* get_probabilities() { return probabilities; }
278  value_t* get_stabilities() { return stabilities.data(); }
279  value_idx* get_inverse_label_map() { return inverse_label_map.data(); }
280  // internal function
281  rmm::device_uvector<value_idx>& _get_inverse_label_map() { return inverse_label_map; }
282 
288  void set_n_clusters(int n_clusters_)
289  {
291  stabilities.resize(n_clusters_,
293  }
294 
296 
297  private:
298  value_t* probabilities; // size n_leaves
299  // inversely maps normalized labels to pre-normalized labels
300  // used for out-of-sample prediction
301  rmm::device_uvector<value_idx> inverse_label_map; // size n_clusters
302 
303  // Size not known ahead of time. Initialize
304  // with `initialize_stabilities()` method.
305  rmm::device_uvector<value_t> stabilities;
306 
307  // Use condensed hierarchy to wrap
308  // condensed tree outputs since we do not
309  // know the size ahead of time.
311 };
312 
313 template class CondensedHierarchy<int, float>;
314 
322 template <typename value_idx, typename value_t>
324  public:
325  PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n, value_t* core_dists_)
326  : handle(handle_),
327  exemplar_idx(0, handle.get_stream()),
328  exemplar_label_offsets(0, handle.get_stream()),
329  n_selected_clusters(0),
330  selected_clusters(0, handle.get_stream()),
331  deaths(0, handle.get_stream()),
332  core_dists(core_dists_),
333  index_into_children(0, handle.get_stream()),
334  n_exemplars(0),
335  n_rows(m),
336  n_cols(n)
337  {
338  }
339  size_t n_rows;
340  size_t n_cols;
341 
342  // Using getters here, making the members private and forcing
343  // consistent state with the constructor. This should make
344  // it much easier to use / debug.
345  value_idx get_n_exemplars() { return n_exemplars; }
346  value_idx get_n_selected_clusters() { return n_selected_clusters; }
347  value_idx* get_exemplar_idx() { return exemplar_idx.data(); }
348  value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
349  value_idx* get_selected_clusters() { return selected_clusters.data(); }
350  value_t* get_deaths() { return deaths.data(); }
351  value_t* get_core_dists() { return core_dists; }
352  value_idx* get_index_into_children() { return index_into_children.data(); }
353 
362  void allocate(const raft::handle_t& handle,
363  value_idx n_exemplars_,
364  value_idx n_selected_clusters_,
365  value_idx n_edges_);
366 
372  void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
373  {
374  deaths.resize(n_clusters_, handle.get_stream());
375  }
376 
377  private:
378  const raft::handle_t& handle;
379  rmm::device_uvector<value_idx> exemplar_idx;
380  rmm::device_uvector<value_idx> exemplar_label_offsets;
381  value_idx n_exemplars;
382  value_idx n_selected_clusters;
383  rmm::device_uvector<value_idx> selected_clusters;
384  rmm::device_uvector<value_t> deaths;
385  value_t* core_dists;
386  rmm::device_uvector<value_idx> index_into_children;
387 };
388 
389 template class PredictionData<int, float>;
390 
391 void generate_prediction_data(const raft::handle_t& handle,
392  CondensedHierarchy<int, float>& condensed_tree,
393  int* labels,
394  int* inverse_label_map,
395  int n_selected_clusters,
396  PredictionData<int, float>& prediction_data);
397 
398 }; // namespace Common
399 }; // namespace HDBSCAN
400 
423 void hdbscan(const raft::handle_t& handle,
424  const float* X,
425  size_t m,
426  size_t n,
427  raft::distance::DistanceType metric,
430  float* core_dists);
431 
432 void build_condensed_hierarchy(const raft::handle_t& handle,
433  const int* children,
434  const float* delta,
435  const int* sizes,
436  int min_cluster_size,
437  int n_leaves,
439 
440 void _extract_clusters(const raft::handle_t& handle,
441  size_t n_leaves,
442  int n_edges,
443  int* parents,
444  int* children,
445  float* lambdas,
446  int* sizes,
447  int* labels,
448  float* probabilities,
449  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
450  bool allow_single_cluster,
451  int max_cluster_size,
452  float cluster_selection_epsilon);
453 
455  const raft::handle_t& handle,
458  const float* X,
459  raft::distance::DistanceType metric,
460  float* membership_vec,
461  size_t batch_size = 4096);
462 
463 void compute_membership_vector(const raft::handle_t& handle,
466  const float* X,
467  const float* points_to_predict,
468  size_t n_prediction_points,
469  int min_samples,
470  raft::distance::DistanceType metric,
471  float* membership_vec,
472  size_t batch_size = 4096);
473 
474 void out_of_sample_predict(const raft::handle_t& handle,
477  const float* X,
478  int* labels,
479  const float* points_to_predict,
480  size_t n_prediction_points,
481  raft::distance::DistanceType metric,
482  int min_samples,
483  int* out_labels,
484  float* out_probabilities);
485 
486 namespace HDBSCAN::HELPER {
487 
499 void compute_core_dists(const raft::handle_t& handle,
500  const float* X,
501  float* core_dists,
502  size_t m,
503  size_t n,
504  raft::distance::DistanceType metric,
505  int min_samples);
506 
520 void compute_inverse_label_map(const raft::handle_t& handle,
522  size_t n_leaves,
523  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
524  rmm::device_uvector<int>& inverse_label_map,
525  bool allow_single_cluster,
526  int max_cluster_size,
527  float cluster_selection_epsilon);
528 
529 } // namespace HDBSCAN::HELPER
530 } // END namespace ML
Definition: hdbscan.hpp:39
value_idx * get_sizes()
Definition: hdbscan.hpp:117
value_t * get_lambdas()
Definition: hdbscan.hpp:116
value_idx get_n_leaves() const
Definition: hdbscan.hpp:120
value_idx get_n_edges()
Definition: hdbscan.hpp:118
value_idx * get_children()
Definition: hdbscan.hpp:115
int get_n_clusters()
Definition: hdbscan.hpp:119
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_)
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, int n_clusters_, rmm::device_uvector< value_idx > &&parents_, rmm::device_uvector< value_idx > &&children_, rmm::device_uvector< value_t > &&lambdas_, rmm::device_uvector< value_idx > &&sizes_)
value_idx * get_parents()
Definition: hdbscan.hpp:114
CondensedHierarchy(const raft::handle_t &handle_, size_t n_leaves_, int n_edges_, value_idx *parents_, value_idx *children_, value_t *lambdas_, value_idx *sizes_)
void condense(value_idx *full_parents, value_idx *full_children, value_t *full_lambdas, value_idx *full_sizes, value_idx size=-1)
Definition: hdbscan.hpp:151
CLUSTER_SELECTION_METHOD cluster_selection_method
Definition: hdbscan.hpp:153
Definition: hdbscan.hpp:323
value_t * get_core_dists()
Definition: hdbscan.hpp:351
void allocate(const raft::handle_t &handle, value_idx n_exemplars_, value_idx n_selected_clusters_, value_idx n_edges_)
value_t * get_deaths()
Definition: hdbscan.hpp:350
value_idx get_n_selected_clusters()
Definition: hdbscan.hpp:346
PredictionData(const raft::handle_t &handle_, value_idx m, value_idx n, value_t *core_dists_)
Definition: hdbscan.hpp:325
value_idx * get_exemplar_label_offsets()
Definition: hdbscan.hpp:348
value_idx * get_index_into_children()
Definition: hdbscan.hpp:352
void set_n_clusters(const raft::handle_t &handle, value_idx n_clusters_)
Definition: hdbscan.hpp:372
value_idx * get_exemplar_idx()
Definition: hdbscan.hpp:347
value_idx * get_selected_clusters()
Definition: hdbscan.hpp:349
value_idx get_n_exemplars()
Definition: hdbscan.hpp:345
size_t n_rows
Definition: hdbscan.hpp:339
size_t n_cols
Definition: hdbscan.hpp:340
float alpha
Definition: hdbscan.hpp:148
float cluster_selection_epsilon
Definition: hdbscan.hpp:144
int max_cluster_size
Definition: hdbscan.hpp:142
bool allow_single_cluster
Definition: hdbscan.hpp:146
int min_cluster_size
Definition: hdbscan.hpp:141
int min_samples
Definition: hdbscan.hpp:140
Definition: hdbscan.hpp:253
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:295
value_t * get_stabilities()
Definition: hdbscan.hpp:278
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:281
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:288
value_t * get_probabilities()
Definition: hdbscan.hpp:277
hdbscan_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_t *probabilities_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:255
value_idx * get_inverse_label_map()
Definition: hdbscan.hpp:279
const raft::handle_t & get_handle()
Definition: hdbscan.hpp:217
value_idx * children
Definition: hdbscan.hpp:227
value_idx * labels
Definition: hdbscan.hpp:224
int get_n_leaves() const
Definition: hdbscan.hpp:200
robust_single_linkage_output(const raft::handle_t &handle_, int n_leaves_, value_idx *labels_, value_idx *children_, value_idx *sizes_, value_t *deltas_, value_idx *mst_src_, value_idx *mst_dst_, value_t *mst_weights_)
Definition: hdbscan.hpp:178
int get_n_clusters() const
Definition: hdbscan.hpp:201
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:214
value_t * mst_weights
Definition: hdbscan.hpp:234
value_idx * get_sizes()
Definition: hdbscan.hpp:204
value_t * deltas
Definition: hdbscan.hpp:229
const raft::handle_t & handle
Definition: hdbscan.hpp:219
value_idx * get_mst_src()
Definition: hdbscan.hpp:206
value_t * get_mst_weights()
Definition: hdbscan.hpp:208
value_idx * get_mst_dst()
Definition: hdbscan.hpp:207
value_idx * get_labels()
Definition: hdbscan.hpp:202
value_idx * sizes
Definition: hdbscan.hpp:228
value_t * get_deltas()
Definition: hdbscan.hpp:205
int n_clusters
Definition: hdbscan.hpp:222
value_idx * mst_src
Definition: hdbscan.hpp:232
value_idx * mst_dst
Definition: hdbscan.hpp:233
value_idx * get_children()
Definition: hdbscan.hpp:203
Definition: params.hpp:34
void generate_prediction_data(const raft::handle_t &handle, CondensedHierarchy< int, float > &condensed_tree, int *labels, int *inverse_label_map, int n_selected_clusters, PredictionData< int, float > &prediction_data)
CLUSTER_SELECTION_METHOD
Definition: hdbscan.hpp:136
@ EOM
Definition: hdbscan.hpp:136
@ LEAF
Definition: hdbscan.hpp:136
void compute_inverse_label_map(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, size_t n_leaves, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, rmm::device_uvector< int > &inverse_label_map, bool allow_single_cluster, int max_cluster_size, float cluster_selection_epsilon)
Compute the map from final, normalize labels to the labels in the CondensedHierarchy.
void compute_core_dists(const raft::handle_t &handle, const float *X, float *core_dists, size_t m, size_t n, raft::distance::DistanceType metric, int min_samples)
Compute the core distances for each point in the training matrix.
Definition: dbscan.hpp:30
void build_condensed_hierarchy(const raft::handle_t &handle, const int *children, const float *delta, const int *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree)
void _extract_clusters(const raft::handle_t &handle, size_t n_leaves, int n_edges, int *parents, int *children, float *lambdas, int *sizes, int *labels, float *probabilities, HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method, bool allow_single_cluster, int max_cluster_size, float cluster_selection_epsilon)
void compute_all_points_membership_vectors(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, raft::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
void compute_membership_vector(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, const float *points_to_predict, size_t n_prediction_points, int min_samples, raft::distance::DistanceType metric, float *membership_vec, size_t batch_size=4096)
void out_of_sample_predict(const raft::handle_t &handle, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree, HDBSCAN::Common::PredictionData< int, float > &prediction_data, const float *X, int *labels, const float *points_to_predict, size_t n_prediction_points, raft::distance::DistanceType metric, int min_samples, int *out_labels, float *out_probabilities)
void hdbscan(const raft::handle_t &handle, const float *X, size_t m, size_t n, raft::distance::DistanceType metric, HDBSCAN::Common::HDBSCANParams &params, HDBSCAN::Common::hdbscan_output< int, float > &out, float *core_dists)