runner.h
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include "detail/condense.cuh"
9 #include "detail/extract.cuh"
10 #include "detail/reachability.cuh"
12 
13 #include <cuml/cluster/hdbscan.hpp>
15 #include <cuml/common/logger.hpp>
16 
17 #include <raft/core/device_coo_matrix.hpp>
18 #include <raft/core/device_mdspan.hpp>
19 #include <raft/core/handle.hpp>
20 #include <raft/core/kvp.hpp>
21 #include <raft/core/resource/thrust_policy.hpp>
22 #include <raft/sparse/coo.hpp>
23 #include <raft/util/cudart_utils.hpp>
24 
25 #include <rmm/device_uvector.hpp>
26 
27 #include <thrust/device_ptr.h>
28 #include <thrust/extrema.h>
29 #include <thrust/gather.h>
30 #include <thrust/scatter.h>
31 #include <thrust/transform.h>
32 
33 #include <cuvs/cluster/agglomerative.hpp>
34 
35 namespace ML {
36 namespace HDBSCAN {
37 
54 template <typename value_idx = int64_t, typename value_t = float>
55 void build_linkage(const raft::handle_t& handle,
56  const value_t* X,
57  size_t m,
58  size_t n,
61  value_t* core_dists,
63 {
64  auto stream = handle.get_stream();
65  size_t n_edges = m - 1;
66  cuvs::cluster::agglomerative::helpers::linkage_graph_params::mutual_reachability_params
67  linkage_params;
68  // (min_samples+1) is used to account for self-loops in the KNN graph
69  // and be consistent with scikit-learn-contrib.
70  if (static_cast<size_t>(params.min_samples + 1) > m) {
71  RAFT_LOG_WARN(
72  "min_samples (%d) must be less than the number of samples in X (%zu), setting min_samples to "
73  "%zu",
74  params.min_samples,
75  m,
76  m - 1);
77  linkage_params.min_samples = m;
78  } else {
79  linkage_params.min_samples = params.min_samples + 1;
80  }
81  linkage_params.alpha = params.alpha;
82  linkage_params.all_neighbors_params.metric = static_cast<cuvs::distance::DistanceType>(metric);
83  linkage_params.all_neighbors_params.overlap_factor = params.build_params.overlap_factor;
84  linkage_params.all_neighbors_params.n_clusters = params.build_params.n_clusters;
85 
87  auto brute_force_params = cuvs::neighbors::graph_build_params::brute_force_params{};
88  brute_force_params.build_params.metric = static_cast<cuvs::distance::DistanceType>(metric);
89  linkage_params.all_neighbors_params.graph_build_params = brute_force_params;
90  } else if (params.build_algo == Common::GRAPH_BUILD_ALGO::NN_DESCENT) {
91  auto nn_descent_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
92  nn_descent_params.max_iterations = params.build_params.nn_descent_params.max_iterations;
93  nn_descent_params.graph_degree = params.build_params.nn_descent_params.graph_degree;
94  nn_descent_params.intermediate_graph_degree =
95  params.build_params.nn_descent_params.intermediate_graph_degree;
96  nn_descent_params.termination_threshold =
97  params.build_params.nn_descent_params.termination_threshold;
98  if (static_cast<int>(nn_descent_params.graph_degree) < linkage_params.min_samples) {
99  // linkage_params.min_samples functions as the k for the knn
100  RAFT_LOG_WARN(
101  "NN Descent graph_degree (%d) must be larger than or equal to min_samples + 1 (%zu), "
102  "setting graph_degree to min_samples + 1 and scaling intermediate_graph_degree accordingly "
103  "to 2*graph_degree",
104  static_cast<int>(nn_descent_params.graph_degree),
105  static_cast<int>(linkage_params.min_samples));
106  nn_descent_params.graph_degree = linkage_params.min_samples;
107  nn_descent_params.intermediate_graph_degree = nn_descent_params.graph_degree * 2;
108  }
109  nn_descent_params.metric = static_cast<cuvs::distance::DistanceType>(metric);
110  linkage_params.all_neighbors_params.graph_build_params = nn_descent_params;
111  } else {
112  RAFT_FAIL("Unsupported build algo. build algo must be either brute force or nn descent");
113  }
114 
115  cudaPointerAttributes attr;
116  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, X));
117  bool data_on_device = attr.type == cudaMemoryTypeDevice;
118 
119  if (data_on_device) {
121  handle,
122  raft::make_device_matrix_view<const value_t, value_idx>(X, m, n),
123  linkage_params,
124  static_cast<cuvs::distance::DistanceType>(metric),
125  raft::make_device_coo_matrix_view<value_t, value_idx, value_idx, size_t>(
126  out.get_mst_weights(),
127  raft::make_device_coordinate_structure_view(
128  out.get_mst_src(), out.get_mst_dst(), value_idx(m), value_idx(m), n_edges)),
129  raft::make_device_matrix_view<value_idx, value_idx>(out.get_children(), n_edges, 2),
130  raft::make_device_vector_view<value_t, value_idx>(out.get_deltas(), n_edges),
131  raft::make_device_vector_view<value_idx, value_idx>(out.get_sizes(), n_edges),
132  std::make_optional<raft::device_vector_view<value_t, value_idx>>(
133  raft::make_device_vector_view<value_t, value_idx>(core_dists, m)));
134  } else {
136  handle,
137  raft::make_host_matrix_view<const value_t, value_idx>(X, m, n),
138  linkage_params,
139  static_cast<cuvs::distance::DistanceType>(metric),
140  raft::make_device_coo_matrix_view<value_t, value_idx, value_idx, size_t>(
141  out.get_mst_weights(),
142  raft::make_device_coordinate_structure_view(
143  out.get_mst_src(), out.get_mst_dst(), value_idx(m), value_idx(m), n_edges)),
144  raft::make_device_matrix_view<value_idx, value_idx>(out.get_children(), n_edges, 2),
145  raft::make_device_vector_view<value_t, value_idx>(out.get_deltas(), n_edges),
146  raft::make_device_vector_view<value_idx, value_idx>(out.get_sizes(), n_edges),
147  std::make_optional<raft::device_vector_view<value_t, value_idx>>(
148  raft::make_device_vector_view<value_t, value_idx>(core_dists, m)));
149  }
150 }
151 
152 template <typename value_idx = int64_t, typename value_t = float>
153 void _fit_hdbscan(const raft::handle_t& handle,
154  const value_t* X,
155  size_t m,
156  size_t n,
159  value_idx* labels,
160  value_t* core_dists,
162 {
163  auto stream = handle.get_stream();
164  auto exec_policy = handle.get_thrust_policy();
165 
166  int min_cluster_size = params.min_cluster_size;
167 
168  RAFT_EXPECTS(params.min_samples <= m, "min_samples must be at most the number of samples in X");
169 
170  build_linkage(handle, X, m, n, metric, params, core_dists, out);
171 
176  out.get_children(),
177  out.get_deltas(),
178  out.get_sizes(),
179  min_cluster_size,
180  m,
181  out.get_condensed_tree());
182 
187  rmm::device_uvector<value_t> tree_stabilities(out.get_condensed_tree().get_n_clusters(),
188  handle.get_stream());
189 
190  rmm::device_uvector<value_idx> label_map(out.get_condensed_tree().get_n_clusters(),
191  handle.get_stream());
192  value_idx n_selected_clusters =
193  detail::Extract::extract_clusters(handle,
194  out.get_condensed_tree(),
195  m,
196  labels,
197  tree_stabilities.data(),
198  out.get_probabilities(),
199  label_map.data(),
200  params.cluster_selection_method,
202  params.allow_single_cluster,
203  static_cast<value_idx>(params.max_cluster_size),
204  params.cluster_selection_epsilon);
205 
206  out.set_n_clusters(n_selected_clusters);
207 
208  auto lambdas_ptr = thrust::device_pointer_cast(out.get_condensed_tree().get_lambdas());
209  value_t max_lambda = *(thrust::max_element(
210  exec_policy, lambdas_ptr, lambdas_ptr + out.get_condensed_tree().get_n_edges()));
211 
212  detail::Stability::get_stability_scores(handle,
213  labels,
214  tree_stabilities.data(),
215  out.get_condensed_tree().get_n_clusters(),
216  max_lambda,
217  m,
218  out.get_stabilities(),
219  label_map.data());
220 
226  thrust::transform(exec_policy,
227  labels,
228  labels + m,
229  out.get_labels(),
230  [label_map = label_map.data()] __device__(value_idx label) {
231  if (label != -1) return label_map[label];
232  return static_cast<value_idx>(-1);
233  });
234 }
235 
236 }; // end namespace HDBSCAN
237 }; // end namespace ML
Definition: hdbscan.hpp:195
Definition: hdbscan.hpp:299
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:341
value_t * get_stabilities()
Definition: hdbscan.hpp:324
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:327
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:334
value_t * get_probabilities()
Definition: hdbscan.hpp:323
value_idx * get_sizes()
Definition: hdbscan.hpp:250
value_idx * get_mst_src()
Definition: hdbscan.hpp:252
value_t * get_mst_weights()
Definition: hdbscan.hpp:254
value_idx * get_mst_dst()
Definition: hdbscan.hpp:253
value_idx * get_labels()
Definition: hdbscan.hpp:248
value_t * get_deltas()
Definition: hdbscan.hpp:251
value_idx * get_children()
Definition: hdbscan.hpp:249
Definition: params.hpp:23
@ NN_DESCENT
Definition: hdbscan.hpp:127
@ BRUTE_FORCE_KNN
Definition: hdbscan.hpp:127
void _fit_hdbscan(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, ML::distance::DistanceType metric, Common::HDBSCANParams ¶ms, value_idx *labels, value_t *core_dists, Common::hdbscan_output< value_idx, value_t > &out)
Definition: runner.h:153
void build_linkage(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, ML::distance::DistanceType metric, Common::HDBSCANParams ¶ms, value_t *core_dists, Common::robust_single_linkage_output< value_idx, value_t > &out)
Definition: runner.h:55
DistanceType
Definition: distance_type.hpp:10
void transform(const raft::handle_t &handle, const KMeansParams ¶ms, const float *centroids, const float *X, int n_samples, int n_features, float *X_new)
Transform X to a cluster-distance space.
Definition: dbscan.hpp:18
void build_condensed_hierarchy(const raft::handle_t &handle, const int64_t *children, const float *delta, const int64_t *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree)