runner.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "detail/condense.cuh"
20 #include "detail/extract.cuh"
21 #include "detail/reachability.cuh"
23 
24 #include <cuml/cluster/hdbscan.hpp>
25 #include <cuml/common/logger.hpp>
26 
27 #include <raft/cluster/detail/agglomerative.cuh> // build_dendrogram_host
28 #include <raft/cluster/detail/mst.cuh> // build_sorted_mst
29 #include <raft/core/handle.hpp>
30 #include <raft/core/kvp.hpp>
31 #include <raft/core/resource/thrust_policy.hpp>
32 #include <raft/sparse/coo.hpp>
33 #include <raft/util/cudart_utils.hpp>
34 
35 #include <rmm/device_uvector.hpp>
36 
37 #include <thrust/device_ptr.h>
38 #include <thrust/extrema.h>
39 #include <thrust/gather.h>
40 #include <thrust/scatter.h>
41 #include <thrust/transform.h>
42 
43 #include <cuvs/neighbors/reachability.hpp>
44 
45 namespace ML {
46 namespace HDBSCAN {
47 
55 template <typename value_idx, typename value_t>
57  value_t* core_dists;
58  value_idx m;
59 
61 
62  FixConnectivitiesRedOp(value_t* core_dists_, value_idx m_) : core_dists(core_dists_), m(m_){};
63 
64  typedef typename raft::KeyValuePair<value_idx, value_t> KVP;
65  DI void operator()(value_idx rit, KVP* out, const KVP& other) const
66  {
67  if (rit < m && other.value < std::numeric_limits<value_t>::max()) {
68  value_t core_dist_rit = core_dists[rit];
69  value_t core_dist_other = max(core_dist_rit, max(core_dists[other.key], other.value));
70 
71  value_t core_dist_out;
72  if (out->key > -1) {
73  core_dist_out = max(core_dist_rit, max(core_dists[out->key], out->value));
74  } else {
75  core_dist_out = out->value;
76  }
77 
78  bool smaller = core_dist_other < core_dist_out;
79  out->key = smaller ? other.key : out->key;
80  out->value = smaller ? core_dist_other : core_dist_out;
81  }
82  }
83 
84  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) const
85  {
86  if (rit < m && a.key > -1) {
87  value_t core_dist_rit = core_dists[rit];
88  value_t core_dist_a = max(core_dist_rit, max(core_dists[a.key], a.value));
89 
90  value_t core_dist_b;
91  if (b.key > -1) {
92  core_dist_b = max(core_dist_rit, max(core_dists[b.key], b.value));
93  } else {
94  core_dist_b = b.value;
95  }
96 
97  return core_dist_a < core_dist_b ? KVP(a.key, core_dist_a) : KVP(b.key, core_dist_b);
98  }
99 
100  return b;
101  }
102 
103  DI void init(value_t* out, value_t maxVal) const { *out = maxVal; }
104  DI void init(KVP* out, value_t maxVal) const
105  {
106  out->key = -1;
107  out->value = maxVal;
108  }
109 
110  DI void init_key(value_t& out, value_idx idx) const { return; }
111  DI void init_key(KVP& out, value_idx idx) const { out.key = idx; }
112 
113  DI value_t get_value(KVP& out) const { return out.value; }
114  DI value_t get_value(value_t& out) const { return out; }
115 
116  void gather(const raft::resources& handle, value_idx* map)
117  {
118  auto tmp_core_dists = raft::make_device_vector<value_t>(handle, m);
119  thrust::gather(raft::resource::get_thrust_policy(handle),
120  map,
121  map + m,
122  core_dists,
123  tmp_core_dists.data_handle());
124  raft::copy_async(
125  core_dists, tmp_core_dists.data_handle(), m, raft::resource::get_cuda_stream(handle));
126  }
127 
128  void scatter(const raft::resources& handle, value_idx* map)
129  {
130  auto tmp_core_dists = raft::make_device_vector<value_t>(handle, m);
131  thrust::scatter(raft::resource::get_thrust_policy(handle),
132  core_dists,
133  core_dists + m,
134  map,
135  tmp_core_dists.data_handle());
136  raft::copy_async(
137  core_dists, tmp_core_dists.data_handle(), m, raft::resource::get_cuda_stream(handle));
138  }
139 };
140 
157 template <typename value_idx = int64_t, typename value_t = float>
158 void build_linkage(const raft::handle_t& handle,
159  const value_t* X,
160  size_t m,
161  size_t n,
162  cuvs::distance::DistanceType metric,
164  value_t* core_dists,
166 {
167  auto stream = handle.get_stream();
168 
172  rmm::device_uvector<value_idx> mutual_reachability_indptr(m + 1, stream);
173  // Note that (min_samples+1) is parsed while allocating space for the COO matrix and to the
174  // mutual_reachability_graph function. This was done to account for self-loops in the knn graph
175  // and be consistent with Scikit learn Contrib.
176  raft::sparse::COO<value_t, value_idx> mutual_reachability_coo(stream,
177  (params.min_samples + 1) * m * 2);
178 
179  cuvs::neighbors::reachability::mutual_reachability_graph(
180  handle,
181  raft::make_device_matrix_view<const value_t, int64_t>(X, m, n),
182  params.min_samples + 1,
183  raft::make_device_vector_view<value_idx>(mutual_reachability_indptr.data(), m + 1),
184  raft::make_device_vector_view<value_t>(core_dists, m),
185  mutual_reachability_coo,
186  metric,
187  params.alpha);
188 
193  rmm::device_uvector<value_idx> color(m, stream);
194  FixConnectivitiesRedOp<value_idx, value_t> red_op(core_dists, m);
195  // during knn graph connection
196  raft::cluster::detail::build_sorted_mst(handle,
197  X,
198  mutual_reachability_indptr.data(),
199  mutual_reachability_coo.cols(),
200  mutual_reachability_coo.vals(),
201  m,
202  n,
203  out.get_mst_src(),
204  out.get_mst_dst(),
205  out.get_mst_weights(),
206  color.data(),
207  mutual_reachability_coo.nnz,
208  red_op,
209  static_cast<raft::distance::DistanceType>(metric),
210  (size_t)10);
211 
215  size_t n_edges = m - 1;
216 
217  raft::cluster::detail::build_dendrogram_host(handle,
218  out.get_mst_src(),
219  out.get_mst_dst(),
220  out.get_mst_weights(),
221  n_edges,
222  out.get_children(),
223  out.get_deltas(),
224  out.get_sizes());
225 }
226 
227 template <typename value_idx = int64_t, typename value_t = float>
228 void _fit_hdbscan(const raft::handle_t& handle,
229  const value_t* X,
230  size_t m,
231  size_t n,
232  cuvs::distance::DistanceType metric,
234  value_idx* labels,
235  value_t* core_dists,
237 {
238  auto stream = handle.get_stream();
239  auto exec_policy = handle.get_thrust_policy();
240 
241  int min_cluster_size = params.min_cluster_size;
242 
243  build_linkage(handle, X, m, n, metric, params, core_dists, out);
244 
249  out.get_children(),
250  out.get_deltas(),
251  out.get_sizes(),
252  min_cluster_size,
253  m,
254  out.get_condensed_tree());
255 
260  rmm::device_uvector<value_t> tree_stabilities(out.get_condensed_tree().get_n_clusters(),
261  handle.get_stream());
262 
263  rmm::device_uvector<value_idx> label_map(out.get_condensed_tree().get_n_clusters(),
264  handle.get_stream());
265  value_idx n_selected_clusters =
266  detail::Extract::extract_clusters(handle,
267  out.get_condensed_tree(),
268  m,
269  labels,
270  tree_stabilities.data(),
271  out.get_probabilities(),
272  label_map.data(),
273  params.cluster_selection_method,
275  params.allow_single_cluster,
276  params.max_cluster_size,
277  params.cluster_selection_epsilon);
278 
279  out.set_n_clusters(n_selected_clusters);
280 
281  auto lambdas_ptr = thrust::device_pointer_cast(out.get_condensed_tree().get_lambdas());
282  value_t max_lambda = *(thrust::max_element(
283  exec_policy, lambdas_ptr, lambdas_ptr + out.get_condensed_tree().get_n_edges()));
284 
285  detail::Stability::get_stability_scores(handle,
286  labels,
287  tree_stabilities.data(),
288  out.get_condensed_tree().get_n_clusters(),
289  max_lambda,
290  m,
291  out.get_stabilities(),
292  label_map.data());
293 
299  thrust::transform(exec_policy,
300  labels,
301  labels + m,
302  out.get_labels(),
303  [label_map = label_map.data()] __device__(value_idx label) {
304  if (label != -1) return label_map[label];
305  return -1;
306  });
307 }
308 
309 }; // end namespace HDBSCAN
310 }; // end namespace ML
Definition: hdbscan.hpp:152
Definition: hdbscan.hpp:254
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:296
value_t * get_stabilities()
Definition: hdbscan.hpp:279
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:282
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:289
value_t * get_probabilities()
Definition: hdbscan.hpp:278
value_idx * get_sizes()
Definition: hdbscan.hpp:205
value_idx * get_mst_src()
Definition: hdbscan.hpp:207
value_t * get_mst_weights()
Definition: hdbscan.hpp:209
value_idx * get_mst_dst()
Definition: hdbscan.hpp:208
value_idx * get_labels()
Definition: hdbscan.hpp:203
value_t * get_deltas()
Definition: hdbscan.hpp:206
value_idx * get_children()
Definition: hdbscan.hpp:204
Definition: params.hpp:34
void build_linkage(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, cuvs::distance::DistanceType metric, Common::HDBSCANParams &params, value_t *core_dists, Common::robust_single_linkage_output< value_idx, value_t > &out)
Definition: runner.h:158
void _fit_hdbscan(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, cuvs::distance::DistanceType metric, Common::HDBSCANParams &params, value_idx *labels, value_t *core_dists, Common::hdbscan_output< value_idx, value_t > &out)
Definition: runner.h:228
math_t max(math_t a, math_t b)
Definition: learning_rate.h:27
void transform(const raft::handle_t &handle, const KMeansParams &params, const float *centroids, const float *X, int n_samples, int n_features, float *X_new)
Transform X to a cluster-distance space.
Definition: dbscan.hpp:30
void build_condensed_hierarchy(const raft::handle_t &handle, const int *children, const float *delta, const int *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int, float > &condensed_tree)
value_t * core_dists
Definition: runner.h:57
void gather(const raft::resources &handle, value_idx *map)
Definition: runner.h:116
DI value_t get_value(value_t &out) const
Definition: runner.h:114
value_idx m
Definition: runner.h:58
FixConnectivitiesRedOp(value_t *core_dists_, value_idx m_)
Definition: runner.h:62
DI void init_key(KVP &out, value_idx idx) const
Definition: runner.h:111
DI void init_key(value_t &out, value_idx idx) const
Definition: runner.h:110
DI FixConnectivitiesRedOp()
Definition: runner.h:60
DI value_t get_value(KVP &out) const
Definition: runner.h:113
DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) const
Definition: runner.h:84
DI void init(KVP *out, value_t maxVal) const
Definition: runner.h:104
DI void init(value_t *out, value_t maxVal) const
Definition: runner.h:103
DI void operator()(value_idx rit, KVP *out, const KVP &other) const
Definition: runner.h:65
void scatter(const raft::resources &handle, value_idx *map)
Definition: runner.h:128
raft::KeyValuePair< value_idx, value_t > KVP
Definition: runner.h:62