runner.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "detail/condense.cuh"
20 #include "detail/extract.cuh"
21 #include "detail/reachability.cuh"
23 
24 #include <cuml/cluster/hdbscan.hpp>
26 #include <cuml/common/logger.hpp>
27 
28 #include <raft/core/device_coo_matrix.hpp>
29 #include <raft/core/device_mdspan.hpp>
30 #include <raft/core/handle.hpp>
31 #include <raft/core/kvp.hpp>
32 #include <raft/core/resource/thrust_policy.hpp>
33 #include <raft/sparse/coo.hpp>
34 #include <raft/util/cudart_utils.hpp>
35 
36 #include <rmm/device_uvector.hpp>
37 
38 #include <thrust/device_ptr.h>
39 #include <thrust/extrema.h>
40 #include <thrust/gather.h>
41 #include <thrust/scatter.h>
42 #include <thrust/transform.h>
43 
44 #include <cuvs/cluster/agglomerative.hpp>
45 
46 namespace ML {
47 namespace HDBSCAN {
48 
65 template <typename value_idx = int64_t, typename value_t = float>
66 void build_linkage(const raft::handle_t& handle,
67  const value_t* X,
68  size_t m,
69  size_t n,
72  value_t* core_dists,
74 {
75  auto stream = handle.get_stream();
76  size_t n_edges = m - 1;
77  cuvs::cluster::agglomerative::helpers::linkage_graph_params::mutual_reachability_params
78  linkage_params;
79  // (min_samples+1) is used to account for self-loops in the KNN graph
80  // and be consistent with scikit-learn-contrib.
81  if (static_cast<size_t>(params.min_samples + 1) > m) {
82  RAFT_LOG_WARN(
83  "min_samples (%d) must be less than the number of samples in X (%zu), setting min_samples to "
84  "%zu",
85  params.min_samples,
86  m,
87  m - 1);
88  linkage_params.min_samples = m;
89  } else {
90  linkage_params.min_samples = params.min_samples + 1;
91  }
92  linkage_params.alpha = params.alpha;
93 
95  handle,
96  raft::make_device_matrix_view<const value_t, value_idx>(X, m, n),
97  linkage_params,
98  static_cast<cuvs::distance::DistanceType>(metric),
99  raft::make_device_coo_matrix_view<value_t, value_idx, value_idx, size_t>(
100  out.get_mst_weights(),
101  raft::make_device_coordinate_structure_view(
102  out.get_mst_src(), out.get_mst_dst(), value_idx(m), value_idx(m), n_edges)),
103  raft::make_device_matrix_view<value_idx, value_idx>(out.get_children(), n_edges, 2),
104  raft::make_device_vector_view<value_t, value_idx>(out.get_deltas(), n_edges),
105  raft::make_device_vector_view<value_idx, value_idx>(out.get_sizes(), n_edges),
106  std::make_optional<raft::device_vector_view<value_t, value_idx>>(
107  raft::make_device_vector_view<value_t, value_idx>(core_dists, m)));
108 }
109 
110 template <typename value_idx = int64_t, typename value_t = float>
111 void _fit_hdbscan(const raft::handle_t& handle,
112  const value_t* X,
113  size_t m,
114  size_t n,
117  value_idx* labels,
118  value_t* core_dists,
120 {
121  auto stream = handle.get_stream();
122  auto exec_policy = handle.get_thrust_policy();
123 
124  int min_cluster_size = params.min_cluster_size;
125 
126  RAFT_EXPECTS(params.min_samples <= m, "min_samples must be at most the number of samples in X");
127 
128  build_linkage(handle, X, m, n, metric, params, core_dists, out);
129 
134  out.get_children(),
135  out.get_deltas(),
136  out.get_sizes(),
137  min_cluster_size,
138  m,
139  out.get_condensed_tree());
140 
145  rmm::device_uvector<value_t> tree_stabilities(out.get_condensed_tree().get_n_clusters(),
146  handle.get_stream());
147 
148  rmm::device_uvector<value_idx> label_map(out.get_condensed_tree().get_n_clusters(),
149  handle.get_stream());
150  value_idx n_selected_clusters =
151  detail::Extract::extract_clusters(handle,
152  out.get_condensed_tree(),
153  m,
154  labels,
155  tree_stabilities.data(),
156  out.get_probabilities(),
157  label_map.data(),
158  params.cluster_selection_method,
160  params.allow_single_cluster,
161  static_cast<value_idx>(params.max_cluster_size),
162  params.cluster_selection_epsilon);
163 
164  out.set_n_clusters(n_selected_clusters);
165 
166  auto lambdas_ptr = thrust::device_pointer_cast(out.get_condensed_tree().get_lambdas());
167  value_t max_lambda = *(thrust::max_element(
168  exec_policy, lambdas_ptr, lambdas_ptr + out.get_condensed_tree().get_n_edges()));
169 
170  detail::Stability::get_stability_scores(handle,
171  labels,
172  tree_stabilities.data(),
173  out.get_condensed_tree().get_n_clusters(),
174  max_lambda,
175  m,
176  out.get_stabilities(),
177  label_map.data());
178 
184  thrust::transform(exec_policy,
185  labels,
186  labels + m,
187  out.get_labels(),
188  [label_map = label_map.data()] __device__(value_idx label) {
189  if (label != -1) return label_map[label];
190  return static_cast<value_idx>(-1);
191  });
192 }
193 
194 }; // end namespace HDBSCAN
195 }; // end namespace ML
Definition: hdbscan.hpp:152
Definition: hdbscan.hpp:254
CondensedHierarchy< value_idx, value_t > & get_condensed_tree()
Definition: hdbscan.hpp:296
value_t * get_stabilities()
Definition: hdbscan.hpp:279
rmm::device_uvector< value_idx > & _get_inverse_label_map()
Definition: hdbscan.hpp:282
void set_n_clusters(int n_clusters_)
Definition: hdbscan.hpp:289
value_t * get_probabilities()
Definition: hdbscan.hpp:278
value_idx * get_sizes()
Definition: hdbscan.hpp:205
value_idx * get_mst_src()
Definition: hdbscan.hpp:207
value_t * get_mst_weights()
Definition: hdbscan.hpp:209
value_idx * get_mst_dst()
Definition: hdbscan.hpp:208
value_idx * get_labels()
Definition: hdbscan.hpp:203
value_t * get_deltas()
Definition: hdbscan.hpp:206
value_idx * get_children()
Definition: hdbscan.hpp:204
Definition: params.hpp:34
void _fit_hdbscan(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, ML::distance::DistanceType metric, Common::HDBSCANParams ¶ms, value_idx *labels, value_t *core_dists, Common::hdbscan_output< value_idx, value_t > &out)
Definition: runner.h:111
void build_linkage(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, ML::distance::DistanceType metric, Common::HDBSCANParams ¶ms, value_t *core_dists, Common::robust_single_linkage_output< value_idx, value_t > &out)
Definition: runner.h:66
DistanceType
Definition: distance_type.hpp:21
void transform(const raft::handle_t &handle, const KMeansParams ¶ms, const float *centroids, const float *X, int n_samples, int n_features, float *X_new)
Transform X to a cluster-distance space.
Definition: dbscan.hpp:29
void build_condensed_hierarchy(const raft::handle_t &handle, const int64_t *children, const float *delta, const int64_t *sizes, int min_cluster_size, int n_leaves, HDBSCAN::Common::CondensedHierarchy< int64_t, float > &condensed_tree)