Attention
The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called cuVS. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and they will be removed from RAFT altogether in the 24.12 (December) release.
Clustering Model Scoring#
Adjusted Rand Index#
#include <raft/stats/adjusted_rand_index.cuh>
namespace raft::stats
-
template<typename value_t, typename math_t, typename idx_t>
double adjusted_rand_index(raft::resources const &handle, raft::device_vector_view<const value_t, idx_t> first_cluster_array, raft::device_vector_view<const value_t, idx_t> second_cluster_array)# Function to calculate Adjusted RandIndex.
- Template Parameters:
value_t – data-type for input label arrays
math_t – integral data-type used for computing n-choose-r
idx_t – Index type of matrix extent.
- Parameters:
handle – [in] the raft handle.
first_cluster_array – [in] the array of classes
second_cluster_array – [in] the array of classes
- Returns:
the Adjusted RandIndex
Completeness Score#
#include <raft/stats/completeness_score.cuh>
namespace raft::stats
-
template<typename value_t, typename idx_t>
double completeness_score(raft::resources const &handle, raft::device_vector_view<const value_t, idx_t> truth_cluster_array, raft::device_vector_view<const value_t, idx_t> pred_cluster_array, value_t lower_label_range, value_t upper_label_range)# Function to calculate the completeness score between two clusters.
- Template Parameters:
value_t – the data type
idx_t – Index type of matrix extent.
- Parameters:
handle – [in] the raft handle.
truth_cluster_array – [in] the array of truth classes of type value_t
pred_cluster_array – [in] the array of predicted classes of type value_t
lower_label_range – [in] the lower bound of the range of labels
upper_label_range – [in] the upper bound of the range of labels
- Returns:
the cluster completeness score
Cluster Dispersion#
#include <raft/stats/dispersion.cuh>
namespace raft::stats
-
template<typename value_t, typename idx_t>
value_t cluster_dispersion(raft::resources const &handle, raft::device_matrix_view<const value_t, idx_t, raft::row_major> centroids, raft::device_vector_view<const idx_t, idx_t> cluster_sizes, std::optional<raft::device_vector_view<value_t, idx_t>> global_centroid, const idx_t n_points)# Compute cluster dispersion metric. This is very useful for automatically finding the ‘k’ (in kmeans) that improves this metric. The cluster dispersion metric is defined as the square root of the sum of the squared distances between the cluster centroids and the global centroid.
- Template Parameters:
value_t – data type
idx_t – index type
- Parameters:
handle – [in] the raft handle
centroids – [in] the cluster centroids. This is assumed to be row-major and of dimension (n_clusters x dim)
cluster_sizes – [in] number of points in the dataset which belong to each cluster. This is of length n_clusters
global_centroid – [out] compute the global weighted centroid of all cluster centroids. This is of length dim. Use std::nullopt to not return it.
n_points – [in] number of points in the dataset
- Returns:
the cluster dispersion value
Rand Index#
#include <raft/stats/rand_index.cuh>
namespace raft::stats
-
template<typename value_t, typename idx_t>
double rand_index(raft::resources const &handle, raft::device_vector_view<const value_t, idx_t> first_cluster_array, raft::device_vector_view<const value_t, idx_t> second_cluster_array)# Function to calculate RandIndex more info on rand index
- Template Parameters:
value_t – the data type
idx_t – index type
- Parameters:
handle – [in] the raft handle
first_cluster_array – [in] the array of classes of type value_t
second_cluster_array – [in] the array of classes of type value_t
- Returns:
: The RandIndex value.
Silhouette Score#
#include <raft/stats/silhouette_score.cuh>
namespace raft::stats
-
template<typename value_t, typename label_t, typename idx_t>
value_t silhouette_score(raft::resources const &handle, raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in, raft::device_vector_view<const label_t, idx_t> labels, std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample, idx_t n_unique_labels, raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)# main function that returns the average silhouette score for a given set of data and its clusterings
- Template Parameters:
value_t – type of the data samples
label_t – type of the labels
idx_t – index type
- Parameters:
handle – [in] raft handle for managing expensive resources
X_in – [in] input matrix Data in row-major format (nRows x nCols)
labels – [in] the pointer to the array containing labels for every data sample (length: nRows)
silhouette_score_per_sample – [out] optional array populated with the silhouette score for every sample (length: nRows)
n_unique_labels – [in] number of unique labels in the labels array
metric – [in] the numerical value that maps to the type of distance metric to be used in the calculations
- Returns:
: The silhouette score.
-
template<typename value_t, typename label_t, typename idx_t>
value_t silhouette_score_batched(raft::resources const &handle, raft::device_matrix_view<const value_t, idx_t, raft::row_major> X, raft::device_vector_view<const label_t, idx_t> labels, std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample, idx_t n_unique_labels, idx_t batch_size, raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)# function that returns the average silhouette score for a given set of data and its clusterings
- Template Parameters:
value_t – type of the data samples
label_t – type of the labels
idx_t – index type
- Parameters:
handle – [in] raft handle for managing expensive resources
X – [in] input matrix Data in row-major format (nRows x nCols)
labels – [in] the pointer to the array containing labels for every data sample (length: nRows)
silhouette_score_per_sample – [out] optional array populated with the silhouette score for every sample (length: nRows)
n_unique_labels – [in] number of unique labels in the labels array
batch_size – [in] number of samples per batch
metric – [in] the numerical value that maps to the type of distance metric to be used in the calculations
- Returns:
: The silhouette score.
V Measure#
#include <raft/stats/v_measure.cuh>
namespace raft::stats
-
template<typename value_t, typename idx_t>
double v_measure(raft::resources const &handle, raft::device_vector_view<const value_t, idx_t> truth_cluster_array, raft::device_vector_view<const value_t, idx_t> pred_cluster_array, value_t lower_label_range, value_t upper_label_range, double beta = 1.0)# Function to calculate the v-measure between two clusters.
- Template Parameters:
value_t – the data type
idx_t – Integer type used to for addressing
- Parameters:
handle – [in] the raft handle
truth_cluster_array – [in] the array of truth classes of type T
pred_cluster_array – [in] the array of predicted classes of type T
lower_label_range – [in] the lower bound of the range of labels
upper_label_range – [in] the upper bound of the range of labels
beta – [in] v_measure parameter
- Returns:
the v-measure between the two clusters