gpu_introspection.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
10 
11 #include <cuda_runtime_api.h>
12 
13 #include <vector>
14 
15 namespace ML {
16 namespace fil {
17 namespace detail {
18 
21 {
22  auto thread_local cache = std::vector<int>{};
23  if (cache.size() == 0) {
24  auto device_count = int{};
25  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
26  cache.resize(device_count);
27  for (auto dev = 0; dev < device_count; ++dev) {
29  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
30  }
31  }
32  return index_type(cache.at(device_id.value()));
33 }
34 
36 {
37  auto thread_local cache = std::vector<int>{};
38  if (cache.size() == 0) {
39  auto device_count = int{};
40  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
41  cache.resize(device_count);
42  for (auto dev = 0; dev < device_count; ++dev) {
44  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
45  }
46  }
47  return index_type(cache.at(device_id.value()));
48 }
49 
51 {
52  auto result = int{};
54  cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
55  return index_type(result);
56 }
57 
59 {
60  auto thread_local cache = std::vector<int>{};
61  if (cache.size() == 0) {
62  auto device_count = int{};
63  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
64  cache.resize(device_count);
65  for (auto dev = 0; dev < device_count; ++dev) {
67  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
68  }
69  }
70  return index_type(cache.at(device_id.value()));
71 }
72 
74 {
75  auto result = int{};
77  cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
78  return index_type(result);
79 }
80 
82 {
83  auto result = int{};
84  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
85  return index_type(result);
86 }
87 
88 /* The maximum number of bytes that can be read in a single instruction */
89 auto constexpr static const MAX_READ_CHUNK = index_type{128};
90 auto constexpr static const MAX_BLOCKS = index_type{65536};
91 auto constexpr static const WARP_SIZE = index_type{32};
92 auto constexpr static const MAX_THREADS_PER_BLOCK = index_type{256};
93 #ifdef __CUDACC__
94 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
95  __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
96 auto constexpr static const MAX_THREADS_PER_SM = index_type{1024};
97 #else
98 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
99 #endif
100 #else
101 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
102 #endif
103 
104 auto constexpr static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
105 
106 } // namespace detail
107 } // namespace fil
108 } // namespace ML
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:35
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:50
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:73
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:58
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:81
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:19
uint32_t index_type
Definition: index_type.hpp:9
Definition: dbscan.hpp:18
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:15
detail::device_id< D > device_id
Definition: device_id.hpp:18
Definition: base.hpp:11