gpu_introspection.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
21 
22 #include <cuda_runtime_api.h>
23 
24 #include <vector>
25 
26 namespace ML {
27 namespace fil {
28 namespace detail {
29 
32 {
33  auto thread_local cache = std::vector<int>{};
34  if (cache.size() == 0) {
35  auto device_count = int{};
36  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
37  cache.resize(device_count);
38  for (auto dev = 0; dev < device_count; ++dev) {
40  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
41  }
42  }
43  return index_type(cache.at(device_id.value()));
44 }
45 
47 {
48  auto thread_local cache = std::vector<int>{};
49  if (cache.size() == 0) {
50  auto device_count = int{};
51  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
52  cache.resize(device_count);
53  for (auto dev = 0; dev < device_count; ++dev) {
55  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
56  }
57  }
58  return index_type(cache.at(device_id.value()));
59 }
60 
62 {
63  auto result = int{};
65  cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
66  return index_type(result);
67 }
68 
70 {
71  auto thread_local cache = std::vector<int>{};
72  if (cache.size() == 0) {
73  auto device_count = int{};
74  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
75  cache.resize(device_count);
76  for (auto dev = 0; dev < device_count; ++dev) {
78  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
79  }
80  }
81  return index_type(cache.at(device_id.value()));
82 }
83 
85 {
86  auto result = int{};
88  cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
89  return index_type(result);
90 }
91 
93 {
94  auto result = int{};
95  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
96  return index_type(result);
97 }
98 
99 /* The maximum number of bytes that can be read in a single instruction */
100 auto constexpr static const MAX_READ_CHUNK = index_type{128};
101 auto constexpr static const MAX_BLOCKS = index_type{65536};
102 auto constexpr static const WARP_SIZE = index_type{32};
103 auto constexpr static const MAX_THREADS_PER_BLOCK = index_type{256};
104 #ifdef __CUDACC__
105 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
106  __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
107 auto constexpr static const MAX_THREADS_PER_SM = index_type{1024};
108 #else
109 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
110 #endif
111 #else
112 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
113 #endif
114 
115 auto constexpr static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
116 
117 } // namespace detail
118 } // namespace fil
119 } // namespace ML
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:46
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:61
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:84
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:69
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:92
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:30
uint32_t index_type
Definition: index_type.hpp:20
Definition: dbscan.hpp:29
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:26
detail::device_id< D > device_id
Definition: device_id.hpp:29
Definition: base.hpp:22