gpu_introspection.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
21 
22 #include <cuda_runtime_api.h>
23 
24 #include <vector>
25 
26 namespace ML {
27 namespace experimental {
28 namespace fil {
29 namespace detail {
30 
33 {
34  auto thread_local cache = std::vector<int>{};
35  if (cache.size() == 0) {
36  auto device_count = int{};
37  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
38  cache.resize(device_count);
39  for (auto dev = 0; dev < device_count; ++dev) {
41  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
42  }
43  }
44  return index_type(cache.at(device_id.value()));
45 }
46 
48 {
49  auto thread_local cache = std::vector<int>{};
50  if (cache.size() == 0) {
51  auto device_count = int{};
52  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
53  cache.resize(device_count);
54  for (auto dev = 0; dev < device_count; ++dev) {
56  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
57  }
58  }
59  return index_type(cache.at(device_id.value()));
60 }
61 
63 {
64  auto result = int{};
66  cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
67  return index_type(result);
68 }
69 
71 {
72  auto thread_local cache = std::vector<int>{};
73  if (cache.size() == 0) {
74  auto device_count = int{};
75  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
76  cache.resize(device_count);
77  for (auto dev = 0; dev < device_count; ++dev) {
79  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
80  }
81  }
82  return index_type(cache.at(device_id.value()));
83 }
84 
86 {
87  auto result = int{};
89  cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
90  return index_type(result);
91 }
92 
94 {
95  auto result = int{};
96  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
97  return index_type(result);
98 }
99 
100 /* The maximum number of bytes that can be read in a single instruction */
101 auto constexpr static const MAX_READ_CHUNK = index_type{128};
102 auto constexpr static const MAX_BLOCKS = index_type{65536};
103 auto constexpr static const WARP_SIZE = index_type{32};
104 auto constexpr static const MAX_THREADS_PER_BLOCK = index_type{256};
105 #ifdef __CUDACC__
106 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
107  __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
108 auto constexpr static const MAX_THREADS_PER_SM = index_type{1024};
109 #else
110 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
111 #endif
112 #else
113 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
114 #endif
115 
116 auto constexpr static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
117 
118 } // namespace detail
119 } // namespace fil
120 } // namespace experimental
121 } // namespace ML
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:85
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:47
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:93
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:62
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:70
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:31
uint32_t index_type
Definition: index_type.hpp:21
Definition: dbscan.hpp:30
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:26
detail::device_id< D > device_id
Definition: device_id.hpp:29
Definition: base.hpp:22