11 #include <cuda_runtime_api.h>
22 auto thread_local cache = std::vector<int>{};
23 if (cache.size() == 0) {
24 auto device_count =
int{};
26 cache.resize(device_count);
27 for (
auto dev = 0; dev < device_count; ++dev) {
29 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
37 auto thread_local cache = std::vector<int>{};
38 if (cache.size() == 0) {
39 auto device_count =
int{};
41 cache.resize(device_count);
42 for (
auto dev = 0; dev < device_count; ++dev) {
44 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
54 cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor,
device_id.value()));
60 auto thread_local cache = std::vector<int>{};
61 if (cache.size() == 0) {
62 auto device_count =
int{};
64 cache.resize(device_count);
65 for (
auto dev = 0; dev < device_count; ++dev) {
67 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
77 cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate,
device_id.value()));
89 auto constexpr
static const MAX_READ_CHUNK =
index_type{128};
90 auto constexpr
static const MAX_BLOCKS =
index_type{65536};
91 auto constexpr
static const WARP_SIZE =
index_type{32};
92 auto constexpr
static const MAX_THREADS_PER_BLOCK =
index_type{256};
94 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
95 __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
96 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{1024};
98 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
101 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
104 auto constexpr
static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:35
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:50
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:73
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:58
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:81
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:19
uint32_t index_type
Definition: index_type.hpp:9
Definition: dbscan.hpp:18
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:15
detail::device_id< D > device_id
Definition: device_id.hpp:18