22 #include <cuda_runtime_api.h>
33 auto thread_local cache = std::vector<int>{};
34 if (cache.size() == 0) {
35 auto device_count =
int{};
37 cache.resize(device_count);
38 for (
auto dev = 0; dev < device_count; ++dev) {
40 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
48 auto thread_local cache = std::vector<int>{};
49 if (cache.size() == 0) {
50 auto device_count =
int{};
52 cache.resize(device_count);
53 for (
auto dev = 0; dev < device_count; ++dev) {
55 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
65 cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor,
device_id.value()));
71 auto thread_local cache = std::vector<int>{};
72 if (cache.size() == 0) {
73 auto device_count =
int{};
75 cache.resize(device_count);
76 for (
auto dev = 0; dev < device_count; ++dev) {
78 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
88 cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate,
device_id.value()));
100 auto constexpr
static const MAX_READ_CHUNK =
index_type{128};
101 auto constexpr
static const MAX_BLOCKS =
index_type{65536};
102 auto constexpr
static const WARP_SIZE =
index_type{32};
103 auto constexpr
static const MAX_THREADS_PER_BLOCK =
index_type{256};
105 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
106 __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
107 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{1024};
109 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
112 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
115 auto constexpr
static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:46
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:61
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:84
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:69
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:92
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:30
uint32_t index_type
Definition: index_type.hpp:20
Definition: dbscan.hpp:29
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:26
detail::device_id< D > device_id
Definition: device_id.hpp:29