22 #include <cuda_runtime_api.h>
27 namespace experimental {
34 auto thread_local cache = std::vector<int>{};
35 if (cache.size() == 0) {
36 auto device_count =
int{};
38 cache.resize(device_count);
39 for (
auto dev = 0; dev < device_count; ++dev) {
41 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
49 auto thread_local cache = std::vector<int>{};
50 if (cache.size() == 0) {
51 auto device_count =
int{};
53 cache.resize(device_count);
54 for (
auto dev = 0; dev < device_count; ++dev) {
56 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
66 cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor,
device_id.value()));
72 auto thread_local cache = std::vector<int>{};
73 if (cache.size() == 0) {
74 auto device_count =
int{};
76 cache.resize(device_count);
77 for (
auto dev = 0; dev < device_count; ++dev) {
79 cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
89 cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate,
device_id.value()));
101 auto constexpr
static const MAX_READ_CHUNK =
index_type{128};
102 auto constexpr
static const MAX_BLOCKS =
index_type{65536};
103 auto constexpr
static const WARP_SIZE =
index_type{32};
104 auto constexpr
static const MAX_THREADS_PER_BLOCK =
index_type{256};
106 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || \
107 __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
108 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{1024};
110 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
113 auto constexpr
static const MAX_THREADS_PER_SM =
index_type{2048};
116 auto constexpr
static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:85
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:47
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:93
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:62
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:70
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:31
uint32_t index_type
Definition: index_type.hpp:21
Definition: dbscan.hpp:30
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:26
detail::device_id< D > device_id
Definition: device_id.hpp:29