dense.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include "base.hpp"
8 
9 #include <raft/core/handle.hpp>
10 #include <raft/linalg/add.cuh>
11 #include <raft/linalg/ternary_op.cuh>
12 #include <raft/util/cuda_utils.cuh>
13 #include <raft/util/cudart_utils.hpp>
14 
15 #include <iostream>
16 #include <vector>
17 // #TODO: Replace with public header when ready
18 #include <raft/linalg/detail/cublas_wrappers.hpp>
19 #include <raft/linalg/map_then_reduce.cuh>
20 #include <raft/linalg/norm.cuh>
21 #include <raft/linalg/unary_op.cuh>
22 
23 #include <rmm/device_uvector.hpp>
24 
25 namespace ML {
26 
27 enum STORAGE_ORDER { COL_MAJOR = 0, ROW_MAJOR = 1 };
28 
29 template <typename T>
32  int len;
33  T* data;
34 
35  STORAGE_ORDER ord; // storage order: runtime param for compile time sake
36 
37  SimpleDenseMat(STORAGE_ORDER order = COL_MAJOR) : Super(0, 0), data(nullptr), len(0), ord(order)
38  {
39  }
40 
41  SimpleDenseMat(T* data, int m, int n, STORAGE_ORDER order = COL_MAJOR)
42  : Super(m, n), data(data), len(m * n), ord(order)
43  {
44  }
45 
46  void reset(T* data_, int m_, int n_)
47  {
48  this->m = m_;
49  this->n = n_;
50  data = data_;
51  len = m_ * n_;
52  }
53 
54  // Implemented GEMM as a static method here to improve readability
55  inline static void gemm(const raft::handle_t& handle,
56  const T alpha,
57  const SimpleDenseMat<T>& A,
58  const bool transA,
59  const SimpleDenseMat<T>& B,
60  const bool transB,
61  const T beta,
63  cudaStream_t stream)
64  {
65  int kA = A.n;
66  int kB = B.m;
67 
68  if (transA) {
69  ASSERT(A.n == C.m, "GEMM invalid dims: m");
70  kA = A.m;
71  } else {
72  ASSERT(A.m == C.m, "GEMM invalid dims: m");
73  }
74 
75  if (transB) {
76  ASSERT(B.m == C.n, "GEMM invalid dims: n");
77  kB = B.n;
78  } else {
79  ASSERT(B.n == C.n, "GEMM invalid dims: n");
80  }
81  ASSERT(kA == kB, "GEMM invalid dims: k");
82 
83  if (A.ord == COL_MAJOR && B.ord == COL_MAJOR && C.ord == COL_MAJOR) {
84  // #TODO: Call from public API when ready
85  raft::linalg::detail::cublasgemm(handle.get_cublas_handle(), // handle
86  transA ? CUBLAS_OP_T : CUBLAS_OP_N, // transA
87  transB ? CUBLAS_OP_T : CUBLAS_OP_N, // transB
88  C.m,
89  C.n,
90  kA, // dimensions m,n,k
91  &alpha,
92  A.data,
93  A.m, // lda
94  B.data,
95  B.m, // ldb
96  &beta,
97  C.data,
98  C.m, // ldc,
99  stream);
100  return;
101  }
102  if (A.ord == ROW_MAJOR) {
103  const SimpleDenseMat<T> Acm(A.data, A.n, A.m, COL_MAJOR);
104  gemm(handle, alpha, Acm, !transA, B, transB, beta, C, stream);
105  return;
106  }
107  if (B.ord == ROW_MAJOR) {
108  const SimpleDenseMat<T> Bcm(B.data, B.n, B.m, COL_MAJOR);
109  gemm(handle, alpha, A, transA, Bcm, !transB, beta, C, stream);
110  return;
111  }
112  if (C.ord == ROW_MAJOR) {
113  SimpleDenseMat<T> Ccm(C.data, C.n, C.m, COL_MAJOR);
114  gemm(handle, alpha, B, !transB, A, !transA, beta, Ccm, stream);
115  return;
116  }
117  }
118 
119  inline void gemmb(const raft::handle_t& handle,
120  const T alpha,
121  const SimpleDenseMat<T>& A,
122  const bool transA,
123  const bool transB,
124  const T beta,
126  cudaStream_t stream) const override
127  {
128  SimpleDenseMat<T>::gemm(handle, alpha, A, transA, *this, transB, beta, C, stream);
129  }
130 
138  inline void assign_gemm(const raft::handle_t& handle,
139  const T alpha,
140  const SimpleDenseMat<T>& A,
141  const bool transA,
142  const SimpleMat<T>& B,
143  const bool transB,
144  const T beta,
145  cudaStream_t stream)
146  {
147  B.gemmb(handle, alpha, A, transA, transB, beta, *this, stream);
148  }
149 
150  // this = a*x
151  inline void ax(const T a, const SimpleDenseMat<T>& x, cudaStream_t stream)
152  {
153  ASSERT(ord == x.ord, "SimpleDenseMat::ax: Storage orders must match");
154 
155  auto scale = [a] __device__(const T x) { return a * x; };
156  raft::linalg::unaryOp(data, x.data, len, scale, stream);
157  }
158 
159  // this = a*x + y
160  inline void axpy(const T a,
161  const SimpleDenseMat<T>& x,
162  const SimpleDenseMat<T>& y,
163  cudaStream_t stream)
164  {
165  ASSERT(ord == x.ord, "SimpleDenseMat::axpy: Storage orders must match");
166  ASSERT(ord == y.ord, "SimpleDenseMat::axpy: Storage orders must match");
167 
168  auto axpy = [a] __device__(const T x, const T y) { return a * x + y; };
169  raft::linalg::binaryOp(data, x.data, y.data, len, axpy, stream);
170  }
171 
172  template <typename Lambda>
173  inline void assign_unary(const SimpleDenseMat<T>& other, Lambda f, cudaStream_t stream)
174  {
175  ASSERT(ord == other.ord, "SimpleDenseMat::assign_unary: Storage orders must match");
176 
177  raft::linalg::unaryOp(data, other.data, len, f, stream);
178  }
179 
180  template <typename Lambda>
181  inline void assign_binary(const SimpleDenseMat<T>& other1,
182  const SimpleDenseMat<T>& other2,
183  Lambda& f,
184  cudaStream_t stream)
185  {
186  ASSERT(ord == other1.ord, "SimpleDenseMat::assign_binary: Storage orders must match");
187  ASSERT(ord == other2.ord, "SimpleDenseMat::assign_binary: Storage orders must match");
188 
189  raft::linalg::binaryOp(data, other1.data, other2.data, len, f, stream);
190  }
191 
192  template <typename Lambda>
193  inline void assign_ternary(const SimpleDenseMat<T>& other1,
194  const SimpleDenseMat<T>& other2,
195  const SimpleDenseMat<T>& other3,
196  Lambda& f,
197  cudaStream_t stream)
198  {
199  ASSERT(ord == other1.ord, "SimpleDenseMat::assign_ternary: Storage orders must match");
200  ASSERT(ord == other2.ord, "SimpleDenseMat::assign_ternary: Storage orders must match");
201  ASSERT(ord == other3.ord, "SimpleDenseMat::assign_ternary: Storage orders must match");
202 
203  raft::linalg::ternaryOp(data, other1.data, other2.data, other3.data, len, f, stream);
204  }
205 
206  inline void fill(const T val, cudaStream_t stream)
207  {
208  // TODO this reads data unnecessary, though it's mostly used for testing
209  auto f = [val] __device__(const T x) { return val; };
210  raft::linalg::unaryOp(data, data, len, f, stream);
211  }
212 
213  inline void copy_async(const SimpleDenseMat<T>& other, cudaStream_t stream)
214  {
215  ASSERT((ord == other.ord) && (this->m == other.m) && (this->n == other.n),
216  "SimpleDenseMat::copy: matrices not compatible");
217 
218  RAFT_CUDA_TRY(
219  cudaMemcpyAsync(data, other.data, len * sizeof(T), cudaMemcpyDeviceToDevice, stream));
220  }
221 
222  void print(std::ostream& oss) const override { oss << (*this) << std::endl; }
223 
224  void operator=(const SimpleDenseMat<T>& other) = delete;
225 };
226 
227 template <typename T>
230 
231  SimpleVec(T* data, const int n) : Super(data, n, 1, COL_MAJOR) {}
232  // this = alpha * A * x + beta * this
233  void assign_gemv(const raft::handle_t& handle,
234  const T alpha,
235  const SimpleDenseMat<T>& A,
236  bool transA,
237  const SimpleVec<T>& x,
238  const T beta,
239  cudaStream_t stream)
240  {
241  Super::assign_gemm(handle, alpha, A, transA, x, false, beta, stream);
242  }
243 
245 
246  inline void reset(T* new_data, int n) { Super::reset(new_data, n, 1); }
247 };
248 
249 template <typename T>
250 inline void col_ref(const SimpleDenseMat<T>& mat, SimpleVec<T>& mask_vec, int c)
251 {
252  ASSERT(mat.ord == COL_MAJOR, "col_ref only available for column major mats");
253  T* tmp = &mat.data[mat.m * c];
254  mask_vec.reset(tmp, mat.m);
255 }
256 
257 template <typename T>
258 inline void col_slice(const SimpleDenseMat<T>& mat,
259  SimpleDenseMat<T>& mask_mat,
260  int c_from,
261  int c_to)
262 {
263  ASSERT(c_from >= 0 && c_from < mat.n, "col_slice: invalid from");
264  ASSERT(c_to >= 0 && c_to <= mat.n, "col_slice: invalid to");
265 
266  ASSERT(mat.ord == COL_MAJOR, "col_ref only available for column major mats");
267  ASSERT(mask_mat.ord == COL_MAJOR, "col_ref only available for column major mask");
268  T* tmp = &mat.data[mat.m * c_from];
269  mask_mat.reset(tmp, mat.m, c_to - c_from);
270 }
271 
272 // Reductions such as dot or norm require an additional location in dev mem
273 // to hold the result. We don't want to deal with this in the SimpleVec class
274 // as it impedes thread safety and constness
275 
276 template <typename T>
277 inline T dot(const SimpleVec<T>& u, const SimpleVec<T>& v, T* tmp_dev, cudaStream_t stream)
278 {
279  auto f = [] __device__(const T x, const T y) { return x * y; };
280  raft::linalg::mapThenSumReduce(tmp_dev, u.len, f, stream, u.data, v.data);
281  T tmp_host;
282  raft::update_host(&tmp_host, tmp_dev, 1, stream);
283 
285  return tmp_host;
286 }
287 
288 template <typename T>
289 inline T squaredNorm(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
290 {
291  return dot(u, u, tmp_dev, stream);
292 }
293 
294 template <typename T>
295 inline T nrmMax(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
296 {
297  auto f = [] __device__(const T x) { return raft::abs<T>(x); };
298  auto r = [] __device__(const T x, const T y) { return raft::max<T>(x, y); };
299  raft::linalg::mapThenReduce(tmp_dev, u.len, T(0), f, r, stream, u.data);
300  T tmp_host;
301  raft::update_host(&tmp_host, tmp_dev, 1, stream);
303  return tmp_host;
304 }
305 
306 template <typename T>
307 inline T nrm2(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
308 {
309  return raft::mySqrt<T>(squaredNorm(u, tmp_dev, stream));
310 }
311 
312 template <typename T>
313 inline T nrm1(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
314 {
315  raft::linalg::rowNorm<raft::linalg::NormType::L1Norm, true>(
316  tmp_dev, u.data, u.len, 1, stream, raft::Nop<T>());
317  T tmp_host;
318  raft::update_host(&tmp_host, tmp_dev, 1, stream);
320  return tmp_host;
321 }
322 
323 template <typename T>
324 std::ostream& operator<<(std::ostream& os, const SimpleVec<T>& v)
325 {
326  std::vector<T> out(v.len);
327  raft::update_host(&out[0], v.data, v.len, 0);
328  raft::interruptible::synchronize(rmm::cuda_stream_view());
329  int it = 0;
330  for (; it < v.len - 1;) {
331  os << out[it] << " ";
332  it++;
333  }
334  os << out[it];
335  return os;
336 }
337 
338 template <typename T>
339 std::ostream& operator<<(std::ostream& os, const SimpleDenseMat<T>& mat)
340 {
341  os << "ord=" << (mat.ord == COL_MAJOR ? "CM" : "RM") << "\n";
342  std::vector<T> out(mat.len);
343  raft::update_host(&out[0], mat.data, mat.len, rmm::cuda_stream_default);
344  raft::interruptible::synchronize(rmm::cuda_stream_view());
345  if (mat.ord == COL_MAJOR) {
346  for (int r = 0; r < mat.m; r++) {
347  int idx = r;
348  for (int c = 0; c < mat.n - 1; c++) {
349  os << out[idx] << ",";
350  idx += mat.m;
351  }
352  os << out[idx] << std::endl;
353  }
354  } else {
355  for (int c = 0; c < mat.m; c++) {
356  int idx = c * mat.n;
357  for (int r = 0; r < mat.n - 1; r++) {
358  os << out[idx] << ",";
359  idx += 1;
360  }
361  os << out[idx] << std::endl;
362  }
363  }
364 
365  return os;
366 }
367 
368 template <typename T>
371  typedef rmm::device_uvector<T> Buffer;
373 
374  SimpleVecOwning() = delete;
375 
376  SimpleVecOwning(int n, cudaStream_t stream) : Super(), buf(n, stream)
377  {
378  Super::reset(buf.data(), n);
379  }
380 
381  void operator=(const SimpleVec<T>& other) = delete;
382 };
383 
384 template <typename T>
387  typedef rmm::device_uvector<T> Buffer;
389  using Super::m;
390  using Super::n;
391  using Super::ord;
392 
393  SimpleMatOwning() = delete;
394 
395  SimpleMatOwning(int m, int n, cudaStream_t stream, STORAGE_ORDER order = COL_MAJOR)
396  : Super(order), buf(m * n, stream)
397  {
398  Super::reset(buf.data(), m, n);
399  }
400 
401  void operator=(const SimpleVec<T>& other) = delete;
402 };
403 
404 }; // namespace ML
Definition: dbscan.hpp:18
void col_slice(const SimpleDenseMat< T > &mat, SimpleDenseMat< T > &mask_mat, int c_from, int c_to)
Definition: dense.hpp:258
T nrm1(const SimpleVec< T > &u, T *tmp_dev, cudaStream_t stream)
Definition: dense.hpp:313
std::ostream & operator<<(std::ostream &os, const SimpleVec< T > &v)
Definition: dense.hpp:324
T nrmMax(const SimpleVec< T > &u, T *tmp_dev, cudaStream_t stream)
Definition: dense.hpp:295
T squaredNorm(const SimpleVec< T > &u, T *tmp_dev, cudaStream_t stream)
Definition: dense.hpp:289
T dot(const SimpleVec< T > &u, const SimpleVec< T > &v, T *tmp_dev, cudaStream_t stream)
Definition: dense.hpp:277
T nrm2(const SimpleVec< T > &u, T *tmp_dev, cudaStream_t stream)
Definition: dense.hpp:307
STORAGE_ORDER
Definition: dense.hpp:27
@ ROW_MAJOR
Definition: dense.hpp:27
@ COL_MAJOR
Definition: dense.hpp:27
void col_ref(const SimpleDenseMat< T > &mat, SimpleVec< T > &mask_vec, int c)
Definition: dense.hpp:250
void synchronize(cuda_stream stream)
Definition: cuda_stream.hpp:16
Definition: dense.hpp:30
void fill(const T val, cudaStream_t stream)
Definition: dense.hpp:206
void assign_binary(const SimpleDenseMat< T > &other1, const SimpleDenseMat< T > &other2, Lambda &f, cudaStream_t stream)
Definition: dense.hpp:181
void assign_gemm(const raft::handle_t &handle, const T alpha, const SimpleDenseMat< T > &A, const bool transA, const SimpleMat< T > &B, const bool transB, const T beta, cudaStream_t stream)
Definition: dense.hpp:138
static void gemm(const raft::handle_t &handle, const T alpha, const SimpleDenseMat< T > &A, const bool transA, const SimpleDenseMat< T > &B, const bool transB, const T beta, SimpleDenseMat< T > &C, cudaStream_t stream)
Definition: dense.hpp:55
void ax(const T a, const SimpleDenseMat< T > &x, cudaStream_t stream)
Definition: dense.hpp:151
void gemmb(const raft::handle_t &handle, const T alpha, const SimpleDenseMat< T > &A, const bool transA, const bool transB, const T beta, SimpleDenseMat< T > &C, cudaStream_t stream) const override
Definition: dense.hpp:119
void assign_unary(const SimpleDenseMat< T > &other, Lambda f, cudaStream_t stream)
Definition: dense.hpp:173
SimpleDenseMat(T *data, int m, int n, STORAGE_ORDER order=COL_MAJOR)
Definition: dense.hpp:41
void axpy(const T a, const SimpleDenseMat< T > &x, const SimpleDenseMat< T > &y, cudaStream_t stream)
Definition: dense.hpp:160
SimpleDenseMat(STORAGE_ORDER order=COL_MAJOR)
Definition: dense.hpp:37
void assign_ternary(const SimpleDenseMat< T > &other1, const SimpleDenseMat< T > &other2, const SimpleDenseMat< T > &other3, Lambda &f, cudaStream_t stream)
Definition: dense.hpp:193
int len
Definition: dense.hpp:32
void operator=(const SimpleDenseMat< T > &other)=delete
void copy_async(const SimpleDenseMat< T > &other, cudaStream_t stream)
Definition: dense.hpp:213
T * data
Definition: dense.hpp:33
void print(std::ostream &oss) const override
Definition: dense.hpp:222
SimpleMat< T > Super
Definition: dense.hpp:31
void reset(T *data_, int m_, int n_)
Definition: dense.hpp:46
STORAGE_ORDER ord
Definition: dense.hpp:35
Definition: dense.hpp:385
int m
Definition: base.hpp:18
SimpleMatOwning(int m, int n, cudaStream_t stream, STORAGE_ORDER order=COL_MAJOR)
Definition: dense.hpp:395
SimpleMatOwning()=delete
Buffer buf
Definition: dense.hpp:388
int n
Definition: base.hpp:18
rmm::device_uvector< T > Buffer
Definition: dense.hpp:387
SimpleDenseMat< T > Super
Definition: dense.hpp:386
void operator=(const SimpleVec< T > &other)=delete
Definition: base.hpp:17
int m
Definition: base.hpp:18
int n
Definition: base.hpp:18
virtual void gemmb(const raft::handle_t &handle, const T alpha, const SimpleDenseMat< T > &A, const bool transA, const bool transB, const T beta, SimpleDenseMat< T > &C, cudaStream_t stream) const =0
Definition: dense.hpp:369
void operator=(const SimpleVec< T > &other)=delete
SimpleVecOwning()=delete
SimpleVecOwning(int n, cudaStream_t stream)
Definition: dense.hpp:376
SimpleVec< T > Super
Definition: dense.hpp:370
rmm::device_uvector< T > Buffer
Definition: dense.hpp:371
Buffer buf
Definition: dense.hpp:372
Definition: dense.hpp:228
SimpleDenseMat< T > Super
Definition: dense.hpp:229
void reset(T *new_data, int n)
Definition: dense.hpp:246
void assign_gemv(const raft::handle_t &handle, const T alpha, const SimpleDenseMat< T > &A, bool transA, const SimpleVec< T > &x, const T beta, cudaStream_t stream)
Definition: dense.hpp:233
SimpleVec(T *data, const int n)
Definition: dense.hpp:231
SimpleVec()
Definition: dense.hpp:244