arima_common.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <raft/util/cudart_utils.hpp>
20 
21 #include <rmm/aligned.hpp>
22 #include <rmm/mr/device/per_device_resource.hpp>
23 #include <rmm/resource_ref.hpp>
24 
25 #include <cuda_runtime.h>
26 #include <thrust/execution_policy.h>
27 #include <thrust/for_each.h>
28 #include <thrust/iterator/counting_iterator.h>
29 
30 #include <algorithm>
31 
32 namespace ML {
33 
37 struct ARIMAOrder {
38  int p; // Basic order
39  int d;
40  int q;
41  int P; // Seasonal order
42  int D;
43  int Q;
44  int s; // Seasonal period
45  int k; // Fit intercept?
46  int n_exog; // Number of exogenous regressors
47 
48  inline int n_diff() const { return d + s * D; }
49  inline int n_phi() const { return p + s * P; }
50  inline int n_theta() const { return q + s * Q; }
51  inline int r() const { return std::max(n_phi(), n_theta() + 1); }
52  inline int rd() const { return n_diff() + r(); }
53  inline int complexity() const { return p + P + q + Q + k + n_exog + 1; }
54  inline bool need_diff() const { return static_cast<bool>(d + D); }
55 };
56 
63 template <typename DataT>
64 struct ARIMAParams {
65  DataT* mu = nullptr;
66  DataT* beta = nullptr;
67  DataT* ar = nullptr;
68  DataT* ma = nullptr;
69  DataT* sar = nullptr;
70  DataT* sma = nullptr;
71  DataT* sigma2 = nullptr;
72 
82  void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
83  {
84  rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
85  if (order.k && !tr)
86  mu = (DataT*)rmm_alloc.allocate_async(
87  batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
88  if (order.n_exog && !tr)
89  beta = (DataT*)rmm_alloc.allocate_async(
90  order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
91  if (order.p)
92  ar = (DataT*)rmm_alloc.allocate_async(
93  order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
94  if (order.q)
95  ma = (DataT*)rmm_alloc.allocate_async(
96  order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
97  if (order.P)
98  sar = (DataT*)rmm_alloc.allocate_async(
99  order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
100  if (order.Q)
101  sma = (DataT*)rmm_alloc.allocate_async(
102  order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
103  sigma2 = (DataT*)rmm_alloc.allocate_async(
104  batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
105  }
106 
116  void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
117  {
118  rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
119  if (order.k && !tr)
120  rmm_alloc.deallocate_async(
121  mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
122  if (order.n_exog && !tr)
123  rmm_alloc.deallocate_async(
124  beta, order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
125  if (order.p)
126  rmm_alloc.deallocate_async(
127  ar, order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
128  if (order.q)
129  rmm_alloc.deallocate_async(
130  ma, order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
131  if (order.P)
132  rmm_alloc.deallocate_async(
133  sar, order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
134  if (order.Q)
135  rmm_alloc.deallocate_async(
136  sma, order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
137  rmm_alloc.deallocate_async(
138  sigma2, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
139  }
140 
150  void pack(const ARIMAOrder& order, int batch_size, DataT* param_vec, cudaStream_t stream) const
151  {
152  int N = order.complexity();
153  auto counting = thrust::make_counting_iterator(0);
154  // The device lambda can't capture structure members...
155  const DataT *_mu = mu, *_beta = beta, *_ar = ar, *_ma = ma, *_sar = sar, *_sma = sma,
156  *_sigma2 = sigma2;
157  thrust::for_each(
158  thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
159  DataT* param = param_vec + bid * N;
160  if (order.k) {
161  *param = _mu[bid];
162  param++;
163  }
164  for (int i = 0; i < order.n_exog; i++) {
165  param[i] = _beta[order.n_exog * bid + i];
166  }
167  param += order.n_exog;
168  for (int ip = 0; ip < order.p; ip++) {
169  param[ip] = _ar[order.p * bid + ip];
170  }
171  param += order.p;
172  for (int iq = 0; iq < order.q; iq++) {
173  param[iq] = _ma[order.q * bid + iq];
174  }
175  param += order.q;
176  for (int iP = 0; iP < order.P; iP++) {
177  param[iP] = _sar[order.P * bid + iP];
178  }
179  param += order.P;
180  for (int iQ = 0; iQ < order.Q; iQ++) {
181  param[iQ] = _sma[order.Q * bid + iQ];
182  }
183  param += order.Q;
184  *param = _sigma2[bid];
185  });
186  }
187 
197  void unpack(const ARIMAOrder& order, int batch_size, const DataT* param_vec, cudaStream_t stream)
198  {
199  int N = order.complexity();
200  auto counting = thrust::make_counting_iterator(0);
201  // The device lambda can't capture structure members...
202  DataT *_mu = mu, *_beta = beta, *_ar = ar, *_ma = ma, *_sar = sar, *_sma = sma,
203  *_sigma2 = sigma2;
204  thrust::for_each(
205  thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
206  const DataT* param = param_vec + bid * N;
207  if (order.k) {
208  _mu[bid] = *param;
209  param++;
210  }
211  for (int i = 0; i < order.n_exog; i++) {
212  _beta[order.n_exog * bid + i] = param[i];
213  }
214  param += order.n_exog;
215  for (int ip = 0; ip < order.p; ip++) {
216  _ar[order.p * bid + ip] = param[ip];
217  }
218  param += order.p;
219  for (int iq = 0; iq < order.q; iq++) {
220  _ma[order.q * bid + iq] = param[iq];
221  }
222  param += order.q;
223  for (int iP = 0; iP < order.P; iP++) {
224  _sar[order.P * bid + iP] = param[iP];
225  }
226  param += order.P;
227  for (int iQ = 0; iQ < order.Q; iQ++) {
228  _sma[order.Q * bid + iQ] = param[iQ];
229  }
230  param += order.Q;
231  _sigma2[bid] = *param;
232  });
233  }
234 };
235 
242 template <typename T, int ALIGN = 256>
243 struct ARIMAMemory {
255 
256  size_t size;
257 
258  protected:
259  char* buf;
260 
261  template <bool assign, typename ValType>
262  inline void append_buffer(ValType*& ptr, size_t n_elem)
263  {
264  if (assign) { ptr = reinterpret_cast<ValType*>(buf + size); }
265  size += ((n_elem * sizeof(ValType) + ALIGN - 1) / ALIGN) * ALIGN;
266  }
267 
268  template <bool assign>
269  inline void buf_offsets(const ARIMAOrder& order,
270  int batch_size,
271  int n_obs,
272  char* in_buf = nullptr)
273  {
274  buf = in_buf;
275  size = 0;
276 
277  int r = order.r();
278  int rd = order.rd();
279  int N = order.complexity();
280  int n_diff = order.n_diff();
281 
282  append_buffer<assign>(params_mu, order.k * batch_size);
283  append_buffer<assign>(params_beta, order.n_exog * batch_size);
284  append_buffer<assign>(params_ar, order.p * batch_size);
285  append_buffer<assign>(params_ma, order.q * batch_size);
286  append_buffer<assign>(params_sar, order.P * batch_size);
287  append_buffer<assign>(params_sma, order.Q * batch_size);
288  append_buffer<assign>(params_sigma2, batch_size);
289 
290  append_buffer<assign>(Tparams_ar, order.p * batch_size);
291  append_buffer<assign>(Tparams_ma, order.q * batch_size);
292  append_buffer<assign>(Tparams_sar, order.P * batch_size);
293  append_buffer<assign>(Tparams_sma, order.Q * batch_size);
294  append_buffer<assign>(Tparams_sigma2, batch_size);
295 
296  append_buffer<assign>(d_params, N * batch_size);
297  append_buffer<assign>(d_Tparams, N * batch_size);
298  append_buffer<assign>(Z_dense, rd * batch_size);
299  append_buffer<assign>(Z_batches, batch_size);
300  append_buffer<assign>(R_dense, rd * batch_size);
301  append_buffer<assign>(R_batches, batch_size);
302  append_buffer<assign>(T_dense, rd * rd * batch_size);
303  append_buffer<assign>(T_batches, batch_size);
304  append_buffer<assign>(RQ_dense, rd * batch_size);
305  append_buffer<assign>(RQ_batches, batch_size);
306  append_buffer<assign>(RQR_dense, rd * rd * batch_size);
307  append_buffer<assign>(RQR_batches, batch_size);
308  append_buffer<assign>(P_dense, rd * rd * batch_size);
309  append_buffer<assign>(P_batches, batch_size);
310  append_buffer<assign>(alpha_dense, rd * batch_size);
311  append_buffer<assign>(alpha_batches, batch_size);
312  append_buffer<assign>(ImT_dense, r * r * batch_size);
313  append_buffer<assign>(ImT_batches, batch_size);
314  append_buffer<assign>(ImT_inv_dense, r * r * batch_size);
315  append_buffer<assign>(ImT_inv_batches, batch_size);
316  append_buffer<assign>(ImT_inv_P, r * batch_size);
317  append_buffer<assign>(ImT_inv_info, batch_size);
318  append_buffer<assign>(v_tmp_dense, rd * batch_size);
319  append_buffer<assign>(v_tmp_batches, batch_size);
320  append_buffer<assign>(m_tmp_dense, rd * rd * batch_size);
321  append_buffer<assign>(m_tmp_batches, batch_size);
322  append_buffer<assign>(K_dense, rd * batch_size);
323  append_buffer<assign>(K_batches, batch_size);
324  append_buffer<assign>(TP_dense, rd * rd * batch_size);
325  append_buffer<assign>(TP_batches, batch_size);
326 
327  append_buffer<assign>(pred, n_obs * batch_size);
328  append_buffer<assign>(y_diff, n_obs * batch_size);
329  append_buffer<assign>(exog_diff, n_obs * order.n_exog * batch_size);
330  append_buffer<assign>(loglike, batch_size);
331  append_buffer<assign>(loglike_base, batch_size);
332  append_buffer<assign>(loglike_pert, batch_size);
333  append_buffer<assign>(x_pert, N * batch_size);
334 
335  if (n_diff > 0) {
336  append_buffer<assign>(Ts_dense, r * r * batch_size);
337  append_buffer<assign>(Ts_batches, batch_size);
338  append_buffer<assign>(RQRs_dense, r * r * batch_size);
339  append_buffer<assign>(RQRs_batches, batch_size);
340  append_buffer<assign>(Ps_dense, r * r * batch_size);
341  append_buffer<assign>(Ps_batches, batch_size);
342  }
343 
344  if (r <= 5) {
345  // Note: temp mem for the direct Lyapunov solver grows very quickly!
346  // This solver is used iff the condition above is satisfied
347  append_buffer<assign>(I_m_AxA_dense, r * r * r * r * batch_size);
348  append_buffer<assign>(I_m_AxA_batches, batch_size);
349  append_buffer<assign>(I_m_AxA_inv_dense, r * r * r * r * batch_size);
350  append_buffer<assign>(I_m_AxA_inv_batches, batch_size);
351  append_buffer<assign>(I_m_AxA_P, r * r * batch_size);
352  append_buffer<assign>(I_m_AxA_info, batch_size);
353  }
354  }
355 
357  ARIMAMemory(const ARIMAOrder& order, int batch_size, int n_obs)
358  {
359  buf_offsets<false>(order, batch_size, n_obs);
360  }
361 
362  public:
370  ARIMAMemory(const ARIMAOrder& order, int batch_size, int n_obs, char* in_buf)
371  {
372  buf_offsets<true>(order, batch_size, n_obs, in_buf);
373  }
374 
381  static size_t compute_size(const ARIMAOrder& order, int batch_size, int n_obs)
382  {
383  ARIMAMemory temp(order, batch_size, n_obs);
384  return temp.size;
385  }
386 };
387 
388 } // namespace ML
math_t max(math_t a, math_t b)
Definition: learning_rate.h:27
Definition: dbscan.hpp:30
Definition: arima_common.h:243
T * x_pert
Definition: arima_common.h:248
T * Tparams_sar
Definition: arima_common.h:245
T * K_dense
Definition: arima_common.h:247
void buf_offsets(const ARIMAOrder &order, int batch_size, int n_obs, char *in_buf=nullptr)
Definition: arima_common.h:269
T ** R_batches
Definition: arima_common.h:250
T ** RQ_batches
Definition: arima_common.h:250
T * params_mu
Definition: arima_common.h:244
T * T_dense
Definition: arima_common.h:246
T ** Ps_batches
Definition: arima_common.h:253
T * alpha_dense
Definition: arima_common.h:246
int * ImT_inv_P
Definition: arima_common.h:254
T * Z_dense
Definition: arima_common.h:246
T * d_params
Definition: arima_common.h:245
T * loglike_base
Definition: arima_common.h:248
T * ImT_inv_dense
Definition: arima_common.h:247
T * Tparams_sma
Definition: arima_common.h:245
static size_t compute_size(const ARIMAOrder &order, int batch_size, int n_obs)
Definition: arima_common.h:381
T * y_diff
Definition: arima_common.h:247
T * TP_dense
Definition: arima_common.h:247
T * Tparams_ar
Definition: arima_common.h:245
T * RQRs_dense
Definition: arima_common.h:249
T ** ImT_batches
Definition: arima_common.h:251
T ** T_batches
Definition: arima_common.h:250
T * I_m_AxA_inv_dense
Definition: arima_common.h:248
T * pred
Definition: arima_common.h:247
T * ImT_dense
Definition: arima_common.h:246
int * ImT_inv_info
Definition: arima_common.h:254
T ** alpha_batches
Definition: arima_common.h:251
T * params_sma
Definition: arima_common.h:244
T * Ps_dense
Definition: arima_common.h:249
T ** P_batches
Definition: arima_common.h:250
T * loglike_pert
Definition: arima_common.h:248
T ** m_tmp_batches
Definition: arima_common.h:251
T ** Z_batches
Definition: arima_common.h:250
int * I_m_AxA_P
Definition: arima_common.h:254
T * m_tmp_dense
Definition: arima_common.h:247
T * params_sar
Definition: arima_common.h:244
T ** K_batches
Definition: arima_common.h:252
T * RQR_dense
Definition: arima_common.h:246
size_t size
Definition: arima_common.h:256
T ** I_m_AxA_inv_batches
Definition: arima_common.h:252
T * d_Tparams
Definition: arima_common.h:245
T ** ImT_inv_batches
Definition: arima_common.h:251
T * I_m_AxA_dense
Definition: arima_common.h:248
T * v_tmp_dense
Definition: arima_common.h:247
T * Tparams_ma
Definition: arima_common.h:245
T * params_beta
Definition: arima_common.h:244
int * I_m_AxA_info
Definition: arima_common.h:254
T * P_dense
Definition: arima_common.h:246
T ** v_tmp_batches
Definition: arima_common.h:251
T ** I_m_AxA_batches
Definition: arima_common.h:252
T * params_ar
Definition: arima_common.h:244
char * buf
Definition: arima_common.h:259
ARIMAMemory(const ARIMAOrder &order, int batch_size, int n_obs)
Definition: arima_common.h:357
void append_buffer(ValType *&ptr, size_t n_elem)
Definition: arima_common.h:262
T * R_dense
Definition: arima_common.h:246
T ** RQRs_batches
Definition: arima_common.h:253
T * loglike
Definition: arima_common.h:248
T * exog_diff
Definition: arima_common.h:247
T * RQ_dense
Definition: arima_common.h:246
T * params_sigma2
Definition: arima_common.h:244
T ** RQR_batches
Definition: arima_common.h:250
ARIMAMemory(const ARIMAOrder &order, int batch_size, int n_obs, char *in_buf)
Definition: arima_common.h:370
T * Tparams_sigma2
Definition: arima_common.h:245
T ** TP_batches
Definition: arima_common.h:252
T * Ts_dense
Definition: arima_common.h:248
T * params_ma
Definition: arima_common.h:244
T ** Ts_batches
Definition: arima_common.h:252
Definition: arima_common.h:37
int p
Definition: arima_common.h:38
int s
Definition: arima_common.h:44
int n_phi() const
Definition: arima_common.h:49
int P
Definition: arima_common.h:41
int r() const
Definition: arima_common.h:51
int n_exog
Definition: arima_common.h:46
int rd() const
Definition: arima_common.h:52
int D
Definition: arima_common.h:42
int complexity() const
Definition: arima_common.h:53
int q
Definition: arima_common.h:40
bool need_diff() const
Definition: arima_common.h:54
int n_theta() const
Definition: arima_common.h:50
int Q
Definition: arima_common.h:43
int d
Definition: arima_common.h:39
int k
Definition: arima_common.h:45
int n_diff() const
Definition: arima_common.h:48
Definition: arima_common.h:64
DataT * mu
Definition: arima_common.h:65
DataT * sma
Definition: arima_common.h:70
DataT * beta
Definition: arima_common.h:66
void deallocate(const ARIMAOrder &order, int batch_size, cudaStream_t stream, bool tr=false)
Definition: arima_common.h:116
void unpack(const ARIMAOrder &order, int batch_size, const DataT *param_vec, cudaStream_t stream)
Definition: arima_common.h:197
DataT * ma
Definition: arima_common.h:68
void allocate(const ARIMAOrder &order, int batch_size, cudaStream_t stream, bool tr=false)
Definition: arima_common.h:82
DataT * ar
Definition: arima_common.h:67
DataT * sar
Definition: arima_common.h:69
DataT * sigma2
Definition: arima_common.h:71
void pack(const ARIMAOrder &order, int batch_size, DataT *param_vec, cudaStream_t stream) const
Definition: arima_common.h:150