19 #include <raft/util/cudart_utils.hpp>
21 #include <rmm/aligned.hpp>
22 #include <rmm/mr/device/per_device_resource.hpp>
23 #include <rmm/resource_ref.hpp>
25 #include <cuda_runtime.h>
26 #include <thrust/execution_policy.h>
27 #include <thrust/for_each.h>
28 #include <thrust/iterator/counting_iterator.h>
54 inline bool need_diff()
const {
return static_cast<bool>(
d +
D); }
63 template <
typename DataT>
84 rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
86 mu = (DataT*)rmm_alloc.allocate_async(
87 batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
89 beta = (DataT*)rmm_alloc.allocate_async(
90 order.
n_exog * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
92 ar = (DataT*)rmm_alloc.allocate_async(
93 order.
p * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
95 ma = (DataT*)rmm_alloc.allocate_async(
96 order.
q * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
98 sar = (DataT*)rmm_alloc.allocate_async(
99 order.
P * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
101 sma = (DataT*)rmm_alloc.allocate_async(
102 order.
Q * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
103 sigma2 = (DataT*)rmm_alloc.allocate_async(
104 batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
118 rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
120 rmm_alloc.deallocate_async(
121 mu, batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
123 rmm_alloc.deallocate_async(
124 beta, order.
n_exog * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
126 rmm_alloc.deallocate_async(
127 ar, order.
p * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
129 rmm_alloc.deallocate_async(
130 ma, order.
q * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
132 rmm_alloc.deallocate_async(
133 sar, order.
P * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
135 rmm_alloc.deallocate_async(
136 sma, order.
Q * batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
137 rmm_alloc.deallocate_async(
138 sigma2, batch_size *
sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
150 void pack(
const ARIMAOrder& order,
int batch_size, DataT* param_vec, cudaStream_t stream)
const
153 auto counting = thrust::make_counting_iterator(0);
155 const DataT *_mu =
mu, *_beta =
beta, *_ar =
ar, *_ma =
ma, *_sar =
sar, *_sma =
sma,
158 thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(
int bid) {
159 DataT* param = param_vec + bid * N;
164 for (
int i = 0; i < order.
n_exog; i++) {
165 param[i] = _beta[order.
n_exog * bid + i];
168 for (
int ip = 0; ip < order.
p; ip++) {
169 param[ip] = _ar[order.
p * bid + ip];
172 for (
int iq = 0; iq < order.
q; iq++) {
173 param[iq] = _ma[order.
q * bid + iq];
176 for (
int iP = 0; iP < order.
P; iP++) {
177 param[iP] = _sar[order.
P * bid + iP];
180 for (
int iQ = 0; iQ < order.
Q; iQ++) {
181 param[iQ] = _sma[order.
Q * bid + iQ];
184 *param = _sigma2[bid];
197 void unpack(
const ARIMAOrder& order,
int batch_size,
const DataT* param_vec, cudaStream_t stream)
200 auto counting = thrust::make_counting_iterator(0);
202 DataT *_mu =
mu, *_beta =
beta, *_ar =
ar, *_ma =
ma, *_sar =
sar, *_sma =
sma,
205 thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(
int bid) {
206 const DataT* param = param_vec + bid * N;
211 for (
int i = 0; i < order.
n_exog; i++) {
212 _beta[order.
n_exog * bid + i] = param[i];
215 for (
int ip = 0; ip < order.
p; ip++) {
216 _ar[order.
p * bid + ip] = param[ip];
219 for (
int iq = 0; iq < order.
q; iq++) {
220 _ma[order.
q * bid + iq] = param[iq];
223 for (
int iP = 0; iP < order.
P; iP++) {
224 _sar[order.
P * bid + iP] = param[iP];
227 for (
int iQ = 0; iQ < order.
Q; iQ++) {
228 _sma[order.
Q * bid + iQ] = param[iQ];
231 _sigma2[bid] = *param;
242 template <
typename T,
int ALIGN = 256>
261 template <
bool assign,
typename ValType>
264 if (assign) { ptr =
reinterpret_cast<ValType*
>(
buf +
size); }
265 size += ((n_elem *
sizeof(ValType) + ALIGN - 1) / ALIGN) * ALIGN;
268 template <
bool assign>
272 char* in_buf =
nullptr)
280 int n_diff = order.
n_diff();
282 append_buffer<assign>(
params_mu, order.
k * batch_size);
284 append_buffer<assign>(
params_ar, order.
p * batch_size);
285 append_buffer<assign>(
params_ma, order.
q * batch_size);
286 append_buffer<assign>(
params_sar, order.
P * batch_size);
287 append_buffer<assign>(
params_sma, order.
Q * batch_size);
290 append_buffer<assign>(
Tparams_ar, order.
p * batch_size);
291 append_buffer<assign>(
Tparams_ma, order.
q * batch_size);
292 append_buffer<assign>(
Tparams_sar, order.
P * batch_size);
293 append_buffer<assign>(
Tparams_sma, order.
Q * batch_size);
296 append_buffer<assign>(
d_params, N * batch_size);
297 append_buffer<assign>(
d_Tparams, N * batch_size);
298 append_buffer<assign>(
Z_dense, rd * batch_size);
299 append_buffer<assign>(
Z_batches, batch_size);
300 append_buffer<assign>(
R_dense, rd * batch_size);
301 append_buffer<assign>(
R_batches, batch_size);
302 append_buffer<assign>(
T_dense, rd * rd * batch_size);
303 append_buffer<assign>(
T_batches, batch_size);
304 append_buffer<assign>(
RQ_dense, rd * batch_size);
305 append_buffer<assign>(
RQ_batches, batch_size);
306 append_buffer<assign>(
RQR_dense, rd * rd * batch_size);
308 append_buffer<assign>(
P_dense, rd * rd * batch_size);
309 append_buffer<assign>(
P_batches, batch_size);
310 append_buffer<assign>(
alpha_dense, rd * batch_size);
312 append_buffer<assign>(
ImT_dense, r * r * batch_size);
316 append_buffer<assign>(
ImT_inv_P, r * batch_size);
318 append_buffer<assign>(
v_tmp_dense, rd * batch_size);
320 append_buffer<assign>(
m_tmp_dense, rd * rd * batch_size);
322 append_buffer<assign>(
K_dense, rd * batch_size);
323 append_buffer<assign>(
K_batches, batch_size);
324 append_buffer<assign>(
TP_dense, rd * rd * batch_size);
325 append_buffer<assign>(
TP_batches, batch_size);
327 append_buffer<assign>(
pred, n_obs * batch_size);
328 append_buffer<assign>(
y_diff, n_obs * batch_size);
330 append_buffer<assign>(
loglike, batch_size);
333 append_buffer<assign>(
x_pert, N * batch_size);
336 append_buffer<assign>(
Ts_dense, r * r * batch_size);
337 append_buffer<assign>(
Ts_batches, batch_size);
338 append_buffer<assign>(
RQRs_dense, r * r * batch_size);
340 append_buffer<assign>(
Ps_dense, r * r * batch_size);
341 append_buffer<assign>(
Ps_batches, batch_size);
347 append_buffer<assign>(
I_m_AxA_dense, r * r * r * r * batch_size);
351 append_buffer<assign>(
I_m_AxA_P, r * r * batch_size);
359 buf_offsets<false>(order, batch_size, n_obs);
372 buf_offsets<true>(order, batch_size, n_obs, in_buf);
math_t max(math_t a, math_t b)
Definition: learning_rate.h:27
Definition: dbscan.hpp:30
Definition: arima_common.h:243
T * x_pert
Definition: arima_common.h:248
T * Tparams_sar
Definition: arima_common.h:245
T * K_dense
Definition: arima_common.h:247
void buf_offsets(const ARIMAOrder &order, int batch_size, int n_obs, char *in_buf=nullptr)
Definition: arima_common.h:269
T ** R_batches
Definition: arima_common.h:250
T ** RQ_batches
Definition: arima_common.h:250
T * params_mu
Definition: arima_common.h:244
T * T_dense
Definition: arima_common.h:246
T ** Ps_batches
Definition: arima_common.h:253
T * alpha_dense
Definition: arima_common.h:246
int * ImT_inv_P
Definition: arima_common.h:254
T * Z_dense
Definition: arima_common.h:246
T * d_params
Definition: arima_common.h:245
T * loglike_base
Definition: arima_common.h:248
T * ImT_inv_dense
Definition: arima_common.h:247
T * Tparams_sma
Definition: arima_common.h:245
static size_t compute_size(const ARIMAOrder &order, int batch_size, int n_obs)
Definition: arima_common.h:381
T * y_diff
Definition: arima_common.h:247
T * TP_dense
Definition: arima_common.h:247
T * Tparams_ar
Definition: arima_common.h:245
T * RQRs_dense
Definition: arima_common.h:249
T ** ImT_batches
Definition: arima_common.h:251
T ** T_batches
Definition: arima_common.h:250
T * I_m_AxA_inv_dense
Definition: arima_common.h:248
T * pred
Definition: arima_common.h:247
T * ImT_dense
Definition: arima_common.h:246
int * ImT_inv_info
Definition: arima_common.h:254
T ** alpha_batches
Definition: arima_common.h:251
T * params_sma
Definition: arima_common.h:244
T * Ps_dense
Definition: arima_common.h:249
T ** P_batches
Definition: arima_common.h:250
T * loglike_pert
Definition: arima_common.h:248
T ** m_tmp_batches
Definition: arima_common.h:251
T ** Z_batches
Definition: arima_common.h:250
int * I_m_AxA_P
Definition: arima_common.h:254
T * m_tmp_dense
Definition: arima_common.h:247
T * params_sar
Definition: arima_common.h:244
T ** K_batches
Definition: arima_common.h:252
T * RQR_dense
Definition: arima_common.h:246
size_t size
Definition: arima_common.h:256
T ** I_m_AxA_inv_batches
Definition: arima_common.h:252
T * d_Tparams
Definition: arima_common.h:245
T ** ImT_inv_batches
Definition: arima_common.h:251
T * I_m_AxA_dense
Definition: arima_common.h:248
T * v_tmp_dense
Definition: arima_common.h:247
T * Tparams_ma
Definition: arima_common.h:245
T * params_beta
Definition: arima_common.h:244
int * I_m_AxA_info
Definition: arima_common.h:254
T * P_dense
Definition: arima_common.h:246
T ** v_tmp_batches
Definition: arima_common.h:251
T ** I_m_AxA_batches
Definition: arima_common.h:252
T * params_ar
Definition: arima_common.h:244
char * buf
Definition: arima_common.h:259
ARIMAMemory(const ARIMAOrder &order, int batch_size, int n_obs)
Definition: arima_common.h:357
void append_buffer(ValType *&ptr, size_t n_elem)
Definition: arima_common.h:262
T * R_dense
Definition: arima_common.h:246
T ** RQRs_batches
Definition: arima_common.h:253
T * loglike
Definition: arima_common.h:248
T * exog_diff
Definition: arima_common.h:247
T * RQ_dense
Definition: arima_common.h:246
T * params_sigma2
Definition: arima_common.h:244
T ** RQR_batches
Definition: arima_common.h:250
ARIMAMemory(const ARIMAOrder &order, int batch_size, int n_obs, char *in_buf)
Definition: arima_common.h:370
T * Tparams_sigma2
Definition: arima_common.h:245
T ** TP_batches
Definition: arima_common.h:252
T * Ts_dense
Definition: arima_common.h:248
T * params_ma
Definition: arima_common.h:244
T ** Ts_batches
Definition: arima_common.h:252
Definition: arima_common.h:37
int p
Definition: arima_common.h:38
int s
Definition: arima_common.h:44
int n_phi() const
Definition: arima_common.h:49
int P
Definition: arima_common.h:41
int r() const
Definition: arima_common.h:51
int n_exog
Definition: arima_common.h:46
int rd() const
Definition: arima_common.h:52
int D
Definition: arima_common.h:42
int complexity() const
Definition: arima_common.h:53
int q
Definition: arima_common.h:40
bool need_diff() const
Definition: arima_common.h:54
int n_theta() const
Definition: arima_common.h:50
int Q
Definition: arima_common.h:43
int d
Definition: arima_common.h:39
int k
Definition: arima_common.h:45
int n_diff() const
Definition: arima_common.h:48
Definition: arima_common.h:64
DataT * mu
Definition: arima_common.h:65
DataT * sma
Definition: arima_common.h:70
DataT * beta
Definition: arima_common.h:66
void deallocate(const ARIMAOrder &order, int batch_size, cudaStream_t stream, bool tr=false)
Definition: arima_common.h:116
void unpack(const ARIMAOrder &order, int batch_size, const DataT *param_vec, cudaStream_t stream)
Definition: arima_common.h:197
DataT * ma
Definition: arima_common.h:68
void allocate(const ARIMAOrder &order, int batch_size, cudaStream_t stream, bool tr=false)
Definition: arima_common.h:82
DataT * ar
Definition: arima_common.h:67
DataT * sar
Definition: arima_common.h:69
DataT * sigma2
Definition: arima_common.h:71
void pack(const ARIMAOrder &order, int batch_size, DataT *param_vec, cudaStream_t stream) const
Definition: arima_common.h:150