cpu.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include <cuml/fil/constants.hpp>
18 #include <cuml/fil/infer_kind.hpp>
19 
20 #include <cstddef>
21 #include <optional>
22 
23 namespace ML {
24 namespace fil {
25 namespace detail {
26 namespace inference {
27 
28 /* A wrapper around the underlying inference kernels to support dispatching to
29  * the right kernel
30  *
31  * This specialization is used for CPU inference and for requests for GPU
32  * inference on non-GPU-enabled builds. An exception will be thrown if a
33  * request is made for GPU on inference on a non-GPU-enabled build.
34  *
35  * @tparam D The type of device (CPU/GPU) on which to perform inference.
36  * @tparam has_categorical_nodes Whether or not any node in the model has
37  * categorical splits.
38  * @tparam vector_output_t If non-nullptr_t, the type of vector leaf output
39  * @tparam categorical_data_t If non-nullptr_t, the type of non-local
40  * categorical data storage
41  *
42  * @param forest The forest to be used for inference.
43  * @param postproc The postprocessor object to be used for postprocessing raw
44  * output from the forest.
45  * @param row_count The number of rows in the input
46  * @param col_count The number of columns per row in the input
47  * @param output_count The number of output elements per row
48  * @param vector_output If non-nullptr, a pointer to storage for vector leaf
49  * outputs
50  * @param categorical_data If non-nullptr, a pointer to non-local storage for
51  * data on categorical splits.
52  * @param infer_type Type of inference to perform. Defaults to summing the outputs of all trees
53  * and produce an output per row. If set to "per_tree", we will instead output all outputs of
54  * individual trees. If set to "leaf_id", we will output the integer ID of the leaf node
55  * for each tree.
56  * @param specified_chunk_size If non-nullopt, the mini-batch size used for
57  * processing rows in a batch. For CPU inference, this essentially determines
58  * the granularity of parallelism. A larger chunk size means that a single
59  * thread will process more rows for its assigned trees before fetching a
60  * new batch of rows. In general, so long as the chunk size remains much
61  * smaller than the batch size (minimally less than the batch size divided by
62  * the number of available cores), larger batches see improved performance with
63  * larger chunk sizes. Unlike for GPU, any positive value is valid (up to
64  * hardware constraints), but it is recommended to test powers of 2 from 1
65  * (for individual row inference) to 512 (for very large batch
66  * inference). A value of 64 is a generally-useful default.
67  */
68 template <raft_proto::device_type D,
69  bool has_categorical_nodes,
70  typename forest_t,
71  typename vector_output_t = std::nullptr_t,
72  typename categorical_data_t = std::nullptr_t>
73 std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
74  std::bool_constant<!raft_proto::GPU_ENABLED>>,
75  void>
76 infer(forest_t const& forest,
78  typename forest_t::io_type* output,
79  typename forest_t::io_type* input,
80  index_type row_count,
81  index_type col_count,
82  index_type output_count,
83  vector_output_t vector_output = nullptr,
84  categorical_data_t categorical_data = nullptr,
86  std::optional<index_type> specified_chunk_size = std::nullopt,
89 {
90  if constexpr (D == raft_proto::device_type::gpu) {
91  throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
92  } else {
93  if (infer_type == infer_kind::leaf_id) {
94  infer_kernel_cpu<has_categorical_nodes, true>(
95  forest,
96  postproc,
97  output,
98  input,
99  row_count,
100  col_count,
101  output_count,
102  specified_chunk_size.value_or(hardware_constructive_interference_size),
103  hardware_constructive_interference_size,
104  vector_output,
105  categorical_data,
106  infer_type);
107  } else {
108  infer_kernel_cpu<has_categorical_nodes, false>(
109  forest,
110  postproc,
111  output,
112  input,
113  row_count,
114  col_count,
115  output_count,
116  specified_chunk_size.value_or(hardware_constructive_interference_size),
117  hardware_constructive_interference_size,
118  vector_output,
119  categorical_data,
120  infer_type);
121  }
122  }
123 }
124 
125 /* This macro is invoked here to declare all standard specializations of this
126  * template as extern. This ensures that this (relatively complex) code is
127  * compiled as few times as possible. A macro is used because ever
128  * specialization must be explicitly declared. The final argument to the macro
129  * references the 8 specialization variants compiled in standard cuML FIL. */
131 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 1)
132 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 2)
133 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 3)
134 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
135 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
136 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
137 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
138 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 8)
139 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 9)
140 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 10)
141 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 11)
142 
143 } // namespace inference
144 } // namespace detail
145 } // namespace fil
146 
147 } // namespace ML
#define CUML_FIL_INFER_ALL(template_type, dev, variant_index)
Definition: infer_macros.hpp:137
std::enable_if_t< std::disjunction_v< std::bool_constant< D==raft_proto::device_type::cpu >, std::bool_constant<!raft_proto::GPU_ENABLED > >, void > infer(forest_t const &forest, postprocessor< typename forest_t::io_type > const &postproc, typename forest_t::io_type *output, typename forest_t::io_type *input, index_type row_count, index_type col_count, index_type output_count, vector_output_t vector_output=nullptr, categorical_data_t categorical_data=nullptr, infer_kind infer_type=infer_kind::default_kind, std::optional< index_type > specified_chunk_size=std::nullopt, raft_proto::device_id< D > device=raft_proto::device_id< D >{}, raft_proto::cuda_stream=raft_proto::cuda_stream{})
Definition: cpu.hpp:76
infer_kind
Definition: infer_kind.hpp:8
uint32_t index_type
Definition: index_type.hpp:9
Definition: dbscan.hpp:18
Definition: buffer.hpp:24
int cuda_stream
Definition: cuda_stream.hpp:14
device_type
Definition: device_type.hpp:7
Definition: forest.hpp:24
Definition: postprocessor.hpp:135
Definition: base.hpp:11
Definition: gpu_support.hpp:36