cpu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cuml/fil/constants.hpp>
29 #include <cuml/fil/infer_kind.hpp>
30 
31 #include <cstddef>
32 #include <optional>
33 
34 namespace ML {
35 namespace fil {
36 namespace detail {
37 namespace inference {
38 
39 /* A wrapper around the underlying inference kernels to support dispatching to
40  * the right kernel
41  *
42  * This specialization is used for CPU inference and for requests for GPU
43  * inference on non-GPU-enabled builds. An exception will be thrown if a
44  * request is made for GPU on inference on a non-GPU-enabled build.
45  *
46  * @tparam D The type of device (CPU/GPU) on which to perform inference.
47  * @tparam has_categorical_nodes Whether or not any node in the model has
48  * categorical splits.
49  * @tparam vector_output_t If non-nullptr_t, the type of vector leaf output
50  * @tparam categorical_data_t If non-nullptr_t, the type of non-local
51  * categorical data storage
52  *
53  * @param forest The forest to be used for inference.
54  * @param postproc The postprocessor object to be used for postprocessing raw
55  * output from the forest.
56  * @param row_count The number of rows in the input
57  * @param col_count The number of columns per row in the input
58  * @param output_count The number of output elements per row
59  * @param vector_output If non-nullptr, a pointer to storage for vector leaf
60  * outputs
61  * @param categorical_data If non-nullptr, a pointer to non-local storage for
62  * data on categorical splits.
63  * @param infer_type Type of inference to perform. Defaults to summing the outputs of all trees
64  * and produce an output per row. If set to "per_tree", we will instead output all outputs of
65  * individual trees. If set to "leaf_id", we will output the integer ID of the leaf node
66  * for each tree.
67  * @param specified_chunk_size If non-nullopt, the mini-batch size used for
68  * processing rows in a batch. For CPU inference, this essentially determines
69  * the granularity of parallelism. A larger chunk size means that a single
70  * thread will process more rows for its assigned trees before fetching a
71  * new batch of rows. In general, so long as the chunk size remains much
72  * smaller than the batch size (minimally less than the batch size divided by
73  * the number of available cores), larger batches see improved performance with
74  * larger chunk sizes. Unlike for GPU, any positive value is valid (up to
75  * hardware constraints), but it is recommended to test powers of 2 from 1
76  * (for individual row inference) to 512 (for very large batch
77  * inference). A value of 64 is a generally-useful default.
78  */
79 template <raft_proto::device_type D,
80  bool has_categorical_nodes,
81  typename forest_t,
82  typename vector_output_t = std::nullptr_t,
83  typename categorical_data_t = std::nullptr_t>
84 std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
85  std::bool_constant<!raft_proto::GPU_ENABLED>>,
86  void>
87 infer(forest_t const& forest,
89  typename forest_t::io_type* output,
90  typename forest_t::io_type* input,
91  index_type row_count,
92  index_type col_count,
93  index_type output_count,
94  vector_output_t vector_output = nullptr,
95  categorical_data_t categorical_data = nullptr,
97  std::optional<index_type> specified_chunk_size = std::nullopt,
100 {
101  if constexpr (D == raft_proto::device_type::gpu) {
102  throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
103  } else {
104  if (infer_type == infer_kind::leaf_id) {
105  infer_kernel_cpu<has_categorical_nodes, true>(
106  forest,
107  postproc,
108  output,
109  input,
110  row_count,
111  col_count,
112  output_count,
113  specified_chunk_size.value_or(hardware_constructive_interference_size),
114  hardware_constructive_interference_size,
115  vector_output,
116  categorical_data,
117  infer_type);
118  } else {
119  infer_kernel_cpu<has_categorical_nodes, false>(
120  forest,
121  postproc,
122  output,
123  input,
124  row_count,
125  col_count,
126  output_count,
127  specified_chunk_size.value_or(hardware_constructive_interference_size),
128  hardware_constructive_interference_size,
129  vector_output,
130  categorical_data,
131  infer_type);
132  }
133  }
134 }
135 
136 /* This macro is invoked here to declare all standard specializations of this
137  * template as extern. This ensures that this (relatively complex) code is
138  * compiled as few times as possible. A macro is used because ever
139  * specialization must be explicitly declared. The final argument to the macro
140  * references the 8 specialization variants compiled in standard cuML FIL. */
142 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 1)
143 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 2)
144 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 3)
145 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
146 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
147 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
148 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
149 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 8)
150 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 9)
151 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 10)
152 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 11)
153 
154 } // namespace inference
155 } // namespace detail
156 } // namespace fil
157 
158 } // namespace ML
#define CUML_FIL_INFER_ALL(template_type, dev, variant_index)
Definition: infer_macros.hpp:148
std::enable_if_t< std::disjunction_v< std::bool_constant< D==raft_proto::device_type::cpu >, std::bool_constant<!raft_proto::GPU_ENABLED > >, void > infer(forest_t const &forest, postprocessor< typename forest_t::io_type > const &postproc, typename forest_t::io_type *output, typename forest_t::io_type *input, index_type row_count, index_type col_count, index_type output_count, vector_output_t vector_output=nullptr, categorical_data_t categorical_data=nullptr, infer_kind infer_type=infer_kind::default_kind, std::optional< index_type > specified_chunk_size=std::nullopt, raft_proto::device_id< D > device=raft_proto::device_id< D >{}, raft_proto::cuda_stream=raft_proto::cuda_stream{})
Definition: cpu.hpp:87
infer_kind
Definition: infer_kind.hpp:19
uint32_t index_type
Definition: index_type.hpp:20
Definition: dbscan.hpp:29
Definition: buffer.hpp:35
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
Definition: forest.hpp:35
Definition: postprocessor.hpp:140
Definition: base.hpp:22
Definition: gpu_support.hpp:47