cpu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
30 
31 #include <cstddef>
32 #include <optional>
33 
34 namespace ML {
35 namespace experimental {
36 namespace fil {
37 namespace detail {
38 namespace inference {
39 
40 /* A wrapper around the underlying inference kernels to support dispatching to
41  * the right kernel
42  *
43  * This specialization is used for CPU inference and for requests for GPU
44  * inference on non-GPU-enabled builds. An exception will be thrown if a
45  * request is made for GPU on inference on a non-GPU-enabled build.
46  *
47  * @tparam D The type of device (CPU/GPU) on which to perform inference.
48  * @tparam has_categorical_nodes Whether or not any node in the model has
49  * categorical splits.
50  * @tparam vector_output_t If non-nullptr_t, the type of vector leaf output
51  * @tparam categorical_data_t If non-nullptr_t, the type of non-local
52  * categorical data storage
53  *
54  * @param forest The forest to be used for inference.
55  * @param postproc The postprocessor object to be used for postprocessing raw
56  * output from the forest.
57  * @param row_count The number of rows in the input
58  * @param col_count The number of columns per row in the input
59  * @param output_count The number of output elements per row
60  * @param vector_output If non-nullptr, a pointer to storage for vector leaf
61  * outputs
62  * @param categorical_data If non-nullptr, a pointer to non-local storage for
63  * data on categorical splits.
64  * @param infer_type Type of inference to perform. Defaults to summing the outputs of all trees
65  * and produce an output per row. If set to "per_tree", we will instead output all outputs of
66  * individual trees. If set to "leaf_id", we will output the integer ID of the leaf node
67  * for each tree.
68  * @param specified_chunk_size If non-nullopt, the mini-batch size used for
69  * processing rows in a batch. For CPU inference, this essentially determines
70  * the granularity of parallelism. A larger chunk size means that a single
71  * thread will process more rows for its assigned trees before fetching a
72  * new batch of rows. In general, so long as the chunk size remains much
73  * smaller than the batch size (minimally less than the batch size divided by
74  * the number of available cores), larger batches see improved performance with
75  * larger chunk sizes. Unlike for GPU, any positive value is valid (up to
76  * hardware constraints), but it is recommended to test powers of 2 from 1
77  * (for individual row inference) to 512 (for very large batch
78  * inference). A value of 64 is a generally-useful default.
79  */
80 template <raft_proto::device_type D,
81  bool has_categorical_nodes,
82  typename forest_t,
83  typename vector_output_t = std::nullptr_t,
84  typename categorical_data_t = std::nullptr_t>
85 std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
86  std::bool_constant<!raft_proto::GPU_ENABLED>>,
87  void>
90  typename forest_t::io_type* output,
91  typename forest_t::io_type* input,
92  index_type row_count,
93  index_type col_count,
94  index_type output_count,
95  vector_output_t vector_output = nullptr,
96  categorical_data_t categorical_data = nullptr,
98  std::optional<index_type> specified_chunk_size = std::nullopt,
101 {
102  if constexpr (D == raft_proto::device_type::gpu) {
103  throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
104  } else {
105  if (infer_type == infer_kind::leaf_id) {
106  infer_kernel_cpu<has_categorical_nodes, true>(
107  forest,
108  postproc,
109  output,
110  input,
111  row_count,
112  col_count,
113  output_count,
114  specified_chunk_size.value_or(hardware_constructive_interference_size),
115  hardware_constructive_interference_size,
116  vector_output,
117  categorical_data,
118  infer_type);
119  } else {
120  infer_kernel_cpu<has_categorical_nodes, false>(
121  forest,
122  postproc,
123  output,
124  input,
125  row_count,
126  col_count,
127  output_count,
128  specified_chunk_size.value_or(hardware_constructive_interference_size),
129  hardware_constructive_interference_size,
130  vector_output,
131  categorical_data,
132  infer_type);
133  }
134  }
135 }
136 
137 /* This macro is invoked here to declare all standard specializations of this
138  * template as extern. This ensures that this (relatively complex) code is
139  * compiled as few times as possible. A macro is used because ever
140  * specialization must be explicitly declared. The final argument to the macro
141  * references the 8 specialization variants compiled in standard cuML FIL. */
143 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 1)
144 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 2)
145 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 3)
146 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
147 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
148 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
149 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
150 
151 } // namespace inference
152 } // namespace detail
153 } // namespace fil
154 
155 } // namespace experimental
156 } // namespace ML
#define CUML_FIL_INFER_ALL(template_type, dev, variant_index)
Definition: infer_macros.hpp:148
std::enable_if_t< std::disjunction_v< std::bool_constant< D==raft_proto::device_type::cpu >, std::bool_constant<!raft_proto::GPU_ENABLED > >, void > infer(forest_t const &forest, postprocessor< typename forest_t::io_type > const &postproc, typename forest_t::io_type *output, typename forest_t::io_type *input, index_type row_count, index_type col_count, index_type output_count, vector_output_t vector_output=nullptr, categorical_data_t categorical_data=nullptr, infer_kind infer_type=infer_kind::default_kind, std::optional< index_type > specified_chunk_size=std::nullopt, raft_proto::device_id< D > device=raft_proto::device_id< D >{}, raft_proto::cuda_stream=raft_proto::cuda_stream{})
Definition: cpu.hpp:88
uint32_t index_type
Definition: index_type.hpp:21
infer_kind
Definition: infer_kind.hpp:20
forest< real_t > * forest_t
Definition: fil.h:89
Definition: dbscan.hpp:30
Definition: buffer.hpp:35
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
Definition: forest.hpp:36
Definition: postprocessor.hpp:141
Definition: base.hpp:22
Definition: gpu_support.hpp:47