Loading [MathJax]/extensions/tex2jax.js
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
decision_forest_builder.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
26 
27 #include <stdint.h>
28 
29 #include <algorithm>
30 #include <cmath>
31 #include <cstddef>
32 #include <numeric>
33 #include <optional>
34 #include <vector>
35 
36 namespace ML {
37 namespace experimental {
38 namespace fil {
39 namespace detail {
40 
41 /*
42  * Exception indicating that FIL model could not be built from given input
43  */
44 struct model_builder_error : std::exception {
45  model_builder_error() : model_builder_error("Error while building model") {}
46  model_builder_error(char const* msg) : msg_{msg} {}
47  virtual char const* what() const noexcept { return msg_; }
48 
49  private:
50  char const* msg_;
51 };
52 
53 /*
54  * Struct used to build FIL forests
55  */
56 template <typename decision_forest_t>
58  /* The type for nodes in the given decision_forest type */
59  using node_type = typename decision_forest_t::node_type;
60 
61  /* Add a node with a categorical split */
62  template <typename iter_t>
64  iter_t vec_begin,
65  iter_t vec_end,
66  std::optional<int> tl_node_id = std::nullopt,
67  std::size_t depth = std::size_t{1},
68  bool default_to_distant_child = false,
69  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
70  typename node_type::offset_type offset = typename node_type::offset_type{})
71  {
72  auto constexpr const bin_width = index_type(sizeof(typename node_type::index_type) * 8);
73  auto node_value = typename node_type::index_type{};
74  auto set_storage = &node_value;
75  auto max_node_categories = *std::max_element(vec_begin, vec_end) + 1;
76  if (max_num_categories_ > bin_width) {
77  // TODO(wphicks): Check for overflow here
78  node_value = categorical_storage_.size();
79  auto bins_required = raft_proto::ceildiv(max_node_categories, bin_width);
80  categorical_storage_.push_back(max_node_categories);
81  categorical_storage_.resize(categorical_storage_.size() + bins_required);
82  set_storage = &(categorical_storage_[node_value + 1]);
83  }
84  auto set = bitset{set_storage, max_node_categories};
85  std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
86 
87  add_node(
88  node_value, tl_node_id, depth, false, default_to_distant_child, true, feature, offset, false);
89  }
90 
91  /* Add a leaf node with vector output */
92  template <typename iter_t>
93  void add_leaf_vector_node(iter_t vec_begin,
94  iter_t vec_end,
95  std::optional<int> tl_node_id = std::nullopt,
96  std::size_t depth = std::size_t{1})
97  {
98  auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
99  std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
100 
101  add_node(leaf_index,
102  tl_node_id,
103  depth,
104  true,
105  false,
106  false,
107  typename node_type::metadata_storage_type{},
108  typename node_type::offset_type{},
109  false);
110  }
111 
112  /* Add a node to the model */
113  template <typename value_t>
114  void add_node(
115  value_t val,
116  std::optional<int> tl_node_id = std::nullopt,
117  std::size_t depth = std::size_t{1},
118  bool is_leaf_node = true,
119  bool default_to_distant_child = false,
120  bool is_categorical_node = false,
121  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
122  typename node_type::offset_type offset = typename node_type::offset_type{},
123  bool is_inclusive = false)
124  {
125  if (depth == std::size_t{}) {
126  if (alignment_ != index_type{}) {
127  if (cur_node_index_ % alignment_ != index_type{}) {
128  auto padding = (alignment_ - cur_node_index_ % alignment_);
129  for (auto i = index_type{}; i < padding; ++i) {
130  add_node(typename node_type::threshold_type{}, std::nullopt);
131  }
132  }
133  }
134  root_node_indexes_.push_back(cur_node_index_);
135  }
136 
137  if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
138  nodes_.emplace_back(
139  val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
140  // 0 indicates the lack of ID mapping for a particular node
141  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
142  ++cur_node_index_;
143  }
144 
145  /* Set the element-wise postprocessing operation for this model */
146  void set_element_postproc(element_op val) { element_postproc_ = val; }
147  /* Set the row-wise postprocessing operation for this model */
148  void set_row_postproc(row_op val) { row_postproc_ = val; }
149  /* Set the value to divide by during postprocessing */
150  void set_average_factor(double val) { average_factor_ = val; }
151  /* Set the the bias term to remove during postprocessing */
152  void set_bias(double val) { bias_ = val; }
153  /* Set the the value of the constant used in the postprocessing operation
154  * (if any) */
155  void set_postproc_constant(double val) { postproc_constant_ = val; }
156  /* Set the number of outputs per row for this model */
158  {
159  if (output_size_ != index_type{1} && output_size_ != val) {
160  throw model_import_error("Inconsistent leaf vector size");
161  }
162  output_size_ = val;
163  }
164 
166  index_type align_bytes = index_type{})
167  : cur_node_index_{},
168  max_num_categories_{max_num_categories},
169  alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
170  output_size_{1},
171  row_postproc_{},
172  element_postproc_{},
173  average_factor_{},
174  bias_{},
175  postproc_constant_{},
176  nodes_{},
177  root_node_indexes_{},
178  vector_output_{}
179  {
180  }
181 
182  /* Return the FIL decision forest built by this builder */
183  auto get_decision_forest(index_type num_feature,
184  index_type num_class,
186  int device = 0,
188  {
189  // Allow narrowing for preprocessing constants. They are stored as doubles
190  // for consistency in the builder but must be converted to the proper types
191  // for the concrete forest model.
192 #pragma GCC diagnostic push
193 #pragma GCC diagnostic ignored "-Wnarrowing"
194  return decision_forest_t{
196  raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
197  raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
198  mem_type,
199  device,
200  stream},
201  raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()},
202  mem_type,
203  device,
204  stream},
205  num_feature,
206  num_class,
207  max_num_categories_ != 0,
208  vector_output_.empty()
209  ? std::nullopt
210  : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
211  raft_proto::buffer{vector_output_.data(), vector_output_.size()},
212  mem_type,
213  device,
214  stream),
215  categorical_storage_.empty()
216  ? std::nullopt
217  : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
218  raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
219  mem_type,
220  device,
221  stream),
222  output_size_,
223  row_postproc_,
224  element_postproc_,
225  static_cast<typename node_type::threshold_type>(average_factor_),
226  static_cast<typename node_type::threshold_type>(bias_),
227  static_cast<typename node_type::threshold_type>(postproc_constant_)};
228 #pragma GCC diagnostic pop
229  }
230 
231  private:
232  index_type cur_node_index_;
233  index_type max_num_categories_;
234  index_type alignment_;
235  index_type output_size_;
236  row_op row_postproc_;
237  element_op element_postproc_;
238  double average_factor_;
239  double bias_;
240  double postproc_constant_;
241 
242  std::vector<node_type> nodes_;
243  std::vector<index_type> root_node_indexes_;
244  std::vector<typename node_type::threshold_type> vector_output_;
245  std::vector<typename node_type::index_type> categorical_storage_;
246  std::vector<index_type> node_id_mapping_;
247 };
248 
249 } // namespace detail
250 } // namespace fil
251 } // namespace experimental
252 } // namespace ML
element_op
Definition: postproc_ops.hpp:29
uint32_t index_type
Definition: index_type.hpp:21
row_op
Definition: postproc_ops.hpp:22
Definition: dbscan.hpp:30
HOST DEVICE constexpr auto ceildiv(T dividend, U divisor)
Definition: ceildiv.hpp:21
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
const_agnostic_same_t< T, U > copy(buffer< T > &dst, buffer< U > const &src, typename buffer< T >::index_type dst_offset, typename buffer< U >::index_type src_offset, typename buffer< T >::index_type size, cuda_stream stream)
Definition: buffer.hpp:327
Definition: decision_forest_builder.hpp:57
void set_element_postproc(element_op val)
Definition: decision_forest_builder.hpp:146
typename decision_forest_t::node_type node_type
Definition: decision_forest_builder.hpp:59
void add_leaf_vector_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1})
Definition: decision_forest_builder.hpp:93
void set_average_factor(double val)
Definition: decision_forest_builder.hpp:150
void set_output_size(index_type val)
Definition: decision_forest_builder.hpp:157
void set_postproc_constant(double val)
Definition: decision_forest_builder.hpp:155
void set_bias(double val)
Definition: decision_forest_builder.hpp:152
decision_forest_builder(index_type max_num_categories=index_type{}, index_type align_bytes=index_type{})
Definition: decision_forest_builder.hpp:165
auto get_decision_forest(index_type num_feature, index_type num_class, raft_proto::device_type mem_type=raft_proto::device_type::cpu, int device=0, raft_proto::cuda_stream stream=raft_proto::cuda_stream{})
Definition: decision_forest_builder.hpp:183
void set_row_postproc(row_op val)
Definition: decision_forest_builder.hpp:148
void add_node(value_t val, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1}, bool is_leaf_node=true, bool default_to_distant_child=false, bool is_categorical_node=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{}, bool is_inclusive=false)
Definition: decision_forest_builder.hpp:114
void add_categorical_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1}, bool default_to_distant_child=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{})
Definition: decision_forest_builder.hpp:63
Definition: decision_forest_builder.hpp:44
model_builder_error(char const *msg)
Definition: decision_forest_builder.hpp:46
model_builder_error()
Definition: decision_forest_builder.hpp:45
virtual char const * what() const noexcept
Definition: decision_forest_builder.hpp:47
Definition: exceptions.hpp:36
A container which may or may not own its own data on host or device.
Definition: buffer.hpp:41