decision_forest_builder.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
26 
27 #include <stdint.h>
28 
29 #include <algorithm>
30 #include <cmath>
31 #include <cstddef>
32 #include <numeric>
33 #include <optional>
34 #include <vector>
35 
36 namespace ML {
37 namespace experimental {
38 namespace fil {
39 namespace detail {
40 
41 /*
42  * Exception indicating that FIL model could not be built from given input
43  */
44 struct model_builder_error : std::exception {
45  model_builder_error() : model_builder_error("Error while building model") {}
46  model_builder_error(char const* msg) : msg_{msg} {}
47  virtual char const* what() const noexcept { return msg_; }
48 
49  private:
50  char const* msg_;
51 };
52 
53 /*
54  * Struct used to build FIL forests
55  */
56 template <typename decision_forest_t>
58  /* The type for nodes in the given decision_forest type */
59  using node_type = typename decision_forest_t::node_type;
60 
61  /* Add a root node, indicating the beginning of a new tree */
63  {
64  if (root_node_indexes_.empty()) {
65  root_node_indexes_.emplace_back();
66  } else {
67  max_tree_size_ = std::max(cur_tree_size_, max_tree_size_);
68  if (alignment_ != index_type{}) {
69  if (cur_tree_size_ % alignment_ != index_type{}) {
70  auto padding = (alignment_ - cur_tree_size_ % alignment_);
71  for (auto i = index_type{}; i < padding; ++i) {
72  add_node(typename node_type::threshold_type{}, std::nullopt);
73  }
74  }
75  }
76  root_node_indexes_.push_back(root_node_indexes_.back() + cur_tree_size_);
77  cur_tree_size_ = index_type{};
78  }
79  }
80 
81  /* Add a node with a categorical split */
82  template <typename iter_t>
84  iter_t vec_begin,
85  iter_t vec_end,
86  std::optional<int> tl_node_id = std::nullopt,
87  bool default_to_distant_child = false,
88  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
89  typename node_type::offset_type offset = typename node_type::offset_type{})
90  {
91  auto constexpr const bin_width = index_type(sizeof(typename node_type::index_type) * 8);
92  auto node_value = typename node_type::index_type{};
93  auto set_storage = &node_value;
94  auto max_node_categories = *std::max_element(vec_begin, vec_end) + 1;
95  if (max_num_categories_ > bin_width) {
96  // TODO(wphicks): Check for overflow here
97  node_value = categorical_storage_.size();
98  auto bins_required = raft_proto::ceildiv(max_node_categories, bin_width);
99  categorical_storage_.push_back(max_node_categories);
100  categorical_storage_.resize(categorical_storage_.size() + bins_required);
101  set_storage = &(categorical_storage_[node_value + 1]);
102  }
103  auto set = bitset{set_storage, max_node_categories};
104  std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
105 
106  add_node(node_value, tl_node_id, false, default_to_distant_child, true, feature, offset, false);
107  }
108 
109  /* Add a leaf node with vector output */
110  template <typename iter_t>
111  void add_leaf_vector_node(iter_t vec_begin,
112  iter_t vec_end,
113  std::optional<int> tl_node_id = std::nullopt)
114  {
115  auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
116  std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
117  nodes_.emplace_back(leaf_index,
118  true,
119  false,
120  false,
121  typename node_type::metadata_storage_type{},
122  typename node_type::offset_type{});
123  // 0 indicates the lack of ID mapping for a particular node
124  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
125  ++cur_tree_size_;
126  }
127 
128  /* Add a node to the model */
129  template <typename value_t>
130  void add_node(
131  value_t val,
132  std::optional<int> tl_node_id = std::nullopt,
133  bool is_leaf_node = true,
134  bool default_to_distant_child = false,
135  bool is_categorical_node = false,
136  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
137  typename node_type::offset_type offset = typename node_type::offset_type{},
138  bool is_inclusive = false)
139  {
140  if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
141  nodes_.emplace_back(
142  val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
143  // 0 indicates the lack of ID mapping for a particular node
144  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
145  ++cur_tree_size_;
146  }
147 
148  /* Set the element-wise postprocessing operation for this model */
149  void set_element_postproc(element_op val) { element_postproc_ = val; }
150  /* Set the row-wise postprocessing operation for this model */
151  void set_row_postproc(row_op val) { row_postproc_ = val; }
152  /* Set the value to divide by during postprocessing */
153  void set_average_factor(double val) { average_factor_ = val; }
154  /* Set the the bias term to remove during postprocessing */
155  void set_bias(double val) { bias_ = val; }
156  /* Set the the value of the constant used in the postprocessing operation
157  * (if any) */
158  void set_postproc_constant(double val) { postproc_constant_ = val; }
159  /* Set the number of outputs per row for this model */
161  {
162  if (output_size_ != index_type{1} && output_size_ != val) {
163  throw model_import_error("Inconsistent leaf vector size");
164  }
165  output_size_ = val;
166  }
167 
169  index_type align_bytes = index_type{})
170  : cur_tree_size_{},
171  max_num_categories_{max_num_categories},
172  alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
173  output_size_{1},
174  element_postproc_{},
175  average_factor_{},
176  row_postproc_{},
177  bias_{},
178  postproc_constant_{},
179  max_tree_size_{},
180  nodes_{},
181  root_node_indexes_{},
182  vector_output_{}
183  {
184  }
185 
186  /* Return the FIL decision forest built by this builder */
187  auto get_decision_forest(index_type num_feature,
188  index_type num_class,
190  int device = 0,
192  {
193  // Allow narrowing for preprocessing constants. They are stored as doubles
194  // for consistency in the builder but must be converted to the proper types
195  // for the concrete forest model.
196 #pragma GCC diagnostic push
197 #pragma GCC diagnostic ignored "-Wnarrowing"
198  return decision_forest_t{
200  raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
201  raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
202  mem_type,
203  device,
204  stream},
205  raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()},
206  mem_type,
207  device,
208  stream},
209  num_feature,
210  num_class,
211  max_num_categories_ != 0,
212  vector_output_.empty()
213  ? std::nullopt
214  : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
215  raft_proto::buffer{vector_output_.data(), vector_output_.size()},
216  mem_type,
217  device,
218  stream),
219  categorical_storage_.empty()
220  ? std::nullopt
221  : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
222  raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
223  mem_type,
224  device,
225  stream),
226  output_size_,
227  row_postproc_,
228  element_postproc_,
229  static_cast<typename node_type::threshold_type>(average_factor_),
230  static_cast<typename node_type::threshold_type>(bias_),
231  static_cast<typename node_type::threshold_type>(postproc_constant_)};
232 #pragma GCC diagnostic pop
233  }
234 
235  private:
236  index_type cur_tree_size_;
237  index_type max_num_categories_;
238  index_type alignment_;
239  index_type output_size_;
240  row_op row_postproc_;
241  element_op element_postproc_;
242  double average_factor_;
243  double bias_;
244  double postproc_constant_;
245  index_type max_tree_size_;
246 
247  std::vector<node_type> nodes_;
248  std::vector<index_type> root_node_indexes_;
249  std::vector<typename node_type::threshold_type> vector_output_;
250  std::vector<typename node_type::index_type> categorical_storage_;
251  std::vector<index_type> node_id_mapping_;
252 };
253 
254 } // namespace detail
255 } // namespace fil
256 } // namespace experimental
257 } // namespace ML
math_t max(math_t a, math_t b)
Definition: learning_rate.h:27
element_op
Definition: postproc_ops.hpp:29
uint32_t index_type
Definition: index_type.hpp:21
row_op
Definition: postproc_ops.hpp:22
Definition: dbscan.hpp:30
HOST DEVICE constexpr auto ceildiv(T dividend, U divisor)
Definition: ceildiv.hpp:21
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
const_agnostic_same_t< T, U > copy(buffer< T > &dst, buffer< U > const &src, typename buffer< T >::index_type dst_offset, typename buffer< U >::index_type src_offset, typename buffer< T >::index_type size, cuda_stream stream)
Definition: buffer.hpp:327
Definition: decision_forest_builder.hpp:57
void set_element_postproc(element_op val)
Definition: decision_forest_builder.hpp:149
void start_new_tree()
Definition: decision_forest_builder.hpp:62
typename decision_forest_t::node_type node_type
Definition: decision_forest_builder.hpp:59
void set_average_factor(double val)
Definition: decision_forest_builder.hpp:153
void add_leaf_vector_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt)
Definition: decision_forest_builder.hpp:111
void set_output_size(index_type val)
Definition: decision_forest_builder.hpp:160
void set_postproc_constant(double val)
Definition: decision_forest_builder.hpp:158
void set_bias(double val)
Definition: decision_forest_builder.hpp:155
decision_forest_builder(index_type max_num_categories=index_type{}, index_type align_bytes=index_type{})
Definition: decision_forest_builder.hpp:168
void add_categorical_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, bool default_to_distant_child=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{})
Definition: decision_forest_builder.hpp:83
auto get_decision_forest(index_type num_feature, index_type num_class, raft_proto::device_type mem_type=raft_proto::device_type::cpu, int device=0, raft_proto::cuda_stream stream=raft_proto::cuda_stream{})
Definition: decision_forest_builder.hpp:187
void set_row_postproc(row_op val)
Definition: decision_forest_builder.hpp:151
void add_node(value_t val, std::optional< int > tl_node_id=std::nullopt, bool is_leaf_node=true, bool default_to_distant_child=false, bool is_categorical_node=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{}, bool is_inclusive=false)
Definition: decision_forest_builder.hpp:130
Definition: decision_forest_builder.hpp:44
model_builder_error(char const *msg)
Definition: decision_forest_builder.hpp:46
model_builder_error()
Definition: decision_forest_builder.hpp:45
virtual char const * what() const noexcept
Definition: decision_forest_builder.hpp:47
Definition: exceptions.hpp:36
A container which may or may not own its own data on host or device.
Definition: buffer.hpp:41