decision_forest_builder.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
13 #include <cuml/fil/exceptions.hpp>
15 
16 #include <stdint.h>
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <cstddef>
21 #include <numeric>
22 #include <optional>
23 #include <vector>
24 
25 namespace ML {
26 namespace fil {
27 namespace detail {
28 
29 /*
30  * Exception indicating that FIL model could not be built from given input
31  */
32 struct model_builder_error : std::exception {
33  model_builder_error() : model_builder_error("Error while building model") {}
34  model_builder_error(char const* msg) : msg_{msg} {}
35  virtual char const* what() const noexcept { return msg_; }
36 
37  private:
38  char const* msg_;
39 };
40 
41 /*
42  * Struct used to build FIL forests
43  */
44 template <typename decision_forest_t>
46  /* The type for nodes in the given decision_forest type */
47  using node_type = typename decision_forest_t::node_type;
48 
49  /* Add a node with a categorical split */
50  template <typename iter_t>
52  iter_t vec_begin,
53  iter_t vec_end,
54  std::optional<int> tl_node_id = std::nullopt,
55  std::size_t depth = std::size_t{1},
56  bool default_to_distant_child = false,
57  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
58  typename node_type::offset_type offset = typename node_type::offset_type{})
59  {
60  auto constexpr const bin_width = index_type(sizeof(typename node_type::index_type) * 8);
61  auto node_value = typename node_type::index_type{};
62  auto set_storage = &node_value;
63  auto max_node_categories =
64  (vec_begin != vec_end) ? *std::max_element(vec_begin, vec_end) + 1 : 1;
65  if (max_num_categories_ > bin_width) {
66  // TODO(wphicks): Check for overflow here
67  node_value = categorical_storage_.size();
68  auto bins_required = raft_proto::ceildiv(max_node_categories, bin_width);
69  categorical_storage_.push_back(max_node_categories);
70  categorical_storage_.resize(categorical_storage_.size() + bins_required);
71  set_storage = &(categorical_storage_[node_value + 1]);
72  }
73  auto set = bitset{set_storage, max_node_categories};
74  std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
75 
76  add_node(
77  node_value, tl_node_id, depth, false, default_to_distant_child, true, feature, offset, false);
78  }
79 
80  /* Add a leaf node with vector output */
81  template <typename iter_t>
82  void add_leaf_vector_node(iter_t vec_begin,
83  iter_t vec_end,
84  std::optional<int> tl_node_id = std::nullopt,
85  std::size_t depth = std::size_t{1})
86  {
87  auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
88  std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
89 
90  add_node(leaf_index,
91  tl_node_id,
92  depth,
93  true,
94  false,
95  false,
96  typename node_type::metadata_storage_type{},
97  typename node_type::offset_type{},
98  false);
99  }
100 
101  /* Add a node to the model */
102  template <typename value_t>
103  void add_node(
104  value_t val,
105  std::optional<int> tl_node_id = std::nullopt,
106  std::size_t depth = std::size_t{1},
107  bool is_leaf_node = true,
108  bool default_to_distant_child = false,
109  bool is_categorical_node = false,
110  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
111  typename node_type::offset_type offset = typename node_type::offset_type{},
112  bool is_inclusive = false)
113  {
114  if (depth == std::size_t{}) {
115  if (alignment_ != index_type{}) {
116  if (cur_node_index_ % alignment_ != index_type{}) {
117  auto padding = (alignment_ - cur_node_index_ % alignment_);
118  for (auto i = index_type{}; i < padding; ++i) {
119  add_node(typename node_type::threshold_type{}, std::nullopt);
120  }
121  }
122  }
123  root_node_indexes_.push_back(cur_node_index_);
124  }
125 
126  if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
127  nodes_.emplace_back(
128  val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
129  // 0 indicates the lack of ID mapping for a particular node
130  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
131  ++cur_node_index_;
132  }
133 
134  /* Set the element-wise postprocessing operation for this model */
135  void set_element_postproc(element_op val) { element_postproc_ = val; }
136  /* Set the row-wise postprocessing operation for this model */
137  void set_row_postproc(row_op val) { row_postproc_ = val; }
138  /* Set the value to divide by during postprocessing */
139  void set_average_factor(double val) { average_factor_ = val; }
140  /* Set the bias term, which is added to the output. The bias term
141  * should have the same length as output_size. */
142  void set_bias(std::vector<double> val)
143  {
144  bias_.resize(val.size());
145  std::transform(val.begin(), val.end(), bias_.begin(), [](double e) {
146  return static_cast<typename node_type::threshold_type>(e);
147  });
148  }
149  /* Set the value of the constant used in the postprocessing operation
150  * (if any) */
151  void set_postproc_constant(double val) { postproc_constant_ = val; }
152  /* Set the number of outputs per row for this model */
154  {
155  if (output_size_ != index_type{1} && output_size_ != val) {
156  throw model_import_error("Inconsistent leaf vector size");
157  }
158  output_size_ = val;
159  }
160 
162  index_type align_bytes = index_type{})
163  : cur_node_index_{},
164  max_num_categories_{max_num_categories},
165  alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
166  output_size_{1},
167  row_postproc_{},
168  element_postproc_{},
169  average_factor_{},
170  postproc_constant_{},
171  nodes_{},
172  root_node_indexes_{},
173  vector_output_{},
174  bias_{}
175  {
176  }
177 
178  /* Return the FIL decision forest built by this builder */
179  auto get_decision_forest(index_type num_feature,
180  index_type num_class,
182  int device = 0,
184  {
185  // Allow narrowing for preprocessing constants. They are stored as doubles
186  // for consistency in the builder but must be converted to the proper types
187  // for the concrete forest model.
188 #pragma GCC diagnostic push
189 #pragma GCC diagnostic ignored "-Wnarrowing"
190  return decision_forest_t{
192  raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
193  raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
194  mem_type,
195  device,
196  stream},
197  raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()},
198  mem_type,
199  device,
200  stream},
201  raft_proto::buffer{raft_proto::buffer{bias_.data(), bias_.size()}, mem_type, device, stream},
202  num_feature,
203  num_class,
204  max_num_categories_ != 0,
205  vector_output_.empty()
206  ? std::nullopt
207  : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
208  raft_proto::buffer{vector_output_.data(), vector_output_.size()},
209  mem_type,
210  device,
211  stream),
212  categorical_storage_.empty()
213  ? std::nullopt
214  : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
215  raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
216  mem_type,
217  device,
218  stream),
219  output_size_,
220  row_postproc_,
221  element_postproc_,
222  static_cast<typename node_type::threshold_type>(average_factor_),
223  static_cast<typename node_type::threshold_type>(postproc_constant_)};
224 #pragma GCC diagnostic pop
225  }
226 
227  private:
228  index_type cur_node_index_;
229  index_type max_num_categories_;
230  index_type alignment_;
231  index_type output_size_;
232  row_op row_postproc_;
233  element_op element_postproc_;
234  double average_factor_;
235  double postproc_constant_;
236 
237  std::vector<node_type> nodes_;
238  std::vector<index_type> root_node_indexes_;
239  std::vector<typename node_type::threshold_type> vector_output_;
240  std::vector<typename node_type::threshold_type> bias_;
241  std::vector<typename node_type::index_type> categorical_storage_;
242  std::vector<index_type> node_id_mapping_;
243 };
244 
245 } // namespace detail
246 } // namespace fil
247 } // namespace ML
row_op
Definition: postproc_ops.hpp:10
element_op
Definition: postproc_ops.hpp:17
uint32_t index_type
Definition: index_type.hpp:9
void transform(const raft::handle_t &handle, const KMeansParams ¶ms, const float *centroids, const float *X, int n_samples, int n_features, float *X_new)
Transform X to a cluster-distance space.
Definition: dbscan.hpp:18
HOST DEVICE constexpr auto ceildiv(T dividend, U divisor)
Definition: ceildiv.hpp:10
int cuda_stream
Definition: cuda_stream.hpp:14
device_type
Definition: device_type.hpp:7
const_agnostic_same_t< T, U > copy(buffer< T > &dst, buffer< U > const &src, typename buffer< T >::index_type dst_offset, typename buffer< U >::index_type src_offset, typename buffer< T >::index_type size, cuda_stream stream)
Definition: buffer.hpp:316
Definition: decision_forest_builder.hpp:45
void set_average_factor(double val)
Definition: decision_forest_builder.hpp:139
void set_bias(std::vector< double > val)
Definition: decision_forest_builder.hpp:142
void set_postproc_constant(double val)
Definition: decision_forest_builder.hpp:151
decision_forest_builder(index_type max_num_categories=index_type{}, index_type align_bytes=index_type{})
Definition: decision_forest_builder.hpp:161
void set_row_postproc(row_op val)
Definition: decision_forest_builder.hpp:137
void add_node(value_t val, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1}, bool is_leaf_node=true, bool default_to_distant_child=false, bool is_categorical_node=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{}, bool is_inclusive=false)
Definition: decision_forest_builder.hpp:103
void add_categorical_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1}, bool default_to_distant_child=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{})
Definition: decision_forest_builder.hpp:51
void set_output_size(index_type val)
Definition: decision_forest_builder.hpp:153
auto get_decision_forest(index_type num_feature, index_type num_class, raft_proto::device_type mem_type=raft_proto::device_type::cpu, int device=0, raft_proto::cuda_stream stream=raft_proto::cuda_stream{})
Definition: decision_forest_builder.hpp:179
void set_element_postproc(element_op val)
Definition: decision_forest_builder.hpp:135
void add_leaf_vector_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, std::size_t depth=std::size_t{1})
Definition: decision_forest_builder.hpp:82
typename decision_forest_t::node_type node_type
Definition: decision_forest_builder.hpp:47
Definition: decision_forest_builder.hpp:32
model_builder_error(char const *msg)
Definition: decision_forest_builder.hpp:34
virtual char const * what() const noexcept
Definition: decision_forest_builder.hpp:35
model_builder_error()
Definition: decision_forest_builder.hpp:33
Definition: exceptions.hpp:24
A container which may or may not own its own data on host or device.
Definition: buffer.hpp:30