cuML C++ API: include/cuml/fil/detail/infer_kernel/cpu.hpp Source File

 /*

  * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.

  * SPDX-License-Identifier: Apache-2.0

  */

 #pragma once

 #include <cuml/fil/detail/cpu_introspection.hpp>

 #include <cuml/fil/detail/evaluate_tree.hpp>

 #include <cuml/fil/detail/index_type.hpp>

 #include <cuml/fil/detail/postprocessor.hpp>

 #include <cuml/fil/detail/raft_proto/ceildiv.hpp>

 #include <cuml/fil/infer_kind.hpp>


 #ifdef _OPENMP

 #include <omp.h>

 #else

 #ifdef omp_get_max_threads

 #if omp_get_max_threads() != 1

 #error "Inconsistent placeholders for omp_get_max_threads"

 #endif

 #else

 #define omp_get_max_threads() 1

 #endif

 #endif


 #include <algorithm>

 #include <cstddef>

 #include <iostream>

 #include <new>

 #include <numeric>

 #include <vector>


 namespace ML {

 namespace fil {

 namespace detail {


 template <bool has_categorical_nodes,

           bool predict_leaf,

           typename forest_t,

           typename vector_output_t    = std::nullptr_t,

           typename categorical_data_t = std::nullptr_t>

 void infer_kernel_cpu(forest_t const& forest,

                       postprocessor<typename forest_t::io_type> const& postproc,

                       typename forest_t::io_type* output,

                       typename forest_t::io_type const* input,

                       index_type row_count,

                       index_type col_count,

                       index_type num_outputs,

                       index_type chunk_size               = hardware_constructive_interference_size,

                       index_type grove_size               = hardware_constructive_interference_size,

                       vector_output_t vector_output_p     = nullptr,

                       categorical_data_t categorical_data = nullptr,

                       infer_kind infer_type               = infer_kind::default_kind)

 {

   auto constexpr has_vector_leaves       = !std::is_same_v<vector_output_t, std::nullptr_t>;

   auto constexpr has_nonlocal_categories = !std::is_same_v<categorical_data_t, std::nullptr_t>;


   using node_t = typename forest_t::node_type;


   using output_t = typename forest_t::template raw_output_type<vector_output_t>;


   auto const num_tree  = forest.tree_count();

   auto const num_grove = raft_proto::ceildiv(num_tree, grove_size);

   auto const num_chunk = raft_proto::ceildiv(row_count, chunk_size);


   auto output_workspace = std::vector<output_t>(row_count * num_outputs * num_grove, output_t{});

   auto const task_count = num_grove * num_chunk;


 #pragma omp parallel num_threads(std::min(index_type(omp_get_max_threads()), task_count))

   {

     // Infer on each grove and chunk

 #pragma omp for

     for (auto task_index = index_type{}; task_index < task_count; ++task_index) {

       auto const grove_index = task_index / num_chunk;

       auto const chunk_index = task_index % num_chunk;

       auto const start_row   = chunk_index * chunk_size;

       auto const end_row     = std::min(start_row + chunk_size, row_count);

       auto const start_tree  = grove_index * grove_size;

       auto const end_tree    = std::min(start_tree + grove_size, num_tree);


       for (auto row_index = start_row; row_index < end_row; ++row_index) {

         for (auto tree_index = start_tree; tree_index < end_tree; ++tree_index) {

           auto tree_output =

             std::conditional_t<predict_leaf,

                                index_type,

                                std::conditional_t<has_vector_leaves,

                                                   typename node_t::index_type,

                                                   typename node_t::threshold_type>>{};

           tree_output = evaluate_tree<has_vector_leaves,

                                       has_categorical_nodes,

                                       has_nonlocal_categories,

                                       predict_leaf>(

             forest, tree_index, input + row_index * col_count, categorical_data);

           if constexpr (predict_leaf) {

             output_workspace[row_index * num_outputs * num_grove + tree_index * num_grove +

                              grove_index] = static_cast<typename forest_t::io_type>(tree_output);

           } else {

             auto const default_num_outputs = forest.num_outputs();

             if constexpr (has_vector_leaves) {

               auto output_offset = (row_index * num_outputs * num_grove +

                                     tree_index * default_num_outputs * num_grove *

                                       (infer_type == infer_kind::per_tree) +

                                     grove_index);

               for (auto output_index = index_type{}; output_index < default_num_outputs;

                    ++output_index) {

                 output_workspace[output_offset + output_index * num_grove] +=

                   vector_output_p[tree_output * default_num_outputs + output_index];

               }

             } else {

               auto output_offset =

                 (row_index * num_outputs * num_grove +

                  (tree_index % default_num_outputs) * num_grove *

                    (infer_type == infer_kind::default_kind) +

                  tree_index * num_grove * (infer_type == infer_kind::per_tree) + grove_index);

               output_workspace[output_offset] += tree_output;

             }

           }

         }  // Trees

       }  // Rows

     }  // Tasks


     // Sum over grove and postprocess

 #pragma omp for

     for (auto row_index = index_type{}; row_index < row_count; ++row_index) {

       for (auto output_index = index_type{}; output_index < num_outputs; ++output_index) {

         auto grove_offset = (row_index * num_outputs * num_grove + output_index * num_grove);


         output_workspace[grove_offset] =

           std::accumulate(std::begin(output_workspace) + grove_offset,

                           std::begin(output_workspace) + grove_offset + num_grove,

                           output_t{});

       }

       postproc(infer_type,

                output_workspace.data() + row_index * num_outputs * num_grove,

                num_outputs,

                forest.bias(),

                output + row_index * num_outputs,

                num_grove);

     }

   }  // End omp parallel

 }


 }  // namespace detail

 }  // namespace fil

 }  // namespace ML

ceildiv.hpp

cpu_introspection.hpp

evaluate_tree.hpp

index_type.hpp

infer_kind.hpp

ML::fil::detail::infer_kernel_cpu
void infer_kernel_cpu(forest_t const &forest, postprocessor< typename forest_t::io_type > const &postproc, typename forest_t::io_type *output, typename forest_t::io_type const *input, index_type row_count, index_type col_count, index_type num_outputs, index_type chunk_size=hardware_constructive_interference_size, index_type grove_size=hardware_constructive_interference_size, vector_output_t vector_output_p=nullptr, categorical_data_t categorical_data=nullptr, infer_kind infer_type=infer_kind::default_kind)
Definition: cpu.hpp:75

ML::fil::detail::evaluate_tree
HOST DEVICE auto evaluate_tree(forest_t const &forest, index_type tree_index, io_t const *__restrict__ row, categorical_data_t categorical_data)
Definition: evaluate_tree.hpp:162

ML::fil::infer_kind
infer_kind
Definition: infer_kind.hpp:8

ML::fil::infer_kind::default_kind
@ default_kind

ML::fil::infer_kind::per_tree
@ per_tree

ML::fil::index_type
uint32_t index_type
Definition: index_type.hpp:9

ML
Definition: dbscan.hpp:18

raft_proto::ceildiv
HOST DEVICE constexpr auto ceildiv(T dividend, U divisor)
Definition: ceildiv.hpp:10

postprocessor.hpp

ML::fil::forest
Definition: forest.hpp:24

ML::fil::forest::num_outputs
HOST DEVICE auto num_outputs() const
Definition: forest.hpp:65

ML::fil::forest::tree_count
HOST DEVICE auto tree_count() const
Definition: forest.hpp:61

ML::fil::forest::bias
HOST DEVICE const auto * bias() const
Definition: forest.hpp:58

ML::fil::postprocessor
Definition: postprocessor.hpp:135