rrun.hpp
1 
6 #pragma once
7 
8 #include <optional>
9 #include <string>
10 #include <vector>
11 
12 namespace cucascade::memory {
13 struct system_topology_info;
14 } // namespace cucascade::memory
15 
16 namespace rapidsmpf::rrun {
17 
24 struct bind_options {
25  bool cpu{true};
26  bool memory{true};
27  bool network{true};
28  bool verify{true};
29 };
30 
38  int rank = -1;
39  int gpu_id = -1;
40  std::string gpu_pci_bus_id;
41  std::string cpu_affinity;
42  std::vector<int> numa_nodes;
43  std::string ucx_net_devices;
44 };
45 
53  std::string cpu_affinity;
54  std::vector<int> memory_binding;
55  std::vector<std::string> network_devices;
56 };
57 
62  bool cpu_ok = true;
63  bool numa_ok = true;
64  bool ucx_ok = true;
65  std::string expected_ucx_devices;
66 
71  [[nodiscard]] bool all_passed() const {
72  return cpu_ok && numa_ok && ucx_ok;
73  }
74 };
75 
91 resource_binding check_binding(int gpu_id_hint = -1);
92 
103 std::optional<expected_binding> get_expected_binding(
104  cucascade::memory::system_topology_info const& topology, int gpu_id
105 );
106 
118  resource_binding const& actual, expected_binding const& expected
119 );
120 
154 void bind(
155  std::optional<unsigned int> gpu_id = std::nullopt, bind_options const& options = {}
156 );
157 
183 void bind(
184  cucascade::memory::system_topology_info const& topology,
185  std::optional<unsigned int> gpu_id = std::nullopt,
186  bind_options const& options = {}
187 );
188 
189 } // namespace rapidsmpf::rrun
std::optional< expected_binding > get_expected_binding(cucascade::memory::system_topology_info const &topology, int gpu_id)
Obtain the expected binding for a GPU from pre-discovered topology.
resource_binding check_binding(int gpu_id_hint=-1)
Collect the live resource binding of the calling process.
binding_validation validate_binding(resource_binding const &actual, expected_binding const &expected)
Validate an actual resource binding against an expected one.
void bind(std::optional< unsigned int > gpu_id=std::nullopt, bind_options const &options={})
Bind the calling process to resources topologically close to a GPU.
Options controlling which topology-based resource bindings to apply.
Definition: rrun.hpp:24
bool network
Set UCX_NET_DEVICES to NICs near the GPU.
Definition: rrun.hpp:27
bool memory
Set NUMA memory policy to nodes near the GPU.
Definition: rrun.hpp:26
bool verify
Read back and verify bindings after applying them.
Definition: rrun.hpp:28
bool cpu
Set CPU affinity to cores near the GPU.
Definition: rrun.hpp:25
Results of validating actual vs. expected resource bindings.
Definition: rrun.hpp:61
std::string expected_ucx_devices
Expected UCX devices (comma-separated).
Definition: rrun.hpp:65
bool cpu_ok
CPU affinity check passed.
Definition: rrun.hpp:62
bool all_passed() const
Check if all validations passed.
Definition: rrun.hpp:71
bool numa_ok
NUMA binding check passed.
Definition: rrun.hpp:63
bool ucx_ok
UCX network devices check passed.
Definition: rrun.hpp:64
Expected resource binding derived from topology information.
Definition: rrun.hpp:52
std::vector< std::string > network_devices
Expected network devices.
Definition: rrun.hpp:55
std::vector< int > memory_binding
Expected NUMA node IDs.
Definition: rrun.hpp:54
std::string cpu_affinity
Expected CPU affinity list.
Definition: rrun.hpp:53
Live resource binding configuration collected from the running process.
Definition: rrun.hpp:37
std::string ucx_net_devices
Value of the UCX_NET_DEVICES env var.
Definition: rrun.hpp:43
int rank
Process rank (-1 if not available).
Definition: rrun.hpp:38
std::vector< int > numa_nodes
NUMA node IDs bound to this process.
Definition: rrun.hpp:42
std::string cpu_affinity
CPU affinity string (e.g., "0-19,40-59").
Definition: rrun.hpp:41
std::string gpu_pci_bus_id
GPU PCI bus ID (empty if unavailable).
Definition: rrun.hpp:40
int gpu_id
GPU device ID (-1 if not available).
Definition: rrun.hpp:39