From 6c0ad21a4a3b5a0778271762cc3571c8c55917c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isa=C3=AFe?= Date: Mon, 15 Jul 2024 11:28:33 +0200 Subject: [PATCH] refactor: replace `hwloc2` with `hwlocality` (#91) * replace hwloc2 by hwlocality * fix thread binding code this is patchwork; I'm not sure this works as intended * add a CLI switch to bind threads when `--bind-threads` is present, rayon threads will be bound to cores * add hwloc dep to clippy & build CI * fix ci? * add railguards for arch that do not support thread binding --- .github/workflows/simple-ci.yml | 5 +- fastiron/Cargo.toml | 2 +- fastiron/src/main.rs | 61 ++++++++++++++++++------- fastiron/src/parameters.rs | 4 ++ fastiron/src/utils/input.rs | 6 ++- fastiron/src/utils/mc_processor_info.rs | 4 ++ 6 files changed, 61 insertions(+), 21 deletions(-) diff --git a/.github/workflows/simple-ci.yml b/.github/workflows/simple-ci.yml index 3e8ad028..e7e6295a 100644 --- a/.github/workflows/simple-ci.yml +++ b/.github/workflows/simple-ci.yml @@ -17,7 +17,7 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - - run: sudo apt-get install -y libhwloc-dev + - run: sudo apt-get install -y libhwloc-dev libudev-dev - name: Run rust tests run: cargo test --all @@ -38,6 +38,7 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 + - run: sudo apt-get install -y libhwloc-dev - name: Run clippy run: cargo clippy -- -D warnings @@ -48,6 +49,6 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - - run: sudo apt-get install -y libhwloc-dev + - run: sudo apt-get install -y libhwloc-dev libudev-dev - name: Build the crate run: cargo build --all diff --git a/fastiron/Cargo.toml b/fastiron/Cargo.toml index 7f7c0354..1dfb2a1b 100644 --- a/fastiron/Cargo.toml +++ b/fastiron/Cargo.toml @@ -13,7 +13,7 @@ rand = { version = "0.8.5", features = ["small_rng"] } tinyvec = { version = "1.6.1" } rayon = { version = "1.10.0" } atomic = { version = "0.5.3" } # further upgrade == breaking change -hwloc2 = { version = "2.2.0" } # should be replaced with hwlocality +hwlocality = { version = "1.0.0-alpha.5" } libc = { version = "0.2.155" } rustc-hash = { version = "2.0.0" } diff --git a/fastiron/src/main.rs b/fastiron/src/main.rs index dcc9ab3a..c107b9e4 100644 --- a/fastiron/src/main.rs +++ b/fastiron/src/main.rs @@ -3,13 +3,18 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use clap::Parser; -use fastiron::data::tallies::TalliedEvent; -use hwloc2::{CpuBindFlags, ObjectType, Topology, TopologyObject}; +use hwlocality::topology::support::{DiscoverySupport, FeatureSupport}; use num::{one, zero, FromPrimitive}; + +use hwlocality::cpu::binding::CpuBindingFlags; +use hwlocality::object::types::ObjectType; +use hwlocality::object::TopologyObject; +use hwlocality::Topology; use rayon::ThreadPoolBuilder; use fastiron::constants::sim::SRC_FRACTION; use fastiron::constants::CustomFloat; +use fastiron::data::tallies::TalliedEvent; use fastiron::init::{init_mcdata, init_mcunits, init_particle_containers, init_results}; use fastiron::montecarlo::{MonteCarloData, MonteCarloResults, MonteCarloUnit}; use fastiron::parameters::Parameters; @@ -70,13 +75,27 @@ pub fn run(cli: Cli) { // rayon only => one global thread pool if mcdata.exec_info.exec_policy == ExecPolicy::Rayon { // custom thread-pool init in this case - if mcdata.exec_info.n_rayon_threads != 0 { - let topo = Arc::new(Mutex::new(Topology::new().unwrap())); - ThreadPoolBuilder::new() - .num_threads(mcdata.exec_info.n_rayon_threads) - .start_handler(move |thread_id| bind_threads(thread_id, &topo)) - .build_global() - .unwrap(); + if mcdata.exec_info.n_rayon_threads != 0 && mcdata.exec_info.bind_threads { + let topology = Topology::new().unwrap(); + // these railguards were taken from the bind_threads_cpu of hwlocality + if !topology.supports(FeatureSupport::discovery, DiscoverySupport::pu_count) { + println!("[Warning] cannot bind threads to core -- reporting of PU objects not supported"); + } else if topology.feature_support().cpu_binding().is_none() { + println!("[Warning] cannot bind threads to core -- CPU binding not supported"); + } else { + let cpu_support = topology.feature_support().cpu_binding().unwrap(); + if !(cpu_support.get_thread() && cpu_support.set_thread()) { + println!("[Warning] cannot bind threads to core -- CPU binding queries not supported"); + } else { + // we can get to work + let topo = Arc::new(Mutex::new(topology)); + ThreadPoolBuilder::new() + .num_threads(mcdata.exec_info.n_rayon_threads) + .start_handler(move |thread_id| bind_threads(thread_id, &topo)) + .build_global() + .unwrap(); + } + } } mcdata.exec_info.n_rayon_threads = rayon::current_num_threads(); } @@ -132,6 +151,10 @@ pub fn run(cli: Cli) { coral_benchmark_correctness(&mcresults); } +//========================== +// End of simulation cleanup +//========================== + pub fn game_over( mcdata: &MonteCarloData, mcunits: &mut [MonteCarloUnit], @@ -153,21 +176,25 @@ pub fn game_over( } } +//======================== +// Thread binding routines +//======================== + pub fn bind_threads(thread_id: usize, topo: &Arc>) { // get thread id let pthread_id = unsafe { libc::pthread_self() }; // get cpu topology - let mut locked_topo = topo.lock().unwrap(); + let locked_topo = topo.lock().unwrap(); // get current thread's cpu affinity let cpu_set = { let ancestor_lvl = locked_topo - .depth_or_above_for_type(&ObjectType::NUMANode) - .unwrap_or(0); - let targets = locked_topo.objects_at_depth(ancestor_lvl); - let ancestor = targets.first().expect("No common ancestor found"); - let processing_units = locked_topo.objects_with_type(&ObjectType::PU).unwrap(); + .depth_or_above_for_type(ObjectType::NUMANode) + .unwrap_or_default(); + let mut targets = locked_topo.objects_at_depth(ancestor_lvl); + let ancestor = targets.next().expect("No common ancestor found"); + let processing_units = locked_topo.objects_with_type(ObjectType::PU); let unit = processing_units - .iter() + .into_iter() .filter(|pu| has_ancestor(pu, ancestor)) .cycle() .nth(thread_id) @@ -175,7 +202,7 @@ pub fn bind_threads(thread_id: usize, topo: &Arc>) { unit.cpuset().unwrap() }; - match locked_topo.set_cpubind_for_thread(pthread_id, cpu_set, CpuBindFlags::CPUBIND_THREAD) { + match locked_topo.bind_thread_cpu(pthread_id, cpu_set, CpuBindingFlags::THREAD) { Ok(_) => {} Err(e) => { println!("[Error]: Could not bind threads to cpu cores:"); diff --git a/fastiron/src/parameters.rs b/fastiron/src/parameters.rs index 85a96f90..0837e3b2 100644 --- a/fastiron/src/parameters.rs +++ b/fastiron/src/parameters.rs @@ -330,6 +330,8 @@ pub struct SimulationParameters { pub chunk_size: u64, /// Number of threads that should be used to run the simulation. pub n_rayon_threads: u64, + /// Switch used to bind rayon threads to physical cores. + pub bind_threads: bool, /// Number of units that should be used to run the simulation. pub n_units: u64, /// Number of steps simulated by the program. @@ -403,6 +405,7 @@ impl SimulationParameters { fetch_from_cli!(n_particles); fetch_from_cli!(chunk_size); fetch_from_cli!(n_rayon_threads); + simulation_params.bind_threads = cli.bind_threads; fetch_from_cli!(n_units); fetch_from_cli!(n_steps); fetch_from_cli!(nx); @@ -427,6 +430,7 @@ impl Default for SimulationParameters { n_particles: 1000000, chunk_size: 0, n_rayon_threads: 1, + bind_threads: false, n_units: 1, n_steps: 10, nx: 10, diff --git a/fastiron/src/utils/input.rs b/fastiron/src/utils/input.rs index bb231dc4..c11d1595 100644 --- a/fastiron/src/utils/input.rs +++ b/fastiron/src/utils/input.rs @@ -52,7 +52,7 @@ pub struct Cli { /// enable thread debugging if present #[arg(short = 't', long = "debug-threads", num_args(0))] - pub debug_threads: bool, + pub debug_threads: bool, // currently unused /// enable single-precision float type usage if present #[arg(short = 'p', long = "single-precision", num_args(0))] @@ -97,6 +97,10 @@ pub struct Cli { )] pub n_rayon_threads: Option, + /// bind rayon threads to physical cores -- can improve performance on large scale NUMA systems + #[arg(long = "bind-threads", num_args(0))] + pub bind_threads: bool, + /// number of units that should be used to run the simulation #[arg( short = 'u', diff --git a/fastiron/src/utils/mc_processor_info.rs b/fastiron/src/utils/mc_processor_info.rs index 6afa871c..9cee9ff6 100644 --- a/fastiron/src/utils/mc_processor_info.rs +++ b/fastiron/src/utils/mc_processor_info.rs @@ -38,6 +38,8 @@ pub struct MCProcessorInfo { pub n_processors: usize, /// Number of thread(s) used for execution. pub n_rayon_threads: usize, + /// Switch to bind rayon threads to physical cores. + pub bind_threads: bool, /// Size of the chunks used by rayon. pub chunk_size: usize, /// Number of unit(s) used for (distributed) execution. @@ -59,6 +61,7 @@ impl MCProcessorInfo { } } else if sim_params.n_rayon_threads != 1 { res.n_rayon_threads = sim_params.n_rayon_threads as usize; + res.bind_threads = sim_params.bind_threads; res.exec_policy = ExecPolicy::Rayon; }; res.chunk_size = sim_params.chunk_size as usize; @@ -78,6 +81,7 @@ impl Default for MCProcessorInfo { exec_policy: Default::default(), n_processors: 1, n_rayon_threads: 1, + bind_threads: false, chunk_size: 0, n_units: 1, }