From be5b9eac88977f3b3461a4d29f26a4b7cbdc132a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 14 Apr 2024 22:07:22 -0700 Subject: [PATCH 1/4] Support per-core state using #[thread_local] - Controlled by `thread_local` feature - Does not require nightly features, but using it does - Intercepts `__pre_init` to copy .tdata into the per-core state - Implements `__aeabi_read_tp` to returns per-core state for code compiler generates when accessing `thread_local` variables - Needs linker script support to set up the layout and symbols This is based on [picolibc](https://github.com/picolibc/picolibc/blob/58d6157cc2135df5043d62c3e89feedc20ffcd57/newlib/libc/picolib/machine/arm/read_tp.S#L71)'s support for TLS; its the only example I found of it for rp2040. --- memory.x | 34 +++++++++++++++++++ rp2040-hal/Cargo.toml | 3 ++ rp2040-hal/src/multicore.rs | 65 +++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/memory.x b/memory.x index e6b80c473..372860d94 100644 --- a/memory.x +++ b/memory.x @@ -34,3 +34,37 @@ SECTIONS { KEEP(*(.boot2)); } > BOOT2 } INSERT BEFORE .text; + +/* Per-core (thread) data into flash */ +SECTIONS { + .tdata : ALIGN(4) + { + . = ALIGN(4); + PROVIDE(__tdata_start = .); + *(.tdata .tdata.*); + . = ALIGN(4); + PROVIDE(__tdata_end = .); + } > FLASH + PROVIDE(__tdata_len = __tdata_end - __tdata_start); +} INSERT AFTER .data; + +/* Size per-core state and allocate bss space for each core */ +SECTIONS { + .tbss (NOLOAD) : ALIGN(4) + { + . = ALIGN(4); + PROVIDE(__tbss_start = .); + *(.tbss .tbss.*); + *(.tcommon); + . = ALIGN(4); + PROVIDE(__tbss_end = .); + } > RAM + PROVIDE(__tbss_len = __tbss_end - __tbss_start); + + .tls_state (NOLOAD) : ALIGN(4) { + PROVIDE(TLS_CORE_0 = ALIGN(4)); + . += __tdata_len + __tbss_len; + PROVIDE(TLS_CORE_1 = ALIGN(4)); + . += __tdata_len + __tbss_len; + } > RAM +} INSERT AFTER .bss; diff --git a/rp2040-hal/Cargo.toml b/rp2040-hal/Cargo.toml index 0718629ff..fec750101 100644 --- a/rp2040-hal/Cargo.toml +++ b/rp2040-hal/Cargo.toml @@ -106,6 +106,9 @@ rtic-monotonic = ["dep:rtic-monotonic"] # Implement `i2c-write-iter` traits i2c-write-iter = ["dep:i2c-write-iter"] +# Enable use of thread-local variables for multicore state +thread_local = [] + [[example]] # irq example uses cortex-m-rt::interrupt, need rt feature for that name = "gpio_irq_example" diff --git a/rp2040-hal/src/multicore.rs b/rp2040-hal/src/multicore.rs index d1b018402..afe809cf4 100644 --- a/rp2040-hal/src/multicore.rs +++ b/rp2040-hal/src/multicore.rs @@ -290,3 +290,68 @@ impl<'p> Core<'p> { } } } + +#[cfg(all(target_arch = "arm", feature = "thread_local"))] +mod thread_local { + use core::arch::global_asm; + use core::ptr::addr_of; + + extern "C" { + static TLS_CORE_0: u8; + static TLS_CORE_1: u8; + } + // Not really a const pointer, but we reform it into mut in the asm + static mut TLS_STATE: [*const u8; 2] = [ + // Point to linker-allocated space in .bss + unsafe { addr_of!(TLS_CORE_0) }, + unsafe { addr_of!(TLS_CORE_1) }, + ]; + + // Define `__aeabi_read_tp` called by the compiler to get access to + // thread-local storage. + global_asm! { + ".pushsection .text.__aeabi_read_tp", + ".align 4", + ".p2align 4,,15", + ".global __aeabi_read_tp", + ".type __aeabi_read_tp,%function", + + "__aeabi_read_tp:", + " push {{r1, lr}}", + " ldr r1, =0xd0000000", // Load SIO CPUID addr + " ldr r1, [r1]", // Get current CPUID + " lsls r1, r1, #2", // Scale by 4 + " ldr r0, ={tls_state}", // Load TLS_STATE base addr + " ldr r0, [r0, r1]", // Load CPU per-thread + " pop {{r1, pc}}", + + ".popsection", + tls_state = sym TLS_STATE, + } + + // Intercept __pre_init to hook into the startup code to copy the tdata into + // TLS_CORE_[01]. + global_asm! { + ".pushsection .text.__pre_init", + ".align 4", + ".p2align 4,,15", + ".global __pre_init", + ".type __pre_init,%function", + + "__pre_init:", + " push {{lr}}", + " ldr r0, ={tls_core_0}", + " ldr r1, =__tdata_start", + " ldr r2, =__tdata_len", + " bl __aeabi_memcpy", + " ldr r0, ={tls_core_1}", + " ldr r1, =__tdata_start", + " ldr r2, =__tdata_len", + " bl __aeabi_memcpy", + " pop {{pc}}", + + ".popsection", + tls_core_0 = sym TLS_CORE_0, + tls_core_1 = sym TLS_CORE_1, + } +} From 5716506aaebe128cd18739e4e6a612b4b9180c8e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 19 Apr 2024 23:24:03 -0700 Subject: [PATCH 2/4] Add docs for per-core state. --- rp2040-hal/src/multicore.rs | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/rp2040-hal/src/multicore.rs b/rp2040-hal/src/multicore.rs index afe809cf4..264e52530 100644 --- a/rp2040-hal/src/multicore.rs +++ b/rp2040-hal/src/multicore.rs @@ -33,6 +33,47 @@ //! For inter-processor communications, see [`crate::sio::SioFifo`] and [`crate::sio::Spinlock0`] //! //! For a detailed example, see [examples/multicore_fifo_blink.rs](https://github.com/rp-rs/rp-hal/tree/main/rp2040-hal/examples/multicore_fifo_blink.rs) +//! +//! ## Per-core static data +//! +//! Both cores share the same memory, so a `static` variable will be accessible +//! and shared by both, requiring the same care as it would in a multi-threaded +//! program. +//! +//! With the `thread_local` feature enabled, this module supports the use of the +//! ([unstable](https://github.com/rust-lang/rust/issues/29594)) +//! `#[thread_local]` attribute to make these per-core variables. This allows +//! the same code to run on both cores but with its own core-specific static +//! state, such maintaining program state, or for things like DMA buffers. +//! +//! For example: +//! ```rust,ignore +//! #![feature(thread_local)] +//! # use core::cell::RefCell; +//! +//! #[thread_local] +//! static MY_COUNTER: RefCell = RefCell::new(0); +//! +//! fn next_id() -> usize { +//! MY_COUNTER.replace_with(|c| *c + 1) +//! } +//! ``` +//! +//! Each core will get its own instance of the `MY_COUNTER` variable. Since +//! these are not shared, they do not need atomic operations to update. +//! +//! These core-local variables are initialized on program startup and retain +//! their value from there on, even between invocations of [`Core::spawn`]. +//! +//! Note that this requires some setup in the linker script to allocate space +//! for the static data. See memory.x for details. +//! +//! If the variables are zero-initialized then they will be reserved space in +//! the `.tbss` section in the executable, and then space in `.bss` for each +//! core. Similarly, variables initialized with non-zero constants will be in +//! the executable's `.tdata` section, and have space reserved in `.bss`; the +//! initial values are copied at program startup. Note that this uses the +//! `__pre_init` hook to do this, so it won't be available for other uses. use core::mem::ManuallyDrop; use core::sync::atomic::compiler_fence; From 70c1bb6bae200dfb739dc52d6505cbdffd0e3d60 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 21 Apr 2024 00:35:56 -0700 Subject: [PATCH 3/4] Update asm: - for __eabi_read_tp, simplify to just directly load TLS_CORE_[01] as needed without the need for an indirection. This only uses r0 and doesn't touch the stack. - Write the tdata copy in Rust, being careful to only use raw pointers. The generated asm is functionally identical to the hand-written asm. --- rp2040-hal/Cargo.toml | 4 ++- rp2040-hal/src/multicore.rs | 61 +++++++++++++------------------------ 2 files changed, 25 insertions(+), 40 deletions(-) diff --git a/rp2040-hal/Cargo.toml b/rp2040-hal/Cargo.toml index fec750101..b48b4db1a 100644 --- a/rp2040-hal/Cargo.toml +++ b/rp2040-hal/Cargo.toml @@ -52,6 +52,8 @@ bitfield = { version = "0.14.0" } i2c-write-iter = { version = "1.0.0", features = ["async"], optional = true } +cortex-m-rt = { version = "0.7", optional = true } + [dev-dependencies] cortex-m-rt = "0.7" cortex-m-rtic = "1.1.4" @@ -107,7 +109,7 @@ rtic-monotonic = ["dep:rtic-monotonic"] i2c-write-iter = ["dep:i2c-write-iter"] # Enable use of thread-local variables for multicore state -thread_local = [] +thread_local = ["dep:cortex-m-rt"] [[example]] # irq example uses cortex-m-rt::interrupt, need rt feature for that diff --git a/rp2040-hal/src/multicore.rs b/rp2040-hal/src/multicore.rs index 264e52530..f543075a8 100644 --- a/rp2040-hal/src/multicore.rs +++ b/rp2040-hal/src/multicore.rs @@ -335,18 +335,14 @@ impl<'p> Core<'p> { #[cfg(all(target_arch = "arm", feature = "thread_local"))] mod thread_local { use core::arch::global_asm; - use core::ptr::addr_of; + use core::ptr::{addr_of, addr_of_mut}; extern "C" { - static TLS_CORE_0: u8; - static TLS_CORE_1: u8; + static mut TLS_CORE_0: u8; + static mut TLS_CORE_1: u8; + static __tdata_start: u8; + static __tdata_len: u8; } - // Not really a const pointer, but we reform it into mut in the asm - static mut TLS_STATE: [*const u8; 2] = [ - // Point to linker-allocated space in .bss - unsafe { addr_of!(TLS_CORE_0) }, - unsafe { addr_of!(TLS_CORE_1) }, - ]; // Define `__aeabi_read_tp` called by the compiler to get access to // thread-local storage. @@ -358,41 +354,28 @@ mod thread_local { ".type __aeabi_read_tp,%function", "__aeabi_read_tp:", - " push {{r1, lr}}", - " ldr r1, =0xd0000000", // Load SIO CPUID addr - " ldr r1, [r1]", // Get current CPUID - " lsls r1, r1, #2", // Scale by 4 - " ldr r0, ={tls_state}", // Load TLS_STATE base addr - " ldr r0, [r0, r1]", // Load CPU per-thread - " pop {{r1, pc}}", + " ldr r0, =0xd0000000", // Load SIO CPUID addr + " ldr r0, [r0]", // Load CPUID + " cmp r0, #0", // Check core 0 + " ldr r0, ={core_0}", // Set TLS_CORE_0 + " beq 1f", // skip if done + " ldr r0, ={core_1}", // Set TLS_CORE_1 + "1: bx lr", ".popsection", - tls_state = sym TLS_STATE, + core_0 = sym TLS_CORE_0, + core_1 = sym TLS_CORE_1, } // Intercept __pre_init to hook into the startup code to copy the tdata into // TLS_CORE_[01]. - global_asm! { - ".pushsection .text.__pre_init", - ".align 4", - ".p2align 4,,15", - ".global __pre_init", - ".type __pre_init,%function", - - "__pre_init:", - " push {{lr}}", - " ldr r0, ={tls_core_0}", - " ldr r1, =__tdata_start", - " ldr r2, =__tdata_len", - " bl __aeabi_memcpy", - " ldr r0, ={tls_core_1}", - " ldr r1, =__tdata_start", - " ldr r2, =__tdata_len", - " bl __aeabi_memcpy", - " pop {{pc}}", - - ".popsection", - tls_core_0 = sym TLS_CORE_0, - tls_core_1 = sym TLS_CORE_1, + // + // NB: Run as the very first thing, nothing has been initialized and memory + // could be in arbitrary state, so we only deal with things via raw pointers. + #[cortex_m_rt::pre_init] + unsafe fn tls_pre_init_hook() { + for dst in [addr_of_mut!(TLS_CORE_0), addr_of_mut!(TLS_CORE_1)] { + core::ptr::copy(addr_of!(__tdata_start), dst, addr_of!(__tdata_len) as usize); + } } } From 5a547d0234269d6e1972df9786e000dee9b71fd1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 20 Apr 2024 00:28:03 -0700 Subject: [PATCH 4/4] Add an example of per-core state FIXME: nightly only --- rp2040-hal/Cargo.toml | 4 + rp2040-hal/examples/multicore_percore_data.rs | 166 ++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 rp2040-hal/examples/multicore_percore_data.rs diff --git a/rp2040-hal/Cargo.toml b/rp2040-hal/Cargo.toml index b48b4db1a..ae428c8ea 100644 --- a/rp2040-hal/Cargo.toml +++ b/rp2040-hal/Cargo.toml @@ -203,6 +203,10 @@ required-features = ["critical-section-impl"] name = "multicore_polyblink" required-features = ["critical-section-impl"] +[[example]] +name = "multicore_percore_data" +required-features = ["critical-section-impl", "thread_local"] + [[example]] name = "pio_blink" required-features = ["critical-section-impl"] diff --git a/rp2040-hal/examples/multicore_percore_data.rs b/rp2040-hal/examples/multicore_percore_data.rs new file mode 100644 index 000000000..c36968fc1 --- /dev/null +++ b/rp2040-hal/examples/multicore_percore_data.rs @@ -0,0 +1,166 @@ +//! # Multicore Blinking Example +//! +//! This application blinks two LEDs on GPIOs 2 and 3 at different rates (3Hz +//! and 4Hz respectively.) +//! +//! See the `Cargo.toml` file for Copyright and licence details. +#![no_std] +//#![cfg(feature = "thread_local")] +#![feature(thread_local)] +#![no_main] + +use core::cell::RefCell; + +use cortex_m::delay::Delay; + +use hal::clocks::Clock; +use hal::gpio::{DynPinId, FunctionSio, Pin, Pins, PullDown, SioOutput}; +use hal::multicore::{Multicore, Stack}; +use hal::sio::Sio; +// Ensure we halt the program on panic (if we don't mention this crate it won't +// be linked) +use panic_halt as _; + +// Alias for our HAL crate +use rp2040_hal as hal; + +// A shorter alias for the Peripheral Access Crate, which provides low-level +// register access +use hal::pac; + +// Some traits we need +use embedded_hal::digital::StatefulOutputPin; + +/// The linker will place this boot block at the start of our program image. We +/// need this to help the ROM bootloader get our code up and running. +/// Note: This boot block is not necessary when using a rp-hal based BSP +/// as the BSPs already perform this step. +#[link_section = ".boot2"] +#[used] +pub static BOOT2: [u8; 256] = rp2040_boot2::BOOT_LOADER_GENERIC_03H; + +/// External high-speed crystal on the Raspberry Pi Pico board is 12 MHz. Adjust +/// if your board has a different frequency +const XTAL_FREQ_HZ: u32 = 12_000_000u32; + +/// The frequency at which core 0 will blink its LED (Hz). +const CORE0_FREQ: u32 = 3; +/// The frequency at which core 1 will blink its LED (Hz). +const CORE1_FREQ: u32 = 4; +/// The delay between each toggle of core 0's LED (us). +const CORE0_DELAY: u32 = 1_000_000 / CORE0_FREQ; +/// The delay between each toggle of core 1's LED (us). +const CORE1_DELAY: u32 = 1_000_000 / CORE1_FREQ; + +/// Stack for core 1 +/// +/// Core 0 gets its stack via the normal route - any memory not used by static +/// values is reserved for stack and initialised by cortex-m-rt. +/// To get the same for Core 1, we would need to compile everything separately +/// and modify the linker file for both programs, and that's quite annoying. +/// So instead, core1.spawn takes a [usize] which gets used for the stack. +/// NOTE: We use the `Stack` struct here to ensure that it has 32-byte +/// alignment, which allows the stack guard to take up the least amount of +/// usable RAM. +static mut CORE1_STACK: Stack<4096> = Stack::new(); + +/// State for the blinker +struct BlinkState { + led: Pin, PullDown>, + delay: Delay, + delay_time: u32, +} + +/// Per core blinker state +#[thread_local] +static STATE: RefCell> = RefCell::new(None); + +/// Blink which ever LED with whatever delay, according to the per-core state. +fn blinker() -> ! { + let mut state = STATE.borrow_mut(); + let BlinkState { + led, + delay, + delay_time, + } = state.as_mut().unwrap(); + loop { + led.toggle().unwrap(); + delay.delay_us(*delay_time); + } +} + +/// Entry point to our bare-metal application. +/// +/// The `#[rp2040_hal::entry]` macro ensures the Cortex-M start-up code calls this function +/// as soon as all global variables and the spinlock are initialised. +#[rp2040_hal::entry] +fn main() -> ! { + // Grab our singleton objects + let mut pac = pac::Peripherals::take().unwrap(); + let core = pac::CorePeripherals::take().unwrap(); + + // Set up the watchdog driver - needed by the clock setup code + let mut watchdog = hal::watchdog::Watchdog::new(pac.WATCHDOG); + + // Configure the clocks + let clocks = hal::clocks::init_clocks_and_plls( + XTAL_FREQ_HZ, + pac.XOSC, + pac.CLOCKS, + pac.PLL_SYS, + pac.PLL_USB, + &mut pac.RESETS, + &mut watchdog, + ) + .unwrap(); + + let sys_freq = clocks.system_clock.freq().to_Hz(); + + // Set up the GPIO pins + let mut sio = Sio::new(pac.SIO); + let pins = Pins::new( + pac.IO_BANK0, + pac.PADS_BANK0, + sio.gpio_bank0, + &mut pac.RESETS, + ); + let led1 = pins.gpio2.into_push_pull_output(); + let led2 = pins.gpio3.into_push_pull_output(); + + // Start up the second core to blink the second LED + let mut mc = Multicore::new(&mut pac.PSM, &mut pac.PPB, &mut sio.fifo); + let cores = mc.cores(); + let core1 = &mut cores[1]; + core1 + .spawn(unsafe { &mut CORE1_STACK.mem }, move || { + // Get the second core's copy of the `CorePeripherals`, which are per-core. + // Unfortunately, `cortex-m` doesn't support this properly right now, + // so we have to use `steal`. + let core = unsafe { pac::CorePeripherals::steal() }; + // Set up the delay for the second core. + let delay = Delay::new(core.SYST, sys_freq); + + STATE.borrow_mut().replace(BlinkState { + led: led2.into_dyn_pin(), + delay, + delay_time: CORE1_DELAY, + }); + + // Blink the second LED. + blinker(); + }) + .unwrap(); + + // Set up the delay for the first core. + let delay = Delay::new(core.SYST, sys_freq); + + // Blink the first LED. + STATE.borrow_mut().replace(BlinkState { + led: led1.into_dyn_pin(), + delay, + delay_time: CORE0_DELAY, + }); + blinker(); +} + +// End of file