diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b309b9aa..d8ceb0f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,25 @@ concurrency: cancel-in-progress: true jobs: + compat: + if: github.event.pull_request.draft == false + name: Wasm-compatibility + runs-on: ubuntu-latest + strategy: + matrix: + target: + - wasm32-unknown-unknown + - wasm32-wasi + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + + - name: Download WASM targets + run: rustup target add "${{ matrix.target }}" + # We run WASM build (for tests) which compiles the lib allowig us to have + # `getrandom` as a dev-dependency. + - name: Build + run: cargo build --tests --release --features "bn256-table derive_serde prefetch" --target "${{ matrix.target }}" test: if: github.event.pull_request.draft == false name: Test @@ -41,8 +60,10 @@ jobs: strategy: matrix: include: - - feature: - feature: default + - feature: bn256-table + - feature: derive_serde + - feature: asm steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 diff --git a/Cargo.toml b/Cargo.toml index a5c1730e..0810983f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "halo2curves-axiom" -version = "0.5.3" +version = "0.6.1" authors = ["Privacy Scaling Explorations team", "Taiko Labs", "Intrinsic Technologies"] license = "MIT/Apache-2.0" edition = "2021" @@ -19,6 +19,11 @@ hex = "0.4" rand_chacha = "0.3.1" sha3 = "0.10.8" +# Added to make sure we are able to build the lib in the CI. +# Notice this will never be loaded for someone using this lib as dep. +[target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies] +getrandom = { version = "0.2", features = ["js"] } + [dependencies] subtle = "2.4" ff = { version = "0.13.0", default-features = false, features = ["std"] } @@ -36,13 +41,13 @@ serde = { version = "1.0", default-features = false, optional = true } serde_arrays = { version = "0.1.0", optional = true } hex = { version = "0.4", optional = true, default-features = false, features = ["alloc", "serde"] } blake2b_simd = "1" -maybe-rayon = { version = "0.1.0", default-features = false } +rayon = "1.8" digest = "0.10.7" sha2 = "0.10.8" +unroll = "0.1.5" [features] -default = ["bits", "multicore", "bn256-table", "derive_serde"] -multicore = ["maybe-rayon/threads"] +default = ["bits", "bn256-table", "derive_serde"] asm = [] bits = ["ff/bits"] bn256-table = [] diff --git a/README.md b/README.md index 3ede071c..a7057af1 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,19 @@ The implementations were originally ported from [matterlabs/pairing](https://git * Various features related to serialization and deserialization of curve points and field elements. * Curve-specific optimizations and benchmarking capabilities. +## Controlling parallelism + +`halo2curves` currently uses [rayon](https://github.com/rayon-rs/rayon) for parallel +computation. + +The `RAYON_NUM_THREADS` environment variable can be used to set the number of +threads. + +When compiling to WASM-targets, notice that since version `1.7`, `rayon` will fallback automatically (with no need to handle features) to require `getrandom` in order to be able to work. +For more info related to WASM-compilation. + +See: [Rayon: Usage with WebAssembly](https://github.com/rayon-rs/rayon#usage-with-webassembly) for more info. + ## Benchmarks Benchmarking is supported through the use of Rust's built-in test framework. Benchmarks can be run without assembly optimizations: diff --git a/rust-toolchain b/rust-toolchain index 832e9afb..dc87e8af 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -1.70.0 +1.74.0 diff --git a/src/arithmetic.rs b/src/arithmetic.rs index 4575d5e3..b88adeb5 100644 --- a/src/arithmetic.rs +++ b/src/arithmetic.rs @@ -45,6 +45,30 @@ pub(crate) const fn macx(a: u64, b: u64, c: u64) -> (u64, u64) { (res as u64, (res >> 64) as u64) } +/// Returns a >= b +#[inline(always)] +pub(crate) const fn bigint_geq(a: &[u64; 4], b: &[u64; 4]) -> bool { + if a[3] > b[3] { + return true; + } else if a[3] < b[3] { + return false; + } + if a[2] > b[2] { + return true; + } else if a[2] < b[2] { + return false; + } + if a[1] > b[1] { + return true; + } else if a[1] < b[1] { + return false; + } + if a[0] >= b[0] { + return true; + } + false +} + /// Compute a * b, returning the result. #[inline(always)] pub(crate) fn mul_512(a: [u64; 4], b: [u64; 4]) -> [u64; 8] { diff --git a/src/bn256/fq.rs b/src/bn256/fq.rs index 23da849a..8f96ded8 100644 --- a/src/bn256/fq.rs +++ b/src/bn256/fq.rs @@ -3,7 +3,7 @@ use crate::bn256::assembly::field_arithmetic_asm; #[cfg(not(feature = "asm"))] use crate::{arithmetic::macx, field_arithmetic, field_specific}; -use crate::arithmetic::{adc, mac, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use crate::{ diff --git a/src/bn256/fr.rs b/src/bn256/fr.rs index c256f488..bd418b14 100644 --- a/src/bn256/fr.rs +++ b/src/bn256/fr.rs @@ -18,7 +18,7 @@ pub use table::FR_TABLE; #[cfg(not(feature = "bn256-table"))] use crate::impl_from_u64; -use crate::arithmetic::{adc, mac, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use crate::{ diff --git a/src/derive/field.rs b/src/derive/field.rs index 8d4ef783..24f95ccf 100644 --- a/src/derive/field.rs +++ b/src/derive/field.rs @@ -63,73 +63,88 @@ macro_rules! field_common { $crate::ff_ext::jacobi::jacobi::<5>(&self.0, &$modulus.0) } - #[cfg(feature = "asm")] const fn montgomery_form(val: [u64; 4], r: $field) -> $field { // Converts a 4 64-bit limb value into its congruent field representation. // If `val` representes a 256 bit value then `r` should be R^2, // if `val` represents the 256 MSB of a 512 bit value, then `r` should be R^3. - let (r0, carry) = mac(0, val[0], r.0[0], 0); - let (r1, carry) = mac(0, val[0], r.0[1], carry); - let (r2, carry) = mac(0, val[0], r.0[2], carry); - let (r3, r4) = mac(0, val[0], r.0[3], carry); - - let (r1, carry) = mac(r1, val[1], r.0[0], 0); - let (r2, carry) = mac(r2, val[1], r.0[1], carry); - let (r3, carry) = mac(r3, val[1], r.0[2], carry); - let (r4, r5) = mac(r4, val[1], r.0[3], carry); - - let (r2, carry) = mac(r2, val[2], r.0[0], 0); - let (r3, carry) = mac(r3, val[2], r.0[1], carry); - let (r4, carry) = mac(r4, val[2], r.0[2], carry); - let (r5, r6) = mac(r5, val[2], r.0[3], carry); - - let (r3, carry) = mac(r3, val[3], r.0[0], 0); - let (r4, carry) = mac(r4, val[3], r.0[1], carry); - let (r5, carry) = mac(r5, val[3], r.0[2], carry); - let (r6, r7) = mac(r6, val[3], r.0[3], carry); - - // Montgomery reduction - let k = r0.wrapping_mul($inv); - let (_, carry) = mac(r0, k, $modulus.0[0], 0); - let (r1, carry) = mac(r1, k, $modulus.0[1], carry); - let (r2, carry) = mac(r2, k, $modulus.0[2], carry); - let (r3, carry) = mac(r3, k, $modulus.0[3], carry); - let (r4, carry2) = adc(r4, 0, carry); - - let k = r1.wrapping_mul($inv); - let (_, carry) = mac(r1, k, $modulus.0[0], 0); - let (r2, carry) = mac(r2, k, $modulus.0[1], carry); - let (r3, carry) = mac(r3, k, $modulus.0[2], carry); - let (r4, carry) = mac(r4, k, $modulus.0[3], carry); - let (r5, carry2) = adc(r5, carry2, carry); - - let k = r2.wrapping_mul($inv); - let (_, carry) = mac(r2, k, $modulus.0[0], 0); - let (r3, carry) = mac(r3, k, $modulus.0[1], carry); - let (r4, carry) = mac(r4, k, $modulus.0[2], carry); - let (r5, carry) = mac(r5, k, $modulus.0[3], carry); - let (r6, carry2) = adc(r6, carry2, carry); - - let k = r3.wrapping_mul($inv); - let (_, carry) = mac(r3, k, $modulus.0[0], 0); - let (r4, carry) = mac(r4, k, $modulus.0[1], carry); - let (r5, carry) = mac(r5, k, $modulus.0[2], carry); - let (r6, carry) = mac(r6, k, $modulus.0[3], carry); - let (r7, carry2) = adc(r7, carry2, carry); - - // Result may be within MODULUS of the correct value - let (d0, borrow) = sbb(r4, $modulus.0[0], 0); - let (d1, borrow) = sbb(r5, $modulus.0[1], borrow); - let (d2, borrow) = sbb(r6, $modulus.0[2], borrow); - let (d3, borrow) = sbb(r7, $modulus.0[3], borrow); - let (_, borrow) = sbb(carry2, 0, borrow); - let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0); - let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry); - let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry); - let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry); + #[cfg(feature = "asm")] + { + let (r0, carry) = mac(0, val[0], r.0[0], 0); + let (r1, carry) = mac(0, val[0], r.0[1], carry); + let (r2, carry) = mac(0, val[0], r.0[2], carry); + let (r3, r4) = mac(0, val[0], r.0[3], carry); + + let (r1, carry) = mac(r1, val[1], r.0[0], 0); + let (r2, carry) = mac(r2, val[1], r.0[1], carry); + let (r3, carry) = mac(r3, val[1], r.0[2], carry); + let (r4, r5) = mac(r4, val[1], r.0[3], carry); + + let (r2, carry) = mac(r2, val[2], r.0[0], 0); + let (r3, carry) = mac(r3, val[2], r.0[1], carry); + let (r4, carry) = mac(r4, val[2], r.0[2], carry); + let (r5, r6) = mac(r5, val[2], r.0[3], carry); + + let (r3, carry) = mac(r3, val[3], r.0[0], 0); + let (r4, carry) = mac(r4, val[3], r.0[1], carry); + let (r5, carry) = mac(r5, val[3], r.0[2], carry); + let (r6, r7) = mac(r6, val[3], r.0[3], carry); + + // Montgomery reduction + let k = r0.wrapping_mul($inv); + let (_, carry) = mac(r0, k, $modulus.0[0], 0); + let (r1, carry) = mac(r1, k, $modulus.0[1], carry); + let (r2, carry) = mac(r2, k, $modulus.0[2], carry); + let (r3, carry) = mac(r3, k, $modulus.0[3], carry); + let (r4, carry2) = adc(r4, 0, carry); + + let k = r1.wrapping_mul($inv); + let (_, carry) = mac(r1, k, $modulus.0[0], 0); + let (r2, carry) = mac(r2, k, $modulus.0[1], carry); + let (r3, carry) = mac(r3, k, $modulus.0[2], carry); + let (r4, carry) = mac(r4, k, $modulus.0[3], carry); + let (r5, carry2) = adc(r5, carry2, carry); + + let k = r2.wrapping_mul($inv); + let (_, carry) = mac(r2, k, $modulus.0[0], 0); + let (r3, carry) = mac(r3, k, $modulus.0[1], carry); + let (r4, carry) = mac(r4, k, $modulus.0[2], carry); + let (r5, carry) = mac(r5, k, $modulus.0[3], carry); + let (r6, carry2) = adc(r6, carry2, carry); + + let k = r3.wrapping_mul($inv); + let (_, carry) = mac(r3, k, $modulus.0[0], 0); + let (r4, carry) = mac(r4, k, $modulus.0[1], carry); + let (r5, carry) = mac(r5, k, $modulus.0[2], carry); + let (r6, carry) = mac(r6, k, $modulus.0[3], carry); + let (r7, carry2) = adc(r7, carry2, carry); + + // Result may be within MODULUS of the correct value + let (d0, borrow) = sbb(r4, $modulus.0[0], 0); + let (d1, borrow) = sbb(r5, $modulus.0[1], borrow); + let (d2, borrow) = sbb(r6, $modulus.0[2], borrow); + let (d3, borrow) = sbb(r7, $modulus.0[3], borrow); + let (_, borrow) = sbb(carry2, 0, borrow); + let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0); + let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry); + let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry); + let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry); + + $field([d0, d1, d2, d3]) + } - $field([d0, d1, d2, d3]) + #[cfg(not(feature = "asm"))] + { + let mut val = val; + if bigint_geq(&val, &$modulus.0) { + let mut borrow = 0; + (val[0], borrow) = sbb(val[0], $modulus.0[0], borrow); + (val[1], borrow) = sbb(val[1], $modulus.0[1], borrow); + (val[2], borrow) = sbb(val[2], $modulus.0[2], borrow); + (val[3], _) = sbb(val[3], $modulus.0[3], borrow); + } + $field::mul(&$field(val), &r) + } } fn from_u512(limbs: [u64; 8]) -> $field { @@ -150,27 +165,13 @@ macro_rules! field_common { let lower_256 = [limbs[0], limbs[1], limbs[2], limbs[3]]; let upper_256 = [limbs[4], limbs[5], limbs[6], limbs[7]]; - #[cfg(feature = "asm")] - { - Self::montgomery_form(lower_256, $r2) + Self::montgomery_form(upper_256, $r3) - } - #[cfg(not(feature = "asm"))] - { - $field(lower_256) * $r2 + $field(upper_256) * $r3 - } + Self::montgomery_form(lower_256, $r2) + Self::montgomery_form(upper_256, $r3) } /// Converts from an integer represented in little endian /// into its (congruent) `$field` representation. pub const fn from_raw(val: [u64; 4]) -> Self { - #[cfg(feature = "asm")] - { - Self::montgomery_form(val, $r2) - } - #[cfg(not(feature = "asm"))] - { - (&$field(val)).mul(&$r2) - } + Self::montgomery_form(val, $r2) } /// Attempts to convert a little-endian byte representation of @@ -429,31 +430,69 @@ macro_rules! field_arithmetic { } /// Multiplies `rhs` by `self`, returning the result. - #[inline] - pub const fn mul(&self, rhs: &Self) -> $field { - // Schoolbook multiplication + #[inline(always)] + #[unroll::unroll_for_loops] + #[allow(unused_assignments)] + pub const fn mul(&self, rhs: &Self) -> Self { + // Fast Coarsely Integrated Operand Scanning (CIOS) as described + // in Algorithm 2 of EdMSM: https://eprint.iacr.org/2022/1400.pdf + // + // Cannot use the fast version (algorithm 2) if + // modulus_high_word >= (WORD_SIZE - 1) / 2 - 1 = (2^64 - 1)/2 - 1 + + if $modulus.0[3] < (u64::MAX / 2) { + const N: usize = 4; + let mut t: [u64; N] = [0u64; N]; + let mut c_2: u64; + for i in 0..4 { + let mut c: u64 = 0u64; + for j in 0..4 { + (t[j], c) = mac(t[j], self.0[j], rhs.0[i], c); + } + c_2 = c; + + let m = t[0].wrapping_mul(INV); + (_, c) = macx(t[0], m, $modulus.0[0]); + + for j in 1..4 { + (t[j - 1], c) = mac(t[j], m, $modulus.0[j], c); + } + (t[N - 1], _) = adc(c_2, c, 0); + } + + if bigint_geq(&t, &$modulus.0) { + let mut borrow = 0; + (t[0], borrow) = sbb(t[0], $modulus.0[0], borrow); + (t[1], borrow) = sbb(t[1], $modulus.0[1], borrow); + (t[2], borrow) = sbb(t[2], $modulus.0[2], borrow); + (t[3], borrow) = sbb(t[3], $modulus.0[3], borrow); + } + $field(t) + } else { + // Schoolbook multiplication - let (r0, carry) = mac(0, self.0[0], rhs.0[0], 0); - let (r1, carry) = mac(0, self.0[0], rhs.0[1], carry); - let (r2, carry) = mac(0, self.0[0], rhs.0[2], carry); - let (r3, r4) = mac(0, self.0[0], rhs.0[3], carry); + let (r0, carry) = mac(0, self.0[0], rhs.0[0], 0); + let (r1, carry) = mac(0, self.0[0], rhs.0[1], carry); + let (r2, carry) = mac(0, self.0[0], rhs.0[2], carry); + let (r3, r4) = mac(0, self.0[0], rhs.0[3], carry); - let (r1, carry) = mac(r1, self.0[1], rhs.0[0], 0); - let (r2, carry) = mac(r2, self.0[1], rhs.0[1], carry); - let (r3, carry) = mac(r3, self.0[1], rhs.0[2], carry); - let (r4, r5) = mac(r4, self.0[1], rhs.0[3], carry); + let (r1, carry) = mac(r1, self.0[1], rhs.0[0], 0); + let (r2, carry) = mac(r2, self.0[1], rhs.0[1], carry); + let (r3, carry) = mac(r3, self.0[1], rhs.0[2], carry); + let (r4, r5) = mac(r4, self.0[1], rhs.0[3], carry); - let (r2, carry) = mac(r2, self.0[2], rhs.0[0], 0); - let (r3, carry) = mac(r3, self.0[2], rhs.0[1], carry); - let (r4, carry) = mac(r4, self.0[2], rhs.0[2], carry); - let (r5, r6) = mac(r5, self.0[2], rhs.0[3], carry); + let (r2, carry) = mac(r2, self.0[2], rhs.0[0], 0); + let (r3, carry) = mac(r3, self.0[2], rhs.0[1], carry); + let (r4, carry) = mac(r4, self.0[2], rhs.0[2], carry); + let (r5, r6) = mac(r5, self.0[2], rhs.0[3], carry); - let (r3, carry) = mac(r3, self.0[3], rhs.0[0], 0); - let (r4, carry) = mac(r4, self.0[3], rhs.0[1], carry); - let (r5, carry) = mac(r5, self.0[3], rhs.0[2], carry); - let (r6, r7) = mac(r6, self.0[3], rhs.0[3], carry); + let (r3, carry) = mac(r3, self.0[3], rhs.0[0], 0); + let (r4, carry) = mac(r4, self.0[3], rhs.0[1], carry); + let (r5, carry) = mac(r5, self.0[3], rhs.0[2], carry); + let (r6, r7) = mac(r6, self.0[3], rhs.0[3], carry); - $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7]) + $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7]) + } } /// Subtracts `rhs` from `self`, returning the result. diff --git a/src/ed25519/fq.rs b/src/ed25519/fq.rs index fed7e413..5d04442a 100644 --- a/src/ed25519/fq.rs +++ b/src/ed25519/fq.rs @@ -8,7 +8,7 @@ use subtle::{Choice, ConditionallySelectable, ConstantTimeEq, CtOption}; #[cfg(feature = "derive_serde")] use serde::{Deserialize, Serialize}; -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; /// This represents an element of $\mathbb{F}_q$ where /// diff --git a/src/ed25519/fr.rs b/src/ed25519/fr.rs index 4ef3ab4b..e91ed4fe 100644 --- a/src/ed25519/fr.rs +++ b/src/ed25519/fr.rs @@ -8,7 +8,7 @@ use subtle::{Choice, ConditionallySelectable, ConstantTimeEq, CtOption}; #[cfg(feature = "derive_serde")] use serde::{Deserialize, Serialize}; -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; /// This represents an element of $\mathbb{F}_q$ where /// diff --git a/src/fft.rs b/src/fft.rs index 6eb3487e..00eca39a 100644 --- a/src/fft.rs +++ b/src/fft.rs @@ -1,4 +1,3 @@ -use crate::multicore; pub use crate::{CurveAffine, CurveExt}; use ff::Field; use group::{GroupOpsOwned, ScalarMulOwned}; @@ -38,7 +37,7 @@ pub fn best_fft>(a: &mut [G], omega: Scalar, r } - let threads = multicore::current_num_threads(); + let threads = rayon::current_num_threads(); let log_threads = threads.ilog2(); let n = a.len(); assert_eq!(n, 1 << log_n); @@ -107,7 +106,7 @@ pub fn recursive_butterfly_arithmetic>( a[1] -= &t; } else { let (left, right) = a.split_at_mut(n / 2); - multicore::join( + rayon::join( || recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles), || recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles), ); diff --git a/src/lib.rs b/src/lib.rs index 36f1fcda..3397043d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,6 @@ pub mod ff_ext; pub mod fft; pub mod hash_to_curve; pub mod msm; -pub mod multicore; pub mod serde; pub mod bls12_381; diff --git a/src/msm.rs b/src/msm.rs index 1a3709c1..25af9711 100644 --- a/src/msm.rs +++ b/src/msm.rs @@ -1,10 +1,14 @@ use std::ops::Neg; +use crate::CurveAffine; +use ff::Field; use ff::PrimeField; use group::Group; -use pasta_curves::arithmetic::CurveAffine; +use rayon::iter::{ + IndexedParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator, +}; -use crate::multicore; +const BATCH_SIZE: usize = 64; fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 { // Booth encoding: @@ -50,6 +54,238 @@ fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 { } } +fn batch_add( + size: usize, + buckets: &mut [BucketAffine], + points: &[SchedulePoint], + bases: &[Affine], +) { + let mut t = vec![C::Base::ZERO; size]; + let mut z = vec![C::Base::ZERO; size]; + let mut acc = C::Base::ONE; + + for ( + ( + SchedulePoint { + base_idx, + buck_idx, + sign, + }, + t, + ), + z, + ) in points.iter().zip(t.iter_mut()).zip(z.iter_mut()) + { + *z = buckets[*buck_idx].x() - bases[*base_idx].x; + if *sign { + *t = acc * (buckets[*buck_idx].y() - bases[*base_idx].y); + } else { + *t = acc * (buckets[*buck_idx].y() + bases[*base_idx].y); + } + acc *= *z; + } + + acc = acc.invert().unwrap(); + + for ( + ( + SchedulePoint { + base_idx, + buck_idx, + sign, + }, + t, + ), + z, + ) in points.iter().zip(t.iter()).zip(z.iter()).rev() + { + let lambda = acc * t; + acc *= z; + + let x = lambda.square() - (buckets[*buck_idx].x() + bases[*base_idx].x); + if *sign { + buckets[*buck_idx].set_y(&((lambda * (bases[*base_idx].x - x)) - bases[*base_idx].y)); + } else { + buckets[*buck_idx].set_y(&((lambda * (bases[*base_idx].x - x)) + bases[*base_idx].y)); + } + buckets[*buck_idx].set_x(&x); + } +} + +#[derive(Debug, Clone, Copy)] +struct Affine { + x: C::Base, + y: C::Base, +} + +impl Affine { + fn from(point: &C) -> Self { + let coords = point.coordinates().unwrap(); + + Self { + x: *coords.x(), + y: *coords.y(), + } + } + + fn neg(&self) -> Self { + Self { + x: self.x, + y: -self.y, + } + } + + fn eval(&self) -> C { + C::from_xy(self.x, self.y).unwrap() + } +} + +#[derive(Debug, Clone)] +enum BucketAffine { + None, + Point(Affine), +} + +#[derive(Debug, Clone)] +enum Bucket { + None, + Point(C::Curve), +} + +impl Bucket { + fn add_assign(&mut self, point: &C, sign: bool) { + *self = match *self { + Bucket::None => Bucket::Point({ + if sign { + point.to_curve() + } else { + point.to_curve().neg() + } + }), + Bucket::Point(a) => { + if sign { + Self::Point(a + point) + } else { + Self::Point(a - point) + } + } + } + } + + fn add(&self, other: &BucketAffine) -> C::Curve { + match (self, other) { + (Self::Point(this), BucketAffine::Point(other)) => *this + other.eval(), + (Self::Point(this), BucketAffine::None) => *this, + (Self::None, BucketAffine::Point(other)) => other.eval().to_curve(), + (Self::None, BucketAffine::None) => C::Curve::identity(), + } + } +} + +impl BucketAffine { + fn assign(&mut self, point: &Affine, sign: bool) -> bool { + match *self { + Self::None => { + *self = Self::Point(if sign { *point } else { point.neg() }); + true + } + Self::Point(_) => false, + } + } + + fn x(&self) -> C::Base { + match self { + Self::None => panic!("::x None"), + Self::Point(a) => a.x, + } + } + + fn y(&self) -> C::Base { + match self { + Self::None => panic!("::y None"), + Self::Point(a) => a.y, + } + } + + fn set_x(&mut self, x: &C::Base) { + match self { + Self::None => panic!("::set_x None"), + Self::Point(ref mut a) => a.x = *x, + } + } + + fn set_y(&mut self, y: &C::Base) { + match self { + Self::None => panic!("::set_y None"), + Self::Point(ref mut a) => a.y = *y, + } + } +} + +struct Schedule { + buckets: Vec>, + set: [SchedulePoint; BATCH_SIZE], + ptr: usize, +} + +#[derive(Debug, Clone, Default)] +struct SchedulePoint { + base_idx: usize, + buck_idx: usize, + sign: bool, +} + +impl SchedulePoint { + fn new(base_idx: usize, buck_idx: usize, sign: bool) -> Self { + Self { + base_idx, + buck_idx, + sign, + } + } +} + +impl Schedule { + fn new(c: usize) -> Self { + let set = (0..BATCH_SIZE) + .map(|_| SchedulePoint::default()) + .collect::>() + .try_into() + .unwrap(); + + Self { + buckets: vec![BucketAffine::None; 1 << (c - 1)], + set, + ptr: 0, + } + } + + fn contains(&self, buck_idx: usize) -> bool { + self.set.iter().any(|sch| sch.buck_idx == buck_idx) + } + + fn execute(&mut self, bases: &[Affine]) { + if self.ptr != 0 { + batch_add(self.ptr, &mut self.buckets, &self.set, bases); + self.ptr = 0; + self.set + .iter_mut() + .for_each(|sch| *sch = SchedulePoint::default()); + } + } + + fn add(&mut self, bases: &[Affine], base_idx: usize, buck_idx: usize, sign: bool) { + if !self.buckets[buck_idx].assign(&bases[base_idx], sign) { + self.set[self.ptr] = SchedulePoint::new(base_idx, buck_idx, sign); + self.ptr += 1; + } + + if self.ptr == self.set.len() { + self.execute(bases); + } + } +} + pub fn multiexp_serial(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) { let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); @@ -123,30 +359,6 @@ pub fn multiexp_serial(coeffs: &[C::Scalar], bases: &[C], acc: & } } -/// Performs a small multi-exponentiation operation. -/// Uses the double-and-add algorithm with doublings shared across points. -pub fn small_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { - let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); - let mut acc = C::Curve::identity(); - - // for byte idx - for byte_idx in (0..32).rev() { - // for bit idx - for bit_idx in (0..8).rev() { - acc = acc.double(); - // for each coeff - for coeff_idx in 0..coeffs.len() { - let byte = coeffs[coeff_idx].as_ref()[byte_idx]; - if ((byte >> bit_idx) & 1) != 0 { - acc += bases[coeff_idx]; - } - } - } - } - - acc -} - /// Performs a multi-exponentiation operation. /// /// This function will panic if coeffs and bases have a different length. @@ -155,12 +367,12 @@ pub fn small_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::C pub fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { assert_eq!(coeffs.len(), bases.len()); - let num_threads = multicore::current_num_threads(); + let num_threads = rayon::current_num_threads(); if coeffs.len() > num_threads { let chunk = coeffs.len() / num_threads; let num_chunks = coeffs.chunks(chunk).len(); let mut results = vec![C::Curve::identity(); num_chunks]; - multicore::scope(|scope| { + rayon::scope(|scope| { let chunk = coeffs.len() / num_threads; for ((coeffs, bases), acc) in coeffs @@ -180,142 +392,96 @@ pub fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Cu acc } } +/// +/// This function will panic if coeffs and bases have a different length. +/// +/// This will use multithreading if beneficial. +pub fn best_multiexp_independent_points( + coeffs: &[C::Scalar], + bases: &[C], +) -> C::Curve { + assert_eq!(coeffs.len(), bases.len()); -#[cfg(test)] -mod test { - - use std::ops::Neg; - - use crate::{ - bn256::{Fr, G1Affine, G1}, - multicore, + // TODO: consider adjusting it with emprical data? + let c = if bases.len() < 4 { + 1 + } else if bases.len() < 32 { + 3 + } else { + (f64::from(bases.len() as u32)).ln().ceil() as usize }; - use ark_std::{end_timer, start_timer}; - use ff::{Field, PrimeField}; - use group::{Curve, Group}; - use pasta_curves::arithmetic::CurveAffine; - use rand_core::OsRng; - - // keeping older implementation it here for baseline comparison, debugging & benchmarking - fn best_multiexp(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve { - assert_eq!(coeffs.len(), bases.len()); - let num_threads = multicore::current_num_threads(); - if coeffs.len() > num_threads { - let chunk = coeffs.len() / num_threads; - let num_chunks = coeffs.chunks(chunk).len(); - let mut results = vec![C::Curve::identity(); num_chunks]; - multicore::scope(|scope| { - let chunk = coeffs.len() / num_threads; - - for ((coeffs, bases), acc) in coeffs - .chunks(chunk) - .zip(bases.chunks(chunk)) - .zip(results.iter_mut()) - { - scope.spawn(move |_| { - multiexp_serial(coeffs, bases, acc); - }); - } - }); - results.iter().fold(C::Curve::identity(), |a, b| a + b) - } else { - let mut acc = C::Curve::identity(); - multiexp_serial(coeffs, bases, &mut acc); - acc - } + if c < 10 { + return best_multiexp(coeffs, bases); } - // keeping older implementation it here for baseline comparision, debugging & benchmarking - fn multiexp_serial(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) { - let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect(); - - let c = if bases.len() < 4 { - 1 - } else if bases.len() < 32 { - 3 - } else { - (f64::from(bases.len() as u32)).ln().ceil() as usize - }; - - fn get_at(segment: usize, c: usize, bytes: &F::Repr) -> usize { - let skip_bits = segment * c; - let skip_bytes = skip_bits / 8; + // coeffs to byte representation + let coeffs: Vec<_> = coeffs.par_iter().map(|a| a.to_repr()).collect(); + // copy bases into `Affine` to skip in on curve check for every access + let bases_local: Vec<_> = bases.par_iter().map(Affine::from).collect(); - if skip_bytes >= 32 { - return 0; - } - - let mut v = [0; 8]; - for (v, o) in v.iter_mut().zip(bytes.as_ref()[skip_bytes..].iter()) { - *v = *o; + // number of windows + let number_of_windows = C::Scalar::NUM_BITS as usize / c + 1; + // accumumator for each window + let mut acc = vec![C::Curve::identity(); number_of_windows]; + acc.par_iter_mut().enumerate().rev().for_each(|(w, acc)| { + // jacobian buckets for already scheduled points + let mut j_bucks = vec![Bucket::::None; 1 << (c - 1)]; + + // schedular for affine addition + let mut sched = Schedule::new(c); + + for (base_idx, coeff) in coeffs.iter().enumerate() { + let buck_idx = get_booth_index(w, c, coeff.as_ref()); + + if buck_idx != 0 { + // parse bucket index + let sign = buck_idx.is_positive(); + let buck_idx = buck_idx.unsigned_abs() as usize - 1; + + if sched.contains(buck_idx) { + // greedy accumulation + // we use original bases here + j_bucks[buck_idx].add_assign(&bases[base_idx], sign); + } else { + // also flushes the schedule if full + sched.add(&bases_local, base_idx, buck_idx, sign); + } } - - let mut tmp = u64::from_le_bytes(v); - tmp >>= skip_bits - (skip_bytes * 8); - tmp %= 1 << c; - - tmp as usize } - let segments = (256 / c) + 1; - - for current_segment in (0..segments).rev() { - for _ in 0..c { - *acc = acc.double(); - } - - #[derive(Clone, Copy)] - enum Bucket { - None, - Affine(C), - Projective(C::Curve), - } + // flush the schedule + sched.execute(&bases_local); - impl Bucket { - fn add_assign(&mut self, other: &C) { - *self = match *self { - Bucket::None => Bucket::Affine(*other), - Bucket::Affine(a) => Bucket::Projective(a + *other), - Bucket::Projective(mut a) => { - a += *other; - Bucket::Projective(a) - } - } - } + // summation by parts + // e.g. 3a + 2b + 1c = a + + // (a) + b + + // ((a) + b) + c + let mut running_sum = C::Curve::identity(); + for (j_buck, a_buck) in j_bucks.iter().zip(sched.buckets.iter()).rev() { + running_sum += j_buck.add(a_buck); + *acc += running_sum; + } - fn add(self, mut other: C::Curve) -> C::Curve { - match self { - Bucket::None => other, - Bucket::Affine(a) => { - other += a; - other - } - Bucket::Projective(a) => other + a, - } - } - } + // shift accumulator to the window position + for _ in 0..c * w { + *acc = acc.double(); + } + }); + acc.into_iter().sum::<_>() +} - let mut buckets: Vec> = vec![Bucket::None; (1 << c) - 1]; +#[cfg(test)] +mod test { - for (coeff, base) in coeffs.iter().zip(bases.iter()) { - let coeff = get_at::(current_segment, c, coeff); - if coeff != 0 { - buckets[coeff - 1].add_assign(base); - } - } + use std::ops::Neg; - // Summation by parts - // e.g. 3a + 2b + 1c = a + - // (a) + b + - // ((a) + b) + c - let mut running_sum = C::Curve::identity(); - for exp in buckets.into_iter().rev() { - running_sum = exp.add(running_sum); - *acc += &running_sum; - } - } - } + use crate::bn256::{Fr, G1Affine, G1}; + use ark_std::{end_timer, start_timer}; + use ff::{Field, PrimeField}; + use group::{Curve, Group}; + use pasta_curves::arithmetic::CurveAffine; + use rand_core::OsRng; #[test] fn test_booth_encoding() { @@ -379,21 +545,19 @@ mod test { let points = &points[..1 << k]; let scalars = &scalars[..1 << k]; - let t0 = start_timer!(|| format!("w/ booth k={}", k)); - let e0 = super::best_multiexp(scalars, points); + let t0 = start_timer!(|| format!("cyclone k={}", k)); + let e0 = super::best_multiexp_independent_points(scalars, points); end_timer!(t0); - let t1 = start_timer!(|| format!("w/o booth k={}", k)); - let e1 = best_multiexp(scalars, points); + let t1 = start_timer!(|| format!("older k={}", k)); + let e1 = super::best_multiexp(scalars, points); end_timer!(t1); - assert_eq!(e0, e1); } } #[test] fn test_msm_cross() { - run_msm_cross::(10, 18); - // run_msm_cross::(19, 23); + run_msm_cross::(14, 22); } } diff --git a/src/multicore.rs b/src/multicore.rs deleted file mode 100644 index d8323553..00000000 --- a/src/multicore.rs +++ /dev/null @@ -1,16 +0,0 @@ -pub use maybe_rayon::{ - iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator}, - join, scope, Scope, -}; - -#[cfg(feature = "multicore")] -pub use maybe_rayon::{ - current_num_threads, - iter::{IndexedParallelIterator, IntoParallelRefIterator}, - slice::ParallelSliceMut, -}; - -#[cfg(not(feature = "multicore"))] -pub fn current_num_threads() -> usize { - 1 -} diff --git a/src/secp256k1/fp.rs b/src/secp256k1/fp.rs index 1538544f..cb3493f0 100644 --- a/src/secp256k1/fp.rs +++ b/src/secp256k1/fp.rs @@ -1,4 +1,4 @@ -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use crate::{ diff --git a/src/secp256k1/fq.rs b/src/secp256k1/fq.rs index 09087227..f013f61a 100644 --- a/src/secp256k1/fq.rs +++ b/src/secp256k1/fq.rs @@ -1,4 +1,4 @@ -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use crate::{ diff --git a/src/secp256r1/fp.rs b/src/secp256r1/fp.rs index f3497c81..6669de11 100644 --- a/src/secp256r1/fp.rs +++ b/src/secp256r1/fp.rs @@ -1,4 +1,4 @@ -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use crate::{ diff --git a/src/secp256r1/fq.rs b/src/secp256r1/fq.rs index 86005d35..b96f05d5 100644 --- a/src/secp256r1/fq.rs +++ b/src/secp256r1/fq.rs @@ -1,4 +1,4 @@ -use crate::arithmetic::{adc, mac, macx, sbb}; +use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb}; use crate::extend_field_legendre; use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup}; use core::fmt;