diff --git a/Cargo.toml b/Cargo.toml index b90f4199b..bcad6488d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ members = [ "poly-benches", "test-curves", "test-templates", + + "scripts/glv-lattice-basis", ] [profile.release] diff --git a/ec/Cargo.toml b/ec/Cargo.toml index d255e6819..9432e0e88 100644 --- a/ec/Cargo.toml +++ b/ec/Cargo.toml @@ -19,9 +19,25 @@ ark-ff = { path = "../ff", default-features = false } derivative = { version = "2", features = ["use_core"] } num-traits = { version = "0.2", default-features = false } rayon = { version = "1", optional = true } +itertools = { version = "0.9.0", default-features = false } +either = { version = "1.6.0", default-features = false } +thread-id = { version = "3.3.0", optional = true } +backtrace = { version = "0.3", optional = true } +accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true } +peekmore = "0.5.6" +closure = { version = "0.3.0", optional = true } +lazy_static = { version = "1.4.0", optional = true } +serde_json = { version = "1.0.58", optional = true } +dirs = { version = "1.0.5", optional = true } +log = { version = "0.4.11", optional = true } +paste = "0.1" zeroize = { version = "1", default-features = false, features = ["zeroize_derive"] } +[dev-dependencies] +rand_xorshift = "0.2" + [features] +cuda = [ "std", "parallel", "accel", "lazy_static", "serde_json", "dirs", "closure", "log" ] default = [] std = [ "ark-std/std", "ark-ff/std", "ark-serialize/std" ] parallel = [ "std", "rayon", "ark-std/parallel" ] diff --git a/ec/src/batch_arith.rs b/ec/src/batch_arith.rs new file mode 100644 index 000000000..6a5ba0529 --- /dev/null +++ b/ec/src/batch_arith.rs @@ -0,0 +1,309 @@ +use crate::AffineCurve; +use ark_ff::{biginteger::BigInteger, fields::Field}; +use ark_std::{ops::Neg, vec::Vec}; +use either::Either; +use num_traits::Zero; + +/// We use a batch size that is big enough to amortise the cost of the actual +/// inversion close to zero while not straining the CPU cache by generating and +/// fetching from large w-NAF tables and slices [G] +pub const BATCH_AFFINE_BATCH_SIZE: usize = 4096; + +/// We code this in the second operand for the `batch_add_in_place_read_only` +/// method utilised in the `batch_scalar_mul_in_place` method. +/// 0 == Identity; 1 == Neg; 2 == GLV; 3 == GLV + Neg +pub const ENDO_CODING_BITS: usize = 2; + +#[inline(always)] +pub fn decode_endo_from_u32(index_code: u32) -> (usize, u8) { + ( + index_code as usize >> ENDO_CODING_BITS, + index_code as u8 % 4, + ) +} + +pub trait BatchGroupArithmetic +where + Self: Sized + Clone + Copy + Zero + Neg, +{ + type BaseFieldForBatch: Field; + + // We use the w-NAF method, achieving point density of approximately 1/(w + 1) + // and requiring storage of only 2^(w - 1). + // Refer to e.g. Improved Techniques for Fast Exponentiation, Section 4 + // Bodo M¨oller 2002. https://www.bmoeller.de/pdf/fastexp-icisc2002.pdf + + /// Computes [[p_1, 3 * p_1, ..., (2^w - 1) * p_1], ..., [p_n, 3*p_n, ..., + /// (2^w - 1) p_n]] We need to manipulate the offsets when using the + /// table + fn batch_wnaf_tables(bases: &[Self], w: usize) -> Vec { + let half_size = 1 << (w - 1); + let batch_size = bases.len(); + + let mut two_a = bases.to_vec(); + let instr = (0..batch_size).map(|x| x as u32).collect::>(); + Self::batch_double_in_place(&mut two_a, &instr[..], None); + + let mut tables = Vec::::with_capacity(half_size * batch_size); + tables.extend_from_slice(bases); + let mut scratch_space = Vec::>::with_capacity((batch_size - 1) / 2 + 1); + + for i in 1..half_size { + let instr = (0..batch_size) + .map(|x| (((i - 1) * batch_size + x) as u32, x as u32)) + .collect::>(); + Self::batch_add_write_read_self( + &two_a[..], + &instr[..], + &mut tables, + &mut scratch_space, + ); + } + tables + } + + /// Computes the vectorised version of the wnaf integer recoding + /// Optionally takes a slice of booleans which indicate whether that + /// scalar is negative. If so, it negates the recoding. + /// Mutates scalars in place + fn batch_wnaf_opcode_recoding( + scalars: &mut [BigInt], + w: usize, + negate: Option<&[bool]>, + ) -> Vec>> { + debug_assert!(w > 0); + let batch_size = scalars.len(); + let window_size: i16 = 1 << (w + 1); + let half_window_size: i16 = 1 << w; + + let mut op_code_vectorised = Vec::>>::with_capacity(BigInt::NUM_LIMBS * 64); + + let mut all_none = false; + + if negate.is_some() { + debug_assert_eq!(scalars.len(), negate.unwrap().len()); // precompute + } + + let f = false; + while !all_none { + let iter = match negate { + None => Either::Left(core::iter::repeat(&f).take(batch_size)), + Some(bools) => Either::Right(bools.iter()), + }; + let mut opcode_row = Vec::with_capacity(batch_size); + for (s, &neg) in scalars.iter_mut().zip(iter) { + if s.is_zero() { + opcode_row.push(None); + } else { + let op = if s.is_odd() { + let mut z: i16 = (s.as_ref()[0] % (1 << (w + 1))) as i16; + if z < half_window_size { + s.sub_noborrow(&BigInt::from(z as u64)); + } else { + z = z - window_size; + s.add_nocarry(&BigInt::from((-z) as u64)); + } + if neg { + -z + } else { + z + } + } else { + 0 + }; + opcode_row.push(Some(op)); + s.div2(); + } + } + all_none = opcode_row.iter().all(|x| x.is_none()); + if !all_none { + op_code_vectorised.push(opcode_row); + } + } + op_code_vectorised + } + + // We define a series of batched primitive EC ops, each of which is most + // suitable to a given scenario. + // + // We encode the indexes as u32s to save on fetch latency via better cacheing. + // The principle we are applying is that the len of the batch ops should + // never exceed about 2^20, and the table size would never exceed 2^10, so + // 32 bits will always be enough + + /// Mutates bases to be doubled in place + /// Accepts optional scratch space which might help by reducing the + /// number of heap allocations for the Vector-based scratch_space + fn batch_double_in_place( + bases: &mut [Self], + index: &[u32], + scratch_space: Option<&mut Vec>, + ); + + /// Mutates bases in place and stores result in the first operand. + /// The element corresponding to the second operand becomes junk data. + fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]); + + /// Mutates bases in place and stores result in bases. + /// The elements in other become junk data. + fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]); + + /// Adds elements in bases with elements in other (for instance, a table), + /// utilising a scratch space to store intermediate results. + fn batch_add_in_place_read_only( + bases: &mut [Self], + other: &[Self], + index: &[(u32, u32)], + scratch_space: &mut Vec, + ); + + /// Lookups up group elements according to index, and either adds and writes + /// or simply writes them to new_elems, using scratch space to store + /// intermediate values. Scratch space is always cleared after use. + + /// No-ops, or copies of the elem in the slice `lookup` in the position of + /// the index of the first operand to the new_elems vector, are encoded + /// as !0u32 in the index for the second operand + fn batch_add_write( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ); + + /// Similar to batch_add_write, only that the lookup for the first operand + /// is performed in new_elems rather than lookup + + /// No-ops, or copies of the elem in the slice `lookup` in the position of + /// the index of the first operand to the new_elems vector, are encoded + /// as !0u32 in the index for the second operand + fn batch_add_write_read_self( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ); + + /// Performs a batch scalar multiplication using the w-NAF encoding + /// utilising the primitive batched ops + fn batch_scalar_mul_in_place( + mut bases: &mut [Self], + scalars: &mut [BigInt], + w: usize, + ) { + let batch_size = bases.len(); + let opcode_vectorised = Self::batch_wnaf_opcode_recoding::(scalars, w, None); + let tables = Self::batch_wnaf_tables(bases, w); + + // Set all points to 0; + let zero = Self::zero(); + for p in bases.iter_mut() { + *p = zero; + } + + for opcode_row in opcode_vectorised.iter().rev() { + let index_double: Vec<_> = opcode_row + .iter() + .enumerate() + .filter(|x| x.1.is_some()) + .map(|x| x.0 as u32) + .collect(); + + Self::batch_double_in_place(&mut bases, &index_double[..], None); + + let mut add_ops: Vec = opcode_row + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|(i, op)| { + let idx = op.unwrap(); + if idx > 0 { + tables[(idx as usize) / 2 * batch_size + i].clone() + } else { + tables[(-idx as usize) / 2 * batch_size + i].clone().neg() + } + }) + .collect(); + + let index_add: Vec<_> = opcode_row + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|x| x.0) + .enumerate() + .map(|(x, y)| (y as u32, x as u32)) + .collect(); + + Self::batch_add_in_place(&mut bases, &mut add_ops[..], &index_add[..]); + } + } + + /// Chunks vectorised instructions into a size that does not require + /// storing a lot of intermediate state + fn get_chunked_instr(instr: &[T], batch_size: usize) -> Vec> { + let mut res = Vec::new(); + + let rem = instr.chunks_exact(batch_size).remainder(); + let mut chunks = instr.chunks_exact(batch_size).peekable(); + + if chunks.len() == 0 { + res.push(rem.to_vec()); + } + + while let Some(chunk) = chunks.next() { + let chunk = if chunks.peek().is_none() { + [chunk, rem].concat() + } else { + chunk.to_vec() + }; + res.push(chunk); + } + res + } +} + +/// We make the syntax for performing batch ops on slices cleaner +/// by defining a corresponding trait and impl for [G] rather than on G +pub trait BatchGroupArithmeticSlice { + fn batch_double_in_place(&mut self, index: &[u32]); + + fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]); + + fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]); + + fn batch_add_write( + &self, + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ); + + fn batch_scalar_mul_in_place(&mut self, scalars: &mut [BigInt], w: usize); +} + +impl BatchGroupArithmeticSlice for [G] { + fn batch_double_in_place(&mut self, index: &[u32]) { + G::batch_double_in_place(self, index, None); + } + + fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]) { + G::batch_add_in_place_same_slice(self, index); + } + + fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]) { + G::batch_add_in_place(self, other, index); + } + + fn batch_add_write( + &self, + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ) { + G::batch_add_write(self, index, new_elems, scratch_space); + } + + fn batch_scalar_mul_in_place(&mut self, scalars: &mut [BigInt], w: usize) { + G::batch_scalar_mul_in_place(self, scalars, w); + } +} diff --git a/ec/src/batch_verify.rs b/ec/src/batch_verify.rs new file mode 100644 index 000000000..ad4105edc --- /dev/null +++ b/ec/src/batch_verify.rs @@ -0,0 +1,181 @@ +use crate::{ + batch_bucketed_add, AffineCurve, BatchGroupArithmeticSlice, BucketPosition, PrimeField, + ProjectiveCurve, BATCH_AFFINE_BATCH_SIZE, +}; +use ark_ff::fields::FpParameters; +use ark_std::{cfg_chunks_mut, fmt, vec::Vec, rand::Rng}; +use num_traits::identities::Zero; + +#[cfg(feature = "parallel")] +use {rand::thread_rng, rayon::prelude::*}; + +#[derive(Debug, Clone)] +pub struct VerificationError; + +impl fmt::Display for VerificationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Verification Error. Not in subgroup") + } +} + +fn verify_points( + points: &[C], + num_buckets: usize, + _new_security_param: Option, // Only pass new_security_param if possibly recursing + rng: &mut R, +) -> Result<(), VerificationError> { + let n_points = points.len(); + let mut bucket_assign = Vec::with_capacity(points.len()); + for i in 0..n_points { + bucket_assign.push(BucketPosition { + bucket: rng.gen_range(0, num_buckets) as u32, + position: i as u32, + }); + } + let mut buckets = batch_bucketed_add(num_buckets, &mut points.to_vec(), &mut bucket_assign[..]); + + // We use the batch_scalar_mul to check the subgroup condition if + // there are sufficient number of buckets. For SW curves, the number + // elems for the batch mul to become useful is around 2^24. + let verification_failure = if num_buckets >= BATCH_AFFINE_BATCH_SIZE { + cfg_chunks_mut!(buckets, BATCH_AFFINE_BATCH_SIZE).for_each(|e| { + let length = e.len(); + e[..].batch_scalar_mul_in_place::<::BigInt>( + &mut vec![C::ScalarField::modulus().into(); length][..], + 4, + ); + }); + !buckets.iter().all(|&p| p.is_zero()) + } else { + !buckets + .iter() + .all(|&b| b.into_projective().mul(C::ScalarField::modulus()).is_zero()) + }; + if verification_failure { + return Err(VerificationError); + } + Ok(()) +} + +fn run_rounds( + points: &[C], + num_buckets: usize, + num_rounds: usize, + new_security_param: Option, + rng: &mut R, +) -> Result<(), VerificationError> { + #[cfg(feature = "parallel")] + if num_rounds > 2 { + use std::sync::Arc; + let ref_points = Arc::new(points.to_vec()); + let mut threads = vec![]; + for _ in 0..num_rounds { + let ref_points_thread = ref_points.clone(); + // We only use std when a multicore environment is available + threads.push(std::thread::spawn( + move || -> Result<(), VerificationError> { + let mut rng = &mut thread_rng(); + verify_points( + &ref_points_thread[..], + num_buckets, + new_security_param, + &mut rng, + )?; + Ok(()) + }, + )); + } + for thread in threads { + thread.join().unwrap()?; + } + } else { + for _ in 0..num_rounds { + verify_points(points, num_buckets, new_security_param, rng)?; + } + } + + #[cfg(not(feature = "parallel"))] + for _ in 0..num_rounds { + verify_points(points, num_buckets, new_security_param, rng)?; + } + + Ok(()) +} + +pub fn batch_verify_in_subgroup( + points: &[C], + security_param: usize, + rng: &mut R, +) -> Result<(), VerificationError> { + #[cfg(feature = "std")] + let cost_estimate = (::Params::MODULUS_BITS as f64 + * (0.5 * 7.0 / 6.0 * 0.8 + 1.0 / 5.0)) + .ceil() as usize; + #[cfg(not(feature = "std"))] + let cost_estimate = ::Params::MODULUS_BITS as usize * 5 / 4; + + let (num_buckets, num_rounds, _) = get_max_bucket( + security_param, + points.len(), + // We estimate the costs of a single scalar multiplication in the batch affine, w-NAF GLV + // case as 7/6 * 0.5 * n_bits * 0.8 (doubling) + 0.5 * 1/(w + 1) * n_bits + // (addition) We take into account that doubling in the batch add model is cheaper + // as it requires less cache use + cost_estimate, + ); + run_rounds(points, num_buckets, num_rounds, None, rng)?; + Ok(()) +} + +/// We get the greatest power of 2 number of buckets such that we minimise the +/// number of rounds while satisfying the constraint that +/// n_rounds * buckets * next_check_per_elem_cost < n +fn get_max_bucket( + security_param: usize, + n_elems: usize, + next_check_per_elem_cost: usize, +) -> (usize, usize, usize) { + #[cfg(feature = "std")] + { + let mut log2_num_buckets = 1f64; + let num_rounds = |log2_num_buckets: f64| -> usize { + (security_param as f64 / log2_num_buckets).ceil() as usize + }; + + while num_rounds(log2_num_buckets) + * next_check_per_elem_cost + * (2f64.powf(log2_num_buckets).ceil() as usize) + < n_elems + && num_rounds(log2_num_buckets + 0.1) > 1 + { + log2_num_buckets += 0.1; + } + ( + 2f64.powf(log2_num_buckets).ceil() as usize, // number of buckets + num_rounds(log2_num_buckets), // number of rounds + log2_num_buckets.ceil() as usize, // new security param + ) + } + + #[cfg(not(feature = "std"))] + { + let mut log2_num_buckets: u32 = 1; + let num_rounds = |log2_num_buckets: u32| -> usize { + (security_param - 1) / (log2_num_buckets as usize) + 1 + }; + + while num_rounds(log2_num_buckets) + * next_check_per_elem_cost + * (2_i32.pow(log2_num_buckets) as usize) + < n_elems + && num_rounds(log2_num_buckets + 1) > 1 + { + log2_num_buckets += 1; + } + ( + 2_i32.pow(log2_num_buckets) as usize, // number of buckets + num_rounds(log2_num_buckets), // number of rounds + log2_num_buckets as usize, // new security param + ) + } +} diff --git a/ec/src/bucketed_add.rs b/ec/src/bucketed_add.rs new file mode 100644 index 000000000..e5711d9d0 --- /dev/null +++ b/ec/src/bucketed_add.rs @@ -0,0 +1,213 @@ +use crate::{BatchGroupArithmeticSlice, BATCH_AFFINE_BATCH_SIZE}; + +use ark_std::vec::Vec; + +use crate::AffineCurve; + +#[derive(Copy, Clone, Debug)] +pub struct BucketPosition { + pub bucket: u32, + pub position: u32, +} + +/// The objective of this function is to identify an addition tree of +/// independent elliptic curve group additions for each bucket, and to batch the +/// independent additions using the batch affine inversion method. + +/// The strategy taken is to sort a list of bucket assignments of all the +/// elements (which we can for most intents and purposes, think of as being +/// uniformly random) by bucket, so that indices corresponding to elements that +/// must be added together are physically collocated in memory. Then, in the +/// first round, we proceed to perform independent additions producing +/// intermediate results at the greatest depth for each addition tree (each +/// corresponding to a bucket), and write the result to a new vector. We do so +/// to improve cache locality for future rounds, and take advantage of the +/// CPU-intensive nature of elliptic curve operations along with prfetching to +/// hide the latency of reading from essentially random locations in memory. + +/// Subsequently, we perform the additions in place, and the second operands +/// become junk data. Finally, when we only have the buckets left (no more +/// additions left to perform), we copy the result into a destination `res` +/// slice. +#[inline] +pub fn batch_bucketed_add( + buckets: usize, + elems: &[C], + bucket_positions: &mut [BucketPosition], +) -> Vec { + assert_eq!(elems.len(), bucket_positions.len()); + assert!(elems.len() > 0); + + // We sort the bucket positions so that indices of elements assigned + // to the same bucket are continguous. This way, we can easily identify + // how to construct the addition tree for that bucket. + bucket_positions.sort_unstable_by_key(|x| x.bucket); + + let mut len = bucket_positions.len(); + let mut all_ones = true; + let mut new_len = 0; // len counter + let mut glob = 0; // global counters + let mut loc = 1; // local counter + let mut batch = 0; // batch counter + let mut instr = Vec::<(u32, u32)>::with_capacity(BATCH_AFFINE_BATCH_SIZE); + let mut new_elems = Vec::::with_capacity(elems.len() * 3 / 8); + + let mut scratch_space = Vec::>::with_capacity(BATCH_AFFINE_BATCH_SIZE / 2); + + // In the first loop, we copy the results of the first in place addition tree + // to a local vector, new_elems + // Subsequently, we perform all the operations in place + while glob < len { + let current_bucket = bucket_positions[glob].bucket; + // We are iterating over elements using a global `glob` counter, and counting + // how many in a row are being assigned to the same bucket, using the `loc` + // counter. + while glob + 1 < len && bucket_positions[glob + 1].bucket == current_bucket { + glob += 1; + loc += 1; + } + // If the current bucket exceeds buckets, it encodes a noop + if current_bucket >= buckets as u32 { + loc = 1; + } else if loc > 1 { + // all ones is false if next len is not 1 + + // in other words, we have not reached the terminating + // condition that after the current round of addition + // there is only one element left in each addition tree + + // This would be the case, if each addition tree had at + // most 2 elements in the current round. + if loc > 2 { + all_ones = false; + } + let is_odd = loc % 2 == 1; + let half = loc / 2; + // We encode instructions to add adjacent elements + for i in 0..half { + instr.push(( + bucket_positions[glob - (loc - 1) + 2 * i].position, + bucket_positions[glob - (loc - 1) + 2 * i + 1].position, + )); + // Compactification of buckets + bucket_positions[new_len + i] = BucketPosition { + bucket: current_bucket, + position: (new_len + i) as u32, + }; + } + // If there are an odd number of elements, the lone element + // without a partner will be copied over to the `new_elems` + // vector, a noop which is encoded as !0u32 + if is_odd { + instr.push((bucket_positions[glob].position, !0u32)); + bucket_positions[new_len + half] = BucketPosition { + bucket: current_bucket, + position: (new_len + half) as u32, + }; + } + // Reset the local_counter and update state + + // We compactify the `bucket_positions` data by shifing left + // `new_len` is the len of the current compactified vector. + + // We also update the `batch` counter to decide when it is + // optimal to invoke the batch inversion, i.e. when we have + // accumulated enough independent additions. + new_len += half + (loc % 2); + batch += half; + loc = 1; + + if batch >= BATCH_AFFINE_BATCH_SIZE / 2 { + // We need instructions for copying data in the case + // of noops. We encode noops/copies as !0u32 + elems[..].batch_add_write(&instr[..], &mut new_elems, &mut scratch_space); + + instr.clear(); + batch = 0; + } + } else { + instr.push((bucket_positions[glob].position, !0u32)); + bucket_positions[new_len] = BucketPosition { + bucket: current_bucket, + position: new_len as u32, + }; + new_len += 1; + } + glob += 1; + } + if instr.len() > 0 { + elems[..].batch_add_write(&instr[..], &mut new_elems, &mut scratch_space); + instr.clear(); + } + glob = 0; + batch = 0; + loc = 1; + len = new_len; + new_len = 0; + + // We repeat the above procedure, except, since we are performing the addition + // trees in place, we do not need to encode noops to force a copy to a new + // vector. + while !all_ones { + all_ones = true; + while glob < len { + let current_bucket = bucket_positions[glob].bucket; + while glob + 1 < len && bucket_positions[glob + 1].bucket == current_bucket { + glob += 1; + loc += 1; + } + if current_bucket >= buckets as u32 { + loc = 1; + } else if loc > 1 { + // all ones is false if next len is not 1 + if loc != 2 { + all_ones = false; + } + let is_odd = loc % 2 == 1; + let half = loc / 2; + for i in 0..half { + instr.push(( + bucket_positions[glob - (loc - 1) + 2 * i].position, + bucket_positions[glob - (loc - 1) + 2 * i + 1].position, + )); + bucket_positions[new_len + i] = bucket_positions[glob - (loc - 1) + 2 * i]; + } + if is_odd { + bucket_positions[new_len + half] = bucket_positions[glob]; + } + // Reset the local_counter and update state + new_len += half + (loc % 2); + batch += half; + loc = 1; + + if batch >= BATCH_AFFINE_BATCH_SIZE / 2 { + &mut new_elems[..].batch_add_in_place_same_slice(&instr[..]); + instr.clear(); + batch = 0; + } + } else { + bucket_positions[new_len] = bucket_positions[glob]; + new_len += 1; + } + glob += 1; + } + if instr.len() > 0 { + &mut new_elems[..].batch_add_in_place_same_slice(&instr[..]); + instr.clear(); + } + glob = 0; + batch = 0; + loc = 1; + len = new_len; + new_len = 0; + } + + let zero = C::zero(); + let mut res = vec![zero; buckets]; + + for i in 0..len { + let (pos, buc) = (bucket_positions[i].position, bucket_positions[i].bucket); + res[buc as usize] = new_elems[pos as usize]; + } + res +} diff --git a/ec/src/cuda/accel_dummy.rs b/ec/src/cuda/accel_dummy.rs new file mode 100644 index 000000000..6acbe17cf --- /dev/null +++ b/ec/src/cuda/accel_dummy.rs @@ -0,0 +1,9 @@ +use ark_std::vec::Vec; + +pub mod error { + pub type Result = T; +} + +pub struct Context {} + +pub type DeviceMemory = Vec; diff --git a/ec/src/cuda/mod.rs b/ec/src/cuda/mod.rs new file mode 100644 index 000000000..f2dc0829d --- /dev/null +++ b/ec/src/cuda/mod.rs @@ -0,0 +1,6 @@ +#[macro_use] +pub mod scalar_mul; +pub use scalar_mul::*; + +#[cfg(not(feature = "cuda"))] +pub mod accel_dummy; diff --git a/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs b/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs new file mode 100644 index 000000000..b979a0658 --- /dev/null +++ b/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs @@ -0,0 +1,286 @@ +// TODO: make this more generic +#[macro_export] +macro_rules! impl_gpu_cpu_run_kernel { + () => { + #[allow(unused_qualifications)] + fn init_gpu_cache_dir() -> Result { + #[cfg(feature = "cuda")] + { + let dir = dirs::cache_dir() + .unwrap() + .join("zexe-algebra") + .join("cuda-scalar-mul-profiler") + .join(P::namespace()); + std::fs::create_dir_all(&dir)?; + Ok(dir.to_str().unwrap().to_string()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + #[allow(unused_qualifications)] + fn read_profile_data() -> Result { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + let data = std::fs::read_to_string(&dir.join("profile_data.txt"))?; + Ok(data) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + fn clear_gpu_profiling_data() -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + std::fs::File::create(&dir.join("profile_data.txt"))?; + Ok(()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + #[allow(unused_variables)] + fn write_profile_data(profile_data: &str) -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?); + let mut file = std::fs::File::create(&dir.join("profile_data.txt"))?; + file.write_all(profile_data.as_bytes())?; + file.sync_all()?; + Ok(()) + } + #[cfg(not(feature = "cuda"))] + Err(crate::CudaScalarMulError::CudaDisabledError) + } + + /// We split up the job statically between the CPU and GPUs + /// based on continuous profiling stored both in a static location in memory + /// that is lost the moment the progam stops running. + /// and also a txt file in the OS' cache dir. + + /// Only one such procedure should be running at any time. + #[allow(unused_variables)] + fn cpu_gpu_static_partition_run_kernel( + bases_h: &mut [::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), crate::CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + if !Device::init() { + panic!("Do not call this function unless the device has been checked to initialise successfully"); + } + let n_devices = Device::get_count().unwrap(); + let n = bases_h.len(); + // Create references so we can split the slices + let mut res_ref = &mut bases_h[..]; + let mut exps_h_ref = exps_h; + + // Get data for proportion of total throughput achieved by each device + let _ = Self::init_gpu_cache_dir()?; + + let arc_mutex = P::scalar_mul_static_profiler(); + let mut profile_data = arc_mutex.lock().unwrap(); + let mut proportions: Vec = profile_data.0.clone(); + + // If the program has just been initialised, we must check for the existence of existing + // cached profile data. If it does not exist, we create a new file + if proportions.is_empty() { + let _ = Self::read_profile_data() + .and_then(|s| { let res = serde_json::from_str(&s).map_err(|_| crate::CudaScalarMulError::ProfilingDeserializationError)?; Ok(res) }) + .and_then(|cached_data| { + *profile_data = cached_data; + proportions = profile_data.0.clone(); + Ok(()) + } + ); + } + + if proportions.is_empty() { + // By default we split the work evenly between devices and host + proportions = vec![1.0 / (n_devices as f64 + 1.0); n_devices]; + } + + assert_eq!(proportions.len(), n_devices); + // Allocate the number of elements in the job to each device/host + let n_gpus = proportions.iter().map(|r| (r * n as f64).round() as usize).collect::>(); + let n_cpu = n - n_gpus.iter().sum::(); + + // Create storage for buffers and contexts for variable number of devices + let mut bases_split = Vec::with_capacity(n_devices); + let mut tables = Vec::with_capacity(n_devices); + let mut exps = Vec::with_capacity(n_devices); + let mut ctxs = Vec::with_capacity(n_devices); + let (mut time_cpu, mut times_gpu) = (0, vec![0; n_devices]); + + // Split data and generate tables and u8 scalar encoding in device memory + for (i, &num) in n_gpus.iter().enumerate() { + let device = Device::nth(i).unwrap(); + let ctx = device.create_context(); + + let (lower, upper) = res_ref.split_at_mut(num); + res_ref = upper; + let lower_exps = &exps_h_ref[..num]; + exps_h_ref = &exps_h_ref[num..]; + + let mut table = DeviceMemory::::zeros(&ctx, num * Self::table_size()); + let mut exp = DeviceMemory::::zeros(&ctx, num * Self::num_u8()); + + Self::generate_tables_and_recoding(lower, &mut table[..], lower_exps, &mut exp[..]); + + ctxs.push((device, ctx)); + bases_split.push(lower); + tables.push(table); + exps.push(exp); + }; + + let jobs_result: std::sync::Arc>> = std::sync::Arc::new(Mutex::new(Ok(()))); + + rayon::scope(|s| { + // Run jobs on GPUs + for (i, (bases_gpu, time_gpu)) in bases_split.iter_mut().zip(times_gpu.iter_mut()).enumerate() { + let n_gpu = n_gpus[i]; + let ctx = &ctxs[i].1; + let table = &tables[i]; + let exp = &exps[i]; + + let jobs_result_inner = jobs_result.clone(); + + s.spawn(move |_| { + let now = std::time::Instant::now(); + + let mut out = DeviceMemory::::zeros(ctx, n_gpu); + let result = P::scalar_mul_kernel( + ctx, + (n_gpu - 1) / cuda_group_size + 1, // grid + cuda_group_size, // block + table.as_ptr(), exp.as_ptr(), out.as_mut_ptr(), n_gpu as isize + ).map_err(|_| crate::CudaScalarMulError::KernelFailedError); + if result.is_err() { + *jobs_result_inner.lock().unwrap() = result; + return; + } + Self::batch_normalization(&mut out[..]); + bases_gpu.clone_from_slice(&out.par_iter().map(|p| p.into_affine()).collect::>()[..]); + *time_gpu = now.elapsed().as_micros(); + }); + } + + // Run on CPU + s.spawn(|_| { + let now = std::time::Instant::now(); + + let exps_mut = &mut exps_h_ref.to_vec()[..]; + rayon::scope(|t| { + for (b, s) in res_ref.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) { + t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4)); + } + }); + + time_cpu = now.elapsed().as_micros(); + }); + }); + + // It's safe to do this, since after the rayon scope we only have one reference. + std::sync::Arc::try_unwrap(jobs_result).unwrap().into_inner().unwrap()?; + + // Update global microbenchmarking state + debug!("CUDA old profile_data: {:?}", profile_data); + let cpu_throughput = n_cpu as f64 / time_cpu as f64; + let gpu_throughputs = n_gpus + .iter() + .zip(times_gpu.iter()) + .map(|(n_gpu, time_gpu)| { + *n_gpu as f64 / *time_gpu as f64 + }) + .collect::>(); + let total_throughput = cpu_throughput + gpu_throughputs.iter().sum::(); + let n_data_points = profile_data.1 as f64; + profile_data.1 += 1; + let new_proportions = gpu_throughputs.iter().map(|t| t / total_throughput); + + if !profile_data.0.is_empty() { + profile_data.0 = new_proportions.zip(profile_data.0.clone()).map(|(new, old)| { + (new + n_data_points * old) / profile_data.1 as f64 + }).collect(); + } else { + profile_data.0 = new_proportions.collect(); + } + + // Update cached profiling data on disk + let s: String = serde_json::to_string(&(*profile_data)).map_err(|_| crate::CudaScalarMulError::ProfilingSerializationError)?; + Self::write_profile_data(&s)?; + + debug!("CUDA new profile_data: {:?}", profile_data); + } + + Ok(()) + } + + #[allow(unused_variables)] + fn cpu_gpu_load_balance_run_kernel( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of a single job in the queue e.g. 2 << 14 + job_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Vec<::Affine> { + #[cfg(feature = "cuda")] + { + let mut bases_res = bases_h.to_vec(); + let queue = Mutex::new(bases_res.chunks_mut(job_size).zip(exps_h.chunks(job_size)).peekmore()); + + rayon::scope(|s| { + // We launch two concurrent GPU threads that block on waiting for GPU to hide latency + for i in 0..2 { + s.spawn(closure!(move i, ref queue, |_| { + std::thread::sleep(std::time::Duration::from_millis(i * 500)); + let mut iter = queue.lock().unwrap(); + while let Some((bases, exps)) = iter.next() { + iter.peek(); + if iter.peek().is_none() { break; } + let mut proj_res = Self::par_run_kernel_sync(ctx, bases, exps, cuda_group_size, iter); + Self::batch_normalization(&mut proj_res[..]); + bases.clone_from_slice(&proj_res.par_iter().map(|p| p.into_affine()).collect::>()[..]); + iter = queue.lock().unwrap(); + } + })); + } + + s.spawn(|_| { + std::thread::sleep(std::time::Duration::from_millis(20)); + let mut iter = queue.lock().unwrap(); + debug!("CUDA acquired cpu"); + while let Some((bases, exps)) = iter.next() { + let exps_mut = &mut exps.to_vec()[..]; + rayon::scope(|t| { + for (b, s) in bases.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) { + t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4)); + } + }); + // Sleep to allow other threads to unlock + drop(iter); + debug!("CUDA unlocked cpu"); + std::thread::sleep(std::time::Duration::from_millis(20)); + iter = queue.lock().unwrap(); + debug!("CUDA acquired cpu"); + } + debug!("CUDA cpu finish"); + }); + }); + drop(queue); + bases_res + } + + #[cfg(not(feature = "cuda"))] + Vec::new() + } + } +} diff --git a/ec/src/cuda/scalar_mul/kernel_macros.rs b/ec/src/cuda/scalar_mul/kernel_macros.rs new file mode 100644 index 000000000..005bec24e --- /dev/null +++ b/ec/src/cuda/scalar_mul/kernel_macros.rs @@ -0,0 +1,177 @@ +#[macro_export] +macro_rules! impl_scalar_mul_kernel { + ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => { + paste::item! { + #[cfg(feature = "cuda")] + use {accel::*, ark_std::{sync::{Arc, Mutex}, vec::Vec}}; + + #[cfg(not(feature = "cuda"))] + use ark_ec::accel_dummy::*; + + use ark_ec::cuda::scalar_mul::ScalarMulProfiler; + + #[cfg(feature = "cuda")] + lazy_static::lazy_static! { + pub static ref MICROBENCH_CPU_GPU_AVG_RATIO: + Arc, usize)>> = Arc::new(Mutex::new((Vec::new(), 0))); + } + + #[cfg(not(feature = "cuda"))] + static MICROBENCH_CPU_GPU_AVG_RATIO: () = (); + + const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]); + + #[cfg(feature = "cuda")] + #[kernel_mod(transparent)] + #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })] + #[dependencies("ark_ff" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ff", default_features = false})] + #[dependencies("ark_ec" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ec", default_features = false})] + #[dependencies("curve" = { git = "https://github.com/arkworks-rs/curves", branch = "master", package = $curve_string, features = ["curve"], default_features = false})] + pub mod scalar_mul { + use curve::$ProjCurve; + use ark_ec::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero}; + + const NUM_BITS: isize = + <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize; + const LOG2_W: isize = 5; + const TABLE_SIZE: isize = 1 << LOG2_W; + const NUM_U8: isize = (NUM_BITS - 1) / LOG2_W + 1; + + #[kernel_func] + pub unsafe fn scalar_mul( + #[type_substitute(*const super::$ProjCurve)] + table: *const $ProjCurve, + exps: *const u8, + #[type_substitute(*mut super::$ProjCurve)] + out: *mut $ProjCurve, + n: isize, + ) { + let i = accel_core::index(); + if i < n { + let mut res = $ProjCurve::zero(); + res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize)); + + for j in 1..NUM_U8 as isize { + for _ in 0..LOG2_W { + res.double_in_place(); + } + res += &(*table + .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + j) as isize)); + } + *out.offset(i) = res; + } + } + } + } + } +} + +#[macro_export] +macro_rules! impl_scalar_mul_kernel_glv { + ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => { + paste::item! { + #[cfg(feature = "cuda")] + use {accel::*, ark_std::{sync::{Arc, Mutex}, vec::Vec}}; + + #[cfg(not(feature = "cuda"))] + use ark_ec::accel_dummy::*; + + use ark_ec::cuda::scalar_mul::ScalarMulProfiler; + + #[cfg(feature = "cuda")] + lazy_static::lazy_static! { + pub static ref MICROBENCH_CPU_GPU_AVG_RATIO: + Arc, usize)>> = Arc::new(Mutex::new((Vec::new(), 0))); + } + + #[cfg(not(feature = "cuda"))] + static MICROBENCH_CPU_GPU_AVG_RATIO: () = (); + + const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]); + + #[cfg(feature = "cuda")] + #[kernel_mod(transparent)] + #[name([<$curve _ $type _cuda_namespace>])] + #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })] + #[dependencies("ark_ff" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ff", default_features = false})] + #[dependencies("ark_ec" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ec", default_features = false})] + #[dependencies("curve" = { git = "https://github.com/arkworks-rs/curves", branch = "master", package = $curve_string, features = ["curve"], default_features = false })] + pub mod scalar_mul { + use curve::$ProjCurve; + use {ark_ec::ProjectiveCurve, ark_ff::{PrimeField, FpParameters, Zero}}; + + const NUM_BITS: isize = + <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize; + const LOG2_W: isize = 5; + const TABLE_SIZE: isize = 1 << LOG2_W; + const HALF_TABLE_SIZE: isize = 1 << (LOG2_W - 1); + const NUM_U8: isize = 2 * ((NUM_BITS - 1) / (2 * (LOG2_W - 1)) + 2); + + #[kernel_func] + pub unsafe fn scalar_mul( + #[type_substitute(*const super::$ProjCurve)] + table: *const $ProjCurve, + exps: *const u8, + #[type_substitute(*mut super::$ProjCurve)] + out: *mut $ProjCurve, + n: isize, + ) { + let i = accel_core::index(); + if i < n { + let mut res = $ProjCurve::zero(); + + res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize)); + res += &(*table.offset( + i * TABLE_SIZE + HALF_TABLE_SIZE + *exps.offset(i * NUM_U8 + 1) as isize, + )); + + for j in 1..NUM_U8 as isize / 2 { + for _ in 0..(LOG2_W - 1) { + res.double_in_place(); + } + res += &(*table + .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + 2 * j) as isize)); + res += &(*table.offset( + i * TABLE_SIZE + + HALF_TABLE_SIZE + + *exps.offset(i * NUM_U8 + 2 * j + 1) as isize, + )); + } + *out.offset(i) = res; + } + } + } + } + } +} + +#[macro_export] +macro_rules! impl_scalar_mul_parameters { + ($ProjCurve:ident) => { + #[allow(unused_variables)] + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const $ProjCurve, + exps: *const u8, + out: *mut $ProjCurve, + n: isize, + ) -> error::Result<()> { + #[cfg(feature = "cuda")] + scalar_mul(ctx, grid, block, (table, exps, out, n)) + } + + fn scalar_mul_static_profiler() -> ScalarMulProfiler { + #[cfg(feature = "cuda")] + return (*MICROBENCH_CPU_GPU_AVG_RATIO).clone(); + + #[cfg(not(feature = "cuda"))] + MICROBENCH_CPU_GPU_AVG_RATIO + } + + fn namespace() -> &'static str { + NAMESPACE + } + }; +} diff --git a/ec/src/cuda/scalar_mul/mod.rs b/ec/src/cuda/scalar_mul/mod.rs new file mode 100644 index 000000000..89855fd48 --- /dev/null +++ b/ec/src/cuda/scalar_mul/mod.rs @@ -0,0 +1,355 @@ +#[cfg(feature = "cuda")] +use std::sync::{Arc, Mutex}; + +use ark_ff::fields::PrimeField; +use ark_std::cfg_chunks_mut; +use core::fmt; + +use crate::{AffineCurve, BatchGroupArithmeticSlice}; +use internal::GPUScalarMulInternal; + +#[macro_use] +mod kernel_macros; +pub use kernel_macros::*; + +#[macro_use] +mod cpu_gpu_macros; + +#[macro_use] +mod run_kernel_macros; + +#[cfg(feature = "cuda")] +pub type ScalarMulProfiler = Arc, usize)>>; +#[cfg(not(feature = "cuda"))] +pub type ScalarMulProfiler = (); + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +pub const MAX_GROUP_ELEM_BYTES: usize = 400; + +#[derive(Debug)] +pub enum CudaScalarMulError { + CudaDisabledError, + IoError, + KernelFailedError, + ProfilingSerializationError, + ProfilingDeserializationError, +} + +#[cfg(feature = "std")] +impl std::error::Error for CudaScalarMulError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +#[cfg(feature = "std")] +impl From for CudaScalarMulError { + fn from(_: std::io::Error) -> Self { + CudaScalarMulError::IoError + } +} + +impl fmt::Display for CudaScalarMulError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + match self { + CudaScalarMulError::CudaDisabledError => write!(f, "CUDA is disabled"), + CudaScalarMulError::IoError => write!(f, "IO error"), + CudaScalarMulError::KernelFailedError => write!(f, "Failed running kernel"), + CudaScalarMulError::ProfilingSerializationError => { + write!(f, "Failed serlializing profiling data") + } + CudaScalarMulError::ProfilingDeserializationError => { + write!(f, "Failed deserializing profiling data") + } + } + } +} + +pub trait GPUScalarMul: GPUScalarMulInternal { + fn clear_gpu_profiling_data() { + #[cfg(feature = "cuda")] + >::clear_gpu_profiling_data() + .expect("Should have cleared GPU profiling data"); + } + + #[allow(unused_variables)] + fn cpu_gpu_scalar_mul( + elems: &mut [G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError> { + #[cfg(feature = "cuda")] + { + // CUDA will return ILLEGAL_ADRESS if group elem size is too large. + if accel::Device::init() && core::mem::size_of::() < MAX_GROUP_ELEM_BYTES { + ::Projective::cpu_gpu_static_partition_run_kernel( + elems, + exps_h, + cuda_group_size, + cpu_chunk_size, + )?; + } else { + let mut exps_mut = exps_h.to_vec(); + cfg_chunks_mut!(elems, cpu_chunk_size) + .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size)) + .for_each(|(b, s)| { + b[..].batch_scalar_mul_in_place(&mut s[..], 4); + }); + } + } + + #[cfg(not(feature = "cuda"))] + { + let mut exps_mut = exps_h.to_vec(); + cfg_chunks_mut!(elems, cpu_chunk_size) + .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size)) + .for_each(|(b, s)| { + b[..].batch_scalar_mul_in_place(&mut s[..], 4); + }); + } + + Ok(()) + } +} + +impl GPUScalarMul for G::Projective {} + +pub(crate) mod internal { + use ark_std::{string::String, vec::Vec}; + + #[cfg(feature = "cuda")] + use accel::*; + + #[cfg(not(feature = "cuda"))] + use crate::accel_dummy::*; + + use crate::{AffineCurve, CudaScalarMulError}; + use ark_ff::fields::PrimeField; + + #[allow(unused_variables)] + pub trait GPUScalarMulInternal: Sized { + const NUM_BITS: usize; + const LOG2_W: usize; + + fn table_size() -> usize { + 1 << Self::LOG2_W + } + + fn num_u8() -> usize; + + fn init_gpu_cache_dir() -> Result; + fn read_profile_data() -> Result; + fn write_profile_data(profile_data: &str) -> Result<(), CudaScalarMulError>; + fn clear_gpu_profiling_data() -> Result<(), CudaScalarMulError>; + + fn par_run_kernel( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + ) -> DeviceMemory; + + fn par_run_kernel_sync( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + lock: T, + ) -> DeviceMemory; + + fn generate_tables_and_recoding( + bases_h: &[G], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ); + + fn cpu_gpu_load_balance_run_kernel( + ctx: &Context, + bases_h: &[G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of a single job in the queue e.g. 2 << 14 + job_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Vec; + + fn cpu_gpu_static_partition_run_kernel( + bases_h: &mut [G], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError>; + } +} + +#[macro_export] +macro_rules! impl_gpu_sw_projective { + ($Parameters:ident) => { + impl GPUScalarMulInternal> for GroupProjective

{ + const NUM_BITS: usize = + <<::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize; + const LOG2_W: usize = 5; + + fn num_u8() -> usize { + if P::has_glv() { + 2 * ((Self::NUM_BITS - 1) / (2 * (Self::LOG2_W - 1)) + 2) + } else { + (Self::NUM_BITS - 1) / Self::LOG2_W + 1 + } + } + + fn generate_tables_and_recoding( + bases_h: &[::Affine], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ) { + if P::has_glv() { + let scalar_recode_glv = + |k1: &mut <::ScalarField as PrimeField>::BigInt, k2: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let table_size_glv: u64 = 1u64 << (Self::LOG2_W - 1); + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8() / 2).rev() { + out[2 * i] = (k1.as_ref()[0] % table_size_glv) as u8; + out[2 * i + 1] = (k2.as_ref()[0] % table_size_glv) as u8; + k1.divn(Self::LOG2_W as u32 - 1); + k2.divn(Self::LOG2_W as u32 - 1); + } + assert!(k1.is_zero()); + assert!(k2.is_zero()); + out + }; + + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let ((k1_neg, mut k1), (k2_neg, mut k2)) = + P::glv_scalar_decomposition(*k); + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode_glv(&mut k1, &mut k2)); + + table[0] = Self::zero(); + table[Self::table_size() / 2] = Self::zero(); + + for i in 1..Self::table_size() / 2 { + let mut res = if k1_neg { + table[i - 1] - base + } else { + table[i - 1] + base + }; + table[i] = res; + + P::glv_endomorphism_in_place(&mut res.x); + table[Self::table_size() / 2 + i] = + if k2_neg != k1_neg { res.neg() } else { res }; + } + }); + } else { + let scalar_recode = |k: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8()).rev() { + out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8; + k.divn(Self::LOG2_W as u32); + } + assert!(k.is_zero()); + out + }; + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]); + + table[0] = Self::zero(); + for i in 1..Self::table_size() { + table[i] = table[i - 1] + base; + } + }); + } + } + + impl_run_kernel!(); + impl_gpu_cpu_run_kernel!(); + } + }; +} + +#[macro_export] +macro_rules! impl_gpu_te_projective { + ($Parameters:ident) => { + impl GPUScalarMulInternal> for GroupProjective

{ + const NUM_BITS: usize = + <<::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize; + const LOG2_W: usize = 5; + + fn generate_tables_and_recoding( + bases_h: &[::Affine], + tables_h: &mut [Self], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + exps_recode_h: &mut [u8], + ) { + let scalar_recode = |k: &mut <::ScalarField as PrimeField>::BigInt| -> Vec { + let mut out = vec![0; Self::num_u8()]; + for i in (0..Self::num_u8()).rev() { + out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8; + k.divn(Self::LOG2_W as u32); + } + assert!(k.is_zero()); + out + }; + cfg_iter!(exps_h) + .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8())) + .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h))) + .for_each(|((k, exps_chunk), (table, base))| { + let base = base.into_projective(); + exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]); + + table[0] = Self::zero(); + for i in 1..Self::table_size() { + table[i] = table[i - 1] + base; + } + } + ); + } + + fn num_u8() -> usize { + (Self::NUM_BITS - 1) / Self::LOG2_W + 1 + } + + impl_run_kernel!(); + impl_gpu_cpu_run_kernel!(); + } + }; +} + +pub trait GPUScalarMulSlice { + #[allow(unused_variables)] + fn cpu_gpu_scalar_mul( + &mut self, + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError>; +} + +impl GPUScalarMulSlice for [G] { + fn cpu_gpu_scalar_mul( + &mut self, + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + // size of the batch for cpu scalar mul + cpu_chunk_size: usize, + ) -> Result<(), CudaScalarMulError> { + G::Projective::cpu_gpu_scalar_mul(self, exps_h, cuda_group_size, cpu_chunk_size) + } +} diff --git a/ec/src/cuda/scalar_mul/run_kernel_macros.rs b/ec/src/cuda/scalar_mul/run_kernel_macros.rs new file mode 100644 index 000000000..4545243d0 --- /dev/null +++ b/ec/src/cuda/scalar_mul/run_kernel_macros.rs @@ -0,0 +1,86 @@ +#[macro_export] +macro_rules! impl_run_kernel { + () => { + // We drop a lock only after the parallel portion has been handled + #[allow(unused_variables)] + fn par_run_kernel_sync( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + lock: T, + ) -> DeviceMemory { + #[cfg(feature = "cuda")] + { + assert_eq!(bases_h.len(), exps_h.len()); + let n = bases_h.len(); + + let mut tables_h = vec![Self::zero(); n * Self::table_size()]; + let mut exps_recode_h = vec![0u8; n * Self::num_u8()]; + + Self::generate_tables_and_recoding( + bases_h, + &mut tables_h[..], + exps_h, + &mut exps_recode_h[..], + ); + drop(lock); + + let mut out = DeviceMemory::::zeros(&ctx, n); + let mut tables = DeviceMemory::::zeros(&ctx, n * Self::table_size()); + let mut exps = DeviceMemory::::zeros(&ctx, n * Self::num_u8()); + + tables.copy_from_slice(&tables_h); + exps.copy_from_slice(&exps_recode_h); + + P::scalar_mul_kernel( + &ctx, + n / cuda_group_size, // grid + cuda_group_size, // block + tables.as_ptr(), + exps.as_ptr(), + out.as_mut_ptr(), + n as isize, + ) + .expect("Kernel call failed"); + out + } + #[cfg(not(feature = "cuda"))] + unreachable!(); + } + + #[allow(unused_variables)] + fn par_run_kernel( + ctx: &Context, + bases_h: &[::Affine], + exps_h: &[<::ScalarField as PrimeField>::BigInt], + cuda_group_size: usize, + ) -> DeviceMemory { + #[cfg(feature = "cuda")] + { + assert_eq!(bases_h.len(), exps_h.len()); + let n = bases_h.len(); + + let mut tables = DeviceMemory::::zeros(&ctx, n * Self::table_size()); + let mut exps = DeviceMemory::::zeros(&ctx, n * Self::num_u8()); + let mut out = DeviceMemory::::zeros(&ctx, n); + + Self::generate_tables_and_recoding(bases_h, &mut tables[..], exps_h, &mut exps[..]); + + P::scalar_mul_kernel( + &ctx, + n / cuda_group_size, // grid + cuda_group_size, // block + tables.as_ptr(), + exps.as_ptr(), + out.as_mut_ptr(), + n as isize, + ) + .expect("Kernel call failed"); + out + } + #[cfg(not(feature = "cuda"))] + unreachable!(); + } + }; +} diff --git a/ec/src/glv.rs b/ec/src/glv.rs new file mode 100644 index 000000000..37566741c --- /dev/null +++ b/ec/src/glv.rs @@ -0,0 +1,128 @@ +use crate::ModelParameters; +use ark_ff::{biginteger::BigInteger, fields::PrimeField}; +use core::ops::Neg; + +/// The GLV parameters here require the following conditions to be satisfied: +/// 1. MODULUS_BITS < NUM_LIMBS * 64 - 1. So 2 * n < 1 << (64 * NUM_LIMBS) +/// We also assume that (|b1| + 2) * (|b2| + 2) < 2 * n +/// We also know that either B1 is neg or B2 is. +pub trait GLVParameters: Send + Sync + 'static + ModelParameters { + type WideBigInt: BigInteger; + + const LAMBDA: Self::ScalarField; // lambda in ZZ s.t. phi(P) = lambda*P for all P + const OMEGA: Self::BaseField; // phi((x, y)) = (\omega x, y) + const Q1: ::BigInt; // round(R*|b2|/n) + const Q2: ::BigInt; // round(R*|b1|/n) + const B1: ::BigInt; // |b1| + const B2: ::BigInt; // |b2| + const B1_IS_NEG: bool; + + const R_BITS: u32; + + #[inline] + fn glv_scalar_decomposition_inner( + k: ::BigInt, + ) -> ( + (bool, ::BigInt), + (bool, ::BigInt), + ) { + let limbs = ::BigInt::NUM_LIMBS; + let modulus = Self::ScalarField::modulus(); + + // If we are doing a subgroup check, we should multiply by the original scalar + // since the GLV decomposition does not guarantee that we would not be + // adding and subtracting back to zero + if k == modulus { + return ( + (false, k), + (false, ::BigInt::from(0)), + ); + } + + let mut half = Self::WideBigInt::from(1); + half.muln(Self::R_BITS - 1); + + let mut c1_wide = Self::WideBigInt::mul_no_reduce(k.as_ref(), Self::Q1.as_ref()); + // add half to achieve rounding rather than flooring + c1_wide.add_nocarry(&half); + // Approximation to round(|b2|*k/n) + c1_wide.divn(Self::R_BITS); + let c1 = &c1_wide.as_ref()[..limbs]; + + let mut c2_wide = Self::WideBigInt::mul_no_reduce(k.as_ref(), Self::Q2.as_ref()); + c2_wide.add_nocarry(&half); + c2_wide.divn(Self::R_BITS); + let c2 = &c2_wide.as_ref()[..limbs]; + + // We first assume that the final 2 bits of the representation for the modulus + // is not set, so that 2 * n < R = 1 << (64 * NUM_LIMBS). + + // wlog c1 = round(k * round(|b_1|R / n) / R) < ceil(k * ceil(|b_1|* R / n) / R) + // < k * (b_1 * R / n + 1) / R + 1 < b_1 * k / n + 2 < b_1 + 2, so a + // bound like (|b1| + 2) * (|b2| + 2) < 2 * n is good enough for wlog d1 + // < 2 * n + let mut d1 = + ::BigInt::mul_no_reduce_lo(&c1, Self::B1.as_ref()); + if d1 > modulus { + d1.sub_noborrow(&modulus); + } + let mut d2 = + ::BigInt::mul_no_reduce_lo(&c2, Self::B2.as_ref()); + if d2 > modulus { + d2.sub_noborrow(&modulus); + } + // We compute k_2 = -(c1.b1 + c1.b1) = sign(b1)*(c2|b2| - c1|b1|) = sign(b1)(d2 + // - d1) + let k2_field = if !Self::B1_IS_NEG { + Self::ScalarField::from_repr(d2).unwrap() - &Self::ScalarField::from_repr(d1).unwrap() + } else { + Self::ScalarField::from_repr(d1).unwrap() - &Self::ScalarField::from_repr(d2).unwrap() + }; + + let k1 = + (Self::ScalarField::from_repr(k).unwrap() - &(k2_field * &Self::LAMBDA)).into_repr(); + let k2 = k2_field.into_repr(); + + let (neg2, k2) = if k2.num_bits() > Self::R_BITS / 2 + 1 { + (true, k2_field.neg().into_repr()) + } else { + (false, k2) + }; + + let (neg1, k1) = if k1.num_bits() > Self::R_BITS / 2 + 1 { + ( + true, + Self::ScalarField::from_repr(k1).unwrap().neg().into_repr(), + ) + } else { + (false, k1) + }; + + ((neg1, k1), (neg2, k2)) + } +} + +#[macro_export] +macro_rules! impl_glv_for_sw { + () => { + #[inline(always)] + fn has_glv() -> bool { + true + } + + #[inline(always)] + fn glv_endomorphism_in_place(elem: &mut Self::BaseField) { + *elem *= &::OMEGA; + } + + #[inline] + fn glv_scalar_decomposition( + k: ::BigInt, + ) -> ( + (bool, ::BigInt), + (bool, ::BigInt), + ) { + ::glv_scalar_decomposition_inner(k) + } + }; +} diff --git a/ec/src/lib.rs b/ec/src/lib.rs index 87712cfc3..ceccaca15 100644 --- a/ec/src/lib.rs +++ b/ec/src/lib.rs @@ -29,11 +29,28 @@ use ark_std::{ use num_traits::Zero; use zeroize::Zeroize; +pub mod batch_verify; +pub use self::batch_verify::*; + +pub mod batch_arith; +pub use self::batch_arith::*; + +pub mod bucketed_add; +pub use self::bucketed_add::*; + +#[macro_use] +pub mod glv; +pub use self::glv::*; + pub mod models; pub use self::models::*; pub mod group; +#[macro_use] +pub mod cuda; +pub use cuda::*; + pub mod msm; pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + PartialEq { @@ -44,6 +61,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par type G1Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// The affine representation of an element in G1. @@ -59,6 +77,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par type G2Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// The affine representation of an element in G2. @@ -145,6 +164,7 @@ pub trait ProjectiveCurve: + core::iter::Sum + for<'a> core::iter::Sum<&'a Self> + From<::Affine> + + GPUScalarMul<::Affine> { const COFACTOR: &'static [u64]; type ScalarField: PrimeField + SquareRootField; @@ -214,6 +234,8 @@ pub trait ProjectiveCurve: self = res; self } + + fn get_x(&mut self) -> &mut Self::BaseField; } /// Affine representation of an elliptic curve point guaranteed to be @@ -238,6 +260,7 @@ pub trait AffineCurve: + Neg + Zeroize + From<::Projective> + + BatchGroupArithmetic::BaseField> { const COFACTOR: &'static [u64]; type ScalarField: PrimeField + SquareRootField + Into<::BigInt>; @@ -245,6 +268,7 @@ pub trait AffineCurve: type Projective: ProjectiveCurve + From + Into + + GPUScalarMul + MulAssign; // needed due to https://github.com/rust-lang/rust/issues/69640 /// Returns a fixed generator of unknown exponent. diff --git a/ec/src/models/mod.rs b/ec/src/models/mod.rs index 1769ae08f..ce8cfaa42 100644 --- a/ec/src/models/mod.rs +++ b/ec/src/models/mod.rs @@ -8,53 +8,15 @@ pub mod mnt6; pub mod short_weierstrass_jacobian; pub mod twisted_edwards_extended; +pub use { + short_weierstrass_jacobian::SWModelParameters, + twisted_edwards_extended::{MontgomeryModelParameters, TEModelParameters}, +}; + pub trait ModelParameters: Send + Sync + 'static { type BaseField: Field + SquareRootField; - type ScalarField: PrimeField + SquareRootField + Into<::BigInt>; -} - -pub trait SWModelParameters: ModelParameters { - const COEFF_A: Self::BaseField; - const COEFF_B: Self::BaseField; - const COFACTOR: &'static [u64]; - const COFACTOR_INV: Self::ScalarField; - const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); - - #[inline(always)] - fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy *= &Self::COEFF_A; - copy - } - - #[inline(always)] - fn add_b(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy += &Self::COEFF_B; - copy - } -} - -pub trait TEModelParameters: ModelParameters { - const COEFF_A: Self::BaseField; - const COEFF_D: Self::BaseField; - const COFACTOR: &'static [u64]; - const COFACTOR_INV: Self::ScalarField; - const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); - - type MontgomeryModelParameters: MontgomeryModelParameters; - - #[inline(always)] - fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { - let mut copy = *elem; - copy *= &Self::COEFF_A; - copy - } -} - -pub trait MontgomeryModelParameters: ModelParameters { - const COEFF_A: Self::BaseField; - const COEFF_B: Self::BaseField; - - type TEModelParameters: TEModelParameters; + type ScalarField: PrimeField + + SquareRootField + + From<::BigInt> + + Into<::BigInt>; } diff --git a/ec/src/models/short_weierstrass_jacobian.rs b/ec/src/models/short_weierstrass_jacobian.rs index 1b91a1132..5f30a100b 100644 --- a/ec/src/models/short_weierstrass_jacobian.rs +++ b/ec/src/models/short_weierstrass_jacobian.rs @@ -1,3 +1,5 @@ +#[cfg(not(feature = "cuda"))] +use crate::accel_dummy::*; use ark_serialize::{ CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize, CanonicalSerializeWithFlags, SWFlags, SerializationError, @@ -7,16 +9,28 @@ use ark_std::{ io::{Read, Result as IoResult, Write}, marker::PhantomData, ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign}, + string::String, vec::Vec, }; +#[cfg(feature = "cuda")] +use { + crate::BatchGroupArithmeticSlice, accel::*, closure::closure, log::debug, peekmore::PeekMore, + std::sync::Mutex, +}; use ark_ff::{ + biginteger::BigInteger, bytes::{FromBytes, ToBytes}, - fields::{BitIteratorBE, Field, PrimeField, SquareRootField}, + fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField}, ToConstraintField, UniformRand, }; -use crate::{models::SWModelParameters as Parameters, AffineCurve, ProjectiveCurve}; +use crate::{ + batch_arith::{decode_endo_from_u32, BatchGroupArithmetic, ENDO_CODING_BITS}, + cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler}, + impl_gpu_cpu_run_kernel, impl_gpu_sw_projective, impl_run_kernel, AffineCurve, ModelParameters, + ProjectiveCurve, +}; use num_traits::{One, Zero}; use zeroize::Zeroize; @@ -29,6 +43,157 @@ use ark_std::rand::{ #[cfg(feature = "parallel")] use rayon::prelude::*; +pub trait SWModelParameters: ModelParameters + Sized { + const COEFF_A: Self::BaseField; + const COEFF_B: Self::BaseField; + const COFACTOR: &'static [u64]; + const COFACTOR_INV: Self::ScalarField; + const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); + + #[inline(always)] + fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy *= &Self::COEFF_A; + copy + } + + #[inline(always)] + fn glv_window_size() -> usize { + 4 + } + + #[inline(always)] + fn add_b(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy += &Self::COEFF_B; + copy + } + + #[inline(always)] + fn has_glv() -> bool { + false + } + + #[inline(always)] + fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) { + unimplemented!() + } + + #[inline(always)] + fn glv_scalar_decomposition( + _k: ::BigInt, + ) -> ( + (bool, ::BigInt), + (bool, ::BigInt), + ) { + unimplemented!() + } + + // CUDA kernels are parameter specific and cannot + // be instantiated generically + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const GroupProjective, + exps: *const u8, + out: *mut GroupProjective, + n: isize, + ) -> error::Result<()>; + + fn scalar_mul_static_profiler() -> ScalarMulProfiler; + + fn namespace() -> &'static str; +} + +/// Implements GLV mul for a single element with a wNAF tables +#[macro_export] +macro_rules! impl_glv_mul { + ($Projective: ty, $P: ident, $w: ident, $self_proj: ident, $res: ident, $by: ident) => { + // In the future, make this a GLV parameter entry + let wnaf_recoding = + |s: &mut ::BigInt, is_neg: bool| -> Vec { + let window_size: i16 = 1 << ($w + 1); + let half_window_size: i16 = 1 << $w; + + let mut recoding = Vec::::with_capacity(s.num_bits() as usize / ($w + 1)); + + while !s.is_zero() { + let op = if s.is_odd() { + let mut z: i16 = (s.as_ref()[0] % (1 << ($w + 1))) as i16; + + if z < half_window_size { + s.sub_noborrow(&(z as u64).into()); + } else { + z = z - window_size; + s.add_nocarry(&((-z) as u64).into()); + } + if is_neg { + -z + } else { + z + } + } else { + 0 + }; + recoding.push(op); + s.div2(); + } + recoding + }; + + let ((k1_neg, mut k1), (k2_neg, mut k2)) = $P::glv_scalar_decomposition($by.into()); + let mut wnaf_table_k1 = Vec::<$Projective>::with_capacity(1 << $w); + let double = $self_proj.double(); + wnaf_table_k1.push($self_proj); + for _ in 1..(1 << ($w - 1)) { + wnaf_table_k1.push(*wnaf_table_k1.last().unwrap() + &double); + } + let mut wnaf_table_k2 = wnaf_table_k1.clone(); + wnaf_table_k2 + .iter_mut() + .for_each(|p| $P::glv_endomorphism_in_place(&mut p.x)); + + let k1_ops = wnaf_recoding(&mut k1, k1_neg); + let k2_ops = wnaf_recoding(&mut k2, k2_neg); + + if k1_ops.len() > k2_ops.len() { + for &op in k1_ops[k2_ops.len()..].iter().rev() { + $res.double_in_place(); + if op > 0 { + $res += &wnaf_table_k1[(op as usize) / 2]; + } else if op < 0 { + $res += &wnaf_table_k1[(-op as usize) / 2].neg(); + } + } + } else { + for &op in k2_ops[k1_ops.len()..].iter().rev() { + $res.double_in_place(); + if op > 0 { + $res += &wnaf_table_k2[(op as usize) / 2]; + } else if op < 0 { + $res += &wnaf_table_k2[(-op as usize) / 2].neg(); + } + } + } + for (&op1, &op2) in k1_ops.iter().zip(k2_ops.iter()).rev() { + $res.double_in_place(); + if op1 > 0 { + $res += &wnaf_table_k1[(op1 as usize) / 2]; + } else if op1 < 0 { + $res += &wnaf_table_k1[(-op1 as usize) / 2].neg(); + } + if op2 > 0 { + $res += &wnaf_table_k2[(op2 as usize) / 2]; + } else if op2 < 0 { + $res += &wnaf_table_k2[(-op2 as usize) / 2].neg(); + } + } + }; +} + +use SWModelParameters as Parameters; + #[derive(Derivative)] #[derivative( Copy(bound = "P: Parameters"), @@ -202,9 +367,17 @@ impl AffineCurve for GroupAffine

{ } #[inline] - fn mul::BigInt>>(&self, by: S) -> GroupProjective

{ - let bits = BitIteratorBE::new(by.into()); - self.mul_bits(bits) + fn mul::BigInt>>(&self, by: S) -> Self::Projective { + if P::has_glv() { + let w = P::glv_window_size(); + let mut res = Self::Projective::zero(); + let self_proj = self.into_projective(); + impl_glv_mul!(Self::Projective, P, w, self_proj, res, by); + res + } else { + let bits = BitIteratorBE::new(by.into()); + self.mul_bits(bits) + } } #[inline] @@ -256,6 +429,607 @@ impl Default for GroupAffine

{ } } +#[cfg(feature = "prefetch")] +macro_rules! prefetch_slice { + ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => { + if let Some((idp_1, idp_2)) = $prefetch_iter.next() { + prefetch::(&mut $slice_1[*idp_1 as usize]); + prefetch::(&mut $slice_2[*idp_2 as usize]); + } + }; + + ($slice_1: ident, $prefetch_iter: ident) => { + if let Some((idp_1, _)) = $prefetch_iter.next() { + prefetch::(&mut $slice_1[*idp_1 as usize]); + } + }; +} + +#[cfg(feature = "prefetch")] +macro_rules! prefetch_slice_endo { + ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => { + if let Some((idp_1, idp_2)) = $prefetch_iter.next() { + let (idp_2, _) = decode_endo_from_u32(*idp_2); + prefetch::(&mut $slice_1[*idp_1 as usize]); + prefetch::(&$slice_2[idp_2]); + } + }; +} + +#[cfg(feature = "prefetch")] +macro_rules! prefetch_slice_write { + ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => { + if let Some((idp_1, idp_2)) = $prefetch_iter.next() { + prefetch::(&$slice_1[*idp_1 as usize]); + if *idp_2 != !0u32 { + prefetch::(&$slice_2[*idp_2 as usize]); + } + } + }; +} + +macro_rules! batch_add_loop_1 { + ($a: ident, $b: ident, $half: ident, $inversion_tmp: ident) => { + if $a.is_zero() || $b.is_zero() { + (); + } else if $a.x == $b.x { + $half = match $half { + None => P::BaseField::one().double().inverse(), + _ => $half, + }; + let h = $half.unwrap(); + + // Double + // In our model, we consider self additions rare. + // So we consider it inconsequential to make them more expensive + // This costs 1 modular mul more than a standard squaring, + // and one amortised inversion + if $a.y == $b.y { + let x_sq = $b.x.square(); + $b.x -= &$b.y; // x - y + $a.x = $b.y.double(); // denominator = 2y + $a.y = x_sq.double() + &x_sq + &P::COEFF_A; // numerator = 3x^2 + a + $b.y -= &(h * &$a.y); // y - (3x^2 + $a./2 + $a.y *= &$inversion_tmp; // (3x^2 + a) * tmp + $inversion_tmp *= &$a.x; // update tmp + } else { + // No inversions take place if either operand is zero + $a.infinity = true; + $b.infinity = true; + } + } else { + // We can recover x1 + x2 from this. Note this is never 0. + $a.x -= &$b.x; // denominator = x1 - x2 + $a.y -= &$b.y; // numerator = y1 - y2 + $a.y *= &$inversion_tmp; // (y1 - y2)*tmp + $inversion_tmp *= &$a.x // update tmp + } + }; +} + +macro_rules! batch_add_loop_2 { + ($a: ident, $b: ident, $inversion_tmp: ident) => { + if $a.is_zero() { + *$a = $b; + } else if !$b.is_zero() { + let lambda = $a.y * &$inversion_tmp; + $inversion_tmp *= &$a.x; // Remove the top layer of the denominator + + // x3 = l^2 - x1 - x2 or for squaring: 2y + l^2 + 2x - 2y = l^2 - 2x + $a.x += &$b.x.double(); + $a.x = lambda.square() - &$a.x; + // y3 = l*(x2 - x3) - y2 or + // for squaring: (3x^2 + a)/2y(x - y - x3) - (y - (3x^2 + a)/2) = l*(x - x3) - y + $a.y = lambda * &($b.x - &$a.x) - &$b.y; + } + }; +} + +impl BatchGroupArithmetic for GroupAffine

{ + type BaseFieldForBatch = P::BaseField; + /// This implementation of batch group ops takes particular + /// care to make most use of points fetched from memory to prevent + /// reallocations + + /// It is inspired by Aztec's approach: + /// https://github.com/AztecProtocol/barretenberg/blob/ + /// c358fee3259a949da830f9867df49dc18768fa26/barretenberg/ + /// src/aztec/ecc/curves/bn254/scalar_multiplication/scalar_multiplication. + /// cpp + + // We require extra scratch space, and since we want to prevent allocation/deallocation + // overhead we pass it externally for when this function is called many times + #[inline] + fn batch_double_in_place( + bases: &mut [Self], + index: &[u32], + scratch_space: Option<&mut Vec>, + ) { + let mut inversion_tmp = P::BaseField::one(); + + let mut _scratch_space_inner = if scratch_space.is_none() { + Vec::with_capacity(index.len()) + } else { + vec![] + }; + let scratch_space = match scratch_space { + Some(vec) => vec, + None => &mut _scratch_space_inner, + }; + + debug_assert!(scratch_space.len() == 0); + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + for idx in index.iter() { + // Prefetch next group into cache + #[cfg(feature = "prefetch")] + if let Some(idp) = prefetch_iter.next() { + prefetch::(&mut bases[*idp as usize]); + } + let mut a = &mut bases[*idx as usize]; + if !a.is_zero() { + if a.y.is_zero() { + a.infinity = true; + } else { + let x_sq = a.x.square(); + let x_sq_3 = x_sq.double() + &x_sq + &P::COEFF_A; // numerator = 3x^2 + a + scratch_space.push(x_sq_3 * &inversion_tmp); // (3x^2 + a) * tmp + inversion_tmp *= &a.y.double(); // update tmp + } + } + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter().rev(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + for idx in index.iter().rev() { + #[cfg(feature = "prefetch")] + if let Some(idp) = prefetch_iter.next() { + prefetch::(&mut bases[*idp as usize]); + } + let mut a = &mut bases[*idx as usize]; + if !a.is_zero() { + let z = scratch_space.pop().unwrap(); + #[cfg(feature = "prefetch")] + if let Some(e) = scratch_space.last() { + prefetch::(e); + } + let lambda = z * &inversion_tmp; + inversion_tmp *= &a.y.double(); // Remove the top layer of the denominator + + // x3 = l^2 + 2x + let x3 = &(lambda.square() - &a.x.double()); + // y3 = l*(x - x3) - y + a.y = lambda * &(a.x - x3) - &a.y; + a.x = *x3; + } + } + + debug_assert!(scratch_space.len() == 0); + + // We reset the vector + // Clearing is really unnecessary, but we can do it anyway + scratch_space.clear(); + } + + #[inline] + fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]) { + let mut inversion_tmp = P::BaseField::one(); + let mut half = None; + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + #[cfg(feature = "prefetch")] + prefetch_slice!(bases, other, prefetch_iter); + + let (mut a, mut b) = (&mut bases[*idx as usize], &mut other[*idy as usize]); + batch_add_loop_1!(a, b, half, inversion_tmp); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter().rev(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + for (idx, idy) in index.iter().rev() { + #[cfg(feature = "prefetch")] + prefetch_slice!(bases, other, prefetch_iter); + let (mut a, b) = (&mut bases[*idx as usize], other[*idy as usize]); + batch_add_loop_2!(a, b, inversion_tmp) + } + } + + #[inline] + fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]) { + let mut inversion_tmp = P::BaseField::one(); + let mut half = None; + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + { + prefetch_iter.next(); + } + + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + #[cfg(feature = "prefetch")] + prefetch_slice!(bases, bases, prefetch_iter); + let (mut a, mut b) = if idx < idy { + let (x, y) = bases.split_at_mut(*idy as usize); + (&mut x[*idx as usize], &mut y[0]) + } else { + let (x, y) = bases.split_at_mut(*idx as usize); + (&mut y[0], &mut x[*idy as usize]) + }; + batch_add_loop_1!(a, b, half, inversion_tmp); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter().rev(); + #[cfg(feature = "prefetch")] + { + prefetch_iter.next(); + } + + for (idx, idy) in index.iter().rev() { + #[cfg(feature = "prefetch")] + prefetch_slice!(bases, bases, prefetch_iter); + let (mut a, b) = if idx < idy { + let (x, y) = bases.split_at_mut(*idy as usize); + (&mut x[*idx as usize], y[0]) + } else { + let (x, y) = bases.split_at_mut(*idx as usize); + (&mut y[0], x[*idy as usize]) + }; + batch_add_loop_2!(a, b, inversion_tmp); + } + } + + #[inline] + fn batch_add_in_place_read_only( + bases: &mut [Self], + other: &[Self], + index: &[(u32, u32)], + scratch_space: &mut Vec, + ) { + let mut inversion_tmp = P::BaseField::one(); + let mut half = None; + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + let (idy, endomorphism) = decode_endo_from_u32(*idy); + #[cfg(feature = "prefetch")] + prefetch_slice_endo!(bases, other, prefetch_iter); + + let mut a = &mut bases[*idx as usize]; + + // Apply endomorphisms according to encoding + let mut b = if endomorphism % 2 == 1 { + other[idy].neg() + } else { + other[idy] + }; + + if P::has_glv() { + if endomorphism >> 1 == 1 { + P::glv_endomorphism_in_place(&mut b.x); + } + } + batch_add_loop_1!(a, b, half, inversion_tmp); + scratch_space.push(b); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter().rev(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + for (idx, _) in index.iter().rev() { + #[cfg(feature = "prefetch")] + { + prefetch_slice!(bases, prefetch_iter); + let len = scratch_space.len(); + if len > 0 { + prefetch::(&mut scratch_space[len - 1]); + } + } + let (mut a, b) = (&mut bases[*idx as usize], scratch_space.pop().unwrap()); + batch_add_loop_2!(a, b, inversion_tmp); + } + } + + fn batch_add_write( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ) { + let mut inversion_tmp = P::BaseField::one(); + let mut half = None; + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + { + prefetch_iter.next(); + } + + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + #[cfg(feature = "prefetch")] + prefetch_slice_write!(lookup, lookup, prefetch_iter); + + if *idy == !0u32 { + new_elems.push(lookup[*idx as usize]); + scratch_space.push(None); + } else { + let (mut a, mut b) = (lookup[*idx as usize], lookup[*idy as usize]); + batch_add_loop_1!(a, b, half, inversion_tmp); + new_elems.push(a); + scratch_space.push(Some(b)); + } + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) { + match op_b { + Some(b) => { + let b_ = *b; + batch_add_loop_2!(a, b_, inversion_tmp); + } + None => (), + }; + } + scratch_space.clear(); + } + + fn batch_add_write_read_self( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ) { + let mut inversion_tmp = P::BaseField::one(); + let mut half = None; + + #[cfg(feature = "prefetch")] + let mut prefetch_iter = index.iter(); + #[cfg(feature = "prefetch")] + prefetch_iter.next(); + + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + #[cfg(feature = "prefetch")] + prefetch_slice_write!(new_elems, lookup, prefetch_iter); + + if *idy == !0u32 { + new_elems.push(lookup[*idx as usize]); + scratch_space.push(None); + } else { + let (mut a, mut b) = (new_elems[*idx as usize], lookup[*idy as usize]); + batch_add_loop_1!(a, b, half, inversion_tmp); + new_elems.push(a); + scratch_space.push(Some(b)); + } + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) { + match op_b { + Some(b) => { + let b_ = *b; + batch_add_loop_2!(a, b_, inversion_tmp); + } + None => (), + }; + } + scratch_space.clear(); + } + + fn batch_scalar_mul_in_place( + mut bases: &mut [Self], + scalars: &mut [BigInt], + w: usize, + ) { + debug_assert!(bases.len() == scalars.len()); + if bases.len() == 0 { + return; + } + let batch_size = bases.len(); + if P::has_glv() { + use itertools::{EitherOrBoth::*, Itertools}; + let mut scratch_space = Vec::::with_capacity(bases.len()); + let mut scratch_space_group = Vec::::with_capacity(bases.len() / w); + + let k_vec: Vec<_> = scalars + .iter() + .map(|k| { + P::glv_scalar_decomposition(::BigInt::from_slice( + k.as_ref(), + )) + }) + .collect(); + + let mut k1_scalars: Vec<_> = k_vec.iter().map(|x| (x.0).1).collect(); + let k1_negates: Vec<_> = k_vec.iter().map(|x| (x.0).0).collect(); + let mut k2_scalars: Vec<_> = k_vec.iter().map(|x| (x.1).1).collect(); + let k2_negates: Vec<_> = k_vec.iter().map(|x| (x.1).0).collect(); + + let opcode_vectorised_k1 = Self::batch_wnaf_opcode_recoding( + &mut k1_scalars[..], + w, + Some(k1_negates.as_slice()), + ); + let opcode_vectorised_k2 = Self::batch_wnaf_opcode_recoding( + &mut k2_scalars[..], + w, + Some(k2_negates.as_slice()), + ); + + let tables = Self::batch_wnaf_tables(bases, w); + let tables_k2: Vec<_> = tables + .iter() + .map(|&p| { + let mut p = p; + P::glv_endomorphism_in_place(&mut p.x); + p + }) + .collect(); + // Set all points to 0; + let zero = Self::zero(); + for p in bases.iter_mut() { + *p = zero; + } + + let noop_vec = vec![None; batch_size]; + for (opcode_row_k1, opcode_row_k2) in opcode_vectorised_k1 + .iter() + .zip_longest(opcode_vectorised_k2.iter()) + .map(|x| match x { + Both(a, b) => (a, b), + Left(a) => (a, &noop_vec), + Right(b) => (&noop_vec, b), + }) + .rev() + { + let index_double: Vec<_> = opcode_row_k1 + .iter() + .zip(opcode_row_k2.iter()) + .enumerate() + .filter(|x| (x.1).0.is_some() || (x.1).1.is_some()) + .map(|x| x.0 as u32) + .collect(); + + Self::batch_double_in_place( + &mut bases, + &index_double[..], + Some(&mut scratch_space), + ); + let index_add_k1: Vec<_> = opcode_row_k1 + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|(i, op)| { + let idx = op.unwrap(); + if idx > 0 { + let op2 = ((idx as usize) / 2 * batch_size + i) as u32; + (i as u32, op2 << ENDO_CODING_BITS) + } else { + let op2 = ((-idx as usize) / 2 * batch_size + i) as u32; + (i as u32, (op2 << ENDO_CODING_BITS) + 1) + } + }) + .collect(); + + Self::batch_add_in_place_read_only( + &mut bases, + &tables[..], + &index_add_k1[..], + &mut scratch_space_group, + ); + let index_add_k2: Vec<_> = opcode_row_k2 + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|(i, op)| { + let idx = op.unwrap(); + if idx > 0 { + let op2 = ((idx as usize) / 2 * batch_size + i) as u32; + (i as u32, op2 << ENDO_CODING_BITS) + } else { + let op2 = ((-idx as usize) / 2 * batch_size + i) as u32; + (i as u32, (op2 << ENDO_CODING_BITS) + 1) + } + }) + .collect(); + + Self::batch_add_in_place_read_only( + &mut bases, + &tables_k2[..], + &index_add_k2[..], + &mut scratch_space_group, + ); + } + } else { + let mut scratch_space = Vec::::with_capacity(bases.len()); + let opcode_vectorised = Self::batch_wnaf_opcode_recoding::(scalars, w, None); + let tables = Self::batch_wnaf_tables(bases, w); + // Set all points to 0; + let zero = Self::zero(); + for p in bases.iter_mut() { + *p = zero; + } + + for opcode_row in opcode_vectorised.iter().rev() { + let index_double: Vec<_> = opcode_row + .iter() + .enumerate() + .filter(|x| x.1.is_some()) + .map(|x| x.0 as u32) + .collect(); + + Self::batch_double_in_place( + &mut bases, + &index_double[..], + Some(&mut scratch_space), + ); + + let mut add_ops: Vec = opcode_row + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|(i, op)| { + let idx = op.unwrap(); + if idx > 0 { + tables[(idx as usize) / 2 * batch_size + i].clone() + } else { + tables[(-idx as usize) / 2 * batch_size + i].clone().neg() + } + }) + .collect(); + + let index_add: Vec<_> = opcode_row + .iter() + .enumerate() + .filter(|(_, op)| op.is_some() && op.unwrap() != 0) + .map(|x| x.0) + .enumerate() + .map(|(x, y)| (y as u32, x as u32)) + .collect(); + + Self::batch_add_in_place(&mut bases, &mut add_ops[..], &index_add[..]); + } + } + } +} + #[derive(Derivative)] #[derivative( Copy(bound = "P: Parameters"), @@ -384,12 +1158,19 @@ impl Zero for GroupProjective

{ } } +impl_gpu_sw_projective!(Parameters); + impl ProjectiveCurve for GroupProjective

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; type Affine = GroupAffine

; + #[inline(always)] + fn get_x(&mut self) -> &mut Self::BaseField { + &mut self.x + } + #[inline] fn prime_subgroup_generator() -> Self { GroupAffine::prime_subgroup_generator().into() @@ -560,6 +1341,26 @@ impl ProjectiveCurve for GroupProjective

{ self.z -= &hh; } } + + fn mul>(mut self, other: S) -> Self { + if P::has_glv() { + let w = P::glv_window_size(); + let mut res = Self::zero(); + let exponent_bigint = ::BigInt::from_slice(other.as_ref()); + impl_glv_mul!(Self, P, w, self, res, exponent_bigint); + res + } else { + let mut res = Self::zero(); + for b in BitIteratorBE::without_leading_zeros(other.as_ref()) { + res.double_in_place(); + if b { + res += self; + } + } + self = res; + self + } + } } impl Neg for GroupProjective

{ @@ -782,6 +1583,15 @@ impl CanonicalSerialize for GroupProjective

{ impl CanonicalDeserialize for GroupAffine

{ #[allow(unused_qualifications)] fn deserialize(reader: R) -> Result { + let p = Self::deserialize_unchecked(reader)?; + if !p.is_in_correct_subgroup_assuming_on_curve() { + return Err(SerializationError::InvalidData); + } + Ok(p) + } + + #[allow(unused_qualifications)] + fn deserialize_unchecked(reader: R) -> Result { let (x, flags): (P::BaseField, SWFlags) = CanonicalDeserializeWithFlags::deserialize_with_flags(reader)?; if flags.is_infinity() { @@ -789,19 +1599,13 @@ impl CanonicalDeserialize for GroupAffine

{ } else { let p = GroupAffine::

::get_point_from_x(x, flags.is_positive().unwrap()) .ok_or(SerializationError::InvalidData)?; - if !p.is_in_correct_subgroup_assuming_on_curve() { - return Err(SerializationError::InvalidData); - } Ok(p) } } #[allow(unused_qualifications)] - fn deserialize_uncompressed( - reader: R, - ) -> Result { - let p = Self::deserialize_unchecked(reader)?; - + fn deserialize_uncompressed(reader: R) -> Result { + let p = Self::deserialize_uncompressed_unchecked(reader)?; if !p.is_in_correct_subgroup_assuming_on_curve() { return Err(SerializationError::InvalidData); } @@ -809,7 +1613,9 @@ impl CanonicalDeserialize for GroupAffine

{ } #[allow(unused_qualifications)] - fn deserialize_unchecked(mut reader: R) -> Result { + fn deserialize_uncompressed_unchecked( + mut reader: R, + ) -> Result { let x: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?; let (y, flags): (P::BaseField, SWFlags) = CanonicalDeserializeWithFlags::deserialize_with_flags(&mut reader)?; diff --git a/ec/src/models/twisted_edwards_extended.rs b/ec/src/models/twisted_edwards_extended.rs index d4a5524ec..8922cc401 100644 --- a/ec/src/models/twisted_edwards_extended.rs +++ b/ec/src/models/twisted_edwards_extended.rs @@ -1,6 +1,10 @@ +#[cfg(not(feature = "cuda"))] +use crate::accel_dummy::*; use crate::{ - models::{MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters}, - AffineCurve, ProjectiveCurve, + batch_arith::{decode_endo_from_u32, BatchGroupArithmetic}, + cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler}, + impl_gpu_cpu_run_kernel, impl_gpu_te_projective, impl_run_kernel, AffineCurve, ModelParameters, + ProjectiveCurve, }; use ark_serialize::{ CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize, @@ -15,20 +19,67 @@ use ark_std::{ io::{Read, Result as IoResult, Write}, marker::PhantomData, ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign}, + string::String, vec::Vec, }; use num_traits::{One, Zero}; +#[cfg(feature = "cuda")] +use { + crate::BatchGroupArithmeticSlice, accel::*, closure::closure, log::debug, peekmore::PeekMore, + std::sync::Mutex, +}; use zeroize::Zeroize; use ark_ff::{ + biginteger::BigInteger, bytes::{FromBytes, ToBytes}, - fields::{BitIteratorBE, Field, PrimeField, SquareRootField}, - ToConstraintField, UniformRand, + fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField}, + impl_additive_ops_from_ref, ToConstraintField, UniformRand, }; #[cfg(feature = "parallel")] use rayon::prelude::*; +pub trait MontgomeryModelParameters: ModelParameters { + const COEFF_A: Self::BaseField; + const COEFF_B: Self::BaseField; + + type TEModelParameters: TEModelParameters; +} + +pub trait TEModelParameters: ModelParameters + Sized { + const COEFF_A: Self::BaseField; + const COEFF_D: Self::BaseField; + const COFACTOR: &'static [u64]; + const COFACTOR_INV: Self::ScalarField; + const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField); + + type MontgomeryModelParameters: MontgomeryModelParameters; + + #[inline(always)] + fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField { + let mut copy = *elem; + copy *= &Self::COEFF_A; + copy + } + + fn scalar_mul_kernel( + ctx: &Context, + grid: usize, + block: usize, + table: *const GroupProjective, + exps: *const u8, + out: *mut GroupProjective, + n: isize, + ) -> error::Result<()>; + + fn scalar_mul_static_profiler() -> ScalarMulProfiler; + + fn namespace() -> &'static str; +} + +use {MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters}; + #[derive(Derivative)] #[derivative( Copy(bound = "P: Parameters"), @@ -180,7 +231,7 @@ impl Neg for GroupAffine

{ } } -ark_ff::impl_additive_ops_from_ref!(GroupAffine, Parameters); +impl_additive_ops_from_ref!(GroupAffine, Parameters); impl<'a, P: Parameters> Add<&'a Self> for GroupAffine

{ type Output = Self; @@ -291,6 +342,208 @@ mod group_impl { } } +macro_rules! batch_add_loop_1 { + ($a: ident, $b: ident, $inversion_tmp: ident) => { + if $a.is_zero() || $b.is_zero() { + continue; + } else { + let y1y2 = $a.y * &$b.y; + let x1x2 = $a.x * &$b.x; + + $a.x = ($a.x + &$a.y) * &($b.x + &$b.y) - &y1y2 - &x1x2; + $a.y = y1y2; + if !P::COEFF_A.is_zero() { + $a.y -= &P::mul_by_a(&x1x2); + } + + let dx1x2y1y2 = P::COEFF_D * &y1y2 * &x1x2; + + let inversion_mul_d = $inversion_tmp * &dx1x2y1y2; + + $a.x *= &($inversion_tmp - &inversion_mul_d); + $a.y *= &($inversion_tmp + &inversion_mul_d); + + $b.x = P::BaseField::one() - &dx1x2y1y2.square(); + + $inversion_tmp *= &$b.x; + } + }; +} + +macro_rules! batch_add_loop_2 { + ($a: ident, $b: ident, $inversion_tmp: ident) => { + if $a.is_zero() { + *$a = $b; + } else if !$b.is_zero() { + $a.x *= &$inversion_tmp; + $a.y *= &$inversion_tmp; + + $inversion_tmp *= &$b.x; + } + }; +} + +impl BatchGroupArithmetic for GroupAffine

{ + type BaseFieldForBatch = P::BaseField; + + fn batch_double_in_place( + bases: &mut [Self], + index: &[u32], + _scratch_space: Option<&mut Vec>, + ) { + Self::batch_add_in_place( + bases, + &mut bases.to_vec()[..], + &index.iter().map(|&x| (x, x)).collect::>()[..], + ); + } + + // Total cost: 12 mul. Projective formulas: 11 mul. + fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]) { + let mut inversion_tmp = P::BaseField::one(); + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + let (mut a, mut b) = if idx < idy { + let (x, y) = bases.split_at_mut(*idy as usize); + (&mut x[*idx as usize], &mut y[0]) + } else { + let (x, y) = bases.split_at_mut(*idx as usize); + (&mut y[0], &mut x[*idy as usize]) + }; + batch_add_loop_1!(a, b, inversion_tmp); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (idx, idy) in index.iter().rev() { + let (a, b) = if idx < idy { + let (x, y) = bases.split_at_mut(*idy as usize); + (&mut x[*idx as usize], y[0]) + } else { + let (x, y) = bases.split_at_mut(*idx as usize); + (&mut y[0], x[*idy as usize]) + }; + batch_add_loop_2!(a, b, inversion_tmp); + } + } + + // Total cost: 12 mul. Projective formulas: 11 mul. + fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]) { + let mut inversion_tmp = P::BaseField::one(); + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + let (mut a, mut b) = (&mut bases[*idx as usize], &mut other[*idy as usize]); + batch_add_loop_1!(a, b, inversion_tmp); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (idx, idy) in index.iter().rev() { + let (a, b) = (&mut bases[*idx as usize], other[*idy as usize]); + batch_add_loop_2!(a, b, inversion_tmp); + } + } + + #[inline] + fn batch_add_in_place_read_only( + bases: &mut [Self], + other: &[Self], + index: &[(u32, u32)], + scratch_space: &mut Vec, + ) { + let mut inversion_tmp = P::BaseField::one(); + // We run two loops over the data separated by an inversion + for (idx, idy) in index.iter() { + let (idy, endomorphism) = decode_endo_from_u32(*idy); + let mut a = &mut bases[*idx as usize]; + // Apply endomorphisms according to encoding + let mut b = if endomorphism % 2 == 1 { + other[idy].neg() + } else { + other[idy] + }; + + batch_add_loop_1!(a, b, inversion_tmp); + scratch_space.push(b); + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (idx, _) in index.iter().rev() { + let (a, b) = (&mut bases[*idx as usize], scratch_space.pop().unwrap()); + batch_add_loop_2!(a, b, inversion_tmp); + } + } + + fn batch_add_write( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ) { + let mut inversion_tmp = P::BaseField::one(); + + for (idx, idy) in index.iter() { + if *idy == !0u32 { + new_elems.push(lookup[*idx as usize]); + scratch_space.push(None); + } else { + let (mut a, mut b) = (lookup[*idx as usize], lookup[*idy as usize]); + batch_add_loop_1!(a, b, inversion_tmp); + new_elems.push(a); + scratch_space.push(Some(b)); + } + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) { + match op_b { + Some(b) => { + let b_ = *b; + batch_add_loop_2!(a, b_, inversion_tmp); + } + None => (), + }; + } + scratch_space.clear(); + } + + fn batch_add_write_read_self( + lookup: &[Self], + index: &[(u32, u32)], + new_elems: &mut Vec, + scratch_space: &mut Vec>, + ) { + let mut inversion_tmp = P::BaseField::one(); + + for (idx, idy) in index.iter() { + if *idy == !0u32 { + new_elems.push(lookup[*idx as usize]); + scratch_space.push(None); + } else { + let (mut a, mut b) = (new_elems[*idx as usize], lookup[*idy as usize]); + batch_add_loop_1!(a, b, inversion_tmp); + new_elems.push(a); + scratch_space.push(Some(b)); + } + } + + inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp* + + for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) { + match op_b { + Some(b) => { + let b_ = *b; + batch_add_loop_2!(a, b_, inversion_tmp); + } + None => (), + }; + } + scratch_space.clear(); + } +} + ////////////////////////////////////////////////////////////////////////////// /// `GroupProjective` implements Extended Twisted Edwards Coordinates @@ -427,12 +680,19 @@ impl Zero for GroupProjective

{ } } +impl_gpu_te_projective!(Parameters); + impl ProjectiveCurve for GroupProjective

{ const COFACTOR: &'static [u64] = P::COFACTOR; type BaseField = P::BaseField; type ScalarField = P::ScalarField; type Affine = GroupAffine

; + #[inline(always)] + fn get_x(&mut self) -> &mut Self::BaseField { + &mut self.x + } + fn prime_subgroup_generator() -> Self { GroupAffine::prime_subgroup_generator().into() } @@ -708,7 +968,6 @@ impl MontgomeryGroupAffine

{ } } } - impl CanonicalSerialize for GroupAffine

{ #[allow(unused_qualifications)] #[inline] @@ -773,7 +1032,15 @@ impl CanonicalSerialize for GroupProjective

{ impl CanonicalDeserialize for GroupAffine

{ #[allow(unused_qualifications)] - fn deserialize(mut reader: R) -> Result { + fn deserialize(reader: R) -> Result { + let p = Self::deserialize_unchecked(reader)?; + if !p.is_in_correct_subgroup_assuming_on_curve() { + return Err(SerializationError::InvalidData); + } + Ok(p) + } + #[allow(unused_qualifications)] + fn deserialize_unchecked(mut reader: R) -> Result { let (x, flags): (P::BaseField, EdwardsFlags) = CanonicalDeserializeWithFlags::deserialize_with_flags(&mut reader)?; if x == P::BaseField::zero() { @@ -781,16 +1048,13 @@ impl CanonicalDeserialize for GroupAffine

{ } else { let p = GroupAffine::

::get_point_from_x(x, flags.is_positive()) .ok_or(SerializationError::InvalidData)?; - if !p.is_in_correct_subgroup_assuming_on_curve() { - return Err(SerializationError::InvalidData); - } Ok(p) } } #[allow(unused_qualifications)] fn deserialize_uncompressed(reader: R) -> Result { - let p = Self::deserialize_unchecked(reader)?; + let p = Self::deserialize_uncompressed_unchecked(reader)?; if !p.is_in_correct_subgroup_assuming_on_curve() { return Err(SerializationError::InvalidData); @@ -799,7 +1063,9 @@ impl CanonicalDeserialize for GroupAffine

{ } #[allow(unused_qualifications)] - fn deserialize_unchecked(mut reader: R) -> Result { + fn deserialize_uncompressed_unchecked( + mut reader: R, + ) -> Result { let x: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?; let y: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?; diff --git a/ff-asm/src/lib.rs b/ff-asm/src/lib.rs index 70442ea8f..a6b75de4f 100644 --- a/ff-asm/src/lib.rs +++ b/ff-asm/src/lib.rs @@ -3,6 +3,7 @@ #![recursion_limit = "128"] use proc_macro::TokenStream; +use quote::quote; use syn::{ parse::{Parse, ParseStream}, Expr, Item, ItemFn, @@ -38,9 +39,9 @@ pub fn unroll_for_loops(_meta: TokenStream, input: TokenStream) -> TokenStream { block: Box::new(new_block), ..item_fn }); - quote::quote! ( #new_item ).into() + quote! ( #new_item ).into() } else { - quote::quote! ( #item ).into() + quote! ( #item ).into() } } diff --git a/ff/Cargo.toml b/ff/Cargo.toml index f9af60818..d9b7063ad 100644 --- a/ff/Cargo.toml +++ b/ff/Cargo.toml @@ -25,6 +25,7 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive [build-dependencies] rustc_version = "0.3" +cc = "1.0" [dev-dependencies] num-bigint = { version = "0.3.0", default-features = false } diff --git a/ff/build.rs b/ff/build.rs index c6a7dad16..fd69959af 100644 --- a/ff/build.rs +++ b/ff/build.rs @@ -15,4 +15,26 @@ fn main() { if should_use_asm { println!("cargo:rustc-cfg=use_asm"); } + + let should_use_bw6_asm = cfg!(any( + all( + feature = "bw6_asm", + target_feature = "bmi2", + target_feature = "adx", + target_arch = "x86_64" + ), + feature = "force_bw6_asm" + )); + if should_use_bw6_asm { + cc::Build::new() + .file("bw6-assembly/modmul768-sos1-adx.S") + .compile("modmul768"); + cc::Build::new() + .file("bw6-assembly/modadd768.S") + .compile("modadd768"); + cc::Build::new() + .file("bw6-assembly/modsub768.S") + .compile("modsub768"); + println!("cargo:rustc-cfg=use_bw6_asm"); + } } diff --git a/ff/bw6-assembly/modadd768.S b/ff/bw6-assembly/modadd768.S new file mode 100644 index 000000000..de291a781 --- /dev/null +++ b/ff/bw6-assembly/modadd768.S @@ -0,0 +1,181 @@ +// void modadd768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12]) + +#ifdef _WIN64 +# define x %rcx +# define y %rdx +# define m %r8 +# define z %r9 + +# define t2 %rdi +# define t3 %rsi +#else +# define x %rdi +# define y %rsi +# define m %rdx +# define z %rcx + +# define t2 %r9 +# define t3 %r8 +#endif + +#define t0 %r11 +#define t1 %r10 +#define t4 %r15 +#define t5 %r14 + +#define t6 %r13 +#define t7 %r12 +#define t8 %rbx +#define t9 %rax +#define t10 %rbp +#define t11 x +#define t12 z + +#define y0 0*8(y) +#define y1 1*8(y) +#define y2 2*8(y) +#define y3 3*8(y) +#define y4 4*8(y) +#define y5 5*8(y) +#define y6 6*8(y) +#define y7 7*8(y) +#define y8 8*8(y) +#define y9 9*8(y) +#define y10 10*8(y) +#define y11 11*8(y) + +#define m0 0*8(m) +#define m1 1*8(m) +#define m2 2*8(m) +#define m3 3*8(m) +#define m4 4*8(m) +#define m5 5*8(m) +#define m6 6*8(m) +#define m7 7*8(m) +#define m8 8*8(m) +#define m9 9*8(m) +#define m10 10*8(m) +#define m11 11*8(m) + +// We only use these after replacing y with z + +#define z0 0*8(y) +#define z1 1*8(y) +#define z2 2*8(y) +#define z3 3*8(y) +#define z4 4*8(y) +#define z5 5*8(y) +#define z6 6*8(y) +#define z7 7*8(y) +#define z8 8*8(y) +#define z9 9*8(y) +#define z10 10*8(y) +#define z11 11*8(y) + +.text + +#ifdef __APPLE__ +#define modadd768 _modadd768 +#endif + +.globl modadd768 +#ifndef __APPLE__ +#ifndef _WIN64 +.type modadd768, @function +#endif +#endif + +.p2align 6,,15 +modadd768: + + // Callee-saves + +#ifdef _WIN64 + mov %rsi, 1*8(%rsp) + mov %rdi, 2*8(%rsp) +#endif + // Load x + push %r15; mov 0*8(x), t0; mov 1*8(x), t1 + push %r14; mov 2*8(x), t2; mov 3*8(x), t3 + push %r13; mov 4*8(x), t4; mov 5*8(x), t5 + push %r12; mov 6*8(x), t6; mov 7*8(x), t7 + push %rbx; mov 8*8(x), t8; mov 9*8(x), t9 + push %rbp; mov 10*8(x), t10; mov 11*8(x), t11 + push z + + xor t12, t12 + add y0, t0 + adc y1, t1 + adc y2, t2 + adc y3, t3 + adc y4, t4 + adc y5, t5 + adc y6, t6 + adc y7, t7 + adc y8, t8 + adc y9, t9 + adc y10, t10 + adc y11, t11 + adc $0, t12 + + // no more need for y. load z in its place + + pop y + + // Conditional subtraction of m + + mov t0, z0; sub m0, t0 + mov t1, z1; sbb m1, t1 + mov t2, z2; sbb m2, t2 + mov t3, z3; sbb m3, t3 + mov t4, z4; sbb m4, t4 + mov t5, z5; sbb m5, t5 + mov t6, z6; sbb m6, t6 + mov t7, z7; sbb m7, t7 + mov t8, z8; sbb m8, t8 + mov t9, z9; sbb m9, t9 + mov t10, z10; sbb m10, t10 + mov t11, z11; sbb m11, t11 + sbb $0, t12 + + cmovc z0, t0 + cmovc z1, t1 + cmovc z2, t2 + cmovc z3, t3 + cmovc z4, t4 + cmovc z5, t5 + cmovc z6, t6 + cmovc z7, t7 + cmovc z8, t8 + cmovc z9, t9 + cmovc z10, t10 + cmovc z11, t11 + + mov t0, z0 + mov t1, z1 + mov t2, z2 + mov t3, z3 + mov t4, z4 + mov t5, z5 + mov t6, z6 + mov t7, z7 + mov t8, z8 + mov t9, z9 + mov t10, z10 + mov t11, z11 + +#ifdef _WIN64 + mov 7*8(%rsp), %rsi + mov 8*8(%rsp), %rdi +#endif + // Load x + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +// vim: noet ts=8 sw=8 diff --git a/ff/bw6-assembly/modmul768-sos1-adx.S b/ff/bw6-assembly/modmul768-sos1-adx.S new file mode 100644 index 000000000..2d0e7fdd4 --- /dev/null +++ b/ff/bw6-assembly/modmul768-sos1-adx.S @@ -0,0 +1,755 @@ +// void modmul768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12]) + +// m[12] contains the least significant word of the negated inverse of the modulus mod 2^768 + +#ifdef _WIN64 +# define x %rcx +# define y %rdx +# define m %r8 +# define z %r9 +#else +# define x %rdi +# define y %rsi +# define m %rdx +# define z %rcx +#endif + +#define l %rax +#define h %rbx + +#define t0 %rcx +#define t1 %rbp +#define t2 %rsi +#define t3 %rdi +#define t4 %r8 +#define t5 %r9 +#define t6 %r10 +#define t7 %r11 +#define t8 %r12 +#define t9 %r13 +#define t10 %r14 +#define t11 %r15 +#define t12 t0 +#define t13 t1 +#define t14 t2 +#define t15 t3 +#define t16 t4 +#define t17 t5 +#define t18 t6 +#define t19 t7 +#define t20 t8 +#define t21 t9 +#define t22 t10 +#define t23 t11 +#define t24 h + +#define zero 14*8(%rsp) + +#define x0 1*8(%rsp) +#define x1 2*8(%rsp) +#define x2 3*8(%rsp) +#define x3 4*8(%rsp) +#define x4 5*8(%rsp) +#define x5 6*8(%rsp) +#define x6 7*8(%rsp) +#define x7 8*8(%rsp) +#define x8 9*8(%rsp) +#define x9 10*8(%rsp) +#define x10 11*8(%rsp) +#define x11 12*8(%rsp) + +#define m0 x0 +#define m1 x1 +#define m2 x2 +#define m3 x3 +#define m4 x4 +#define m5 x5 +#define m6 x6 +#define m7 x7 +#define m8 x8 +#define m9 x9 +#define m10 x10 +#define m11 x11 +#define inv 13*8(%rsp) + +#define z0 16*8(%rsp) +#define z1 17*8(%rsp) +#define z2 18*8(%rsp) +#define z3 19*8(%rsp) +#define z4 20*8(%rsp) +#define z5 21*8(%rsp) +#define z6 22*8(%rsp) +#define z7 23*8(%rsp) +#define z8 24*8(%rsp) +#define z9 25*8(%rsp) +#define z10 26*8(%rsp) +#define z11 27*8(%rsp) +#define z12 28*8(%rsp) +#define z13 29*8(%rsp) +#define z14 30*8(%rsp) +#define z15 31*8(%rsp) +#define z16 32*8(%rsp) +#define z17 33*8(%rsp) +#define z18 34*8(%rsp) +#define z19 35*8(%rsp) +#define z20 36*8(%rsp) +#define z21 37*8(%rsp) +#define z22 38*8(%rsp) +#define z23 39*8(%rsp) + +#define y1 z1 +#define y2 z2 +#define y3 z3 +#define y4 z4 +#define y5 z5 +#define y6 z6 +#define y7 z7 +#define y8 z8 +#define y9 z9 +#define y10 z10 +#define y11 z11 + +.text + +#ifdef __APPLE__ +#define modmul768 _modmul768 +#endif + +.globl modmul768 +#ifndef __APPLE__ +#ifndef _WIN64 +.type modmul768, @function +#endif +#endif + +.p2align 6,,63 +modmul768: + + // Allocate space on the stack: + // 1 word for padding, to make rsp offsets constant size for x and x*y + // 13 words for x or m + // 2 words for the m and z pointers + // 24 words for x*y (used initially to store y) + // 6 words for callee-saves + + mov %rsp, %rax + sub $8*(1+13+2+24+6), %rsp + + // Callee-saves + +#ifdef _WIN64 + mov %rsi, 1*8(%rax) + mov %rdi, 2*8(%rax) +#endif + mov %rbp, -6*8(%rax) + mov %rbx, -5*8(%rax) + mov %r12, -4*8(%rax) + mov %r13, -3*8(%rax) + mov %r14, -2*8(%rax) + mov %r15, -1*8(%rax) + + // m and z pointers + + mov m, 0*8(%rsp) + mov z, 15*8(%rsp) + + // zero + + movq $0, zero + + // Let m point to the stack space for y + + lea 16*8(%rsp), m + + // copy x to the stack + + mov 0*8(x), %rax; mov %rax, x0 + mov 1*8(x), %rax; mov %rax, x1 + mov 2*8(x), %rax; mov %rax, x2 + mov 3*8(x), %rax; mov %rax, x3 + mov 4*8(x), %rax; mov %rax, x4 + mov 5*8(x), %rax; mov %rax, x5 + mov 6*8(x), %rax; mov %rax, x6 + mov 7*8(x), %rax; mov %rax, x7 + mov 8*8(x), %rax; mov %rax, x8 + mov 9*8(x), %rax; mov %rax, x9 + mov 10*8(x), %rax; mov %rax, x10 + mov 11*8(x), %rax; mov %rax, x11 + + xor h, h // For padding + + // copy y to the stack + + mov 11*8(y), %rax; mov %rax, 11*8(m) + mov 10*8(y), %rax; mov %rax, 10*8(m) + mov 9*8(y), %rax; mov %rax, 9*8(m) + mov 8*8(y), %rax; mov %rax, 8*8(m) + mov 7*8(y), %rax; mov %rax, 7*8(m) + mov 6*8(y), %rax; mov %rax, 6*8(m) + mov 5*8(y), %rax; mov %rax, 5*8(m) + mov 4*8(y), %rax; mov %rax, 4*8(m) + mov 3*8(y), %rax; mov %rax, 3*8(m) + mov 2*8(y), %rax; mov %rax, 2*8(m) + mov 1*8(y), %rax; mov %rax, 1*8(m) + mov 0*8(y), %rdx + +// y0 + + // mov y0, %rdx + + xor h, h + + mulx x0, t0, t1 + mov t0, z0 + mulx x2, t2, t3 + mulx x1, l, h; adcx l, t1; adcx h, t2 + mulx x4, t4, t5 + mulx x3, l, h; adcx l, t3; adcx h, t4 + mulx x6, t6, t7 + mulx x5, l, h; adcx l, t5; adcx h, t6 + mulx x8, t8, t9 + mulx x7, l, h; adcx l, t7; adcx h, t8 + mulx x10, t10, t11 + mulx x9, l, h; adcx l, t9; adcx h, t10 + mulx x11, l, t12; adcx l, t11 + +// y1 + + mov y1, %rdx; adc $0, t12 + xor h, h + + mulx x1, l, h; adox l, t2; adox h, t3 + mulx x0, l, h; adcx l, t1; adcx h, t2 + mov t1, z1; mov zero, t13; + mulx x3, l, h; adox l, t4; adox h, t5 + mulx x2, l, h; adcx l, t3; adcx h, t4 + mulx x5, l, h; adox l, t6; adox h, t7 + mulx x4, l, h; adcx l, t5; adcx h, t6 + mulx x7, l, h; adox l, t8; adox h, t9 + mulx x6, l, h; adcx l, t7; adcx h, t8 + mulx x9, l, h; adox l, t10; adox h, t11 + mulx x8, l, h; adcx l, t9; adcx h, t10; nop + mulx x11, l, h; adox l, t12; adox h, t13 + mulx x10, l, h; adcx l, t11; adcx h, t12 + +// y2 + + mov y2, %rdx; adc $0, t13 + xor h, h + + mulx x1, l, h; adox l, t3; adox h, t4 + mulx x0, l, h; adcx l, t2; adcx h, t3 + mov t2, z2; mov zero, t14; + mulx x3, l, h; adox l, t5; adox h, t6 + mulx x2, l, h; adcx l, t4; adcx h, t5 + mulx x5, l, h; adox l, t7; adox h, t8 + mulx x4, l, h; adcx l, t6; adcx h, t7 + mulx x7, l, h; adox l, t9; adox h, t10 + mulx x6, l, h; adcx l, t8; adcx h, t9 + mulx x9, l, h; adox l, t11; adox h, t12 + mulx x8, l, h; adcx l, t10; adcx h, t11; nop + mulx x11, l, h; adox l, t13; adox h, t14 + mulx x10, l, h; adcx l, t12; adcx h, t13 + +// y3 + + mov y3, %rdx; adc $0, t14 + xor h, h + + mulx x1, l, h; adox l, t4; adox h, t5 + mulx x0, l, h; adcx l, t3; adcx h, t4 + mov t3, z3; mov zero, t15; + mulx x3, l, h; adox l, t6; adox h, t7 + mulx x2, l, h; adcx l, t5; adcx h, t6 + mulx x5, l, h; adox l, t8; adox h, t9 + mulx x4, l, h; adcx l, t7; adcx h, t8 + mulx x7, l, h; adox l, t10; adox h, t11 + mulx x6, l, h; adcx l, t9; adcx h, t10 + mulx x9, l, h; adox l, t12; adox h, t13 + mulx x8, l, h; adcx l, t11; adcx h, t12; nop + mulx x11, l, h; adox l, t14; adox h, t15 + mulx x10, l, h; adcx l, t13; adcx h, t14 + +// y4 + + mov y4, %rdx; adc $0, t15 + xor h, h + + mulx x1, l, h; adox l, t5; adox h, t6 + mulx x0, l, h; adcx l, t4; adcx h, t5 + mov t4, z4; mov zero, t16; + mulx x3, l, h; adox l, t7; adox h, t8 + mulx x2, l, h; adcx l, t6; adcx h, t7 + mulx x5, l, h; adox l, t9; adox h, t10 + mulx x4, l, h; adcx l, t8; adcx h, t9 + mulx x7, l, h; adox l, t11; adox h, t12 + mulx x6, l, h; adcx l, t10; adcx h, t11 + mulx x9, l, h; adox l, t13; adox h, t14 + mulx x8, l, h; adcx l, t12; adcx h, t13; nop + mulx x11, l, h; adox l, t15; adox h, t16 + mulx x10, l, h; adcx l, t14; adcx h, t15 + +// y5 + + mov y5, %rdx; adc $0, t16 + xor h, h + + mulx x1, l, h; adox l, t6; adox h, t7 + mulx x0, l, h; adcx l, t5; adcx h, t6 + mov t5, z5; mov zero, t17; + mulx x3, l, h; adox l, t8; adox h, t9 + mulx x2, l, h; adcx l, t7; adcx h, t8 + mulx x5, l, h; adox l, t10; adox h, t11 + mulx x4, l, h; adcx l, t9; adcx h, t10 + mulx x7, l, h; adox l, t12; adox h, t13 + mulx x6, l, h; adcx l, t11; adcx h, t12 + mulx x9, l, h; adox l, t14; adox h, t15 + mulx x8, l, h; adcx l, t13; adcx h, t14; nop + mulx x11, l, h; adox l, t16; adox h, t17 + mulx x10, l, h; adcx l, t15; adcx h, t16 + +// y6 + + mov y6, %rdx; adc $0, t17 + xor h, h + + mulx x1, l, h; adox l, t7; adox h, t8 + mulx x0, l, h; adcx l, t6; adcx h, t7 + mov t6, z6; mov zero, t18; + mulx x3, l, h; adox l, t9; adox h, t10 + mulx x2, l, h; adcx l, t8; adcx h, t9 + mulx x5, l, h; adox l, t11; adox h, t12 + mulx x4, l, h; adcx l, t10; adcx h, t11 + mulx x7, l, h; adox l, t13; adox h, t14 + mulx x6, l, h; adcx l, t12; adcx h, t13 + mulx x9, l, h; adox l, t15; adox h, t16 + mulx x8, l, h; adcx l, t14; adcx h, t15; nop + mulx x11, l, h; adox l, t17; adox h, t18 + mulx x10, l, h; adcx l, t16; adcx h, t17 + +// y7 + + mov y7, %rdx; adc $0, t18 + xor h, h + + mulx x1, l, h; adox l, t8; adox h, t9 + mulx x0, l, h; adcx l, t7; adcx h, t8 + mov t7, z7; mov zero, t19; + mulx x3, l, h; adox l, t10; adox h, t11 + mulx x2, l, h; adcx l, t9; adcx h, t10 + mulx x5, l, h; adox l, t12; adox h, t13 + mulx x4, l, h; adcx l, t11; adcx h, t12 + mulx x7, l, h; adox l, t14; adox h, t15 + mulx x6, l, h; adcx l, t13; adcx h, t14 + mulx x9, l, h; adox l, t16; adox h, t17 + mulx x8, l, h; adcx l, t15; adcx h, t16; nop + mulx x11, l, h; adox l, t18; adox h, t19 + mulx x10, l, h; adcx l, t17; adcx h, t18 + +// y8 + + mov y8, %rdx; adc $0, t19 + xor h, h + + mulx x1, l, h; adox l, t9; adox h, t10 + mulx x0, l, h; adcx l, t8; adcx h, t9 + mov t8, z8; mov zero, t20; + mulx x3, l, h; adox l, t11; adox h, t12 + mulx x2, l, h; adcx l, t10; adcx h, t11 + mulx x5, l, h; adox l, t13; adox h, t14 + mulx x4, l, h; adcx l, t12; adcx h, t13 + mulx x7, l, h; adox l, t15; adox h, t16 + mulx x6, l, h; adcx l, t14; adcx h, t15 + mulx x9, l, h; adox l, t17; adox h, t18 + mulx x8, l, h; adcx l, t16; adcx h, t17; nop + mulx x11, l, h; adox l, t19; adox h, t20 + mulx x10, l, h; adcx l, t18; adcx h, t19 + +// y9 + + mov y9, %rdx; adc $0, t20 + xor h, h + + mulx x1, l, h; adox l, t10; adox h, t11 + mulx x0, l, h; adcx l, t9; adcx h, t10 + mov t9, z9; mov zero, t21; + mulx x3, l, h; adox l, t12; adox h, t13 + mulx x2, l, h; adcx l, t11; adcx h, t12 + mulx x5, l, h; adox l, t14; adox h, t15 + mulx x4, l, h; adcx l, t13; adcx h, t14 + mulx x7, l, h; adox l, t16; adox h, t17 + mulx x6, l, h; adcx l, t15; adcx h, t16 + mulx x9, l, h; adox l, t18; adox h, t19 + mulx x8, l, h; adcx l, t17; adcx h, t18; nop + mulx x11, l, h; adox l, t20; adox h, t21 + mulx x10, l, h; adcx l, t19; adcx h, t20 + +// y10 + + mov y10, %rdx; adc $0, t21 + xor h, h + + mulx x1, l, h; adox l, t11; adox h, t12 + mulx x0, l, h; adcx l, t10; adcx h, t11 + mov t10, z10; mov zero, t22; + mulx x3, l, h; adox l, t13; adox h, t14 + mulx x2, l, h; adcx l, t12; adcx h, t13 + mulx x5, l, h; adox l, t15; adox h, t16 + mulx x4, l, h; adcx l, t14; adcx h, t15 + mulx x7, l, h; adox l, t17; adox h, t18 + mulx x6, l, h; adcx l, t16; adcx h, t17 + mulx x9, l, h; adox l, t19; adox h, t20 + mulx x8, l, h; adcx l, t18; adcx h, t19; nop + mulx x11, l, h; adox l, t21; adox h, t22 + mulx x10, l, h; adcx l, t20; adcx h, t21 + +// y11 + + mov y11, %rdx; adc $0, t22 + xor h, h + + mulx x1, l, h; adox l, t12; adox h, t13 + mulx x0, l, h; adcx l, t11; adcx h, t12 + mov t11, z11; mov zero, t23; + mulx x3, l, h; adox l, t14; adox h, t15 + mulx x2, l, h; adcx l, t13; adcx h, t14 + mulx x5, l, h; adox l, t16; adox h, t17 + mulx x4, l, h; adcx l, t15; adcx h, t16 + mulx x7, l, h; adox l, t18; adox h, t19 + mulx x6, l, h; adcx l, t17; adcx h, t18 + mulx x9, l, h; adox l, t20; adox h, t21 + mulx x8, l, h; adcx l, t19; adcx h, t20; nop + mulx x11, l, h; adox l, t22; adox h, t23 + mulx x10, l, h; adcx l, t21; adcx h, t22 + + // Copy m to the stack, overwriting x + + mov 0*8(%rsp), h; adc $0, t23 + mov 12*8(h), %rdx; mov %rdx, inv + mov 11*8(h), %rax; mov %rax, m11 + mov 10*8(h), %rax; mov %rax, m10 + mov 9*8(h), %rax; mov %rax, m9 + mov 8*8(h), %rax; mov %rax, m8 + mov 7*8(h), %rax; mov %rax, m7 + mov 6*8(h), %rax; mov %rax, m6 + mov 5*8(h), %rax; mov %rax, m5 + mov 4*8(h), %rax; mov %rax, m4 + mov 3*8(h), %rax; mov %rax, m3 + mov 2*8(h), %rax; mov %rax, m2 + mov 1*8(h), %rax; mov %rax, m1 + mov 0*8(h), %rax; mov %rax, m0 + + // Write out the top half of x*y to the stack, load the low half back in + + mov t12, z12; mov z0, t0 + mov t13, z13; mov z1, t1 + mov t14, z14; mov z2, t2 + mov t15, z15; mov z3, t3 + mov t16, z16; mov z4, t4 + mov t17, z17; mov z5, t5 + mov t18, z18; mov z6, t6 + mov t19, z19; mov z7, t7 + mov t20, z20; mov z8, t8 + mov t21, z21; mov z9, t9 + mov t22, z22; mov z10, t10 + mov t23, z23; mov z11, t11 + +//////////////////////////////////////////////////////////////// +// Reduction +//////////////////////////////////////////////////////////////// + +// z0 + + //mov inv, %rdx + mulx t0, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t1; adox h, t2 + mulx m0, l, h; adcx l, t0; adcx h, t1 + mulx m3, l, h; adox l, t3; adox h, t4 + mulx m2, l, h; adcx l, t2; adcx h, t3 + mulx m5, l, h; adox l, t5; adox h, t6 + mulx m4, l, h; adcx l, t4; adcx h, t5 + mulx m7, l, h; adox l, t7; adox h, t8 + mulx m6, l, h; adcx l, t6; adcx h, t7 + mulx m9, l, h; adox l, t9; adox h, t10 + mulx m8, l, h; adcx l, t8; adcx h, t9 + mulx m11, l, h; adox l, t11; adox h, t12 + mulx m10, l, h + +// z1 + + mov inv, %rdx; adcx l, t10; adcx h, t11; adc $0, t12 + mulx t1, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t2; adox h, t3 + mulx m0, l, h; adcx l, t1; adcx h, t2 + mulx m3, l, h; adox l, t4; adox h, t5 + mulx m2, l, h; adcx l, t3; adcx h, t4 + mulx m5, l, h; adox l, t6; adox h, t7 + mulx m4, l, h; adcx l, t5; adcx h, t6 + mulx m7, l, h; adox l, t8; adox h, t9 + mulx m6, l, h; adcx l, t7; adcx h, t8 + mulx m9, l, h; adox l, t10; adox h, t11 + mulx m8, l, h; adcx l, t9; adcx h, t10 + mulx m11, l, h; adox l, t12; adox h, t13 + mulx m10, l, h + +// z2 + + mov inv, %rdx; adcx l, t11; adcx h, t12; adc $0, t13 + mulx t2, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t3; adox h, t4 + mulx m0, l, h; adcx l, t2; adcx h, t3 + mulx m3, l, h; adox l, t5; adox h, t6 + mulx m2, l, h; adcx l, t4; adcx h, t5 + mulx m5, l, h; adox l, t7; adox h, t8 + mulx m4, l, h; adcx l, t6; adcx h, t7 + mulx m7, l, h; adox l, t9; adox h, t10 + mulx m6, l, h; adcx l, t8; adcx h, t9 + mulx m9, l, h; adox l, t11; adox h, t12 + mulx m8, l, h; adcx l, t10; adcx h, t11 + mulx m11, l, h; adox l, t13; adox h, t14 + mulx m10, l, h + +// z3 + + mov inv, %rdx; adcx l, t12; adcx h, t13; adc $0, t14 + mulx t3, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t4; adox h, t5 + mulx m0, l, h; adcx l, t3; adcx h, t4 + mulx m3, l, h; adox l, t6; adox h, t7 + mulx m2, l, h; adcx l, t5; adcx h, t6 + mulx m5, l, h; adox l, t8; adox h, t9 + mulx m4, l, h; adcx l, t7; adcx h, t8 + mulx m7, l, h; adox l, t10; adox h, t11 + mulx m6, l, h; adcx l, t9; adcx h, t10 + mulx m9, l, h; adox l, t12; adox h, t13 + mulx m8, l, h; adcx l, t11; adcx h, t12 + mulx m11, l, h; adox l, t14; adox h, t15 + mulx m10, l, h + +// z4 + + mov inv, %rdx; adcx l, t13; adcx h, t14; adc $0, t15 + mulx t4, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t5; adox h, t6 + mulx m0, l, h; adcx l, t4; adcx h, t5 + mulx m3, l, h; adox l, t7; adox h, t8 + mulx m2, l, h; adcx l, t6; adcx h, t7 + mulx m5, l, h; adox l, t9; adox h, t10 + mulx m4, l, h; adcx l, t8; adcx h, t9 + mulx m7, l, h; adox l, t11; adox h, t12 + mulx m6, l, h; adcx l, t10; adcx h, t11 + mulx m9, l, h; adox l, t13; adox h, t14 + mulx m8, l, h; adcx l, t12; adcx h, t13 + mulx m11, l, h; adox l, t15; adox h, t16 + mulx m10, l, h + +// z5 + + mov inv, %rdx; adcx l, t14; adcx h, t15; adc $0, t16 + mulx t5, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t6; adox h, t7 + mulx m0, l, h; adcx l, t5; adcx h, t6 + mulx m3, l, h; adox l, t8; adox h, t9 + mulx m2, l, h; adcx l, t7; adcx h, t8 + mulx m5, l, h; adox l, t10; adox h, t11 + mulx m4, l, h; adcx l, t9; adcx h, t10 + mulx m7, l, h; adox l, t12; adox h, t13 + mulx m6, l, h; adcx l, t11; adcx h, t12 + mulx m9, l, h; adox l, t14; adox h, t15 + mulx m8, l, h; adcx l, t13; adcx h, t14 + mulx m11, l, h; adox l, t16; adox h, t17 + mulx m10, l, h + +// z6 + + mov inv, %rdx; adcx l, t15; adcx h, t16; adc $0, t17 + mulx t6, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t7; adox h, t8 + mulx m0, l, h; adcx l, t6; adcx h, t7 + mulx m3, l, h; adox l, t9; adox h, t10 + mulx m2, l, h; adcx l, t8; adcx h, t9 + mulx m5, l, h; adox l, t11; adox h, t12 + mulx m4, l, h; adcx l, t10; adcx h, t11 + mulx m7, l, h; adox l, t13; adox h, t14 + mulx m6, l, h; adcx l, t12; adcx h, t13 + mulx m9, l, h; adox l, t15; adox h, t16 + mulx m8, l, h; adcx l, t14; adcx h, t15 + mulx m11, l, h; adox l, t17; adox h, t18 + mulx m10, l, h + +// z7 + + mov inv, %rdx; adcx l, t16; adcx h, t17; adc $0, t18 + mulx t7, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t8; adox h, t9 + mulx m0, l, h; adcx l, t7; adcx h, t8 + mulx m3, l, h; adox l, t10; adox h, t11 + mulx m2, l, h; adcx l, t9; adcx h, t10 + mulx m5, l, h; adox l, t12; adox h, t13 + mulx m4, l, h; adcx l, t11; adcx h, t12 + mulx m7, l, h; adox l, t14; adox h, t15 + mulx m6, l, h; adcx l, t13; adcx h, t14 + mulx m9, l, h; adox l, t16; adox h, t17 + mulx m8, l, h; adcx l, t15; adcx h, t16 + mulx m11, l, h; adox l, t18; adox h, t19 + mulx m10, l, h + +// z8 + + mov inv, %rdx; adcx l, t17; adcx h, t18; adc $0, t19 + mulx t8, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t9; adox h, t10 + mulx m0, l, h; adcx l, t8; adcx h, t9 + mulx m3, l, h; adox l, t11; adox h, t12 + mulx m2, l, h; adcx l, t10; adcx h, t11 + mulx m5, l, h; adox l, t13; adox h, t14 + mulx m4, l, h; adcx l, t12; adcx h, t13 + mulx m7, l, h; adox l, t15; adox h, t16 + mulx m6, l, h; adcx l, t14; adcx h, t15 + mulx m9, l, h; adox l, t17; adox h, t18 + mulx m8, l, h; adcx l, t16; adcx h, t17 + mulx m11, l, h; adox l, t19; adox h, t20 + mulx m10, l, h + +// z9 + + mov inv, %rdx; adcx l, t18; adcx h, t19; adc $0, t20 + mulx t9, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t10; adox h, t11 + mulx m0, l, h; adcx l, t9; adcx h, t10 + mulx m3, l, h; adox l, t12; adox h, t13 + mulx m2, l, h; adcx l, t11; adcx h, t12 + mulx m5, l, h; adox l, t14; adox h, t15 + mulx m4, l, h; adcx l, t13; adcx h, t14 + mulx m7, l, h; adox l, t16; adox h, t17 + mulx m6, l, h; adcx l, t15; adcx h, t16 + mulx m9, l, h; adox l, t18; adox h, t19 + mulx m8, l, h; adcx l, t17; adcx h, t18 + mulx m11, l, h; adox l, t20; adox h, t21 + mulx m10, l, h + +// z10 + + mov inv, %rdx; adcx l, t19; adcx h, t20; adc $0, t21 + mulx t10, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t11; adox h, t12 + mulx m0, l, h; adcx l, t10; adcx h, t11 + mulx m3, l, h; adox l, t13; adox h, t14 + mulx m2, l, h; adcx l, t12; adcx h, t13 + mulx m5, l, h; adox l, t15; adox h, t16 + mulx m4, l, h; adcx l, t14; adcx h, t15 + mulx m7, l, h; adox l, t17; adox h, t18 + mulx m6, l, h; adcx l, t16; adcx h, t17 + mulx m9, l, h; adox l, t19; adox h, t20 + mulx m8, l, h; adcx l, t18; adcx h, t19 + mulx m11, l, h; adox l, t21; adox h, t22 + mulx m10, l, h + +// z11 + + mov inv, %rdx; adcx l, t20; adcx h, t21; adc $0, t22 + mulx t11, %rdx, h + xor h, h + + mulx m1, l, h; adox l, t12; adox h, t13 + mulx m0, l, h; adcx l, t11; adcx h, t12 + mulx m3, l, h; adox l, t14; adox h, t15 + mulx m2, l, h; adcx l, t13; adcx h, t14 + mulx m5, l, h; adox l, t16; adox h, t17 + mulx m4, l, h; adcx l, t15; adcx h, t16 + mulx m7, l, h; adox l, t18; adox h, t19 + mulx m6, l, h; adcx l, t17; adcx h, t18 + mulx m9, l, h; adox l, t20; adox h, t21 + mulx m8, l, h; adcx l, t19; adcx h, t20 + mulx m11, l, h; adox l, t22; adox h, t23 + mulx m10, l, h; adcx l, t21; adcx h, t22; adc $0, t23 + + xor t24, t24; lea 8*(1+13+2+24+6)(%rsp), %rax + + add z12, t12 + adc z13, t13 + adc z14, t14 + adc z15, t15 + adc z16, t16 + adc z17, t17 + adc z18, t18 + adc z19, t19 + adc z20, t20 + adc z21, t21 + adc z22, t22 + adc z23, t23 + adc $0, t24; mov 15*8(%rsp), %rdx + + // Conditional subtraction of m + + mov t12, z0; sub m0, t12 + mov t13, z1; sbb m1, t13 + mov t14, z2; sbb m2, t14 + mov t15, z3; sbb m3, t15 + mov t16, z4; sbb m4, t16 + mov t17, z5; sbb m5, t17 + mov t18, z6; sbb m6, t18 + mov t19, z7; sbb m7, t19 + mov t20, z8; sbb m8, t20 + mov t21, z9; sbb m9, t21 + mov t22, z10; sbb m10, t22 + mov t23, z11; sbb m11, t23 + sbb $0, t24 + + cmovc z0, t12 + cmovc z1, t13 + cmovc z2, t14 + cmovc z3, t15 + cmovc z4, t16 + cmovc z5, t17 + cmovc z6, t18 + cmovc z7, t19 + cmovc z8, t20 + cmovc z9, t21 + cmovc z10, t22 + cmovc z11, t23 + + mov t12, 0*8(%rdx) + mov t13, 1*8(%rdx) + mov t14, 2*8(%rdx) + mov t15, 3*8(%rdx) + mov t16, 4*8(%rdx) + mov t17, 5*8(%rdx) + mov t18, 6*8(%rdx); mov -6*8(%rax), %rbp + mov t19, 7*8(%rdx); mov -5*8(%rax), %rbx + mov t20, 8*8(%rdx); mov -4*8(%rax), %r12 + mov t21, 9*8(%rdx); mov -3*8(%rax), %r13 + mov t22, 10*8(%rdx); mov -2*8(%rax), %r14 + mov t23, 11*8(%rdx); mov -1*8(%rax), %r15 + + add $8*(1+13+2+24+6), %rsp + +#ifdef _WIN64 + mov 1*8(%rax), %rsi + mov 2*8(%rax), %rdi +#endif + ret + +// vim: noet ts=8 sw=8 \ No newline at end of file diff --git a/ff/bw6-assembly/modsub768.S b/ff/bw6-assembly/modsub768.S new file mode 100644 index 000000000..ccc8e7368 --- /dev/null +++ b/ff/bw6-assembly/modsub768.S @@ -0,0 +1,182 @@ +// void modsub768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12]) + +#ifdef _WIN64 +# define x %rcx +# define y %rdx +# define m %r8 +# define z %r9 + +# define t2 %rdi +# define t3 %rsi +#else +# define x %rdi +# define y %rsi +# define m %rdx +# define z %rcx + +# define t2 %r9 +# define t3 %r8 +#endif + +#define t0 %r11 +#define t1 %r10 +#define t4 %r15 +#define t5 %r14 + +#define t6 %r13 +#define t7 %r12 +#define t8 %rbx +#define t9 %rax +#define t10 %rbp +#define t11 x +#define t12 z + +#define y0 0*8(y) +#define y1 1*8(y) +#define y2 2*8(y) +#define y3 3*8(y) +#define y4 4*8(y) +#define y5 5*8(y) +#define y6 6*8(y) +#define y7 7*8(y) +#define y8 8*8(y) +#define y9 9*8(y) +#define y10 10*8(y) +#define y11 11*8(y) + +#define m0 0*8(m) +#define m1 1*8(m) +#define m2 2*8(m) +#define m3 3*8(m) +#define m4 4*8(m) +#define m5 5*8(m) +#define m6 6*8(m) +#define m7 7*8(m) +#define m8 8*8(m) +#define m9 9*8(m) +#define m10 10*8(m) +#define m11 11*8(m) + +// We only use these after replacing y with z + +#define z0 0*8(y) +#define z1 1*8(y) +#define z2 2*8(y) +#define z3 3*8(y) +#define z4 4*8(y) +#define z5 5*8(y) +#define z6 6*8(y) +#define z7 7*8(y) +#define z8 8*8(y) +#define z9 9*8(y) +#define z10 10*8(y) +#define z11 11*8(y) + +.text + +#ifdef __APPLE__ +#define modsub768 _modsub768 +#endif + +.globl modsub768 +#ifndef __APPLE__ +#ifndef _WIN64 +.type modsub768, @function +#endif +#endif + +.p2align 6,,15 +modsub768: + + // Callee-saves + +#ifdef _WIN64 + mov %rsi, 1*8(%rsp) + mov %rdi, 2*8(%rsp) +#endif + // Load x + push %r15; mov 0*8(x), t0; mov 1*8(x), t1 + push %r14; mov 2*8(x), t2; mov 3*8(x), t3 + push %r13; mov 4*8(x), t4; mov 5*8(x), t5 + push %r12; mov 6*8(x), t6; mov 7*8(x), t7 + push %rbx; mov 8*8(x), t8; mov 9*8(x), t9 + push %rbp; mov 10*8(x), t10; mov 11*8(x), t11 + push z + + xor t12, t12 + sub y0, t0 + sbb y1, t1 + sbb y2, t2 + sbb y3, t3 + sbb y4, t4 + sbb y5, t5 + sbb y6, t6 + sbb y7, t7 + sbb y8, t8 + sbb y9, t9 + sbb y10, t10 + sbb y11, t11 + sbb $0, t12 // -1 if y>x + + // no more need for y. load z in its place + + pop y + + // Conditional addition of m + + mov t0, z0; add m0, t0 + mov t1, z1; adc m1, t1 + mov t2, z2; adc m2, t2 + mov t3, z3; adc m3, t3 + mov t4, z4; adc m4, t4 + mov t5, z5; adc m5, t5 + mov t6, z6; adc m6, t6 + mov t7, z7; adc m7, t7 + mov t8, z8; adc m8, t8 + mov t9, z9; adc m9, t9 + mov t10, z10; adc m10, t10 + mov t11, z11; adc m11, t11 + + add $1, t12 // sets carry if adding m is needed + + cmovnc z0, t0 + cmovnc z1, t1 + cmovnc z2, t2 + cmovnc z3, t3 + cmovnc z4, t4 + cmovnc z5, t5 + cmovnc z6, t6 + cmovnc z7, t7 + cmovnc z8, t8 + cmovnc z9, t9 + cmovnc z10, t10 + cmovnc z11, t11 + + mov t0, z0 + mov t1, z1 + mov t2, z2 + mov t3, z3 + mov t4, z4 + mov t5, z5 + mov t6, z6 + mov t7, z7 + mov t8, z8 + mov t9, z9 + mov t10, z10 + mov t11, z11 + +#ifdef _WIN64 + mov 7*8(%rsp), %rsi + mov 8*8(%rsp), %rdi +#endif + // Load x + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +// vim: noet ts=8 sw=8 diff --git a/ff/src/biginteger/macros.rs b/ff/src/biginteger/macros.rs index d1c370016..b71d2bfbc 100644 --- a/ff/src/biginteger/macros.rs +++ b/ff/src/biginteger/macros.rs @@ -277,6 +277,48 @@ macro_rules! bigint_impl { res } + + #[inline] + fn mul_no_reduce(this: &[u64], other: &[u64]) -> Self { + assert!(this.len() == $num_limbs / 2); + assert!(other.len() == $num_limbs / 2); + + let mut r = [0u64; $num_limbs]; + for i in 0..$num_limbs / 2 { + let mut carry = 0u64; + for j in 0..$num_limbs / 2 { + r[j + i] = + mac_with_carry!(r[j + i], this[i], other[j], &mut carry); + } + r[$num_limbs / 2 + i] = carry; + } + Self::new(r) + } + + #[inline] + fn mul_no_reduce_lo(this: &[u64], other: &[u64]) -> Self { + assert!(this.len() == $num_limbs); + assert!(other.len() == $num_limbs); + + let mut r = [0u64; $num_limbs]; + for i in 0..$num_limbs { + let mut carry = 0u64; + for j in 0..($num_limbs - i) { + r[j + i] = + mac_with_carry!(r[j + i], this[i], other[j], &mut carry); + } + } + Self::new(r) + } + + #[inline] + fn from_slice(slice: &[u64]) -> Self { + let mut repr = Self::default(); + for (limb, &value) in repr.0.iter_mut().zip(slice) { + *limb = value; + } + repr + } } impl CanonicalSerialize for $name { diff --git a/ff/src/biginteger/mod.rs b/ff/src/biginteger/mod.rs index 3f342e357..3aa2266cb 100644 --- a/ff/src/biginteger/mod.rs +++ b/ff/src/biginteger/mod.rs @@ -25,6 +25,7 @@ bigint_impl!(BigInteger128, 2); bigint_impl!(BigInteger256, 4); bigint_impl!(BigInteger320, 5); bigint_impl!(BigInteger384, 6); +bigint_impl!(BigInteger512, 6); bigint_impl!(BigInteger768, 12); bigint_impl!(BigInteger832, 13); @@ -137,4 +138,18 @@ pub trait BigInteger: *self = Self::read(reader)?; Ok(()) } + + /// Takes two slices of u64 representing big integers and returns a bigger + /// BigInteger of type Self representing their product. Preferably used + /// only for even NUM_LIMBS. We require the invariant that this.len() == + /// other.len() == NUM_LIMBS / 2 + fn mul_no_reduce(this: &[u64], other: &[u64]) -> Self; + + /// Similar to `mul_no_reduce` but accepts slices of len == NUM_LIMBS and + /// only returns lower half of the result + fn mul_no_reduce_lo(this: &[u64], other: &[u64]) -> Self; + + /// Copies data from a slice to Self in a len agnostic way, + // based on whichever of the two is shorter. + fn from_slice(slice: &[u64]) -> Self; } diff --git a/ff/src/fields/arithmetic.rs b/ff/src/fields/arithmetic.rs index 82fdaef61..85e7977a6 100644 --- a/ff/src/fields/arithmetic.rs +++ b/ff/src/fields/arithmetic.rs @@ -1,14 +1,53 @@ +/// All of these methods store intermediate results on the stack, and so +/// they support overlap of input and output parameters. +#[cfg(use_bw6_asm)] +extern "C" { + pub fn modmul768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); + pub fn modadd768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); + pub fn modsub768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64); +} /// This modular multiplication algorithm uses Montgomery /// reduction for efficient implementation. It also additionally /// uses the "no-carry optimization" outlined /// [here](https://hackmd.io/@zkteam/modular_multiplication) if -/// `P::MODULUS` has (a) a non-zero MSB, and (b) at least one +/// `P::MODULUS` has BOTH (a) a zero MSB, AND (b) at least one /// zero bit in the rest of the modulus. + macro_rules! impl_field_mul_assign { ($limbs:expr) => { #[inline] #[ark_ff_asm::unroll_for_loops] fn mul_assign(&mut self, other: &Self) { + #[cfg(use_bw6_asm)] + #[allow(unsafe_code, unused_mut, unconditional_panic)] + { + if $limbs == 12 { + unsafe { + let modulus_with_inv = [ + P::MODULUS.0[0], + P::MODULUS.0[1], + P::MODULUS.0[2], + P::MODULUS.0[3], + P::MODULUS.0[4], + P::MODULUS.0[5], + P::MODULUS.0[6], + P::MODULUS.0[7], + P::MODULUS.0[8], + P::MODULUS.0[9], + P::MODULUS.0[10], + P::MODULUS.0[11], + P::INV, + ]; + crate::fields::arithmetic::modmul768( + ((self.0).0).as_ptr(), + ((other.0).0).as_ptr(), + modulus_with_inv.as_ptr(), + ((self.0).0).as_mut_ptr(), + ); + return; + } + } + } // Checking the modulus at compile time let first_bit_set = P::MODULUS.0[$limbs - 1] >> 63 != 0; // $limbs can be 1, hence we can run into a case with an unused mut. @@ -17,6 +56,7 @@ macro_rules! impl_field_mul_assign { for i in 1..$limbs { all_bits_set &= P::MODULUS.0[$limbs - i - 1] == !0u64; } + let _no_carry: bool = !(first_bit_set || all_bits_set); // No-carry optimisation applied to CIOS @@ -56,6 +96,93 @@ macro_rules! impl_field_mul_assign { }; } +macro_rules! impl_field_add_assign { + ($limbs:expr) => { + #[inline] + #[ark_ff_asm::unroll_for_loops] + fn add_assign(&mut self, other: &Self) { + #[cfg(use_bw6_asm)] + #[allow(unsafe_code, unused_mut, unconditional_panic)] + { + if $limbs == 12 { + unsafe { + let modulus_with_inv = [ + P::MODULUS.0[0], + P::MODULUS.0[1], + P::MODULUS.0[2], + P::MODULUS.0[3], + P::MODULUS.0[4], + P::MODULUS.0[5], + P::MODULUS.0[6], + P::MODULUS.0[7], + P::MODULUS.0[8], + P::MODULUS.0[9], + P::MODULUS.0[10], + P::MODULUS.0[11], + P::INV, + ]; + crate::fields::arithmetic::modadd768( + ((self.0).0).as_ptr(), + ((other.0).0).as_ptr(), + modulus_with_inv.as_ptr(), + ((self.0).0).as_mut_ptr(), + ); + return; + } + } + } + // This cannot exceed the backing capacity. + self.0.add_nocarry(&other.0); + // However, it may need to be reduced + self.reduce(); + } + }; +} + +macro_rules! impl_field_sub_assign { + ($limbs:expr) => { + #[inline] + #[ark_ff_asm::unroll_for_loops] + fn sub_assign(&mut self, other: &Self) { + #[cfg(use_bw6_asm)] + #[allow(unsafe_code, unused_mut, unconditional_panic)] + { + if $limbs == 12 { + unsafe { + let modulus_with_inv = [ + P::MODULUS.0[0], + P::MODULUS.0[1], + P::MODULUS.0[2], + P::MODULUS.0[3], + P::MODULUS.0[4], + P::MODULUS.0[5], + P::MODULUS.0[6], + P::MODULUS.0[7], + P::MODULUS.0[8], + P::MODULUS.0[9], + P::MODULUS.0[10], + P::MODULUS.0[11], + P::INV, + ]; + crate::fields::arithmetic::modsub768( + ((self.0).0).as_ptr(), + ((other.0).0).as_ptr(), + modulus_with_inv.as_ptr(), + ((self.0).0).as_mut_ptr(), + ); + return; + } + } + } + // If `other` is larger than `self`, add the modulus to self first. + if other.0 > self.0 { + self.0.add_nocarry(&P::MODULUS); + } + self.0.sub_noborrow(&other.0); + } + }; +} + macro_rules! impl_field_into_repr { ($limbs:expr, $BigIntegerType:ty) => { #[inline] @@ -91,6 +218,37 @@ macro_rules! impl_field_square_in_place { *self = *self * *self; return self; } + #[cfg(use_bw6_asm)] + #[allow(unsafe_code, unused_mut, unconditional_panic)] + { + if $limbs == 12 { + unsafe { + let modulus_with_inv = [ + P::MODULUS.0[0], + P::MODULUS.0[1], + P::MODULUS.0[2], + P::MODULUS.0[3], + P::MODULUS.0[4], + P::MODULUS.0[5], + P::MODULUS.0[6], + P::MODULUS.0[7], + P::MODULUS.0[8], + P::MODULUS.0[9], + P::MODULUS.0[10], + P::MODULUS.0[11], + P::INV, + ]; + crate::fields::arithmetic::modmul768( + ((self.0).0).as_ptr(), + ((self.0).0).as_ptr(), + modulus_with_inv.as_ptr(), + ((self.0).0).as_mut_ptr(), + ); + return self; + } + } + } + #[cfg(use_asm)] #[allow(unsafe_code, unused_mut)] { diff --git a/ff/src/fields/macros.rs b/ff/src/fields/macros.rs index 8a33dc11f..9965ade4f 100644 --- a/ff/src/fields/macros.rs +++ b/ff/src/fields/macros.rs @@ -658,25 +658,12 @@ macro_rules! impl_Fp { impl_multiplicative_ops_from_ref!($Fp, $FpParameters); impl<'a, P: $FpParameters> AddAssign<&'a Self> for $Fp

{ - #[inline] - fn add_assign(&mut self, other: &Self) { - // This cannot exceed the backing capacity. - self.0.add_nocarry(&other.0); - // However, it may need to be reduced - self.reduce(); - } + impl_field_add_assign!($limbs); } impl<'a, P: $FpParameters> SubAssign<&'a Self> for $Fp

{ - #[inline] - fn sub_assign(&mut self, other: &Self) { - // If `other` is larger than `self`, add the modulus to self first. - if other.0 > self.0 { - self.0.add_nocarry(&P::MODULUS); - } - self.0.sub_noborrow(&other.0); - } - } + impl_field_sub_assign!($limbs); + } impl<'a, P: $FpParameters> MulAssign<&'a Self> for $Fp

{ impl_field_mul_assign!($limbs); diff --git a/ff/src/fields/mod.rs b/ff/src/fields/mod.rs index 678962868..c66fc112d 100644 --- a/ff/src/fields/mod.rs +++ b/ff/src/fields/mod.rs @@ -67,6 +67,31 @@ macro_rules! field_new { }; } +#[macro_export] +macro_rules! field_new_from_raw_repr { + ($name:ident, $c0:expr) => { + $name { + 0: $c0, + 1: core::marker::PhantomData, + } + }; + ($name:ident, $c0:expr, $c1:expr $(,)?) => { + $name { + c0: $c0, + c1: $c1, + _parameters: core::marker::PhantomData, + } + }; + ($name:ident, $c0:expr, $c1:expr, $c2:expr $(,)?) => { + $name { + c0: $c0, + c1: $c1, + c2: $c2, + _parameters: core::marker::PhantomData, + } + }; +} + /// The interface for a generic field. pub trait Field: ToBytes @@ -418,6 +443,11 @@ pub trait PrimeField: Self::Params::T_MINUS_ONE_DIV_TWO } + /// Returns the trace minus one divided by two. + fn modulus() -> Self::BigInt { + Self::Params::MODULUS + } + /// Returns the modulus minus one divided by two. fn modulus_minus_one_div_two() -> Self::BigInt { Self::Params::MODULUS_MINUS_ONE_DIV_TWO diff --git a/scripts/glv-lattice-basis/Cargo.toml b/scripts/glv-lattice-basis/Cargo.toml new file mode 100644 index 000000000..c9a045bda --- /dev/null +++ b/scripts/glv-lattice-basis/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "glv_lattice_basis" +version = "0.1.0" +authors = ["Jonathan Chuang"] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +ark-bls12-381 = { git = "https://github.com/arkworks-rs/curves", features = ["curve"], default-features = false } +ark-ff = { path = "../../ff", default-features = false } +ark-ec = { path = "../../ec", default-features = false } +num-traits = { version = "0.2", default-features = false } +num-bigint = "0.4.0" + +[features] +default = [ "std" ] +std = [] diff --git a/scripts/glv-lattice-basis/LICENSE-APACHE b/scripts/glv-lattice-basis/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/scripts/glv-lattice-basis/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/scripts/glv-lattice-basis/LICENSE-MIT b/scripts/glv-lattice-basis/LICENSE-MIT new file mode 100644 index 000000000..72dc60d84 --- /dev/null +++ b/scripts/glv-lattice-basis/LICENSE-MIT @@ -0,0 +1,19 @@ +The MIT License (MIT) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/scripts/glv-lattice-basis/examples-rename/main.rs b/scripts/glv-lattice-basis/examples-rename/main.rs new file mode 100644 index 000000000..30fc643f4 --- /dev/null +++ b/scripts/glv-lattice-basis/examples-rename/main.rs @@ -0,0 +1,12 @@ +extern crate ark_bls12_381; + +use ark_bls12_381::G1Projective as GroupProjective; +use ark_ff::{ + BigInteger384 as BaseFieldBigInt, + BigInteger512 as FrWideBigInt, +}; +use glv_lattice_basis::*; + +fn main() { + print_glv_params::(); +} diff --git a/scripts/glv-lattice-basis/src/arithmetic.rs b/scripts/glv-lattice-basis/src/arithmetic.rs new file mode 100644 index 000000000..bf6cb9747 --- /dev/null +++ b/scripts/glv-lattice-basis/src/arithmetic.rs @@ -0,0 +1,34 @@ +use ark_ff::biginteger::BigInteger; + +// Naive long division +pub fn div_with_remainder( + numerator: BigInt, + divisor: BigInt, +) -> (BigInt, BigInt) { + assert!(divisor != BigInt::from(0), "Divisor cannot be zero"); + let mut remainder = numerator; + let mut quotient = BigInt::from(0); + + let div_num_bits = divisor.num_bits(); + + while remainder >= divisor { + let mut current_divisor = divisor; + let mut num_bits = 1 + remainder.num_bits() - div_num_bits; + current_divisor.muln(num_bits); + while current_divisor > remainder { + current_divisor.div2(); + num_bits -= 1; + } + remainder.sub_noborrow(¤t_divisor); + + let mut pow2_quot = BigInt::from(1); + pow2_quot.muln(num_bits); + quotient.add_nocarry(&pow2_quot); + } + + let mut reconstructed_numerator = + BigInt::mul_no_reduce_lo("ient.as_ref(), &divisor.as_ref()); + reconstructed_numerator.add_nocarry(&remainder); + assert_eq!(reconstructed_numerator, numerator); + (quotient, remainder) +} diff --git a/scripts/glv-lattice-basis/src/lib.rs b/scripts/glv-lattice-basis/src/lib.rs new file mode 100644 index 000000000..32ca76380 --- /dev/null +++ b/scripts/glv-lattice-basis/src/lib.rs @@ -0,0 +1,238 @@ +extern crate ark_ff; +extern crate ark_ec; +extern crate num_bigint; +extern crate num_traits; + +mod arithmetic; + +use num_bigint::BigUint; +use ark_ff::{BigInteger, Field, PrimeField}; +use ark_ec::ProjectiveCurve; +pub use arithmetic::*; +use num_traits::Zero; +use std::ops::Neg; + +/// Takes data from two endomorphisms and sorts out which corresponds to which +fn which_endo( + base_roots: (G::BaseField, G::BaseField), + scalar_roots: (G::ScalarField, G::ScalarField), +) -> ( + (G::BaseField, G::ScalarField), + (G::BaseField, G::ScalarField), +) { + // println!("{:?}, {:?}", base_roots, scalar_roots); + let g = G::prime_subgroup_generator(); + + let mut g_endo = g; + *g_endo.get_x() *= &base_roots.0; + + let d1 = if g.mul(scalar_roots.0.into_repr()) == g_endo { + (base_roots.0, scalar_roots.0) + } else { + let mut g_endo = g; + *g_endo.get_x() *= &base_roots.1; + assert!(g.mul(scalar_roots.0.into_repr()) == g_endo); + + (base_roots.1, scalar_roots.0) + }; + + let d2 = if g.mul(scalar_roots.1.into_repr()) == g_endo { + (base_roots.0, scalar_roots.1) + } else { + let mut g_endo = g; + *g_endo.get_x() *= &base_roots.1; + assert!(g.mul(scalar_roots.1.into_repr()) == g_endo); + + (base_roots.1, scalar_roots.1) + }; + + (d1, d2) +} + +fn cube_root_unity() -> (F, F) { + let char = B::from_slice(F::characteristic()); + let deg = F::extension_degree(); + let mut modulus = char; + for _ in 1..deg { + modulus = B::mul_no_reduce_lo(&modulus.as_ref(), &char.as_ref()); + } + + modulus.sub_noborrow(&B::from(1)); + let (q, r) = div_with_remainder(modulus, B::from(3)); + assert!(r == B::from(0)); + + let mut g = 2u32; + let mut root1 = F::one(); + loop { + if root1 != F::one() { + break; + } + let x = F::from(g); + root1 = x.pow(q); + g += 1; + } + let root2 = root1 * root1; + assert!(root1.pow(&[3]) == F::one()); + assert!(root2.pow(&[3]) == F::one()); + assert!(root1 != root2); + + (root1, root2) +} + +fn get_endo_data() -> (G::BaseField, G::ScalarField) { + which_endo::( + cube_root_unity::(), + cube_root_unity::::BigInt>(), + ) + .1 +} + +fn to_str(x: B) -> String { + BigUint::from_bytes_be(&x.to_bytes_be()[..]).to_string() +} + +pub fn print_glv_params() { + let (omega, lambda) = get_endo_data::(); + let g = G::prime_subgroup_generator(); + let mut g_endo = g; + *g_endo.get_x() *= ω + assert!(g.mul(lambda.into_repr()) == g_endo); + + println!("const OMEGA: Self::BaseField = {:?};", omega); + let n = ::modulus(); + println!("const LAMBDA: Self::ScalarField = {:?};", to_str(lambda.into_repr())); + + let vecs = get_lattice_basis::(n, lambda.into_repr()); + + // We check that `(|B1| + 2) * (|B2| + 2) < 2n` + // and `B_i^2 < 2n` e.g. `|B_i| < \sqrt{2n}$ + // We use this to prove some bounds later + let wide_modulus = WideBigInt::from_slice(&n.as_ref()[..]); + let two_modulus = WideBigInt::mul_no_reduce_lo( + &wide_modulus.as_ref()[..], + &WideBigInt::from(2).as_ref()[..], + ); + + let mut b1 = ((vecs.0).1).1; + let mut b2 = ((vecs.1).1).1; + let two = ::BigInt::from(2); + let b1b1 = WideBigInt::mul_no_reduce(&b1.as_ref()[..], &b1.as_ref()[..]); + let b2b2 = WideBigInt::mul_no_reduce(&b2.as_ref()[..], &b2.as_ref()[..]); + + b1.add_nocarry(&two); + b2.add_nocarry(&two); + let b1b2 = WideBigInt::mul_no_reduce(&b1.as_ref()[..], &b2.as_ref()[..]); + + assert!(b1b1 < two_modulus); + assert!(b2b2 < two_modulus); + assert!(b1b2 < two_modulus); + + for (i, vec) in [vecs.0, vecs.1].iter().enumerate() { + let (s1, (flag, t1)) = vec; + + let mut t1_big = WideBigInt::from_slice(t1.as_ref()); + let n_big = WideBigInt::from_slice(n.as_ref()); + t1_big.muln(::BigInt::NUM_LIMBS as u32 * 64); + let (g1_big, _) = div_with_remainder::(t1_big, n_big); + let g1 = ::BigInt::from_slice(g1_big.as_ref()); + + println!("/// |round(B{} * R / n)|", i + 1); + println!( + "const Q{}: ::BigInt = {:?};", + ((i + 1) % 2) + 1, + to_str(g1) + ); + println!( + "const B{}: ::BigInt = {:?};", + i + 1, + to_str(*t1) + ); + println!("const B{}_IS_NEG: bool = {:?};", i + 1, flag); + + debug_assert_eq!( + recompose_integer( + G::ScalarField::from_repr(*s1).unwrap(), + if !flag { + G::ScalarField::from_repr(*t1).unwrap() + } else { + G::ScalarField::from_repr(*t1).unwrap().neg() + }, + lambda + ), + G::ScalarField::zero() + ); + } + println!( + "const R_BITS: u32 = {:?};", + ::BigInt::NUM_LIMBS * 64 + ); +} + +// We work on arrays of size 3 +// We assume that |E(F_q)| < R = 2^{ceil(limbs/2) * 64} +pub fn get_lattice_basis( + n: F::BigInt, + lambda: F::BigInt, +) -> ( + (F::BigInt, (bool, F::BigInt)), + (F::BigInt, (bool, F::BigInt)), +) { + let mut r = [n, lambda, n]; + let one = F::one(); + let zero = F::zero(); + let mut t: [F; 3] = [zero, one, zero]; + let max_num_bits_lattice = (F::BigInt::from_slice(F::characteristic()).num_bits() - 1) / 2 + 1; + + // We can use an approximation as we are merely using a heuristic. We should + // check that the parameters obtained from this heuristic satisfies the + // required conditions separately. + let sqrt_n = as_f64(n.as_ref()).sqrt(); + + // println!("Log sqrtn: {}", sqrt_n.log2()); + + let mut i = 0; + // While r_i >= sqrt(n), we perform the extended euclidean algorithm so that + // si*n + ti*lambda = ri then return the vectors (r_i, (sign(t_i), |t_i|)), + // (r_i+1, (sign(t_i+1), |t_i+1|)) Notice this makes ri + (-ti)*lambda = 0 + // mod n, which is what we desire for our short lattice basis + while as_f64(r[(i + 1) % 3].as_ref()) >= sqrt_n { + // while i < 20 { + let (q, rem): (F::BigInt, F::BigInt) = + div_with_remainder::(r[i % 3], r[(i + 1) % 3]); + r[(i + 2) % 3] = rem; + let int_q = F::from_repr(q).unwrap(); + t[(i + 2) % 3] = t[i % 3] - int_q * (t[(i + 1) % 3]); + + i += 1; + } + let just_computed = (i + 1) % 3; + // We reverse the signs due to s_i*n = r_i - t_i*LAMBDA + let (neg_flag1, t1) = if t[just_computed].into_repr().num_bits() <= max_num_bits_lattice { + (true, t[just_computed].into_repr()) + } else { + (false, t[just_computed].neg().into_repr()) + }; + let vec_1 = (r[just_computed], (neg_flag1, t1)); + + let prev = i % 3; + let (neg_flag2, t2) = if t[prev].into_repr().num_bits() <= max_num_bits_lattice { + (true, t[prev].into_repr()) + } else { + (false, t[prev].neg().into_repr()) + }; + let vec_2 = (r[prev], (neg_flag2, t2)); + + (vec_1, vec_2) +} + +pub fn recompose_integer(k1: F, k2: F, lambda: F) -> F { + k1 + &(k2 * &lambda) +} + +fn as_f64(bigint_ref: &[u64]) -> f64 { + let mut n_float: f64 = 0.0; + for (i, limb) in bigint_ref.iter().enumerate() { + n_float += (*limb as f64) * 2f64.powf((i as f64) * 64f64) + } + n_float +} diff --git a/scripts/to_dec_str.py b/scripts/to_dec_str.py new file mode 100644 index 000000000..828b4057e --- /dev/null +++ b/scripts/to_dec_str.py @@ -0,0 +1,14 @@ +def from_u64_slice_to_decimal_str(x): + ret = 0 + for i, limb in enumerate(x): + print(i) + print(ret) + ret += 2 ** (i*64) * limb + return ret + +print(from_u64_slice_to_decimal_str([ + 7865245318337523249, + 18346590209729131401, + 15545362854776399464, + 6505881510324251116, +])) diff --git a/serialize/Cargo.toml b/serialize/Cargo.toml index ca7e6eadf..cef3e52b7 100644 --- a/serialize/Cargo.toml +++ b/serialize/Cargo.toml @@ -15,6 +15,7 @@ edition = "2018" [dependencies] ark-serialize-derive = { path = "../serialize-derive", optional = true } ark-std = { git = "https://github.com/arkworks-rs/utils", default-features = false } +paste = "0.1" [features] default = [] diff --git a/serialize/src/lib.rs b/serialize/src/lib.rs index a375c5752..3b5db4fcc 100644 --- a/serialize/src/lib.rs +++ b/serialize/src/lib.rs @@ -12,6 +12,7 @@ use ark_std::{ rc::Rc, string::String, vec::Vec, + vec, }; pub use error::*; pub use flags::*; @@ -58,6 +59,7 @@ pub trait CanonicalSerializeWithFlags: CanonicalSerialize { /// /// If your code depends on `algebra` instead, the example works analogously /// when importing `algebra::serialize::*`. + pub trait CanonicalSerialize { /// Serializes `self` into `writer`. /// It is left up to a particular type for how it strikes the @@ -74,17 +76,26 @@ pub trait CanonicalSerialize { fn serialized_size(&self) -> usize; + /// Serializes `self` into `writer` with compression, and without + /// performing validity checks. Should be used *only* when there is no + /// danger of adversarial manipulation of the output. + #[inline] + fn serialize_unchecked(&self, writer: W) -> Result<(), SerializationError> { + self.serialize(writer) + } + /// Serializes `self` into `writer` without compression. #[inline] fn serialize_uncompressed(&self, writer: W) -> Result<(), SerializationError> { self.serialize(writer) } - /// Serializes `self` into `writer` without compression, and without - /// performing validity checks. Should be used *only* when there is no - /// danger of adversarial manipulation of the output. + /// Serializes `self` into `writer` without compression. #[inline] - fn serialize_unchecked(&self, writer: W) -> Result<(), SerializationError> { + fn serialize_uncompressed_unchecked( + &self, + writer: W, + ) -> Result<(), SerializationError> { self.serialize_uncompressed(writer) } @@ -126,6 +137,13 @@ pub trait CanonicalDeserialize: Sized { /// Reads `Self` from `reader`. fn deserialize(reader: R) -> Result; + /// Reads `self` from `reader` with compression, and without performing + /// validity checks. Should be used *only* when the input is trusted. + #[inline] + fn deserialize_unchecked(reader: R) -> Result { + Self::deserialize(reader) + } + /// Reads `Self` from `reader` without compression. #[inline] fn deserialize_uncompressed(reader: R) -> Result { @@ -135,7 +153,7 @@ pub trait CanonicalDeserialize: Sized { /// Reads `self` from `reader` without compression, and without performing /// validity checks. Should be used *only* when the input is trusted. #[inline] - fn deserialize_unchecked(reader: R) -> Result { + fn deserialize_uncompressed_unchecked(reader: R) -> Result { Self::deserialize_uncompressed(reader) } } @@ -193,6 +211,23 @@ impl CanonicalDeserialize for usize { } } +macro_rules! impl_serialize_for_slice { + ($($name:ident),*) => { + $( + #[inline] + fn $name(&self, mut writer: W) -> Result<(), SerializationError> { + let len = self.len() as u64; + len.serialize(&mut writer)?; + for item in self.iter() { + item.$name(&mut writer)?; + } + Ok(()) + } + )* + } +} + + // Implement Serialization for `String` // It is serialized by obtaining its byte representation as a Vec and // serializing that. This yields an end serialization of @@ -217,16 +252,14 @@ impl CanonicalDeserialize for String { } } + impl CanonicalSerialize for [T] { - #[inline] - fn serialize(&self, mut writer: W) -> Result<(), SerializationError> { - let len = self.len() as u64; - len.serialize(&mut writer)?; - for item in self.iter() { - item.serialize(&mut writer)?; - } - Ok(()) - } + impl_serialize_for_slice!( + serialize, + serialize_unchecked, + serialize_uncompressed, + serialize_uncompressed_unchecked + ); #[inline] fn serialized_size(&self) -> usize { @@ -236,26 +269,6 @@ impl CanonicalSerialize for [T] { .sum::() } - #[inline] - fn serialize_uncompressed(&self, mut writer: W) -> Result<(), SerializationError> { - let len = self.len() as u64; - len.serialize(&mut writer)?; - for item in self.iter() { - item.serialize_uncompressed(&mut writer)?; - } - Ok(()) - } - - #[inline] - fn serialize_unchecked(&self, mut writer: W) -> Result<(), SerializationError> { - let len = self.len() as u64; - len.serialize(&mut writer)?; - for item in self.iter() { - item.serialize_unchecked(&mut writer)?; - } - Ok(()) - } - #[inline] fn uncompressed_size(&self) -> usize { 8 + self @@ -265,63 +278,59 @@ impl CanonicalSerialize for [T] { } } -impl CanonicalSerialize for Vec { - #[inline] - fn serialize(&self, writer: W) -> Result<(), SerializationError> { - self.as_slice().serialize(writer) +macro_rules! impl_serialize_for_vec { + ($($name:ident),*) => { + $( + #[inline] + fn $name(&self, writer: W) -> Result<(), SerializationError> { + self.as_slice().$name(writer) + } + )* } +} + +impl CanonicalSerialize for Vec { + impl_serialize_for_vec!( + serialize, + serialize_unchecked, + serialize_uncompressed, + serialize_uncompressed_unchecked + ); #[inline] fn serialized_size(&self) -> usize { self.as_slice().serialized_size() } - #[inline] - fn serialize_uncompressed(&self, writer: W) -> Result<(), SerializationError> { - self.as_slice().serialize_uncompressed(writer) - } - - #[inline] - fn serialize_unchecked(&self, writer: W) -> Result<(), SerializationError> { - self.as_slice().serialize_unchecked(writer) - } - #[inline] fn uncompressed_size(&self) -> usize { self.as_slice().uncompressed_size() } } -impl CanonicalDeserialize for Vec { - #[inline] - fn deserialize(mut reader: R) -> Result { - let len = u64::deserialize(&mut reader)?; - let mut values = Vec::new(); - for _ in 0..len { - values.push(T::deserialize(&mut reader)?); - } - Ok(values) - } - - #[inline] - fn deserialize_uncompressed(mut reader: R) -> Result { - let len = u64::deserialize(&mut reader)?; - let mut values = Vec::new(); - for _ in 0..len { - values.push(T::deserialize_uncompressed(&mut reader)?); - } - Ok(values) +macro_rules! impl_deserialize_for_vec { + ($($name:ident),*) => { + $( + #[inline] + fn $name(mut reader: R) -> Result { + let len = u64::deserialize(&mut reader)?; + let mut values = vec![]; + for _ in 0..len { + values.push(T::$name(&mut reader)?); + } + Ok(values) + } + )* } +} - #[inline] - fn deserialize_unchecked(mut reader: R) -> Result { - let len = u64::deserialize(&mut reader)?; - let mut values = Vec::new(); - for _ in 0..len { - values.push(T::deserialize_unchecked(&mut reader)?); - } - Ok(values) - } +impl CanonicalDeserialize for Vec { + impl_deserialize_for_vec!( + deserialize, + deserialize_unchecked, + deserialize_uncompressed, + deserialize_uncompressed_unchecked + ); } #[inline] @@ -876,6 +885,19 @@ mod test { } } + macro_rules! impl_test { + ($data:ident, $(($name:ident, $size:ident)),*) => { + $( + paste::item! { + let mut serialized = vec![0; $data.[< $size _size>]()]; + $data.[< serialize_ $name >](&mut serialized[..]).unwrap(); + let de = T::[< deserialize_ $name >](&serialized[..]).unwrap(); + assert_eq!($data, de); + } + )* + } + } + fn test_serialize< T: PartialEq + core::fmt::Debug + CanonicalSerialize + CanonicalDeserialize, >( @@ -886,15 +908,12 @@ mod test { let de = T::deserialize(&serialized[..]).unwrap(); assert_eq!(data, de); - let mut serialized = vec![0; data.uncompressed_size()]; - data.serialize_uncompressed(&mut serialized[..]).unwrap(); - let de = T::deserialize_uncompressed(&serialized[..]).unwrap(); - assert_eq!(data, de); - - let mut serialized = vec![0; data.uncompressed_size()]; - data.serialize_unchecked(&mut serialized[..]).unwrap(); - let de = T::deserialize_unchecked(&serialized[..]).unwrap(); - assert_eq!(data, de); + impl_test!( + data, + (unchecked, serialized), + (uncompressed, uncompressed), + (uncompressed_unchecked, uncompressed) + ); } // Serialize T, randomly mutate the data, and deserialize it. diff --git a/test-curves/Cargo.toml b/test-curves/Cargo.toml index 61093dfd0..da48978d6 100644 --- a/test-curves/Cargo.toml +++ b/test-curves/Cargo.toml @@ -9,10 +9,15 @@ edition = "2018" publish = false [dependencies] +paste = "0.1" ark-std = { git = "https://github.com/arkworks-rs/utils", default-features = false } ark-ff = { path = "../ff", default-features = false } ark-ec = { path = "../ec", default-features = false } + +lazy_static = { version = "1.4.0", optional = true } +accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true } + [dev-dependencies] ark-serialize = { path = "../serialize", default-features = false } ark-algebra-test-templates = { path = "../test-templates", default-features = false } @@ -20,8 +25,8 @@ ark-algebra-test-templates = { path = "../test-templates", default-features = fa [features] default = [] +cuda = [ "ark-ec/cuda", "accel", "lazy_static", "ark-ec/std" ] asm = [ "ark-ff/asm" ] - parallel = [ "ark-ff/parallel", "ark-ec/parallel", "ark-std/parallel" ] bls12_381_scalar_field = [] @@ -33,4 +38,4 @@ mnt4_753_curve = [ "mnt4_753_scalar_field", "mnt4_753_base_field" ] bn384_small_two_adicity_scalar_field = [] bn384_small_two_adicity_base_field = [] -bn384_small_two_adicity_curve = [ "bn384_small_two_adicity_scalar_field", "bn384_small_two_adicity_base_field" ] \ No newline at end of file +bn384_small_two_adicity_curve = [ "bn384_small_two_adicity_scalar_field", "bn384_small_two_adicity_base_field" ] diff --git a/test-curves/src/bls12_381/g1.rs b/test-curves/src/bls12_381/g1.rs index 0e8391f91..64fabeec3 100644 --- a/test-curves/src/bls12_381/g1.rs +++ b/test-curves/src/bls12_381/g1.rs @@ -1,9 +1,14 @@ use crate::bls12_381::*; use ark_ec::{ + impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters, models::{ModelParameters, SWModelParameters}, short_weierstrass_jacobian::*, + GLVParameters, +}; +use ark_ff::{ + biginteger::{BigInteger256, BigInteger384, BigInteger512}, + field_new, field_new_from_raw_repr, PrimeField, Zero, }; -use ark_ff::{field_new, Zero}; pub type G1Affine = GroupAffine; pub type G1Projective = GroupProjective; @@ -16,6 +21,41 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel_glv!(bls12_381, "ark-bls12-381", g1, G1Projective); + +impl GLVParameters for Parameters { + type WideBigInt = BigInteger512; + const OMEGA: Self::BaseField = field_new_from_raw_repr!( + Fq, + BigInteger384([ + 3526659474838938856, + 17562030475567847978, + 1632777218702014455, + 14009062335050482331, + 3906511377122991214, + 368068849512964448, + ]) + ); + const LAMBDA: Self::ScalarField = field_new_from_raw_repr!( + Fr, + BigInteger256([ + 7865245318337523249, + 18346590209729131401, + 15545362854776399464, + 6505881510324251116, + ]) + ); + /// |round(B1 * R / n)| + const Q2: ::BigInt = + BigInteger256([7203196592358157870, 8965520006802549469, 1, 0]); + const B1: ::BigInt = + BigInteger256([4294967295, 12413508272118670338, 0, 0]); + const B1_IS_NEG: bool = true; + /// |round(B2 * R / n)| + const Q1: ::BigInt = BigInteger256([2, 0, 0, 0]); + const B2: ::BigInt = BigInteger256([1, 0, 0, 0]); + const R_BITS: u32 = 256; +} impl SWModelParameters for Parameters { /// COEFF_A = 0 const COEFF_A: Fq = field_new!(Fq, "0"); @@ -40,6 +80,9 @@ impl SWModelParameters for Parameters { fn mul_by_a(_: &Self::BaseField) -> Self::BaseField { Self::BaseField::zero() } + + impl_scalar_mul_parameters!(G1Projective); + impl_glv_for_sw!(); } /// G1_GENERATOR_X = diff --git a/test-curves/src/bn384_small_two_adicity/g1.rs b/test-curves/src/bn384_small_two_adicity/g1.rs index aadf08151..27fc71349 100644 --- a/test-curves/src/bn384_small_two_adicity/g1.rs +++ b/test-curves/src/bn384_small_two_adicity/g1.rs @@ -1,4 +1,5 @@ use ark_ec::{ + impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters, models::{ModelParameters, SWModelParameters}, short_weierstrass_jacobian::*, }; @@ -12,6 +13,8 @@ pub type G1Projective = GroupProjective; #[derive(Clone, Default, PartialEq, Eq)] pub struct Parameters; +impl_scalar_mul_kernel!(bn384, "ark-bn384", g1, G1Projective); + impl ModelParameters for Parameters { type BaseField = Fq; type ScalarField = Fr; @@ -38,6 +41,8 @@ impl SWModelParameters for Parameters { fn mul_by_a(_: &Self::BaseField) -> Self::BaseField { Self::BaseField::zero() } + + impl_scalar_mul_parameters!(G1Projective); } /// G1_GENERATOR_X = -1 diff --git a/test-curves/src/mnt4_753/g1.rs b/test-curves/src/mnt4_753/g1.rs index 71863ead0..1bbfe0d80 100644 --- a/test-curves/src/mnt4_753/g1.rs +++ b/test-curves/src/mnt4_753/g1.rs @@ -1,4 +1,5 @@ use ark_ec::{ + impl_scalar_mul_kernel, impl_scalar_mul_parameters, models::{ModelParameters, SWModelParameters}, short_weierstrass_jacobian::*, }; @@ -17,6 +18,8 @@ impl ModelParameters for Parameters { type ScalarField = Fr; } +impl_scalar_mul_kernel!(mnt4_753, "ark-mnt4-753", g1, G1Projective); + impl SWModelParameters for Parameters { /// COEFF_A = 2 #[rustfmt::skip] @@ -38,6 +41,8 @@ impl SWModelParameters for Parameters { /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y) const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) = (G1_GENERATOR_X, G1_GENERATOR_Y); + + impl_scalar_mul_parameters!(G1Projective); } // Generator of G1