diff --git a/Cargo.toml b/Cargo.toml
index b90f4199b..bcad6488d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,8 @@ members = [
     "poly-benches",
     "test-curves",
     "test-templates",
+
+    "scripts/glv-lattice-basis",
 ]
 
 [profile.release]
diff --git a/ec/Cargo.toml b/ec/Cargo.toml
index d255e6819..9432e0e88 100644
--- a/ec/Cargo.toml
+++ b/ec/Cargo.toml
@@ -19,9 +19,25 @@ ark-ff = { path = "../ff", default-features = false }
 derivative = { version = "2", features = ["use_core"] }
 num-traits = { version = "0.2", default-features = false }
 rayon = { version = "1", optional = true }
+itertools = { version = "0.9.0", default-features = false }
+either = { version = "1.6.0", default-features = false }
+thread-id = { version = "3.3.0", optional = true }
+backtrace = { version = "0.3", optional = true }
+accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true }
+peekmore = "0.5.6"
+closure = { version = "0.3.0", optional = true }
+lazy_static = { version = "1.4.0", optional = true }
+serde_json = { version = "1.0.58", optional = true }
+dirs = { version = "1.0.5", optional = true }
+log = { version = "0.4.11", optional = true }
+paste = "0.1"
 zeroize = { version = "1", default-features = false, features = ["zeroize_derive"] }
 
+[dev-dependencies]
+rand_xorshift = "0.2"
+
 [features]
+cuda = [ "std", "parallel", "accel", "lazy_static", "serde_json", "dirs", "closure", "log" ]
 default = []
 std = [ "ark-std/std", "ark-ff/std", "ark-serialize/std" ]
 parallel = [ "std", "rayon", "ark-std/parallel" ]
diff --git a/ec/src/batch_arith.rs b/ec/src/batch_arith.rs
new file mode 100644
index 000000000..6a5ba0529
--- /dev/null
+++ b/ec/src/batch_arith.rs
@@ -0,0 +1,309 @@
+use crate::AffineCurve;
+use ark_ff::{biginteger::BigInteger, fields::Field};
+use ark_std::{ops::Neg, vec::Vec};
+use either::Either;
+use num_traits::Zero;
+
+/// We use a batch size that is big enough to amortise the cost of the actual
+/// inversion close to zero while not straining the CPU cache by generating and
+/// fetching from large w-NAF tables and slices [G]
+pub const BATCH_AFFINE_BATCH_SIZE: usize = 4096;
+
+/// We code this in the second operand for the `batch_add_in_place_read_only`
+/// method utilised in the `batch_scalar_mul_in_place` method.
+/// 0 == Identity; 1 == Neg; 2 == GLV; 3 == GLV + Neg
+pub const ENDO_CODING_BITS: usize = 2;
+
+#[inline(always)]
+pub fn decode_endo_from_u32(index_code: u32) -> (usize, u8) {
+    (
+        index_code as usize >> ENDO_CODING_BITS,
+        index_code as u8 % 4,
+    )
+}
+
+pub trait BatchGroupArithmetic
+where
+    Self: Sized + Clone + Copy + Zero + Neg<Output = Self>,
+{
+    type BaseFieldForBatch: Field;
+
+    // We use the w-NAF method, achieving point density of approximately 1/(w + 1)
+    // and requiring storage of only 2^(w - 1).
+    // Refer to e.g. Improved Techniques for Fast Exponentiation, Section 4
+    // Bodo M¨oller 2002. https://www.bmoeller.de/pdf/fastexp-icisc2002.pdf
+
+    /// Computes [[p_1, 3 * p_1, ..., (2^w - 1) * p_1], ..., [p_n, 3*p_n,  ...,
+    /// (2^w - 1) p_n]] We need to manipulate the offsets when using the
+    /// table
+    fn batch_wnaf_tables(bases: &[Self], w: usize) -> Vec<Self> {
+        let half_size = 1 << (w - 1);
+        let batch_size = bases.len();
+
+        let mut two_a = bases.to_vec();
+        let instr = (0..batch_size).map(|x| x as u32).collect::<Vec<_>>();
+        Self::batch_double_in_place(&mut two_a, &instr[..], None);
+
+        let mut tables = Vec::<Self>::with_capacity(half_size * batch_size);
+        tables.extend_from_slice(bases);
+        let mut scratch_space = Vec::<Option<Self>>::with_capacity((batch_size - 1) / 2 + 1);
+
+        for i in 1..half_size {
+            let instr = (0..batch_size)
+                .map(|x| (((i - 1) * batch_size + x) as u32, x as u32))
+                .collect::<Vec<_>>();
+            Self::batch_add_write_read_self(
+                &two_a[..],
+                &instr[..],
+                &mut tables,
+                &mut scratch_space,
+            );
+        }
+        tables
+    }
+
+    /// Computes the vectorised version of the wnaf integer recoding
+    /// Optionally takes a slice of booleans which indicate whether that
+    /// scalar is negative. If so, it negates the recoding.
+    /// Mutates scalars in place
+    fn batch_wnaf_opcode_recoding<BigInt: BigInteger>(
+        scalars: &mut [BigInt],
+        w: usize,
+        negate: Option<&[bool]>,
+    ) -> Vec<Vec<Option<i16>>> {
+        debug_assert!(w > 0);
+        let batch_size = scalars.len();
+        let window_size: i16 = 1 << (w + 1);
+        let half_window_size: i16 = 1 << w;
+
+        let mut op_code_vectorised = Vec::<Vec<Option<i16>>>::with_capacity(BigInt::NUM_LIMBS * 64);
+
+        let mut all_none = false;
+
+        if negate.is_some() {
+            debug_assert_eq!(scalars.len(), negate.unwrap().len()); // precompute
+        }
+
+        let f = false;
+        while !all_none {
+            let iter = match negate {
+                None => Either::Left(core::iter::repeat(&f).take(batch_size)),
+                Some(bools) => Either::Right(bools.iter()),
+            };
+            let mut opcode_row = Vec::with_capacity(batch_size);
+            for (s, &neg) in scalars.iter_mut().zip(iter) {
+                if s.is_zero() {
+                    opcode_row.push(None);
+                } else {
+                    let op = if s.is_odd() {
+                        let mut z: i16 = (s.as_ref()[0] % (1 << (w + 1))) as i16;
+                        if z < half_window_size {
+                            s.sub_noborrow(&BigInt::from(z as u64));
+                        } else {
+                            z = z - window_size;
+                            s.add_nocarry(&BigInt::from((-z) as u64));
+                        }
+                        if neg {
+                            -z
+                        } else {
+                            z
+                        }
+                    } else {
+                        0
+                    };
+                    opcode_row.push(Some(op));
+                    s.div2();
+                }
+            }
+            all_none = opcode_row.iter().all(|x| x.is_none());
+            if !all_none {
+                op_code_vectorised.push(opcode_row);
+            }
+        }
+        op_code_vectorised
+    }
+
+    // We define a series of batched primitive EC ops, each of which is most
+    // suitable to a given scenario.
+    //
+    // We encode the indexes as u32s to save on fetch latency via better cacheing.
+    // The principle we are applying is that the len of the batch ops should
+    // never exceed about 2^20, and the table size would never exceed 2^10, so
+    // 32 bits will always be enough
+
+    /// Mutates bases to be doubled in place
+    /// Accepts optional scratch space which might help by reducing the
+    /// number of heap allocations for the Vector-based scratch_space
+    fn batch_double_in_place(
+        bases: &mut [Self],
+        index: &[u32],
+        scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
+    );
+
+    /// Mutates bases in place and stores result in the first operand.
+    /// The element corresponding to the second operand becomes junk data.
+    fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]);
+
+    /// Mutates bases in place and stores result in bases.
+    /// The elements in other become junk data.
+    fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]);
+
+    /// Adds elements in bases with elements in other (for instance, a table),
+    /// utilising a scratch space to store intermediate results.
+    fn batch_add_in_place_read_only(
+        bases: &mut [Self],
+        other: &[Self],
+        index: &[(u32, u32)],
+        scratch_space: &mut Vec<Self>,
+    );
+
+    /// Lookups up group elements according to index, and either adds and writes
+    /// or simply writes them to new_elems, using scratch space to store
+    /// intermediate values. Scratch space is always cleared after use.
+
+    /// No-ops, or copies of the elem in the slice `lookup` in the position of
+    /// the index of the first operand to the new_elems vector, are encoded
+    /// as !0u32 in the index for the second operand
+    fn batch_add_write(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    );
+
+    /// Similar to batch_add_write, only that the lookup for the first operand
+    /// is performed in new_elems rather than lookup
+
+    /// No-ops, or copies of the elem in the slice `lookup` in the position of
+    /// the index of the first operand to the new_elems vector, are encoded
+    /// as !0u32 in the index for the second operand
+    fn batch_add_write_read_self(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    );
+
+    /// Performs a batch scalar multiplication using the w-NAF encoding
+    /// utilising the primitive batched ops
+    fn batch_scalar_mul_in_place<BigInt: BigInteger>(
+        mut bases: &mut [Self],
+        scalars: &mut [BigInt],
+        w: usize,
+    ) {
+        let batch_size = bases.len();
+        let opcode_vectorised = Self::batch_wnaf_opcode_recoding::<BigInt>(scalars, w, None);
+        let tables = Self::batch_wnaf_tables(bases, w);
+
+        // Set all points to 0;
+        let zero = Self::zero();
+        for p in bases.iter_mut() {
+            *p = zero;
+        }
+
+        for opcode_row in opcode_vectorised.iter().rev() {
+            let index_double: Vec<_> = opcode_row
+                .iter()
+                .enumerate()
+                .filter(|x| x.1.is_some())
+                .map(|x| x.0 as u32)
+                .collect();
+
+            Self::batch_double_in_place(&mut bases, &index_double[..], None);
+
+            let mut add_ops: Vec<Self> = opcode_row
+                .iter()
+                .enumerate()
+                .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                .map(|(i, op)| {
+                    let idx = op.unwrap();
+                    if idx > 0 {
+                        tables[(idx as usize) / 2 * batch_size + i].clone()
+                    } else {
+                        tables[(-idx as usize) / 2 * batch_size + i].clone().neg()
+                    }
+                })
+                .collect();
+
+            let index_add: Vec<_> = opcode_row
+                .iter()
+                .enumerate()
+                .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                .map(|x| x.0)
+                .enumerate()
+                .map(|(x, y)| (y as u32, x as u32))
+                .collect();
+
+            Self::batch_add_in_place(&mut bases, &mut add_ops[..], &index_add[..]);
+        }
+    }
+
+    /// Chunks vectorised instructions into a size that does not require
+    /// storing a lot of intermediate state
+    fn get_chunked_instr<T: Clone>(instr: &[T], batch_size: usize) -> Vec<Vec<T>> {
+        let mut res = Vec::new();
+
+        let rem = instr.chunks_exact(batch_size).remainder();
+        let mut chunks = instr.chunks_exact(batch_size).peekable();
+
+        if chunks.len() == 0 {
+            res.push(rem.to_vec());
+        }
+
+        while let Some(chunk) = chunks.next() {
+            let chunk = if chunks.peek().is_none() {
+                [chunk, rem].concat()
+            } else {
+                chunk.to_vec()
+            };
+            res.push(chunk);
+        }
+        res
+    }
+}
+
+/// We make the syntax for performing batch ops on slices cleaner
+/// by defining a corresponding trait and impl for [G] rather than on G
+pub trait BatchGroupArithmeticSlice<G: AffineCurve> {
+    fn batch_double_in_place(&mut self, index: &[u32]);
+
+    fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]);
+
+    fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]);
+
+    fn batch_add_write(
+        &self,
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<G>,
+        scratch_space: &mut Vec<Option<G>>,
+    );
+
+    fn batch_scalar_mul_in_place<BigInt: BigInteger>(&mut self, scalars: &mut [BigInt], w: usize);
+}
+
+impl<G: AffineCurve> BatchGroupArithmeticSlice<G> for [G] {
+    fn batch_double_in_place(&mut self, index: &[u32]) {
+        G::batch_double_in_place(self, index, None);
+    }
+
+    fn batch_add_in_place_same_slice(&mut self, index: &[(u32, u32)]) {
+        G::batch_add_in_place_same_slice(self, index);
+    }
+
+    fn batch_add_in_place(&mut self, other: &mut Self, index: &[(u32, u32)]) {
+        G::batch_add_in_place(self, other, index);
+    }
+
+    fn batch_add_write(
+        &self,
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<G>,
+        scratch_space: &mut Vec<Option<G>>,
+    ) {
+        G::batch_add_write(self, index, new_elems, scratch_space);
+    }
+
+    fn batch_scalar_mul_in_place<BigInt: BigInteger>(&mut self, scalars: &mut [BigInt], w: usize) {
+        G::batch_scalar_mul_in_place(self, scalars, w);
+    }
+}
diff --git a/ec/src/batch_verify.rs b/ec/src/batch_verify.rs
new file mode 100644
index 000000000..ad4105edc
--- /dev/null
+++ b/ec/src/batch_verify.rs
@@ -0,0 +1,181 @@
+use crate::{
+    batch_bucketed_add, AffineCurve, BatchGroupArithmeticSlice, BucketPosition, PrimeField,
+    ProjectiveCurve, BATCH_AFFINE_BATCH_SIZE,
+};
+use ark_ff::fields::FpParameters;
+use ark_std::{cfg_chunks_mut, fmt, vec::Vec, rand::Rng};
+use num_traits::identities::Zero;
+
+#[cfg(feature = "parallel")]
+use {rand::thread_rng, rayon::prelude::*};
+
+#[derive(Debug, Clone)]
+pub struct VerificationError;
+
+impl fmt::Display for VerificationError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "Verification Error. Not in subgroup")
+    }
+}
+
+fn verify_points<C: AffineCurve, R: Rng>(
+    points: &[C],
+    num_buckets: usize,
+    _new_security_param: Option<usize>, // Only pass new_security_param if possibly recursing
+    rng: &mut R,
+) -> Result<(), VerificationError> {
+    let n_points = points.len();
+    let mut bucket_assign = Vec::with_capacity(points.len());
+    for i in 0..n_points {
+        bucket_assign.push(BucketPosition {
+            bucket: rng.gen_range(0, num_buckets) as u32,
+            position: i as u32,
+        });
+    }
+    let mut buckets = batch_bucketed_add(num_buckets, &mut points.to_vec(), &mut bucket_assign[..]);
+
+    // We use the batch_scalar_mul to check the subgroup condition if
+    // there are sufficient number of buckets. For SW curves, the number
+    // elems for the batch mul to become useful is around 2^24.
+    let verification_failure = if num_buckets >= BATCH_AFFINE_BATCH_SIZE {
+        cfg_chunks_mut!(buckets, BATCH_AFFINE_BATCH_SIZE).for_each(|e| {
+            let length = e.len();
+            e[..].batch_scalar_mul_in_place::<<C::ScalarField as PrimeField>::BigInt>(
+                &mut vec![C::ScalarField::modulus().into(); length][..],
+                4,
+            );
+        });
+        !buckets.iter().all(|&p| p.is_zero())
+    } else {
+        !buckets
+            .iter()
+            .all(|&b| b.into_projective().mul(C::ScalarField::modulus()).is_zero())
+    };
+    if verification_failure {
+        return Err(VerificationError);
+    }
+    Ok(())
+}
+
+fn run_rounds<C: AffineCurve, R: Rng>(
+    points: &[C],
+    num_buckets: usize,
+    num_rounds: usize,
+    new_security_param: Option<usize>,
+    rng: &mut R,
+) -> Result<(), VerificationError> {
+    #[cfg(feature = "parallel")]
+    if num_rounds > 2 {
+        use std::sync::Arc;
+        let ref_points = Arc::new(points.to_vec());
+        let mut threads = vec![];
+        for _ in 0..num_rounds {
+            let ref_points_thread = ref_points.clone();
+            // We only use std when a multicore environment is available
+            threads.push(std::thread::spawn(
+                move || -> Result<(), VerificationError> {
+                    let mut rng = &mut thread_rng();
+                    verify_points(
+                        &ref_points_thread[..],
+                        num_buckets,
+                        new_security_param,
+                        &mut rng,
+                    )?;
+                    Ok(())
+                },
+            ));
+        }
+        for thread in threads {
+            thread.join().unwrap()?;
+        }
+    } else {
+        for _ in 0..num_rounds {
+            verify_points(points, num_buckets, new_security_param, rng)?;
+        }
+    }
+
+    #[cfg(not(feature = "parallel"))]
+    for _ in 0..num_rounds {
+        verify_points(points, num_buckets, new_security_param, rng)?;
+    }
+
+    Ok(())
+}
+
+pub fn batch_verify_in_subgroup<C: AffineCurve, R: Rng>(
+    points: &[C],
+    security_param: usize,
+    rng: &mut R,
+) -> Result<(), VerificationError> {
+    #[cfg(feature = "std")]
+    let cost_estimate = (<C::ScalarField as PrimeField>::Params::MODULUS_BITS as f64
+        * (0.5 * 7.0 / 6.0 * 0.8 + 1.0 / 5.0))
+        .ceil() as usize;
+    #[cfg(not(feature = "std"))]
+    let cost_estimate = <C::ScalarField as PrimeField>::Params::MODULUS_BITS as usize * 5 / 4;
+
+    let (num_buckets, num_rounds, _) = get_max_bucket(
+        security_param,
+        points.len(),
+        // We estimate the costs of a single scalar multiplication in the batch affine, w-NAF GLV
+        // case as 7/6 * 0.5 * n_bits * 0.8 (doubling) + 0.5 * 1/(w + 1) * n_bits
+        // (addition) We take into account that doubling in the batch add model is cheaper
+        // as it requires less cache use
+        cost_estimate,
+    );
+    run_rounds(points, num_buckets, num_rounds, None, rng)?;
+    Ok(())
+}
+
+/// We get the greatest power of 2 number of buckets such that we minimise the
+/// number of rounds while satisfying the constraint that
+/// n_rounds * buckets * next_check_per_elem_cost < n
+fn get_max_bucket(
+    security_param: usize,
+    n_elems: usize,
+    next_check_per_elem_cost: usize,
+) -> (usize, usize, usize) {
+    #[cfg(feature = "std")]
+    {
+        let mut log2_num_buckets = 1f64;
+        let num_rounds = |log2_num_buckets: f64| -> usize {
+            (security_param as f64 / log2_num_buckets).ceil() as usize
+        };
+
+        while num_rounds(log2_num_buckets)
+            * next_check_per_elem_cost
+            * (2f64.powf(log2_num_buckets).ceil() as usize)
+            < n_elems
+            && num_rounds(log2_num_buckets + 0.1) > 1
+        {
+            log2_num_buckets += 0.1;
+        }
+        (
+            2f64.powf(log2_num_buckets).ceil() as usize, // number of buckets
+            num_rounds(log2_num_buckets),                // number of rounds
+            log2_num_buckets.ceil() as usize,            // new security param
+        )
+    }
+
+    #[cfg(not(feature = "std"))]
+    {
+        let mut log2_num_buckets: u32 = 1;
+        let num_rounds = |log2_num_buckets: u32| -> usize {
+            (security_param - 1) / (log2_num_buckets as usize) + 1
+        };
+
+        while num_rounds(log2_num_buckets)
+            * next_check_per_elem_cost
+            * (2_i32.pow(log2_num_buckets) as usize)
+            < n_elems
+            && num_rounds(log2_num_buckets + 1) > 1
+        {
+            log2_num_buckets += 1;
+        }
+        (
+            2_i32.pow(log2_num_buckets) as usize, // number of buckets
+            num_rounds(log2_num_buckets),         // number of rounds
+            log2_num_buckets as usize,            // new security param
+        )
+    }
+}
diff --git a/ec/src/bucketed_add.rs b/ec/src/bucketed_add.rs
new file mode 100644
index 000000000..e5711d9d0
--- /dev/null
+++ b/ec/src/bucketed_add.rs
@@ -0,0 +1,213 @@
+use crate::{BatchGroupArithmeticSlice, BATCH_AFFINE_BATCH_SIZE};
+
+use ark_std::vec::Vec;
+
+use crate::AffineCurve;
+
+#[derive(Copy, Clone, Debug)]
+pub struct BucketPosition {
+    pub bucket: u32,
+    pub position: u32,
+}
+
+/// The objective of this function is to identify an addition tree of
+/// independent elliptic curve group additions for each bucket, and to batch the
+/// independent additions using the batch affine inversion method.
+
+/// The strategy taken is to sort a list of bucket assignments of all the
+/// elements (which we can for most intents and purposes, think of as being
+/// uniformly random) by bucket, so that indices corresponding to elements that
+/// must be added together are physically collocated in memory. Then, in the
+/// first round, we proceed to perform independent additions producing
+/// intermediate results at the greatest depth for each addition tree (each
+/// corresponding to a bucket), and write the result to a new vector. We do so
+/// to improve cache locality for future rounds, and take advantage of the
+/// CPU-intensive nature of elliptic curve operations along with prfetching to
+/// hide the latency of reading from essentially random locations in memory.
+
+/// Subsequently, we perform the additions in place, and the second operands
+/// become junk data. Finally, when we only have the buckets left (no more
+/// additions left to perform), we copy the result into a destination `res`
+/// slice.
+#[inline]
+pub fn batch_bucketed_add<C: AffineCurve>(
+    buckets: usize,
+    elems: &[C],
+    bucket_positions: &mut [BucketPosition],
+) -> Vec<C> {
+    assert_eq!(elems.len(), bucket_positions.len());
+    assert!(elems.len() > 0);
+
+    // We sort the bucket positions so that indices of elements assigned
+    // to the same bucket are continguous. This way, we can easily identify
+    // how to construct the addition tree for that bucket.
+    bucket_positions.sort_unstable_by_key(|x| x.bucket);
+
+    let mut len = bucket_positions.len();
+    let mut all_ones = true;
+    let mut new_len = 0; // len counter
+    let mut glob = 0; // global counters
+    let mut loc = 1; // local counter
+    let mut batch = 0; // batch counter
+    let mut instr = Vec::<(u32, u32)>::with_capacity(BATCH_AFFINE_BATCH_SIZE);
+    let mut new_elems = Vec::<C>::with_capacity(elems.len() * 3 / 8);
+
+    let mut scratch_space = Vec::<Option<C>>::with_capacity(BATCH_AFFINE_BATCH_SIZE / 2);
+
+    // In the first loop, we copy the results of the first in place addition tree
+    // to a local vector, new_elems
+    // Subsequently, we perform all the operations in place
+    while glob < len {
+        let current_bucket = bucket_positions[glob].bucket;
+        // We are iterating over elements using a global `glob` counter, and counting
+        // how many in a row are being assigned to the same bucket, using the `loc`
+        // counter.
+        while glob + 1 < len && bucket_positions[glob + 1].bucket == current_bucket {
+            glob += 1;
+            loc += 1;
+        }
+        // If the current bucket exceeds buckets, it encodes a noop
+        if current_bucket >= buckets as u32 {
+            loc = 1;
+        } else if loc > 1 {
+            // all ones is false if next len is not 1
+
+            // in other words, we have not reached the terminating
+            // condition that after the current round of addition
+            // there is only one element left in each addition tree
+
+            // This would be the case, if each addition tree had at
+            // most 2 elements in the current round.
+            if loc > 2 {
+                all_ones = false;
+            }
+            let is_odd = loc % 2 == 1;
+            let half = loc / 2;
+            // We encode instructions to add adjacent elements
+            for i in 0..half {
+                instr.push((
+                    bucket_positions[glob - (loc - 1) + 2 * i].position,
+                    bucket_positions[glob - (loc - 1) + 2 * i + 1].position,
+                ));
+                // Compactification of buckets
+                bucket_positions[new_len + i] = BucketPosition {
+                    bucket: current_bucket,
+                    position: (new_len + i) as u32,
+                };
+            }
+            // If there are an odd number of elements, the lone element
+            // without a partner will be copied over to the `new_elems`
+            // vector, a noop which is encoded as !0u32
+            if is_odd {
+                instr.push((bucket_positions[glob].position, !0u32));
+                bucket_positions[new_len + half] = BucketPosition {
+                    bucket: current_bucket,
+                    position: (new_len + half) as u32,
+                };
+            }
+            // Reset the local_counter and update state
+
+            // We compactify the `bucket_positions` data by shifing left
+            // `new_len` is the len of the current compactified vector.
+
+            // We also update the `batch` counter to decide when it is
+            // optimal to invoke the batch inversion, i.e. when we have
+            // accumulated enough independent additions.
+            new_len += half + (loc % 2);
+            batch += half;
+            loc = 1;
+
+            if batch >= BATCH_AFFINE_BATCH_SIZE / 2 {
+                // We need instructions for copying data in the case
+                // of noops. We encode noops/copies as !0u32
+                elems[..].batch_add_write(&instr[..], &mut new_elems, &mut scratch_space);
+
+                instr.clear();
+                batch = 0;
+            }
+        } else {
+            instr.push((bucket_positions[glob].position, !0u32));
+            bucket_positions[new_len] = BucketPosition {
+                bucket: current_bucket,
+                position: new_len as u32,
+            };
+            new_len += 1;
+        }
+        glob += 1;
+    }
+    if instr.len() > 0 {
+        elems[..].batch_add_write(&instr[..], &mut new_elems, &mut scratch_space);
+        instr.clear();
+    }
+    glob = 0;
+    batch = 0;
+    loc = 1;
+    len = new_len;
+    new_len = 0;
+
+    // We repeat the above procedure, except, since we are performing the addition
+    // trees in place, we do not need to encode noops to force a copy to a new
+    // vector.
+    while !all_ones {
+        all_ones = true;
+        while glob < len {
+            let current_bucket = bucket_positions[glob].bucket;
+            while glob + 1 < len && bucket_positions[glob + 1].bucket == current_bucket {
+                glob += 1;
+                loc += 1;
+            }
+            if current_bucket >= buckets as u32 {
+                loc = 1;
+            } else if loc > 1 {
+                // all ones is false if next len is not 1
+                if loc != 2 {
+                    all_ones = false;
+                }
+                let is_odd = loc % 2 == 1;
+                let half = loc / 2;
+                for i in 0..half {
+                    instr.push((
+                        bucket_positions[glob - (loc - 1) + 2 * i].position,
+                        bucket_positions[glob - (loc - 1) + 2 * i + 1].position,
+                    ));
+                    bucket_positions[new_len + i] = bucket_positions[glob - (loc - 1) + 2 * i];
+                }
+                if is_odd {
+                    bucket_positions[new_len + half] = bucket_positions[glob];
+                }
+                // Reset the local_counter and update state
+                new_len += half + (loc % 2);
+                batch += half;
+                loc = 1;
+
+                if batch >= BATCH_AFFINE_BATCH_SIZE / 2 {
+                    &mut new_elems[..].batch_add_in_place_same_slice(&instr[..]);
+                    instr.clear();
+                    batch = 0;
+                }
+            } else {
+                bucket_positions[new_len] = bucket_positions[glob];
+                new_len += 1;
+            }
+            glob += 1;
+        }
+        if instr.len() > 0 {
+            &mut new_elems[..].batch_add_in_place_same_slice(&instr[..]);
+            instr.clear();
+        }
+        glob = 0;
+        batch = 0;
+        loc = 1;
+        len = new_len;
+        new_len = 0;
+    }
+
+    let zero = C::zero();
+    let mut res = vec![zero; buckets];
+
+    for i in 0..len {
+        let (pos, buc) = (bucket_positions[i].position, bucket_positions[i].bucket);
+        res[buc as usize] = new_elems[pos as usize];
+    }
+    res
+}
diff --git a/ec/src/cuda/accel_dummy.rs b/ec/src/cuda/accel_dummy.rs
new file mode 100644
index 000000000..6acbe17cf
--- /dev/null
+++ b/ec/src/cuda/accel_dummy.rs
@@ -0,0 +1,9 @@
+use ark_std::vec::Vec;
+
+pub mod error {
+    pub type Result<T> = T;
+}
+
+pub struct Context {}
+
+pub type DeviceMemory<T> = Vec<T>;
diff --git a/ec/src/cuda/mod.rs b/ec/src/cuda/mod.rs
new file mode 100644
index 000000000..f2dc0829d
--- /dev/null
+++ b/ec/src/cuda/mod.rs
@@ -0,0 +1,6 @@
+#[macro_use]
+pub mod scalar_mul;
+pub use scalar_mul::*;
+
+#[cfg(not(feature = "cuda"))]
+pub mod accel_dummy;
diff --git a/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs b/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs
new file mode 100644
index 000000000..b979a0658
--- /dev/null
+++ b/ec/src/cuda/scalar_mul/cpu_gpu_macros.rs
@@ -0,0 +1,286 @@
+// TODO: make this more generic
+#[macro_export]
+macro_rules! impl_gpu_cpu_run_kernel {
+    () =>  {
+        #[allow(unused_qualifications)]
+        fn init_gpu_cache_dir() -> Result<String, crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = dirs::cache_dir()
+                    .unwrap()
+                    .join("zexe-algebra")
+                    .join("cuda-scalar-mul-profiler")
+                    .join(P::namespace());
+                std::fs::create_dir_all(&dir)?;
+                Ok(dir.to_str().unwrap().to_string())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        #[allow(unused_qualifications)]
+        fn read_profile_data() -> Result<String, crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                let data = std::fs::read_to_string(&dir.join("profile_data.txt"))?;
+                Ok(data)
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        fn clear_gpu_profiling_data() -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                std::fs::File::create(&dir.join("profile_data.txt"))?;
+                Ok(())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        #[allow(unused_variables)]
+        fn write_profile_data(profile_data: &str) -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                let dir = std::path::PathBuf::from(Self::init_gpu_cache_dir()?);
+                let mut file = std::fs::File::create(&dir.join("profile_data.txt"))?;
+                file.write_all(profile_data.as_bytes())?;
+                file.sync_all()?;
+                Ok(())
+            }
+            #[cfg(not(feature = "cuda"))]
+            Err(crate::CudaScalarMulError::CudaDisabledError)
+        }
+
+        /// We split up the job statically between the CPU and GPUs
+        /// based on continuous profiling stored both in a static location in memory
+        /// that is lost the moment the progam stops running.
+        /// and also a txt file in the OS' cache dir.
+
+        /// Only one such procedure should be running at any time.
+        #[allow(unused_variables)]
+        fn cpu_gpu_static_partition_run_kernel(
+            bases_h: &mut [<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Result<(), crate::CudaScalarMulError> {
+            #[cfg(feature = "cuda")]
+            {
+                if !Device::init() {
+                    panic!("Do not call this function unless the device has been checked to initialise successfully");
+                }
+                let n_devices = Device::get_count().unwrap();
+                let n = bases_h.len();
+                // Create references so we can split the slices
+                let mut res_ref = &mut bases_h[..];
+                let mut exps_h_ref = exps_h;
+
+                // Get data for proportion of total throughput achieved by each device
+                let _ = Self::init_gpu_cache_dir()?;
+
+                let arc_mutex = P::scalar_mul_static_profiler();
+                let mut profile_data = arc_mutex.lock().unwrap();
+                let mut proportions: Vec<f64> = profile_data.0.clone();
+
+                // If the program has just been initialised, we must check for the existence of existing
+                // cached profile data. If it does not exist, we create a new file
+                if proportions.is_empty() {
+                    let _ = Self::read_profile_data()
+                        .and_then(|s| { let res = serde_json::from_str(&s).map_err(|_| crate::CudaScalarMulError::ProfilingDeserializationError)?; Ok(res) })
+                        .and_then(|cached_data| {
+                            *profile_data = cached_data;
+                            proportions = profile_data.0.clone();
+                            Ok(())
+                        }
+                    );
+                }
+
+                if proportions.is_empty() {
+                    // By default we split the work evenly between devices and host
+                    proportions = vec![1.0 / (n_devices as f64 + 1.0); n_devices];
+                }
+
+                assert_eq!(proportions.len(), n_devices);
+                // Allocate the number of elements in the job to each device/host
+                let n_gpus = proportions.iter().map(|r| (r * n as f64).round() as usize).collect::<Vec<_>>();
+                let n_cpu = n - n_gpus.iter().sum::<usize>();
+
+                // Create storage for buffers and contexts for variable number of devices
+                let mut bases_split = Vec::with_capacity(n_devices);
+                let mut tables = Vec::with_capacity(n_devices);
+                let mut exps = Vec::with_capacity(n_devices);
+                let mut ctxs = Vec::with_capacity(n_devices);
+                let (mut time_cpu, mut times_gpu) = (0, vec![0; n_devices]);
+
+                // Split data and generate tables and u8 scalar encoding in device memory
+                for (i, &num) in n_gpus.iter().enumerate() {
+                    let device = Device::nth(i).unwrap();
+                    let ctx = device.create_context();
+
+                    let (lower, upper) = res_ref.split_at_mut(num);
+                    res_ref = upper;
+                    let lower_exps = &exps_h_ref[..num];
+                    exps_h_ref = &exps_h_ref[num..];
+
+                    let mut table = DeviceMemory::<Self>::zeros(&ctx, num * Self::table_size());
+                    let mut exp = DeviceMemory::<u8>::zeros(&ctx, num * Self::num_u8());
+
+                    Self::generate_tables_and_recoding(lower, &mut table[..], lower_exps, &mut exp[..]);
+
+                    ctxs.push((device, ctx));
+                    bases_split.push(lower);
+                    tables.push(table);
+                    exps.push(exp);
+                };
+
+                let jobs_result: std::sync::Arc<Mutex<Result<(), crate::CudaScalarMulError>>> = std::sync::Arc::new(Mutex::new(Ok(())));
+
+                rayon::scope(|s| {
+                    // Run jobs on GPUs
+                    for (i, (bases_gpu, time_gpu)) in bases_split.iter_mut().zip(times_gpu.iter_mut()).enumerate() {
+                        let n_gpu = n_gpus[i];
+                        let ctx = &ctxs[i].1;
+                        let table = &tables[i];
+                        let exp = &exps[i];
+
+                        let jobs_result_inner = jobs_result.clone();
+
+                        s.spawn(move |_| {
+                            let now = std::time::Instant::now();
+
+                            let mut out = DeviceMemory::<Self>::zeros(ctx, n_gpu);
+                            let result = P::scalar_mul_kernel(
+                                ctx,
+                                (n_gpu - 1) / cuda_group_size + 1, // grid
+                                cuda_group_size,     // block
+                                table.as_ptr(), exp.as_ptr(), out.as_mut_ptr(), n_gpu as isize
+                            ).map_err(|_| crate::CudaScalarMulError::KernelFailedError);
+                            if result.is_err() {
+                                *jobs_result_inner.lock().unwrap() = result;
+                                return;
+                            }
+                            Self::batch_normalization(&mut out[..]);
+                            bases_gpu.clone_from_slice(&out.par_iter().map(|p| p.into_affine()).collect::<Vec<_>>()[..]);
+                            *time_gpu = now.elapsed().as_micros();
+                        });
+                    }
+
+                    // Run on CPU
+                    s.spawn(|_| {
+                        let now = std::time::Instant::now();
+
+                        let exps_mut = &mut exps_h_ref.to_vec()[..];
+                        rayon::scope(|t| {
+                            for (b, s) in res_ref.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) {
+                                t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4));
+                            }
+                        });
+
+                        time_cpu = now.elapsed().as_micros();
+                    });
+                });
+
+                // It's safe to do this, since after the rayon scope we only have one reference.
+                std::sync::Arc::try_unwrap(jobs_result).unwrap().into_inner().unwrap()?;
+
+                // Update global microbenchmarking state
+                debug!("CUDA old profile_data: {:?}", profile_data);
+                let cpu_throughput = n_cpu as f64 / time_cpu as f64;
+                let gpu_throughputs = n_gpus
+                    .iter()
+                    .zip(times_gpu.iter())
+                    .map(|(n_gpu, time_gpu)| {
+                        *n_gpu as f64 / *time_gpu as f64
+                })
+                .collect::<Vec<_>>();
+                let total_throughput = cpu_throughput + gpu_throughputs.iter().sum::<f64>();
+                let n_data_points = profile_data.1 as f64;
+                profile_data.1 += 1;
+                let new_proportions = gpu_throughputs.iter().map(|t| t / total_throughput);
+
+                if !profile_data.0.is_empty() {
+                    profile_data.0 = new_proportions.zip(profile_data.0.clone()).map(|(new, old)| {
+                        (new + n_data_points * old) / profile_data.1 as f64
+                    }).collect();
+                } else {
+                    profile_data.0 = new_proportions.collect();
+                }
+
+                // Update cached profiling data on disk
+                let s: String = serde_json::to_string(&(*profile_data)).map_err(|_| crate::CudaScalarMulError::ProfilingSerializationError)?;
+                Self::write_profile_data(&s)?;
+
+                debug!("CUDA new profile_data: {:?}", profile_data);
+            }
+
+            Ok(())
+        }
+
+        #[allow(unused_variables)]
+        fn cpu_gpu_load_balance_run_kernel(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of a single job in the queue e.g. 2 << 14
+            job_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Vec<<Self as ProjectiveCurve>::Affine> {
+            #[cfg(feature = "cuda")]
+            {
+                let mut bases_res = bases_h.to_vec();
+                let queue = Mutex::new(bases_res.chunks_mut(job_size).zip(exps_h.chunks(job_size)).peekmore());
+
+                rayon::scope(|s| {
+                    // We launch two concurrent GPU threads that block on waiting for GPU to hide latency
+                    for i in 0..2 {
+                        s.spawn(closure!(move i, ref queue, |_| {
+                            std::thread::sleep(std::time::Duration::from_millis(i * 500));
+                            let mut iter = queue.lock().unwrap();
+                            while let Some((bases, exps)) = iter.next() {
+                                iter.peek();
+                                if iter.peek().is_none() { break; }
+                                let mut proj_res = Self::par_run_kernel_sync(ctx, bases, exps, cuda_group_size, iter);
+                                Self::batch_normalization(&mut proj_res[..]);
+                                bases.clone_from_slice(&proj_res.par_iter().map(|p| p.into_affine()).collect::<Vec<_>>()[..]);
+                                iter = queue.lock().unwrap();
+                            }
+                        }));
+                    }
+
+                    s.spawn(|_| {
+                        std::thread::sleep(std::time::Duration::from_millis(20));
+                        let mut iter = queue.lock().unwrap();
+                        debug!("CUDA acquired cpu");
+                        while let Some((bases, exps)) = iter.next() {
+                            let exps_mut = &mut exps.to_vec()[..];
+                            rayon::scope(|t| {
+                                for (b, s) in bases.chunks_mut(cpu_chunk_size).zip(exps_mut.chunks_mut(cpu_chunk_size)) {
+                                    t.spawn(move |_| b[..].batch_scalar_mul_in_place(&mut s[..], 4));
+                                }
+                            });
+                            // Sleep to allow other threads to unlock
+                            drop(iter);
+                            debug!("CUDA unlocked cpu");
+                            std::thread::sleep(std::time::Duration::from_millis(20));
+                            iter = queue.lock().unwrap();
+                            debug!("CUDA acquired cpu");
+                        }
+                        debug!("CUDA cpu finish");
+                    });
+                });
+                drop(queue);
+                bases_res
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            Vec::new()
+        }
+    }
+}
diff --git a/ec/src/cuda/scalar_mul/kernel_macros.rs b/ec/src/cuda/scalar_mul/kernel_macros.rs
new file mode 100644
index 000000000..005bec24e
--- /dev/null
+++ b/ec/src/cuda/scalar_mul/kernel_macros.rs
@@ -0,0 +1,177 @@
+#[macro_export]
+macro_rules! impl_scalar_mul_kernel {
+    ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => {
+        paste::item! {
+            #[cfg(feature = "cuda")]
+            use {accel::*, ark_std::{sync::{Arc, Mutex}, vec::Vec}};
+
+            #[cfg(not(feature = "cuda"))]
+            use ark_ec::accel_dummy::*;
+
+            use ark_ec::cuda::scalar_mul::ScalarMulProfiler;
+
+            #[cfg(feature = "cuda")]
+            lazy_static::lazy_static! {
+                pub static ref MICROBENCH_CPU_GPU_AVG_RATIO:
+                    Arc<Mutex<(Vec<f64>, usize)>> = Arc::new(Mutex::new((Vec::new(), 0)));
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            static MICROBENCH_CPU_GPU_AVG_RATIO: () = ();
+
+            const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]);
+
+            #[cfg(feature = "cuda")]
+            #[kernel_mod(transparent)]
+            #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })]
+            #[dependencies("ark_ff" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ff", default_features = false})]
+            #[dependencies("ark_ec" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ec", default_features = false})]
+            #[dependencies("curve" = { git = "https://github.com/arkworks-rs/curves", branch = "master", package = $curve_string, features = ["curve"], default_features = false})]
+            pub mod scalar_mul {
+                use curve::$ProjCurve;
+                use ark_ec::{curves::ProjectiveCurve, fields::PrimeField, FpParameters, Zero};
+
+                const NUM_BITS: isize =
+                    <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize;
+                const LOG2_W: isize = 5;
+                const TABLE_SIZE: isize = 1 << LOG2_W;
+                const NUM_U8: isize = (NUM_BITS - 1) / LOG2_W + 1;
+
+                #[kernel_func]
+                pub unsafe fn scalar_mul(
+                    #[type_substitute(*const super::$ProjCurve)]
+                    table: *const $ProjCurve,
+                    exps: *const u8,
+                    #[type_substitute(*mut super::$ProjCurve)]
+                    out: *mut $ProjCurve,
+                    n: isize,
+                ) {
+                    let i = accel_core::index();
+                    if i < n {
+                        let mut res = $ProjCurve::zero();
+                        res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize));
+
+                        for j in 1..NUM_U8 as isize {
+                            for _ in 0..LOG2_W {
+                                res.double_in_place();
+                            }
+                            res += &(*table
+                                .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + j) as isize));
+                        }
+                        *out.offset(i) = res;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! impl_scalar_mul_kernel_glv {
+    ($curve: ident, $curve_string:expr, $type: expr, $ProjCurve: ident) => {
+        paste::item! {
+            #[cfg(feature = "cuda")]
+            use {accel::*, ark_std::{sync::{Arc, Mutex}, vec::Vec}};
+
+            #[cfg(not(feature = "cuda"))]
+            use ark_ec::accel_dummy::*;
+
+            use ark_ec::cuda::scalar_mul::ScalarMulProfiler;
+
+            #[cfg(feature = "cuda")]
+            lazy_static::lazy_static! {
+                pub static ref MICROBENCH_CPU_GPU_AVG_RATIO:
+                    Arc<Mutex<(Vec<f64>, usize)>> = Arc::new(Mutex::new((Vec::new(), 0)));
+            }
+
+            #[cfg(not(feature = "cuda"))]
+            static MICROBENCH_CPU_GPU_AVG_RATIO: () = ();
+
+            const NAMESPACE: &'static str = stringify!([<$curve _ $type _cuda_namespace>]);
+
+            #[cfg(feature = "cuda")]
+            #[kernel_mod(transparent)]
+            #[name([<$curve _ $type _cuda_namespace>])]
+            #[dependencies("accel-core" = { git = "https://github.com/jon-chuang/accel", package = "accel-core" })]
+            #[dependencies("ark_ff" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ff", default_features = false})]
+            #[dependencies("ark_ec" = { git = "https://github.com/arkworks-rs/algebra", branch = "master", package = "ark-ec", default_features = false})]
+            #[dependencies("curve" = { git = "https://github.com/arkworks-rs/curves", branch = "master", package = $curve_string, features = ["curve"], default_features = false })]
+            pub mod scalar_mul {
+                use curve::$ProjCurve;
+                use {ark_ec::ProjectiveCurve, ark_ff::{PrimeField, FpParameters, Zero}};
+
+                const NUM_BITS: isize =
+                    <<<$ProjCurve as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as isize;
+                const LOG2_W: isize = 5;
+                const TABLE_SIZE: isize = 1 << LOG2_W;
+                const HALF_TABLE_SIZE: isize = 1 << (LOG2_W - 1);
+                const NUM_U8: isize = 2 * ((NUM_BITS - 1) / (2 * (LOG2_W - 1)) + 2);
+
+                #[kernel_func]
+                pub unsafe fn scalar_mul(
+                    #[type_substitute(*const super::$ProjCurve)]
+                    table: *const $ProjCurve,
+                    exps: *const u8,
+                    #[type_substitute(*mut super::$ProjCurve)]
+                    out: *mut $ProjCurve,
+                    n: isize,
+                ) {
+                    let i = accel_core::index();
+                    if i < n {
+                        let mut res = $ProjCurve::zero();
+
+                        res += &(*table.offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8) as isize));
+                        res += &(*table.offset(
+                            i * TABLE_SIZE + HALF_TABLE_SIZE + *exps.offset(i * NUM_U8 + 1) as isize,
+                        ));
+
+                        for j in 1..NUM_U8 as isize / 2 {
+                            for _ in 0..(LOG2_W - 1) {
+                                res.double_in_place();
+                            }
+                            res += &(*table
+                                .offset(i * TABLE_SIZE + *exps.offset(i * NUM_U8 + 2 * j) as isize));
+                            res += &(*table.offset(
+                                i * TABLE_SIZE
+                                    + HALF_TABLE_SIZE
+                                    + *exps.offset(i * NUM_U8 + 2 * j + 1) as isize,
+                            ));
+                        }
+                        *out.offset(i) = res;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! impl_scalar_mul_parameters {
+    ($ProjCurve:ident) => {
+        #[allow(unused_variables)]
+        fn scalar_mul_kernel(
+            ctx: &Context,
+            grid: usize,
+            block: usize,
+            table: *const $ProjCurve,
+            exps: *const u8,
+            out: *mut $ProjCurve,
+            n: isize,
+        ) -> error::Result<()> {
+            #[cfg(feature = "cuda")]
+            scalar_mul(ctx, grid, block, (table, exps, out, n))
+        }
+
+        fn scalar_mul_static_profiler() -> ScalarMulProfiler {
+            #[cfg(feature = "cuda")]
+            return (*MICROBENCH_CPU_GPU_AVG_RATIO).clone();
+
+            #[cfg(not(feature = "cuda"))]
+            MICROBENCH_CPU_GPU_AVG_RATIO
+        }
+
+        fn namespace() -> &'static str {
+            NAMESPACE
+        }
+    };
+}
diff --git a/ec/src/cuda/scalar_mul/mod.rs b/ec/src/cuda/scalar_mul/mod.rs
new file mode 100644
index 000000000..89855fd48
--- /dev/null
+++ b/ec/src/cuda/scalar_mul/mod.rs
@@ -0,0 +1,355 @@
+#[cfg(feature = "cuda")]
+use std::sync::{Arc, Mutex};
+
+use ark_ff::fields::PrimeField;
+use ark_std::cfg_chunks_mut;
+use core::fmt;
+
+use crate::{AffineCurve, BatchGroupArithmeticSlice};
+use internal::GPUScalarMulInternal;
+
+#[macro_use]
+mod kernel_macros;
+pub use kernel_macros::*;
+
+#[macro_use]
+mod cpu_gpu_macros;
+
+#[macro_use]
+mod run_kernel_macros;
+
+#[cfg(feature = "cuda")]
+pub type ScalarMulProfiler = Arc<Mutex<(Vec<f64>, usize)>>;
+#[cfg(not(feature = "cuda"))]
+pub type ScalarMulProfiler = ();
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
+pub const MAX_GROUP_ELEM_BYTES: usize = 400;
+
+#[derive(Debug)]
+pub enum CudaScalarMulError {
+    CudaDisabledError,
+    IoError,
+    KernelFailedError,
+    ProfilingSerializationError,
+    ProfilingDeserializationError,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CudaScalarMulError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        None
+    }
+}
+
+#[cfg(feature = "std")]
+impl From<std::io::Error> for CudaScalarMulError {
+    fn from(_: std::io::Error) -> Self {
+        CudaScalarMulError::IoError
+    }
+}
+
+impl fmt::Display for CudaScalarMulError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+        match self {
+            CudaScalarMulError::CudaDisabledError => write!(f, "CUDA is disabled"),
+            CudaScalarMulError::IoError => write!(f, "IO error"),
+            CudaScalarMulError::KernelFailedError => write!(f, "Failed running kernel"),
+            CudaScalarMulError::ProfilingSerializationError => {
+                write!(f, "Failed serlializing profiling data")
+            }
+            CudaScalarMulError::ProfilingDeserializationError => {
+                write!(f, "Failed deserializing profiling data")
+            }
+        }
+    }
+}
+
+pub trait GPUScalarMul<G: AffineCurve>: GPUScalarMulInternal<G> {
+    fn clear_gpu_profiling_data() {
+        #[cfg(feature = "cuda")]
+        <Self as internal::GPUScalarMulInternal<G>>::clear_gpu_profiling_data()
+            .expect("Should have cleared GPU profiling data");
+    }
+
+    #[allow(unused_variables)]
+    fn cpu_gpu_scalar_mul(
+        elems: &mut [G],
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError> {
+        #[cfg(feature = "cuda")]
+        {
+            // CUDA will return ILLEGAL_ADRESS if group elem size is too large.
+            if accel::Device::init() && core::mem::size_of::<G>() < MAX_GROUP_ELEM_BYTES {
+                <G as AffineCurve>::Projective::cpu_gpu_static_partition_run_kernel(
+                    elems,
+                    exps_h,
+                    cuda_group_size,
+                    cpu_chunk_size,
+                )?;
+            } else {
+                let mut exps_mut = exps_h.to_vec();
+                cfg_chunks_mut!(elems, cpu_chunk_size)
+                    .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size))
+                    .for_each(|(b, s)| {
+                        b[..].batch_scalar_mul_in_place(&mut s[..], 4);
+                    });
+            }
+        }
+
+        #[cfg(not(feature = "cuda"))]
+        {
+            let mut exps_mut = exps_h.to_vec();
+            cfg_chunks_mut!(elems, cpu_chunk_size)
+                .zip(cfg_chunks_mut!(exps_mut, cpu_chunk_size))
+                .for_each(|(b, s)| {
+                    b[..].batch_scalar_mul_in_place(&mut s[..], 4);
+                });
+        }
+
+        Ok(())
+    }
+}
+
+impl<G: AffineCurve> GPUScalarMul<G> for G::Projective {}
+
+pub(crate) mod internal {
+    use ark_std::{string::String, vec::Vec};
+
+    #[cfg(feature = "cuda")]
+    use accel::*;
+
+    #[cfg(not(feature = "cuda"))]
+    use crate::accel_dummy::*;
+
+    use crate::{AffineCurve, CudaScalarMulError};
+    use ark_ff::fields::PrimeField;
+
+    #[allow(unused_variables)]
+    pub trait GPUScalarMulInternal<G: AffineCurve>: Sized {
+        const NUM_BITS: usize;
+        const LOG2_W: usize;
+
+        fn table_size() -> usize {
+            1 << Self::LOG2_W
+        }
+
+        fn num_u8() -> usize;
+
+        fn init_gpu_cache_dir() -> Result<String, CudaScalarMulError>;
+        fn read_profile_data() -> Result<String, CudaScalarMulError>;
+        fn write_profile_data(profile_data: &str) -> Result<(), CudaScalarMulError>;
+        fn clear_gpu_profiling_data() -> Result<(), CudaScalarMulError>;
+
+        fn par_run_kernel(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+        ) -> DeviceMemory<Self>;
+
+        fn par_run_kernel_sync<T>(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            lock: T,
+        ) -> DeviceMemory<Self>;
+
+        fn generate_tables_and_recoding(
+            bases_h: &[G],
+            tables_h: &mut [Self],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            exps_recode_h: &mut [u8],
+        );
+
+        fn cpu_gpu_load_balance_run_kernel(
+            ctx: &Context,
+            bases_h: &[G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of a single job in the queue e.g. 2 << 14
+            job_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Vec<G>;
+
+        fn cpu_gpu_static_partition_run_kernel(
+            bases_h: &mut [G],
+            exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            // size of the batch for cpu scalar mul
+            cpu_chunk_size: usize,
+        ) -> Result<(), CudaScalarMulError>;
+    }
+}
+
+#[macro_export]
+macro_rules! impl_gpu_sw_projective {
+    ($Parameters:ident) => {
+        impl<P: $Parameters> GPUScalarMulInternal<GroupAffine<P>> for GroupProjective<P> {
+            const NUM_BITS: usize =
+                <<<Self as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize;
+            const LOG2_W: usize = 5;
+
+            fn num_u8() -> usize {
+                if P::has_glv() {
+                    2 * ((Self::NUM_BITS - 1) / (2 * (Self::LOG2_W - 1)) + 2)
+                } else {
+                    (Self::NUM_BITS - 1) / Self::LOG2_W + 1
+                }
+            }
+
+            fn generate_tables_and_recoding(
+                bases_h: &[<Self as ProjectiveCurve>::Affine],
+                tables_h: &mut [Self],
+                exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+                exps_recode_h: &mut [u8],
+            ) {
+                if P::has_glv() {
+                    let scalar_recode_glv =
+                        |k1: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt, k2: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                            let table_size_glv: u64 = 1u64 << (Self::LOG2_W - 1);
+                            let mut out = vec![0; Self::num_u8()];
+                            for i in (0..Self::num_u8() / 2).rev() {
+                                out[2 * i] = (k1.as_ref()[0] % table_size_glv) as u8;
+                                out[2 * i + 1] = (k2.as_ref()[0] % table_size_glv) as u8;
+                                k1.divn(Self::LOG2_W as u32 - 1);
+                                k2.divn(Self::LOG2_W as u32 - 1);
+                            }
+                            assert!(k1.is_zero());
+                            assert!(k2.is_zero());
+                            out
+                        };
+
+                    cfg_iter!(exps_h)
+                        .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                        .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                        .for_each(|((k, exps_chunk), (table, base))| {
+                            let ((k1_neg, mut k1), (k2_neg, mut k2)) =
+                                P::glv_scalar_decomposition(*k);
+                            let base = base.into_projective();
+                            exps_chunk.clone_from_slice(&scalar_recode_glv(&mut k1, &mut k2));
+
+                            table[0] = Self::zero();
+                            table[Self::table_size() / 2] = Self::zero();
+
+                            for i in 1..Self::table_size() / 2 {
+                                let mut res = if k1_neg {
+                                    table[i - 1] - base
+                                } else {
+                                    table[i - 1] + base
+                                };
+                                table[i] = res;
+
+                                P::glv_endomorphism_in_place(&mut res.x);
+                                table[Self::table_size() / 2 + i] =
+                                    if k2_neg != k1_neg { res.neg() } else { res };
+                            }
+                        });
+                } else {
+                    let scalar_recode = |k: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                        let mut out = vec![0; Self::num_u8()];
+                        for i in (0..Self::num_u8()).rev() {
+                            out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8;
+                            k.divn(Self::LOG2_W as u32);
+                        }
+                        assert!(k.is_zero());
+                        out
+                    };
+                    cfg_iter!(exps_h)
+                        .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                        .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                        .for_each(|((k, exps_chunk), (table, base))| {
+                            let base = base.into_projective();
+                            exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]);
+
+                            table[0] = Self::zero();
+                            for i in 1..Self::table_size() {
+                                table[i] = table[i - 1] + base;
+                            }
+                        });
+                }
+            }
+
+            impl_run_kernel!();
+            impl_gpu_cpu_run_kernel!();
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! impl_gpu_te_projective {
+    ($Parameters:ident) => {
+        impl<P: $Parameters> GPUScalarMulInternal<GroupAffine<P>> for GroupProjective<P> {
+            const NUM_BITS: usize =
+                <<<Self as ProjectiveCurve>::ScalarField as PrimeField>::Params as FpParameters>::MODULUS_BITS as usize;
+            const LOG2_W: usize = 5;
+
+            fn generate_tables_and_recoding(
+                bases_h: &[<Self as ProjectiveCurve>::Affine],
+                tables_h: &mut [Self],
+                exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+                exps_recode_h: &mut [u8],
+            ) {
+                let scalar_recode = |k: &mut <<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt| -> Vec<u8> {
+                    let mut out = vec![0; Self::num_u8()];
+                    for i in (0..Self::num_u8()).rev() {
+                        out[i] = (k.as_ref()[0] % Self::table_size() as u64) as u8;
+                        k.divn(Self::LOG2_W as u32);
+                    }
+                    assert!(k.is_zero());
+                    out
+                };
+                cfg_iter!(exps_h)
+                    .zip(cfg_chunks_mut!(exps_recode_h, Self::num_u8()))
+                    .zip(cfg_chunks_mut!(tables_h, Self::table_size()).zip(cfg_iter!(bases_h)))
+                    .for_each(|((k, exps_chunk), (table, base))| {
+                        let base = base.into_projective();
+                        exps_chunk.clone_from_slice(&scalar_recode(&mut k.clone())[..]);
+
+                        table[0] = Self::zero();
+                        for i in 1..Self::table_size() {
+                            table[i] = table[i - 1] + base;
+                        }
+                    }
+                );
+            }
+
+            fn num_u8() -> usize {
+                (Self::NUM_BITS - 1) / Self::LOG2_W + 1
+            }
+
+            impl_run_kernel!();
+            impl_gpu_cpu_run_kernel!();
+        }
+    };
+}
+
+pub trait GPUScalarMulSlice<G: AffineCurve> {
+    #[allow(unused_variables)]
+    fn cpu_gpu_scalar_mul(
+        &mut self,
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError>;
+}
+
+impl<G: AffineCurve> GPUScalarMulSlice<G> for [G] {
+    fn cpu_gpu_scalar_mul(
+        &mut self,
+        exps_h: &[<<G as AffineCurve>::ScalarField as PrimeField>::BigInt],
+        cuda_group_size: usize,
+        // size of the batch for cpu scalar mul
+        cpu_chunk_size: usize,
+    ) -> Result<(), CudaScalarMulError> {
+        G::Projective::cpu_gpu_scalar_mul(self, exps_h, cuda_group_size, cpu_chunk_size)
+    }
+}
diff --git a/ec/src/cuda/scalar_mul/run_kernel_macros.rs b/ec/src/cuda/scalar_mul/run_kernel_macros.rs
new file mode 100644
index 000000000..4545243d0
--- /dev/null
+++ b/ec/src/cuda/scalar_mul/run_kernel_macros.rs
@@ -0,0 +1,86 @@
+#[macro_export]
+macro_rules! impl_run_kernel {
+    () => {
+        // We drop a lock only after the parallel portion has been handled
+        #[allow(unused_variables)]
+        fn par_run_kernel_sync<T>(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+            lock: T,
+        ) -> DeviceMemory<Self> {
+            #[cfg(feature = "cuda")]
+            {
+                assert_eq!(bases_h.len(), exps_h.len());
+                let n = bases_h.len();
+
+                let mut tables_h = vec![Self::zero(); n * Self::table_size()];
+                let mut exps_recode_h = vec![0u8; n * Self::num_u8()];
+
+                Self::generate_tables_and_recoding(
+                    bases_h,
+                    &mut tables_h[..],
+                    exps_h,
+                    &mut exps_recode_h[..],
+                );
+                drop(lock);
+
+                let mut out = DeviceMemory::<Self>::zeros(&ctx, n);
+                let mut tables = DeviceMemory::<Self>::zeros(&ctx, n * Self::table_size());
+                let mut exps = DeviceMemory::<u8>::zeros(&ctx, n * Self::num_u8());
+
+                tables.copy_from_slice(&tables_h);
+                exps.copy_from_slice(&exps_recode_h);
+
+                P::scalar_mul_kernel(
+                    &ctx,
+                    n / cuda_group_size, // grid
+                    cuda_group_size,     // block
+                    tables.as_ptr(),
+                    exps.as_ptr(),
+                    out.as_mut_ptr(),
+                    n as isize,
+                )
+                .expect("Kernel call failed");
+                out
+            }
+            #[cfg(not(feature = "cuda"))]
+            unreachable!();
+        }
+
+        #[allow(unused_variables)]
+        fn par_run_kernel(
+            ctx: &Context,
+            bases_h: &[<Self as ProjectiveCurve>::Affine],
+            exps_h: &[<<Self as ProjectiveCurve>::ScalarField as PrimeField>::BigInt],
+            cuda_group_size: usize,
+        ) -> DeviceMemory<Self> {
+            #[cfg(feature = "cuda")]
+            {
+                assert_eq!(bases_h.len(), exps_h.len());
+                let n = bases_h.len();
+
+                let mut tables = DeviceMemory::<Self>::zeros(&ctx, n * Self::table_size());
+                let mut exps = DeviceMemory::<u8>::zeros(&ctx, n * Self::num_u8());
+                let mut out = DeviceMemory::<Self>::zeros(&ctx, n);
+
+                Self::generate_tables_and_recoding(bases_h, &mut tables[..], exps_h, &mut exps[..]);
+
+                P::scalar_mul_kernel(
+                    &ctx,
+                    n / cuda_group_size, // grid
+                    cuda_group_size,     // block
+                    tables.as_ptr(),
+                    exps.as_ptr(),
+                    out.as_mut_ptr(),
+                    n as isize,
+                )
+                .expect("Kernel call failed");
+                out
+            }
+            #[cfg(not(feature = "cuda"))]
+            unreachable!();
+        }
+    };
+}
diff --git a/ec/src/glv.rs b/ec/src/glv.rs
new file mode 100644
index 000000000..37566741c
--- /dev/null
+++ b/ec/src/glv.rs
@@ -0,0 +1,128 @@
+use crate::ModelParameters;
+use ark_ff::{biginteger::BigInteger, fields::PrimeField};
+use core::ops::Neg;
+
+/// The GLV parameters here require the following conditions to be satisfied:
+/// 1. MODULUS_BITS < NUM_LIMBS * 64 - 1. So 2 * n < 1 << (64 * NUM_LIMBS)
+/// We also assume that (|b1| + 2) * (|b2| + 2) < 2 * n
+/// We also know that either B1 is neg or B2 is.
+pub trait GLVParameters: Send + Sync + 'static + ModelParameters {
+    type WideBigInt: BigInteger;
+
+    const LAMBDA: Self::ScalarField; // lambda in ZZ s.t. phi(P) = lambda*P for all P
+    const OMEGA: Self::BaseField; // phi((x, y)) = (\omega x, y)
+    const Q1: <Self::ScalarField as PrimeField>::BigInt; // round(R*|b2|/n)
+    const Q2: <Self::ScalarField as PrimeField>::BigInt; // round(R*|b1|/n)
+    const B1: <Self::ScalarField as PrimeField>::BigInt; // |b1|
+    const B2: <Self::ScalarField as PrimeField>::BigInt; // |b2|
+    const B1_IS_NEG: bool;
+
+    const R_BITS: u32;
+
+    #[inline]
+    fn glv_scalar_decomposition_inner(
+        k: <Self::ScalarField as PrimeField>::BigInt,
+    ) -> (
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+    ) {
+        let limbs = <Self::ScalarField as PrimeField>::BigInt::NUM_LIMBS;
+        let modulus = Self::ScalarField::modulus();
+
+        // If we are doing a subgroup check, we should multiply by the original scalar
+        // since the GLV decomposition does not guarantee that we would not be
+        // adding and subtracting back to zero
+        if k == modulus {
+            return (
+                (false, k),
+                (false, <Self::ScalarField as PrimeField>::BigInt::from(0)),
+            );
+        }
+
+        let mut half = Self::WideBigInt::from(1);
+        half.muln(Self::R_BITS - 1);
+
+        let mut c1_wide = Self::WideBigInt::mul_no_reduce(k.as_ref(), Self::Q1.as_ref());
+        // add half to achieve rounding rather than flooring
+        c1_wide.add_nocarry(&half);
+        // Approximation to round(|b2|*k/n)
+        c1_wide.divn(Self::R_BITS);
+        let c1 = &c1_wide.as_ref()[..limbs];
+
+        let mut c2_wide = Self::WideBigInt::mul_no_reduce(k.as_ref(), Self::Q2.as_ref());
+        c2_wide.add_nocarry(&half);
+        c2_wide.divn(Self::R_BITS);
+        let c2 = &c2_wide.as_ref()[..limbs];
+
+        // We first assume that the final 2 bits of the representation for the modulus
+        // is not set, so that 2 * n < R = 1 << (64 * NUM_LIMBS).
+
+        // wlog c1 = round(k * round(|b_1|R / n) / R) < ceil(k * ceil(|b_1|* R / n) / R)
+        // < k * (b_1 * R / n + 1) / R + 1 <  b_1 * k / n + 2 < b_1 + 2, so a
+        // bound like (|b1| + 2) * (|b2| + 2) < 2 * n is good enough for wlog d1
+        // < 2 * n
+        let mut d1 =
+            <Self::ScalarField as PrimeField>::BigInt::mul_no_reduce_lo(&c1, Self::B1.as_ref());
+        if d1 > modulus {
+            d1.sub_noborrow(&modulus);
+        }
+        let mut d2 =
+            <Self::ScalarField as PrimeField>::BigInt::mul_no_reduce_lo(&c2, Self::B2.as_ref());
+        if d2 > modulus {
+            d2.sub_noborrow(&modulus);
+        }
+        // We compute k_2 = -(c1.b1 + c1.b1) = sign(b1)*(c2|b2| - c1|b1|) = sign(b1)(d2
+        // - d1)
+        let k2_field = if !Self::B1_IS_NEG {
+            Self::ScalarField::from_repr(d2).unwrap() - &Self::ScalarField::from_repr(d1).unwrap()
+        } else {
+            Self::ScalarField::from_repr(d1).unwrap() - &Self::ScalarField::from_repr(d2).unwrap()
+        };
+
+        let k1 =
+            (Self::ScalarField::from_repr(k).unwrap() - &(k2_field * &Self::LAMBDA)).into_repr();
+        let k2 = k2_field.into_repr();
+
+        let (neg2, k2) = if k2.num_bits() > Self::R_BITS / 2 + 1 {
+            (true, k2_field.neg().into_repr())
+        } else {
+            (false, k2)
+        };
+
+        let (neg1, k1) = if k1.num_bits() > Self::R_BITS / 2 + 1 {
+            (
+                true,
+                Self::ScalarField::from_repr(k1).unwrap().neg().into_repr(),
+            )
+        } else {
+            (false, k1)
+        };
+
+        ((neg1, k1), (neg2, k2))
+    }
+}
+
+#[macro_export]
+macro_rules! impl_glv_for_sw {
+    () => {
+        #[inline(always)]
+        fn has_glv() -> bool {
+            true
+        }
+
+        #[inline(always)]
+        fn glv_endomorphism_in_place(elem: &mut Self::BaseField) {
+            *elem *= &<Self as GLVParameters>::OMEGA;
+        }
+
+        #[inline]
+        fn glv_scalar_decomposition(
+            k: <Self::ScalarField as PrimeField>::BigInt,
+        ) -> (
+            (bool, <Self::ScalarField as PrimeField>::BigInt),
+            (bool, <Self::ScalarField as PrimeField>::BigInt),
+        ) {
+            <Self as GLVParameters>::glv_scalar_decomposition_inner(k)
+        }
+    };
+}
diff --git a/ec/src/lib.rs b/ec/src/lib.rs
index 87712cfc3..ceccaca15 100644
--- a/ec/src/lib.rs
+++ b/ec/src/lib.rs
@@ -29,11 +29,28 @@ use ark_std::{
 use num_traits::Zero;
 use zeroize::Zeroize;
 
+pub mod batch_verify;
+pub use self::batch_verify::*;
+
+pub mod batch_arith;
+pub use self::batch_arith::*;
+
+pub mod bucketed_add;
+pub use self::bucketed_add::*;
+
+#[macro_use]
+pub mod glv;
+pub use self::glv::*;
+
 pub mod models;
 pub use self::models::*;
 
 pub mod group;
 
+#[macro_use]
+pub mod cuda;
+pub use cuda::*;
+
 pub mod msm;
 
 pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + PartialEq {
@@ -44,6 +61,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par
     type G1Projective: ProjectiveCurve<BaseField = Self::Fq, ScalarField = Self::Fr, Affine = Self::G1Affine>
         + From<Self::G1Affine>
         + Into<Self::G1Affine>
+        + GPUScalarMul<Self::G1Affine>
         + MulAssign<Self::Fr>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// The affine representation of an element in G1.
@@ -59,6 +77,7 @@ pub trait PairingEngine: Sized + 'static + Copy + Debug + Sync + Send + Eq + Par
     type G2Projective: ProjectiveCurve<BaseField = Self::Fqe, ScalarField = Self::Fr, Affine = Self::G2Affine>
         + From<Self::G2Affine>
         + Into<Self::G2Affine>
+        + GPUScalarMul<Self::G2Affine>
         + MulAssign<Self::Fr>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// The affine representation of an element in G2.
@@ -145,6 +164,7 @@ pub trait ProjectiveCurve:
     + core::iter::Sum<Self>
     + for<'a> core::iter::Sum<&'a Self>
     + From<<Self as ProjectiveCurve>::Affine>
+    + GPUScalarMul<<Self as ProjectiveCurve>::Affine>
 {
     const COFACTOR: &'static [u64];
     type ScalarField: PrimeField + SquareRootField;
@@ -214,6 +234,8 @@ pub trait ProjectiveCurve:
         self = res;
         self
     }
+
+    fn get_x(&mut self) -> &mut Self::BaseField;
 }
 
 /// Affine representation of an elliptic curve point guaranteed to be
@@ -238,6 +260,7 @@ pub trait AffineCurve:
     + Neg<Output = Self>
     + Zeroize
     + From<<Self as AffineCurve>::Projective>
+    + BatchGroupArithmetic<BaseFieldForBatch = <Self as AffineCurve>::BaseField>
 {
     const COFACTOR: &'static [u64];
     type ScalarField: PrimeField + SquareRootField + Into<<Self::ScalarField as PrimeField>::BigInt>;
@@ -245,6 +268,7 @@ pub trait AffineCurve:
     type Projective: ProjectiveCurve<Affine = Self, ScalarField = Self::ScalarField, BaseField = Self::BaseField>
         + From<Self>
         + Into<Self>
+        + GPUScalarMul<Self>
         + MulAssign<Self::ScalarField>; // needed due to https://github.com/rust-lang/rust/issues/69640
 
     /// Returns a fixed generator of unknown exponent.
diff --git a/ec/src/models/mod.rs b/ec/src/models/mod.rs
index 1769ae08f..ce8cfaa42 100644
--- a/ec/src/models/mod.rs
+++ b/ec/src/models/mod.rs
@@ -8,53 +8,15 @@ pub mod mnt6;
 pub mod short_weierstrass_jacobian;
 pub mod twisted_edwards_extended;
 
+pub use {
+    short_weierstrass_jacobian::SWModelParameters,
+    twisted_edwards_extended::{MontgomeryModelParameters, TEModelParameters},
+};
+
 pub trait ModelParameters: Send + Sync + 'static {
     type BaseField: Field + SquareRootField;
-    type ScalarField: PrimeField + SquareRootField + Into<<Self::ScalarField as PrimeField>::BigInt>;
-}
-
-pub trait SWModelParameters: ModelParameters {
-    const COEFF_A: Self::BaseField;
-    const COEFF_B: Self::BaseField;
-    const COFACTOR: &'static [u64];
-    const COFACTOR_INV: Self::ScalarField;
-    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
-
-    #[inline(always)]
-    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy *= &Self::COEFF_A;
-        copy
-    }
-
-    #[inline(always)]
-    fn add_b(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy += &Self::COEFF_B;
-        copy
-    }
-}
-
-pub trait TEModelParameters: ModelParameters {
-    const COEFF_A: Self::BaseField;
-    const COEFF_D: Self::BaseField;
-    const COFACTOR: &'static [u64];
-    const COFACTOR_INV: Self::ScalarField;
-    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
-
-    type MontgomeryModelParameters: MontgomeryModelParameters<BaseField = Self::BaseField>;
-
-    #[inline(always)]
-    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
-        let mut copy = *elem;
-        copy *= &Self::COEFF_A;
-        copy
-    }
-}
-
-pub trait MontgomeryModelParameters: ModelParameters {
-    const COEFF_A: Self::BaseField;
-    const COEFF_B: Self::BaseField;
-
-    type TEModelParameters: TEModelParameters<BaseField = Self::BaseField>;
+    type ScalarField: PrimeField
+        + SquareRootField
+        + From<<Self::ScalarField as PrimeField>::BigInt>
+        + Into<<Self::ScalarField as PrimeField>::BigInt>;
 }
diff --git a/ec/src/models/short_weierstrass_jacobian.rs b/ec/src/models/short_weierstrass_jacobian.rs
index 1b91a1132..5f30a100b 100644
--- a/ec/src/models/short_weierstrass_jacobian.rs
+++ b/ec/src/models/short_weierstrass_jacobian.rs
@@ -1,3 +1,5 @@
+#[cfg(not(feature = "cuda"))]
+use crate::accel_dummy::*;
 use ark_serialize::{
     CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize,
     CanonicalSerializeWithFlags, SWFlags, SerializationError,
@@ -7,16 +9,28 @@ use ark_std::{
     io::{Read, Result as IoResult, Write},
     marker::PhantomData,
     ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign},
+    string::String,
     vec::Vec,
 };
+#[cfg(feature = "cuda")]
+use {
+    crate::BatchGroupArithmeticSlice, accel::*, closure::closure, log::debug, peekmore::PeekMore,
+    std::sync::Mutex,
+};
 
 use ark_ff::{
+    biginteger::BigInteger,
     bytes::{FromBytes, ToBytes},
-    fields::{BitIteratorBE, Field, PrimeField, SquareRootField},
+    fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField},
     ToConstraintField, UniformRand,
 };
 
-use crate::{models::SWModelParameters as Parameters, AffineCurve, ProjectiveCurve};
+use crate::{
+    batch_arith::{decode_endo_from_u32, BatchGroupArithmetic, ENDO_CODING_BITS},
+    cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler},
+    impl_gpu_cpu_run_kernel, impl_gpu_sw_projective, impl_run_kernel, AffineCurve, ModelParameters,
+    ProjectiveCurve,
+};
 
 use num_traits::{One, Zero};
 use zeroize::Zeroize;
@@ -29,6 +43,157 @@ use ark_std::rand::{
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
+pub trait SWModelParameters: ModelParameters + Sized {
+    const COEFF_A: Self::BaseField;
+    const COEFF_B: Self::BaseField;
+    const COFACTOR: &'static [u64];
+    const COFACTOR_INV: Self::ScalarField;
+    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
+
+    #[inline(always)]
+    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy *= &Self::COEFF_A;
+        copy
+    }
+
+    #[inline(always)]
+    fn glv_window_size() -> usize {
+        4
+    }
+
+    #[inline(always)]
+    fn add_b(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy += &Self::COEFF_B;
+        copy
+    }
+
+    #[inline(always)]
+    fn has_glv() -> bool {
+        false
+    }
+
+    #[inline(always)]
+    fn glv_endomorphism_in_place(_elem: &mut Self::BaseField) {
+        unimplemented!()
+    }
+
+    #[inline(always)]
+    fn glv_scalar_decomposition(
+        _k: <Self::ScalarField as PrimeField>::BigInt,
+    ) -> (
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+        (bool, <Self::ScalarField as PrimeField>::BigInt),
+    ) {
+        unimplemented!()
+    }
+
+    // CUDA kernels are parameter specific and cannot
+    // be instantiated generically
+    fn scalar_mul_kernel(
+        ctx: &Context,
+        grid: usize,
+        block: usize,
+        table: *const GroupProjective<Self>,
+        exps: *const u8,
+        out: *mut GroupProjective<Self>,
+        n: isize,
+    ) -> error::Result<()>;
+
+    fn scalar_mul_static_profiler() -> ScalarMulProfiler;
+
+    fn namespace() -> &'static str;
+}
+
+/// Implements GLV mul for a single element with a wNAF tables
+#[macro_export]
+macro_rules! impl_glv_mul {
+    ($Projective: ty, $P: ident, $w: ident, $self_proj: ident, $res: ident, $by: ident) => {
+        // In the future, make this a GLV parameter entry
+        let wnaf_recoding =
+            |s: &mut <Self::ScalarField as PrimeField>::BigInt, is_neg: bool| -> Vec<i16> {
+                let window_size: i16 = 1 << ($w + 1);
+                let half_window_size: i16 = 1 << $w;
+
+                let mut recoding = Vec::<i16>::with_capacity(s.num_bits() as usize / ($w + 1));
+
+                while !s.is_zero() {
+                    let op = if s.is_odd() {
+                        let mut z: i16 = (s.as_ref()[0] % (1 << ($w + 1))) as i16;
+
+                        if z < half_window_size {
+                            s.sub_noborrow(&(z as u64).into());
+                        } else {
+                            z = z - window_size;
+                            s.add_nocarry(&((-z) as u64).into());
+                        }
+                        if is_neg {
+                            -z
+                        } else {
+                            z
+                        }
+                    } else {
+                        0
+                    };
+                    recoding.push(op);
+                    s.div2();
+                }
+                recoding
+            };
+
+        let ((k1_neg, mut k1), (k2_neg, mut k2)) = $P::glv_scalar_decomposition($by.into());
+        let mut wnaf_table_k1 = Vec::<$Projective>::with_capacity(1 << $w);
+        let double = $self_proj.double();
+        wnaf_table_k1.push($self_proj);
+        for _ in 1..(1 << ($w - 1)) {
+            wnaf_table_k1.push(*wnaf_table_k1.last().unwrap() + &double);
+        }
+        let mut wnaf_table_k2 = wnaf_table_k1.clone();
+        wnaf_table_k2
+            .iter_mut()
+            .for_each(|p| $P::glv_endomorphism_in_place(&mut p.x));
+
+        let k1_ops = wnaf_recoding(&mut k1, k1_neg);
+        let k2_ops = wnaf_recoding(&mut k2, k2_neg);
+
+        if k1_ops.len() > k2_ops.len() {
+            for &op in k1_ops[k2_ops.len()..].iter().rev() {
+                $res.double_in_place();
+                if op > 0 {
+                    $res += &wnaf_table_k1[(op as usize) / 2];
+                } else if op < 0 {
+                    $res += &wnaf_table_k1[(-op as usize) / 2].neg();
+                }
+            }
+        } else {
+            for &op in k2_ops[k1_ops.len()..].iter().rev() {
+                $res.double_in_place();
+                if op > 0 {
+                    $res += &wnaf_table_k2[(op as usize) / 2];
+                } else if op < 0 {
+                    $res += &wnaf_table_k2[(-op as usize) / 2].neg();
+                }
+            }
+        }
+        for (&op1, &op2) in k1_ops.iter().zip(k2_ops.iter()).rev() {
+            $res.double_in_place();
+            if op1 > 0 {
+                $res += &wnaf_table_k1[(op1 as usize) / 2];
+            } else if op1 < 0 {
+                $res += &wnaf_table_k1[(-op1 as usize) / 2].neg();
+            }
+            if op2 > 0 {
+                $res += &wnaf_table_k2[(op2 as usize) / 2];
+            } else if op2 < 0 {
+                $res += &wnaf_table_k2[(-op2 as usize) / 2].neg();
+            }
+        }
+    };
+}
+
+use SWModelParameters as Parameters;
+
 #[derive(Derivative)]
 #[derivative(
     Copy(bound = "P: Parameters"),
@@ -202,9 +367,17 @@ impl<P: Parameters> AffineCurve for GroupAffine<P> {
     }
 
     #[inline]
-    fn mul<S: Into<<Self::ScalarField as PrimeField>::BigInt>>(&self, by: S) -> GroupProjective<P> {
-        let bits = BitIteratorBE::new(by.into());
-        self.mul_bits(bits)
+    fn mul<S: Into<<Self::ScalarField as PrimeField>::BigInt>>(&self, by: S) -> Self::Projective {
+        if P::has_glv() {
+            let w = P::glv_window_size();
+            let mut res = Self::Projective::zero();
+            let self_proj = self.into_projective();
+            impl_glv_mul!(Self::Projective, P, w, self_proj, res, by);
+            res
+        } else {
+            let bits = BitIteratorBE::new(by.into());
+            self.mul_bits(bits)
+        }
     }
 
     #[inline]
@@ -256,6 +429,607 @@ impl<P: Parameters> Default for GroupAffine<P> {
     }
 }
 
+#[cfg(feature = "prefetch")]
+macro_rules! prefetch_slice {
+    ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => {
+        if let Some((idp_1, idp_2)) = $prefetch_iter.next() {
+            prefetch::<Self>(&mut $slice_1[*idp_1 as usize]);
+            prefetch::<Self>(&mut $slice_2[*idp_2 as usize]);
+        }
+    };
+
+    ($slice_1: ident, $prefetch_iter: ident) => {
+        if let Some((idp_1, _)) = $prefetch_iter.next() {
+            prefetch::<Self>(&mut $slice_1[*idp_1 as usize]);
+        }
+    };
+}
+
+#[cfg(feature = "prefetch")]
+macro_rules! prefetch_slice_endo {
+    ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => {
+        if let Some((idp_1, idp_2)) = $prefetch_iter.next() {
+            let (idp_2, _) = decode_endo_from_u32(*idp_2);
+            prefetch::<Self>(&mut $slice_1[*idp_1 as usize]);
+            prefetch::<Self>(&$slice_2[idp_2]);
+        }
+    };
+}
+
+#[cfg(feature = "prefetch")]
+macro_rules! prefetch_slice_write {
+    ($slice_1: ident, $slice_2: ident, $prefetch_iter: ident) => {
+        if let Some((idp_1, idp_2)) = $prefetch_iter.next() {
+            prefetch::<Self>(&$slice_1[*idp_1 as usize]);
+            if *idp_2 != !0u32 {
+                prefetch::<Self>(&$slice_2[*idp_2 as usize]);
+            }
+        }
+    };
+}
+
+macro_rules! batch_add_loop_1 {
+    ($a: ident, $b: ident, $half: ident, $inversion_tmp: ident) => {
+        if $a.is_zero() || $b.is_zero() {
+            ();
+        } else if $a.x == $b.x {
+            $half = match $half {
+                None => P::BaseField::one().double().inverse(),
+                _ => $half,
+            };
+            let h = $half.unwrap();
+
+            // Double
+            // In our model, we consider self additions rare.
+            // So we consider it inconsequential to make them more expensive
+            // This costs 1 modular mul more than a standard squaring,
+            // and one amortised inversion
+            if $a.y == $b.y {
+                let x_sq = $b.x.square();
+                $b.x -= &$b.y; // x - y
+                $a.x = $b.y.double(); // denominator = 2y
+                $a.y = x_sq.double() + &x_sq + &P::COEFF_A; // numerator = 3x^2 + a
+                $b.y -= &(h * &$a.y); // y - (3x^2 + $a./2
+                $a.y *= &$inversion_tmp; // (3x^2 + a) * tmp
+                $inversion_tmp *= &$a.x; // update tmp
+            } else {
+                // No inversions take place if either operand is zero
+                $a.infinity = true;
+                $b.infinity = true;
+            }
+        } else {
+            // We can recover x1 + x2 from this. Note this is never 0.
+            $a.x -= &$b.x; // denominator = x1 - x2
+            $a.y -= &$b.y; // numerator = y1 - y2
+            $a.y *= &$inversion_tmp; // (y1 - y2)*tmp
+            $inversion_tmp *= &$a.x // update tmp
+        }
+    };
+}
+
+macro_rules! batch_add_loop_2 {
+    ($a: ident, $b: ident, $inversion_tmp: ident) => {
+        if $a.is_zero() {
+            *$a = $b;
+        } else if !$b.is_zero() {
+            let lambda = $a.y * &$inversion_tmp;
+            $inversion_tmp *= &$a.x; // Remove the top layer of the denominator
+
+            // x3 = l^2 - x1 - x2 or for squaring: 2y + l^2 + 2x - 2y = l^2 - 2x
+            $a.x += &$b.x.double();
+            $a.x = lambda.square() - &$a.x;
+            // y3 = l*(x2 - x3) - y2 or
+            // for squaring: (3x^2 + a)/2y(x - y - x3) - (y - (3x^2 + a)/2) = l*(x - x3) - y
+            $a.y = lambda * &($b.x - &$a.x) - &$b.y;
+        }
+    };
+}
+
+impl<P: Parameters> BatchGroupArithmetic for GroupAffine<P> {
+    type BaseFieldForBatch = P::BaseField;
+    /// This implementation of batch group ops takes particular
+    /// care to make most use of points fetched from memory to prevent
+    /// reallocations
+
+    /// It is inspired by Aztec's approach:
+    /// https://github.com/AztecProtocol/barretenberg/blob/
+    /// c358fee3259a949da830f9867df49dc18768fa26/barretenberg/
+    /// src/aztec/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.
+    /// cpp
+
+    // We require extra scratch space, and since we want to prevent allocation/deallocation
+    // overhead we pass it externally for when this function is called many times
+    #[inline]
+    fn batch_double_in_place(
+        bases: &mut [Self],
+        index: &[u32],
+        scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+
+        let mut _scratch_space_inner = if scratch_space.is_none() {
+            Vec::with_capacity(index.len())
+        } else {
+            vec![]
+        };
+        let scratch_space = match scratch_space {
+            Some(vec) => vec,
+            None => &mut _scratch_space_inner,
+        };
+
+        debug_assert!(scratch_space.len() == 0);
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        for idx in index.iter() {
+            // Prefetch next group into cache
+            #[cfg(feature = "prefetch")]
+            if let Some(idp) = prefetch_iter.next() {
+                prefetch::<Self>(&mut bases[*idp as usize]);
+            }
+            let mut a = &mut bases[*idx as usize];
+            if !a.is_zero() {
+                if a.y.is_zero() {
+                    a.infinity = true;
+                } else {
+                    let x_sq = a.x.square();
+                    let x_sq_3 = x_sq.double() + &x_sq + &P::COEFF_A; // numerator = 3x^2 + a
+                    scratch_space.push(x_sq_3 * &inversion_tmp); // (3x^2 + a) * tmp
+                    inversion_tmp *= &a.y.double(); // update tmp
+                }
+            }
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter().rev();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        for idx in index.iter().rev() {
+            #[cfg(feature = "prefetch")]
+            if let Some(idp) = prefetch_iter.next() {
+                prefetch::<Self>(&mut bases[*idp as usize]);
+            }
+            let mut a = &mut bases[*idx as usize];
+            if !a.is_zero() {
+                let z = scratch_space.pop().unwrap();
+                #[cfg(feature = "prefetch")]
+                if let Some(e) = scratch_space.last() {
+                    prefetch::<P::BaseField>(e);
+                }
+                let lambda = z * &inversion_tmp;
+                inversion_tmp *= &a.y.double(); // Remove the top layer of the denominator
+
+                // x3 = l^2 + 2x
+                let x3 = &(lambda.square() - &a.x.double());
+                // y3 = l*(x - x3) - y
+                a.y = lambda * &(a.x - x3) - &a.y;
+                a.x = *x3;
+            }
+        }
+
+        debug_assert!(scratch_space.len() == 0);
+
+        // We reset the vector
+        // Clearing is really unnecessary, but we can do it anyway
+        scratch_space.clear();
+    }
+
+    #[inline]
+    fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]) {
+        let mut inversion_tmp = P::BaseField::one();
+        let mut half = None;
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice!(bases, other, prefetch_iter);
+
+            let (mut a, mut b) = (&mut bases[*idx as usize], &mut other[*idy as usize]);
+            batch_add_loop_1!(a, b, half, inversion_tmp);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter().rev();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        for (idx, idy) in index.iter().rev() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice!(bases, other, prefetch_iter);
+            let (mut a, b) = (&mut bases[*idx as usize], other[*idy as usize]);
+            batch_add_loop_2!(a, b, inversion_tmp)
+        }
+    }
+
+    #[inline]
+    fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]) {
+        let mut inversion_tmp = P::BaseField::one();
+        let mut half = None;
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        {
+            prefetch_iter.next();
+        }
+
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice!(bases, bases, prefetch_iter);
+            let (mut a, mut b) = if idx < idy {
+                let (x, y) = bases.split_at_mut(*idy as usize);
+                (&mut x[*idx as usize], &mut y[0])
+            } else {
+                let (x, y) = bases.split_at_mut(*idx as usize);
+                (&mut y[0], &mut x[*idy as usize])
+            };
+            batch_add_loop_1!(a, b, half, inversion_tmp);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter().rev();
+        #[cfg(feature = "prefetch")]
+        {
+            prefetch_iter.next();
+        }
+
+        for (idx, idy) in index.iter().rev() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice!(bases, bases, prefetch_iter);
+            let (mut a, b) = if idx < idy {
+                let (x, y) = bases.split_at_mut(*idy as usize);
+                (&mut x[*idx as usize], y[0])
+            } else {
+                let (x, y) = bases.split_at_mut(*idx as usize);
+                (&mut y[0], x[*idy as usize])
+            };
+            batch_add_loop_2!(a, b, inversion_tmp);
+        }
+    }
+
+    #[inline]
+    fn batch_add_in_place_read_only(
+        bases: &mut [Self],
+        other: &[Self],
+        index: &[(u32, u32)],
+        scratch_space: &mut Vec<Self>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+        let mut half = None;
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            let (idy, endomorphism) = decode_endo_from_u32(*idy);
+            #[cfg(feature = "prefetch")]
+            prefetch_slice_endo!(bases, other, prefetch_iter);
+
+            let mut a = &mut bases[*idx as usize];
+
+            // Apply endomorphisms according to encoding
+            let mut b = if endomorphism % 2 == 1 {
+                other[idy].neg()
+            } else {
+                other[idy]
+            };
+
+            if P::has_glv() {
+                if endomorphism >> 1 == 1 {
+                    P::glv_endomorphism_in_place(&mut b.x);
+                }
+            }
+            batch_add_loop_1!(a, b, half, inversion_tmp);
+            scratch_space.push(b);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter().rev();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        for (idx, _) in index.iter().rev() {
+            #[cfg(feature = "prefetch")]
+            {
+                prefetch_slice!(bases, prefetch_iter);
+                let len = scratch_space.len();
+                if len > 0 {
+                    prefetch::<Self>(&mut scratch_space[len - 1]);
+                }
+            }
+            let (mut a, b) = (&mut bases[*idx as usize], scratch_space.pop().unwrap());
+            batch_add_loop_2!(a, b, inversion_tmp);
+        }
+    }
+
+    fn batch_add_write(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+        let mut half = None;
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        {
+            prefetch_iter.next();
+        }
+
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice_write!(lookup, lookup, prefetch_iter);
+
+            if *idy == !0u32 {
+                new_elems.push(lookup[*idx as usize]);
+                scratch_space.push(None);
+            } else {
+                let (mut a, mut b) = (lookup[*idx as usize], lookup[*idy as usize]);
+                batch_add_loop_1!(a, b, half, inversion_tmp);
+                new_elems.push(a);
+                scratch_space.push(Some(b));
+            }
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) {
+            match op_b {
+                Some(b) => {
+                    let b_ = *b;
+                    batch_add_loop_2!(a, b_, inversion_tmp);
+                }
+                None => (),
+            };
+        }
+        scratch_space.clear();
+    }
+
+    fn batch_add_write_read_self(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+        let mut half = None;
+
+        #[cfg(feature = "prefetch")]
+        let mut prefetch_iter = index.iter();
+        #[cfg(feature = "prefetch")]
+        prefetch_iter.next();
+
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            #[cfg(feature = "prefetch")]
+            prefetch_slice_write!(new_elems, lookup, prefetch_iter);
+
+            if *idy == !0u32 {
+                new_elems.push(lookup[*idx as usize]);
+                scratch_space.push(None);
+            } else {
+                let (mut a, mut b) = (new_elems[*idx as usize], lookup[*idy as usize]);
+                batch_add_loop_1!(a, b, half, inversion_tmp);
+                new_elems.push(a);
+                scratch_space.push(Some(b));
+            }
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) {
+            match op_b {
+                Some(b) => {
+                    let b_ = *b;
+                    batch_add_loop_2!(a, b_, inversion_tmp);
+                }
+                None => (),
+            };
+        }
+        scratch_space.clear();
+    }
+
+    fn batch_scalar_mul_in_place<BigInt: BigInteger>(
+        mut bases: &mut [Self],
+        scalars: &mut [BigInt],
+        w: usize,
+    ) {
+        debug_assert!(bases.len() == scalars.len());
+        if bases.len() == 0 {
+            return;
+        }
+        let batch_size = bases.len();
+        if P::has_glv() {
+            use itertools::{EitherOrBoth::*, Itertools};
+            let mut scratch_space = Vec::<Self::BaseFieldForBatch>::with_capacity(bases.len());
+            let mut scratch_space_group = Vec::<Self>::with_capacity(bases.len() / w);
+
+            let k_vec: Vec<_> = scalars
+                .iter()
+                .map(|k| {
+                    P::glv_scalar_decomposition(<P::ScalarField as PrimeField>::BigInt::from_slice(
+                        k.as_ref(),
+                    ))
+                })
+                .collect();
+
+            let mut k1_scalars: Vec<_> = k_vec.iter().map(|x| (x.0).1).collect();
+            let k1_negates: Vec<_> = k_vec.iter().map(|x| (x.0).0).collect();
+            let mut k2_scalars: Vec<_> = k_vec.iter().map(|x| (x.1).1).collect();
+            let k2_negates: Vec<_> = k_vec.iter().map(|x| (x.1).0).collect();
+
+            let opcode_vectorised_k1 = Self::batch_wnaf_opcode_recoding(
+                &mut k1_scalars[..],
+                w,
+                Some(k1_negates.as_slice()),
+            );
+            let opcode_vectorised_k2 = Self::batch_wnaf_opcode_recoding(
+                &mut k2_scalars[..],
+                w,
+                Some(k2_negates.as_slice()),
+            );
+
+            let tables = Self::batch_wnaf_tables(bases, w);
+            let tables_k2: Vec<_> = tables
+                .iter()
+                .map(|&p| {
+                    let mut p = p;
+                    P::glv_endomorphism_in_place(&mut p.x);
+                    p
+                })
+                .collect();
+            // Set all points to 0;
+            let zero = Self::zero();
+            for p in bases.iter_mut() {
+                *p = zero;
+            }
+
+            let noop_vec = vec![None; batch_size];
+            for (opcode_row_k1, opcode_row_k2) in opcode_vectorised_k1
+                .iter()
+                .zip_longest(opcode_vectorised_k2.iter())
+                .map(|x| match x {
+                    Both(a, b) => (a, b),
+                    Left(a) => (a, &noop_vec),
+                    Right(b) => (&noop_vec, b),
+                })
+                .rev()
+            {
+                let index_double: Vec<_> = opcode_row_k1
+                    .iter()
+                    .zip(opcode_row_k2.iter())
+                    .enumerate()
+                    .filter(|x| (x.1).0.is_some() || (x.1).1.is_some())
+                    .map(|x| x.0 as u32)
+                    .collect();
+
+                Self::batch_double_in_place(
+                    &mut bases,
+                    &index_double[..],
+                    Some(&mut scratch_space),
+                );
+                let index_add_k1: Vec<_> = opcode_row_k1
+                    .iter()
+                    .enumerate()
+                    .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                    .map(|(i, op)| {
+                        let idx = op.unwrap();
+                        if idx > 0 {
+                            let op2 = ((idx as usize) / 2 * batch_size + i) as u32;
+                            (i as u32, op2 << ENDO_CODING_BITS)
+                        } else {
+                            let op2 = ((-idx as usize) / 2 * batch_size + i) as u32;
+                            (i as u32, (op2 << ENDO_CODING_BITS) + 1)
+                        }
+                    })
+                    .collect();
+
+                Self::batch_add_in_place_read_only(
+                    &mut bases,
+                    &tables[..],
+                    &index_add_k1[..],
+                    &mut scratch_space_group,
+                );
+                let index_add_k2: Vec<_> = opcode_row_k2
+                    .iter()
+                    .enumerate()
+                    .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                    .map(|(i, op)| {
+                        let idx = op.unwrap();
+                        if idx > 0 {
+                            let op2 = ((idx as usize) / 2 * batch_size + i) as u32;
+                            (i as u32, op2 << ENDO_CODING_BITS)
+                        } else {
+                            let op2 = ((-idx as usize) / 2 * batch_size + i) as u32;
+                            (i as u32, (op2 << ENDO_CODING_BITS) + 1)
+                        }
+                    })
+                    .collect();
+
+                Self::batch_add_in_place_read_only(
+                    &mut bases,
+                    &tables_k2[..],
+                    &index_add_k2[..],
+                    &mut scratch_space_group,
+                );
+            }
+        } else {
+            let mut scratch_space = Vec::<Self::BaseFieldForBatch>::with_capacity(bases.len());
+            let opcode_vectorised = Self::batch_wnaf_opcode_recoding::<BigInt>(scalars, w, None);
+            let tables = Self::batch_wnaf_tables(bases, w);
+            // Set all points to 0;
+            let zero = Self::zero();
+            for p in bases.iter_mut() {
+                *p = zero;
+            }
+
+            for opcode_row in opcode_vectorised.iter().rev() {
+                let index_double: Vec<_> = opcode_row
+                    .iter()
+                    .enumerate()
+                    .filter(|x| x.1.is_some())
+                    .map(|x| x.0 as u32)
+                    .collect();
+
+                Self::batch_double_in_place(
+                    &mut bases,
+                    &index_double[..],
+                    Some(&mut scratch_space),
+                );
+
+                let mut add_ops: Vec<Self> = opcode_row
+                    .iter()
+                    .enumerate()
+                    .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                    .map(|(i, op)| {
+                        let idx = op.unwrap();
+                        if idx > 0 {
+                            tables[(idx as usize) / 2 * batch_size + i].clone()
+                        } else {
+                            tables[(-idx as usize) / 2 * batch_size + i].clone().neg()
+                        }
+                    })
+                    .collect();
+
+                let index_add: Vec<_> = opcode_row
+                    .iter()
+                    .enumerate()
+                    .filter(|(_, op)| op.is_some() && op.unwrap() != 0)
+                    .map(|x| x.0)
+                    .enumerate()
+                    .map(|(x, y)| (y as u32, x as u32))
+                    .collect();
+
+                Self::batch_add_in_place(&mut bases, &mut add_ops[..], &index_add[..]);
+            }
+        }
+    }
+}
+
 #[derive(Derivative)]
 #[derivative(
     Copy(bound = "P: Parameters"),
@@ -384,12 +1158,19 @@ impl<P: Parameters> Zero for GroupProjective<P> {
     }
 }
 
+impl_gpu_sw_projective!(Parameters);
+
 impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
     const COFACTOR: &'static [u64] = P::COFACTOR;
     type BaseField = P::BaseField;
     type ScalarField = P::ScalarField;
     type Affine = GroupAffine<P>;
 
+    #[inline(always)]
+    fn get_x(&mut self) -> &mut Self::BaseField {
+        &mut self.x
+    }
+
     #[inline]
     fn prime_subgroup_generator() -> Self {
         GroupAffine::prime_subgroup_generator().into()
@@ -560,6 +1341,26 @@ impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
             self.z -= &hh;
         }
     }
+
+    fn mul<S: AsRef<[u64]>>(mut self, other: S) -> Self {
+        if P::has_glv() {
+            let w = P::glv_window_size();
+            let mut res = Self::zero();
+            let exponent_bigint = <Self::ScalarField as PrimeField>::BigInt::from_slice(other.as_ref());
+            impl_glv_mul!(Self, P, w, self, res, exponent_bigint);
+            res
+        } else {
+            let mut res = Self::zero();
+            for b in BitIteratorBE::without_leading_zeros(other.as_ref()) {
+                res.double_in_place();
+                if b {
+                    res += self;
+                }
+            }
+            self = res;
+            self
+        }
+    }
 }
 
 impl<P: Parameters> Neg for GroupProjective<P> {
@@ -782,6 +1583,15 @@ impl<P: Parameters> CanonicalSerialize for GroupProjective<P> {
 impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
     #[allow(unused_qualifications)]
     fn deserialize<R: Read>(reader: R) -> Result<Self, SerializationError> {
+        let p = Self::deserialize_unchecked(reader)?;
+        if !p.is_in_correct_subgroup_assuming_on_curve() {
+            return Err(SerializationError::InvalidData);
+        }
+        Ok(p)
+    }
+
+    #[allow(unused_qualifications)]
+    fn deserialize_unchecked<R: Read>(reader: R) -> Result<Self, SerializationError> {
         let (x, flags): (P::BaseField, SWFlags) =
             CanonicalDeserializeWithFlags::deserialize_with_flags(reader)?;
         if flags.is_infinity() {
@@ -789,19 +1599,13 @@ impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
         } else {
             let p = GroupAffine::<P>::get_point_from_x(x, flags.is_positive().unwrap())
                 .ok_or(SerializationError::InvalidData)?;
-            if !p.is_in_correct_subgroup_assuming_on_curve() {
-                return Err(SerializationError::InvalidData);
-            }
             Ok(p)
         }
     }
 
     #[allow(unused_qualifications)]
-    fn deserialize_uncompressed<R: Read>(
-        reader: R,
-    ) -> Result<Self, ark_serialize::SerializationError> {
-        let p = Self::deserialize_unchecked(reader)?;
-
+    fn deserialize_uncompressed<R: Read>(reader: R) -> Result<Self, SerializationError> {
+        let p = Self::deserialize_uncompressed_unchecked(reader)?;
         if !p.is_in_correct_subgroup_assuming_on_curve() {
             return Err(SerializationError::InvalidData);
         }
@@ -809,7 +1613,9 @@ impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
     }
 
     #[allow(unused_qualifications)]
-    fn deserialize_unchecked<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
+    fn deserialize_uncompressed_unchecked<R: Read>(
+        mut reader: R,
+    ) -> Result<Self, SerializationError> {
         let x: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?;
         let (y, flags): (P::BaseField, SWFlags) =
             CanonicalDeserializeWithFlags::deserialize_with_flags(&mut reader)?;
diff --git a/ec/src/models/twisted_edwards_extended.rs b/ec/src/models/twisted_edwards_extended.rs
index d4a5524ec..8922cc401 100644
--- a/ec/src/models/twisted_edwards_extended.rs
+++ b/ec/src/models/twisted_edwards_extended.rs
@@ -1,6 +1,10 @@
+#[cfg(not(feature = "cuda"))]
+use crate::accel_dummy::*;
 use crate::{
-    models::{MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters},
-    AffineCurve, ProjectiveCurve,
+    batch_arith::{decode_endo_from_u32, BatchGroupArithmetic},
+    cuda::scalar_mul::{internal::GPUScalarMulInternal, ScalarMulProfiler},
+    impl_gpu_cpu_run_kernel, impl_gpu_te_projective, impl_run_kernel, AffineCurve, ModelParameters,
+    ProjectiveCurve,
 };
 use ark_serialize::{
     CanonicalDeserialize, CanonicalDeserializeWithFlags, CanonicalSerialize,
@@ -15,20 +19,67 @@ use ark_std::{
     io::{Read, Result as IoResult, Write},
     marker::PhantomData,
     ops::{Add, AddAssign, MulAssign, Neg, Sub, SubAssign},
+    string::String,
     vec::Vec,
 };
 use num_traits::{One, Zero};
+#[cfg(feature = "cuda")]
+use {
+    crate::BatchGroupArithmeticSlice, accel::*, closure::closure, log::debug, peekmore::PeekMore,
+    std::sync::Mutex,
+};
 use zeroize::Zeroize;
 
 use ark_ff::{
+    biginteger::BigInteger,
     bytes::{FromBytes, ToBytes},
-    fields::{BitIteratorBE, Field, PrimeField, SquareRootField},
-    ToConstraintField, UniformRand,
+    fields::{BitIteratorBE, Field, FpParameters, PrimeField, SquareRootField},
+    impl_additive_ops_from_ref, ToConstraintField, UniformRand,
 };
 
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
+pub trait MontgomeryModelParameters: ModelParameters {
+    const COEFF_A: Self::BaseField;
+    const COEFF_B: Self::BaseField;
+
+    type TEModelParameters: TEModelParameters<BaseField = Self::BaseField>;
+}
+
+pub trait TEModelParameters: ModelParameters + Sized {
+    const COEFF_A: Self::BaseField;
+    const COEFF_D: Self::BaseField;
+    const COFACTOR: &'static [u64];
+    const COFACTOR_INV: Self::ScalarField;
+    const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField);
+
+    type MontgomeryModelParameters: MontgomeryModelParameters<BaseField = Self::BaseField>;
+
+    #[inline(always)]
+    fn mul_by_a(elem: &Self::BaseField) -> Self::BaseField {
+        let mut copy = *elem;
+        copy *= &Self::COEFF_A;
+        copy
+    }
+
+    fn scalar_mul_kernel(
+        ctx: &Context,
+        grid: usize,
+        block: usize,
+        table: *const GroupProjective<Self>,
+        exps: *const u8,
+        out: *mut GroupProjective<Self>,
+        n: isize,
+    ) -> error::Result<()>;
+
+    fn scalar_mul_static_profiler() -> ScalarMulProfiler;
+
+    fn namespace() -> &'static str;
+}
+
+use {MontgomeryModelParameters as MontgomeryParameters, TEModelParameters as Parameters};
+
 #[derive(Derivative)]
 #[derivative(
     Copy(bound = "P: Parameters"),
@@ -180,7 +231,7 @@ impl<P: Parameters> Neg for GroupAffine<P> {
     }
 }
 
-ark_ff::impl_additive_ops_from_ref!(GroupAffine, Parameters);
+impl_additive_ops_from_ref!(GroupAffine, Parameters);
 
 impl<'a, P: Parameters> Add<&'a Self> for GroupAffine<P> {
     type Output = Self;
@@ -291,6 +342,208 @@ mod group_impl {
     }
 }
 
+macro_rules! batch_add_loop_1 {
+    ($a: ident, $b: ident, $inversion_tmp: ident) => {
+        if $a.is_zero() || $b.is_zero() {
+            continue;
+        } else {
+            let y1y2 = $a.y * &$b.y;
+            let x1x2 = $a.x * &$b.x;
+
+            $a.x = ($a.x + &$a.y) * &($b.x + &$b.y) - &y1y2 - &x1x2;
+            $a.y = y1y2;
+            if !P::COEFF_A.is_zero() {
+                $a.y -= &P::mul_by_a(&x1x2);
+            }
+
+            let dx1x2y1y2 = P::COEFF_D * &y1y2 * &x1x2;
+
+            let inversion_mul_d = $inversion_tmp * &dx1x2y1y2;
+
+            $a.x *= &($inversion_tmp - &inversion_mul_d);
+            $a.y *= &($inversion_tmp + &inversion_mul_d);
+
+            $b.x = P::BaseField::one() - &dx1x2y1y2.square();
+
+            $inversion_tmp *= &$b.x;
+        }
+    };
+}
+
+macro_rules! batch_add_loop_2 {
+    ($a: ident, $b: ident, $inversion_tmp: ident) => {
+        if $a.is_zero() {
+            *$a = $b;
+        } else if !$b.is_zero() {
+            $a.x *= &$inversion_tmp;
+            $a.y *= &$inversion_tmp;
+
+            $inversion_tmp *= &$b.x;
+        }
+    };
+}
+
+impl<P: Parameters> BatchGroupArithmetic for GroupAffine<P> {
+    type BaseFieldForBatch = P::BaseField;
+
+    fn batch_double_in_place(
+        bases: &mut [Self],
+        index: &[u32],
+        _scratch_space: Option<&mut Vec<Self::BaseFieldForBatch>>,
+    ) {
+        Self::batch_add_in_place(
+            bases,
+            &mut bases.to_vec()[..],
+            &index.iter().map(|&x| (x, x)).collect::<Vec<_>>()[..],
+        );
+    }
+
+    // Total cost: 12 mul. Projective formulas: 11 mul.
+    fn batch_add_in_place_same_slice(bases: &mut [Self], index: &[(u32, u32)]) {
+        let mut inversion_tmp = P::BaseField::one();
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            let (mut a, mut b) = if idx < idy {
+                let (x, y) = bases.split_at_mut(*idy as usize);
+                (&mut x[*idx as usize], &mut y[0])
+            } else {
+                let (x, y) = bases.split_at_mut(*idx as usize);
+                (&mut y[0], &mut x[*idy as usize])
+            };
+            batch_add_loop_1!(a, b, inversion_tmp);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (idx, idy) in index.iter().rev() {
+            let (a, b) = if idx < idy {
+                let (x, y) = bases.split_at_mut(*idy as usize);
+                (&mut x[*idx as usize], y[0])
+            } else {
+                let (x, y) = bases.split_at_mut(*idx as usize);
+                (&mut y[0], x[*idy as usize])
+            };
+            batch_add_loop_2!(a, b, inversion_tmp);
+        }
+    }
+
+    // Total cost: 12 mul. Projective formulas: 11 mul.
+    fn batch_add_in_place(bases: &mut [Self], other: &mut [Self], index: &[(u32, u32)]) {
+        let mut inversion_tmp = P::BaseField::one();
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            let (mut a, mut b) = (&mut bases[*idx as usize], &mut other[*idy as usize]);
+            batch_add_loop_1!(a, b, inversion_tmp);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (idx, idy) in index.iter().rev() {
+            let (a, b) = (&mut bases[*idx as usize], other[*idy as usize]);
+            batch_add_loop_2!(a, b, inversion_tmp);
+        }
+    }
+
+    #[inline]
+    fn batch_add_in_place_read_only(
+        bases: &mut [Self],
+        other: &[Self],
+        index: &[(u32, u32)],
+        scratch_space: &mut Vec<Self>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+        // We run two loops over the data separated by an inversion
+        for (idx, idy) in index.iter() {
+            let (idy, endomorphism) = decode_endo_from_u32(*idy);
+            let mut a = &mut bases[*idx as usize];
+            // Apply endomorphisms according to encoding
+            let mut b = if endomorphism % 2 == 1 {
+                other[idy].neg()
+            } else {
+                other[idy]
+            };
+
+            batch_add_loop_1!(a, b, inversion_tmp);
+            scratch_space.push(b);
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (idx, _) in index.iter().rev() {
+            let (a, b) = (&mut bases[*idx as usize], scratch_space.pop().unwrap());
+            batch_add_loop_2!(a, b, inversion_tmp);
+        }
+    }
+
+    fn batch_add_write(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+
+        for (idx, idy) in index.iter() {
+            if *idy == !0u32 {
+                new_elems.push(lookup[*idx as usize]);
+                scratch_space.push(None);
+            } else {
+                let (mut a, mut b) = (lookup[*idx as usize], lookup[*idy as usize]);
+                batch_add_loop_1!(a, b, inversion_tmp);
+                new_elems.push(a);
+                scratch_space.push(Some(b));
+            }
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) {
+            match op_b {
+                Some(b) => {
+                    let b_ = *b;
+                    batch_add_loop_2!(a, b_, inversion_tmp);
+                }
+                None => (),
+            };
+        }
+        scratch_space.clear();
+    }
+
+    fn batch_add_write_read_self(
+        lookup: &[Self],
+        index: &[(u32, u32)],
+        new_elems: &mut Vec<Self>,
+        scratch_space: &mut Vec<Option<Self>>,
+    ) {
+        let mut inversion_tmp = P::BaseField::one();
+
+        for (idx, idy) in index.iter() {
+            if *idy == !0u32 {
+                new_elems.push(lookup[*idx as usize]);
+                scratch_space.push(None);
+            } else {
+                let (mut a, mut b) = (new_elems[*idx as usize], lookup[*idy as usize]);
+                batch_add_loop_1!(a, b, inversion_tmp);
+                new_elems.push(a);
+                scratch_space.push(Some(b));
+            }
+        }
+
+        inversion_tmp = inversion_tmp.inverse().unwrap(); // this is always in Fp*
+
+        for (a, op_b) in new_elems.iter_mut().rev().zip(scratch_space.iter().rev()) {
+            match op_b {
+                Some(b) => {
+                    let b_ = *b;
+                    batch_add_loop_2!(a, b_, inversion_tmp);
+                }
+                None => (),
+            };
+        }
+        scratch_space.clear();
+    }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 
 /// `GroupProjective` implements Extended Twisted Edwards Coordinates
@@ -427,12 +680,19 @@ impl<P: Parameters> Zero for GroupProjective<P> {
     }
 }
 
+impl_gpu_te_projective!(Parameters);
+
 impl<P: Parameters> ProjectiveCurve for GroupProjective<P> {
     const COFACTOR: &'static [u64] = P::COFACTOR;
     type BaseField = P::BaseField;
     type ScalarField = P::ScalarField;
     type Affine = GroupAffine<P>;
 
+    #[inline(always)]
+    fn get_x(&mut self) -> &mut Self::BaseField {
+        &mut self.x
+    }
+
     fn prime_subgroup_generator() -> Self {
         GroupAffine::prime_subgroup_generator().into()
     }
@@ -708,7 +968,6 @@ impl<P: MontgomeryParameters> MontgomeryGroupAffine<P> {
         }
     }
 }
-
 impl<P: Parameters> CanonicalSerialize for GroupAffine<P> {
     #[allow(unused_qualifications)]
     #[inline]
@@ -773,7 +1032,15 @@ impl<P: Parameters> CanonicalSerialize for GroupProjective<P> {
 
 impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
     #[allow(unused_qualifications)]
-    fn deserialize<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
+    fn deserialize<R: Read>(reader: R) -> Result<Self, SerializationError> {
+        let p = Self::deserialize_unchecked(reader)?;
+        if !p.is_in_correct_subgroup_assuming_on_curve() {
+            return Err(SerializationError::InvalidData);
+        }
+        Ok(p)
+    }
+    #[allow(unused_qualifications)]
+    fn deserialize_unchecked<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
         let (x, flags): (P::BaseField, EdwardsFlags) =
             CanonicalDeserializeWithFlags::deserialize_with_flags(&mut reader)?;
         if x == P::BaseField::zero() {
@@ -781,16 +1048,13 @@ impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
         } else {
             let p = GroupAffine::<P>::get_point_from_x(x, flags.is_positive())
                 .ok_or(SerializationError::InvalidData)?;
-            if !p.is_in_correct_subgroup_assuming_on_curve() {
-                return Err(SerializationError::InvalidData);
-            }
             Ok(p)
         }
     }
 
     #[allow(unused_qualifications)]
     fn deserialize_uncompressed<R: Read>(reader: R) -> Result<Self, SerializationError> {
-        let p = Self::deserialize_unchecked(reader)?;
+        let p = Self::deserialize_uncompressed_unchecked(reader)?;
 
         if !p.is_in_correct_subgroup_assuming_on_curve() {
             return Err(SerializationError::InvalidData);
@@ -799,7 +1063,9 @@ impl<P: Parameters> CanonicalDeserialize for GroupAffine<P> {
     }
 
     #[allow(unused_qualifications)]
-    fn deserialize_unchecked<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
+    fn deserialize_uncompressed_unchecked<R: Read>(
+        mut reader: R,
+    ) -> Result<Self, SerializationError> {
         let x: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?;
         let y: P::BaseField = CanonicalDeserialize::deserialize(&mut reader)?;
 
diff --git a/ff-asm/src/lib.rs b/ff-asm/src/lib.rs
index 70442ea8f..a6b75de4f 100644
--- a/ff-asm/src/lib.rs
+++ b/ff-asm/src/lib.rs
@@ -3,6 +3,7 @@
 #![recursion_limit = "128"]
 
 use proc_macro::TokenStream;
+use quote::quote;
 use syn::{
     parse::{Parse, ParseStream},
     Expr, Item, ItemFn,
@@ -38,9 +39,9 @@ pub fn unroll_for_loops(_meta: TokenStream, input: TokenStream) -> TokenStream {
             block: Box::new(new_block),
             ..item_fn
         });
-        quote::quote! ( #new_item ).into()
+        quote! ( #new_item ).into()
     } else {
-        quote::quote! ( #item ).into()
+        quote! ( #item ).into()
     }
 }
 
diff --git a/ff/Cargo.toml b/ff/Cargo.toml
index f9af60818..d9b7063ad 100644
--- a/ff/Cargo.toml
+++ b/ff/Cargo.toml
@@ -25,6 +25,7 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
 
 [build-dependencies]
 rustc_version = "0.3"
+cc = "1.0"
 
 [dev-dependencies]
 num-bigint = { version = "0.3.0", default-features = false }
diff --git a/ff/build.rs b/ff/build.rs
index c6a7dad16..fd69959af 100644
--- a/ff/build.rs
+++ b/ff/build.rs
@@ -15,4 +15,26 @@ fn main() {
     if should_use_asm {
         println!("cargo:rustc-cfg=use_asm");
     }
+
+    let should_use_bw6_asm = cfg!(any(
+        all(
+            feature = "bw6_asm",
+            target_feature = "bmi2",
+            target_feature = "adx",
+            target_arch = "x86_64"
+        ),
+        feature = "force_bw6_asm"
+    ));
+    if should_use_bw6_asm {
+        cc::Build::new()
+            .file("bw6-assembly/modmul768-sos1-adx.S")
+            .compile("modmul768");
+        cc::Build::new()
+            .file("bw6-assembly/modadd768.S")
+            .compile("modadd768");
+        cc::Build::new()
+            .file("bw6-assembly/modsub768.S")
+            .compile("modsub768");
+        println!("cargo:rustc-cfg=use_bw6_asm");
+    }
 }
diff --git a/ff/bw6-assembly/modadd768.S b/ff/bw6-assembly/modadd768.S
new file mode 100644
index 000000000..de291a781
--- /dev/null
+++ b/ff/bw6-assembly/modadd768.S
@@ -0,0 +1,181 @@
+// void modadd768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12])
+
+#ifdef _WIN64
+#	define x	%rcx
+#	define y	%rdx
+#	define m	%r8
+#	define z	%r9
+
+#	define t2	%rdi
+#	define t3	%rsi
+#else
+#	define x	%rdi
+#	define y	%rsi
+#	define m	%rdx
+#	define z	%rcx
+
+#	define t2	%r9
+#	define t3	%r8
+#endif
+
+#define t0	%r11
+#define t1	%r10
+#define t4	%r15
+#define t5	%r14
+
+#define t6	%r13
+#define t7	%r12
+#define t8	%rbx
+#define t9	%rax
+#define t10	%rbp
+#define t11	x
+#define t12	z
+
+#define  y0	 0*8(y)
+#define  y1	 1*8(y)
+#define  y2	 2*8(y)
+#define  y3	 3*8(y)
+#define  y4	 4*8(y)
+#define  y5	 5*8(y)
+#define  y6	 6*8(y)
+#define  y7	 7*8(y)
+#define  y8	 8*8(y)
+#define  y9	 9*8(y)
+#define y10	10*8(y)
+#define y11	11*8(y)
+
+#define  m0	 0*8(m)
+#define  m1	 1*8(m)
+#define  m2	 2*8(m)
+#define  m3	 3*8(m)
+#define  m4	 4*8(m)
+#define  m5	 5*8(m)
+#define  m6	 6*8(m)
+#define  m7	 7*8(m)
+#define  m8	 8*8(m)
+#define  m9	 9*8(m)
+#define m10	10*8(m)
+#define m11	11*8(m)
+
+// We only use these after replacing y with z
+
+#define  z0	 0*8(y)
+#define  z1	 1*8(y)
+#define  z2	 2*8(y)
+#define  z3	 3*8(y)
+#define  z4	 4*8(y)
+#define  z5	 5*8(y)
+#define  z6	 6*8(y)
+#define  z7	 7*8(y)
+#define  z8	 8*8(y)
+#define  z9	 9*8(y)
+#define z10	10*8(y)
+#define z11	11*8(y)
+
+.text
+
+#ifdef __APPLE__
+#define modadd768 _modadd768
+#endif
+
+.globl  modadd768
+#ifndef __APPLE__
+#ifndef _WIN64
+.type   modadd768, @function
+#endif
+#endif
+
+.p2align 6,,15
+modadd768:
+
+	// Callee-saves
+
+#ifdef _WIN64
+	mov	%rsi, 1*8(%rsp)
+	mov	%rdi, 2*8(%rsp)
+#endif
+			// Load x
+	push	%r15;	mov	 0*8(x),  t0;	mov	 1*8(x),  t1
+	push	%r14;	mov	 2*8(x),  t2;	mov	 3*8(x),  t3
+	push	%r13;	mov	 4*8(x),  t4;	mov	 5*8(x),  t5
+	push	%r12;	mov	 6*8(x),  t6;	mov	 7*8(x),  t7
+	push	%rbx;	mov	 8*8(x),  t8;	mov	 9*8(x),  t9
+	push	%rbp;	mov	10*8(x), t10;	mov	11*8(x), t11
+	push	z
+
+	xor	t12, t12
+	add	 y0,  t0
+	adc	 y1,  t1
+	adc	 y2,  t2
+	adc	 y3,  t3
+	adc	 y4,  t4
+	adc	 y5,  t5
+	adc	 y6,  t6
+	adc	 y7,  t7
+	adc	 y8,  t8
+	adc	 y9,  t9
+	adc	y10, t10
+	adc	y11, t11
+	adc	 $0, t12
+
+	// no more need for y. load z in its place
+
+	pop	y
+
+	// Conditional subtraction of m
+
+	mov	 t0,  z0;	sub	 m0,  t0
+	mov	 t1,  z1;	sbb	 m1,  t1
+	mov	 t2,  z2;	sbb	 m2,  t2
+	mov	 t3,  z3;	sbb	 m3,  t3
+	mov	 t4,  z4;	sbb	 m4,  t4
+	mov	 t5,  z5;	sbb	 m5,  t5
+	mov	 t6,  z6;	sbb	 m6,  t6
+	mov	 t7,  z7;	sbb	 m7,  t7
+	mov	 t8,  z8;	sbb	 m8,  t8
+	mov	 t9,  z9;	sbb	 m9,  t9
+	mov	t10, z10;	sbb	m10, t10
+	mov	t11, z11;	sbb	m11, t11
+				sbb	 $0, t12
+
+	cmovc	 z0,  t0
+	cmovc	 z1,  t1
+	cmovc	 z2,  t2
+	cmovc	 z3,  t3
+	cmovc	 z4,  t4
+	cmovc	 z5,  t5
+	cmovc	 z6,  t6
+	cmovc	 z7,  t7
+	cmovc	 z8,  t8
+	cmovc	 z9,  t9
+	cmovc	z10, t10
+	cmovc	z11, t11
+
+	mov	 t0,  z0
+	mov	 t1,  z1
+	mov	 t2,  z2
+	mov	 t3,  z3
+	mov	 t4,  z4
+	mov	 t5,  z5
+	mov	 t6,  z6
+	mov	 t7,  z7
+	mov	 t8,  z8
+	mov	 t9,  z9
+	mov	t10, z10
+	mov	t11, z11
+
+#ifdef _WIN64
+	mov	7*8(%rsp), %rsi
+	mov	8*8(%rsp), %rdi
+#endif
+			// Load x
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+
+	ret
+
+// vim: noet ts=8 sw=8
diff --git a/ff/bw6-assembly/modmul768-sos1-adx.S b/ff/bw6-assembly/modmul768-sos1-adx.S
new file mode 100644
index 000000000..2d0e7fdd4
--- /dev/null
+++ b/ff/bw6-assembly/modmul768-sos1-adx.S
@@ -0,0 +1,755 @@
+// void modmul768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12])
+
+// m[12] contains the least significant word of the negated inverse of the modulus mod 2^768
+
+#ifdef _WIN64
+#	define x	%rcx
+#	define y	%rdx
+#	define m	%r8
+#	define z	%r9
+#else
+#	define x	%rdi
+#	define y	%rsi
+#	define m	%rdx
+#	define z	%rcx
+#endif
+
+#define l	%rax
+#define h	%rbx
+
+#define t0	%rcx
+#define t1	%rbp
+#define t2	%rsi
+#define t3	%rdi
+#define t4	%r8
+#define t5	%r9
+#define t6	%r10
+#define t7	%r11
+#define t8	%r12
+#define t9	%r13
+#define t10	%r14
+#define t11	%r15
+#define t12	t0
+#define t13	t1
+#define t14	t2
+#define t15	t3
+#define t16	t4
+#define t17	t5
+#define t18	t6
+#define t19	t7
+#define t20	t8
+#define t21	t9
+#define t22	t10
+#define t23	t11
+#define t24	h
+
+#define zero	14*8(%rsp)
+
+#define x0	1*8(%rsp)
+#define x1	2*8(%rsp)
+#define x2	3*8(%rsp)
+#define x3	4*8(%rsp)
+#define x4	5*8(%rsp)
+#define x5	6*8(%rsp)
+#define x6	7*8(%rsp)
+#define x7	8*8(%rsp)
+#define x8	9*8(%rsp)
+#define x9	10*8(%rsp)
+#define x10	11*8(%rsp)
+#define x11	12*8(%rsp)
+
+#define m0	x0
+#define m1	x1
+#define m2	x2
+#define m3	x3
+#define m4	x4
+#define m5	x5
+#define m6	x6
+#define m7	x7
+#define m8	x8
+#define m9	x9
+#define m10	x10
+#define m11	x11
+#define inv	13*8(%rsp)
+
+#define z0	16*8(%rsp)
+#define z1	17*8(%rsp)
+#define z2	18*8(%rsp)
+#define z3	19*8(%rsp)
+#define z4	20*8(%rsp)
+#define z5	21*8(%rsp)
+#define z6	22*8(%rsp)
+#define z7	23*8(%rsp)
+#define z8	24*8(%rsp)
+#define z9	25*8(%rsp)
+#define z10	26*8(%rsp)
+#define z11	27*8(%rsp)
+#define z12	28*8(%rsp)
+#define z13	29*8(%rsp)
+#define z14	30*8(%rsp)
+#define z15	31*8(%rsp)
+#define z16	32*8(%rsp)
+#define z17	33*8(%rsp)
+#define z18	34*8(%rsp)
+#define z19	35*8(%rsp)
+#define z20	36*8(%rsp)
+#define z21	37*8(%rsp)
+#define z22	38*8(%rsp)
+#define z23	39*8(%rsp)
+
+#define y1	z1
+#define y2	z2
+#define y3	z3
+#define y4	z4
+#define y5	z5
+#define y6	z6
+#define y7	z7
+#define y8	z8
+#define y9	z9
+#define y10	z10
+#define y11	z11
+
+.text
+
+#ifdef __APPLE__
+#define modmul768 _modmul768
+#endif
+
+.globl  modmul768
+#ifndef __APPLE__
+#ifndef _WIN64
+.type   modmul768, @function
+#endif
+#endif
+
+.p2align 6,,63
+modmul768:
+
+	// Allocate space on the stack:
+	//    1 word for padding, to make rsp offsets constant size for x and x*y
+	//   13 words for x or m
+	//    2 words for the m and z pointers
+	//   24 words for x*y (used initially to store y)
+	//    6 words for callee-saves
+
+	mov	%rsp, %rax
+	sub	$8*(1+13+2+24+6), %rsp
+
+	// Callee-saves
+
+#ifdef _WIN64
+	mov	%rsi, 1*8(%rax)
+	mov	%rdi, 2*8(%rax)
+#endif
+	mov	%rbp, -6*8(%rax)
+	mov	%rbx, -5*8(%rax)
+	mov	%r12, -4*8(%rax)
+	mov	%r13, -3*8(%rax)
+	mov	%r14, -2*8(%rax)
+	mov	%r15, -1*8(%rax)
+
+	// m and z pointers
+
+	mov	m, 0*8(%rsp)
+	mov	z, 15*8(%rsp)
+
+	// zero
+
+	movq	$0, zero
+
+	// Let m point to the stack space for y
+
+	lea	16*8(%rsp), m
+
+	// copy x to the stack
+
+	mov	 0*8(x), %rax;	mov	%rax, x0
+	mov	 1*8(x), %rax;	mov	%rax, x1
+	mov	 2*8(x), %rax;	mov	%rax, x2
+	mov	 3*8(x), %rax;	mov	%rax, x3
+	mov	 4*8(x), %rax;	mov	%rax, x4
+	mov	 5*8(x), %rax;	mov	%rax, x5
+	mov	 6*8(x), %rax;	mov	%rax, x6
+	mov	 7*8(x), %rax;	mov	%rax, x7
+	mov	 8*8(x), %rax;	mov	%rax, x8
+	mov	 9*8(x), %rax;	mov	%rax, x9
+	mov	10*8(x), %rax;	mov	%rax, x10
+	mov	11*8(x), %rax;	mov	%rax, x11
+
+	xor	h, h	// For padding
+
+	// copy y to the stack
+
+	mov	11*8(y), %rax;	mov	%rax, 11*8(m)
+	mov	10*8(y), %rax;	mov	%rax, 10*8(m)
+	mov	 9*8(y), %rax;	mov	%rax,  9*8(m)
+	mov	 8*8(y), %rax;	mov	%rax,  8*8(m)
+	mov	 7*8(y), %rax;	mov	%rax,  7*8(m)
+	mov	 6*8(y), %rax;	mov	%rax,  6*8(m)
+	mov	 5*8(y), %rax;	mov	%rax,  5*8(m)
+	mov	 4*8(y), %rax;	mov	%rax,  4*8(m)
+	mov	 3*8(y), %rax;	mov	%rax,  3*8(m)
+	mov	 2*8(y), %rax;	mov	%rax,  2*8(m)
+	mov	 1*8(y), %rax;	mov	%rax,  1*8(m)
+	mov	 0*8(y), %rdx
+
+// y0
+
+	// mov	y0, %rdx
+
+	xor	h, h
+
+	mulx	x0, t0, t1
+	mov	t0, z0
+	mulx	x2, t2, t3
+	mulx	x1, l, h;	adcx	l, t1;	adcx	h, t2
+	mulx	x4, t4, t5
+	mulx	x3, l, h;	adcx	l, t3;	adcx	h, t4
+	mulx	x6, t6, t7
+	mulx	x5, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	x8, t8, t9
+	mulx	x7, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	x10, t10, t11
+	mulx	x9, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	x11, l, t12;	adcx	l, t11
+
+// y1
+
+	mov	y1, %rdx;	adc	$0, t12
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t2;	adox	h, t3
+	mulx	x0, l, h;	adcx	l, t1;	adcx	h, t2
+	mov	t1, z1;		mov	zero, t13;
+	mulx	x3, l, h;	adox	l, t4;	adox	h, t5
+	mulx	x2, l, h;	adcx	l, t3;	adcx	h, t4
+	mulx	x5, l, h;	adox	l, t6;	adox	h, t7
+	mulx	x4, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	x7, l, h;	adox	l, t8;	adox	h, t9
+	mulx	x6, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	x9, l, h;	adox	l, t10;	adox	h, t11
+	mulx	x8, l, h;	adcx	l, t9;	adcx	h, t10;	nop
+	mulx	x11, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x10, l, h;	adcx	l, t11;	adcx	h, t12
+
+// y2
+
+	mov	y2, %rdx;	adc	$0, t13
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t3;	adox	h, t4
+	mulx	x0, l, h;	adcx	l, t2;	adcx	h, t3
+	mov	t2, z2;		mov	zero, t14;
+	mulx	x3, l, h;	adox	l, t5;	adox	h, t6
+	mulx	x2, l, h;	adcx	l, t4;	adcx	h, t5
+	mulx	x5, l, h;	adox	l, t7;	adox	h, t8
+	mulx	x4, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	x7, l, h;	adox	l, t9;	adox	h, t10
+	mulx	x6, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	x9, l, h;	adox	l, t11;	adox	h, t12
+	mulx	x8, l, h;	adcx	l, t10;	adcx	h, t11;	nop
+	mulx	x11, l, h;	adox	l, t13;	adox	h, t14
+	mulx	x10, l, h;	adcx	l, t12;	adcx	h, t13
+
+// y3
+
+	mov	y3, %rdx;	adc	$0, t14
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t4;	adox	h, t5
+	mulx	x0, l, h;	adcx	l, t3;	adcx	h, t4
+	mov	t3, z3;		mov	zero, t15;
+	mulx	x3, l, h;	adox	l, t6;	adox	h, t7
+	mulx	x2, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	x5, l, h;	adox	l, t8;	adox	h, t9
+	mulx	x4, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	x7, l, h;	adox	l, t10;	adox	h, t11
+	mulx	x6, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	x9, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x8, l, h;	adcx	l, t11;	adcx	h, t12;	nop
+	mulx	x11, l, h;	adox	l, t14;	adox	h, t15
+	mulx	x10, l, h;	adcx	l, t13;	adcx	h, t14
+
+// y4
+
+	mov	y4, %rdx;	adc	$0, t15
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t5;	adox	h, t6
+	mulx	x0, l, h;	adcx	l, t4;	adcx	h, t5
+	mov	t4, z4;		mov	zero, t16;
+	mulx	x3, l, h;	adox	l, t7;	adox	h, t8
+	mulx	x2, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	x5, l, h;	adox	l, t9;	adox	h, t10
+	mulx	x4, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	x7, l, h;	adox	l, t11;	adox	h, t12
+	mulx	x6, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	x9, l, h;	adox	l, t13;	adox	h, t14
+	mulx	x8, l, h;	adcx	l, t12;	adcx	h, t13;	nop
+	mulx	x11, l, h;	adox	l, t15;	adox	h, t16
+	mulx	x10, l, h;	adcx	l, t14;	adcx	h, t15
+
+// y5
+
+	mov	y5, %rdx;	adc	$0, t16
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t6;	adox	h, t7
+	mulx	x0, l, h;	adcx	l, t5;	adcx	h, t6
+	mov	t5, z5;		mov	zero, t17;
+	mulx	x3, l, h;	adox	l, t8;	adox	h, t9
+	mulx	x2, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	x5, l, h;	adox	l, t10;	adox	h, t11
+	mulx	x4, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	x7, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x6, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	x9, l, h;	adox	l, t14;	adox	h, t15
+	mulx	x8, l, h;	adcx	l, t13;	adcx	h, t14;	nop
+	mulx	x11, l, h;	adox	l, t16;	adox	h, t17
+	mulx	x10, l, h;	adcx	l, t15;	adcx	h, t16
+
+// y6
+
+	mov	y6, %rdx;	adc	$0, t17
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t7;	adox	h, t8
+	mulx	x0, l, h;	adcx	l, t6;	adcx	h, t7
+	mov	t6, z6;		mov	zero, t18;
+	mulx	x3, l, h;	adox	l, t9;	adox	h, t10
+	mulx	x2, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	x5, l, h;	adox	l, t11;	adox	h, t12
+	mulx	x4, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	x7, l, h;	adox	l, t13;	adox	h, t14
+	mulx	x6, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	x9, l, h;	adox	l, t15;	adox	h, t16
+	mulx	x8, l, h;	adcx	l, t14;	adcx	h, t15;	nop
+	mulx	x11, l, h;	adox	l, t17;	adox	h, t18
+	mulx	x10, l, h;	adcx	l, t16;	adcx	h, t17
+
+// y7
+
+	mov	y7, %rdx;	adc	$0, t18
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t8;	adox	h, t9
+	mulx	x0, l, h;	adcx	l, t7;	adcx	h, t8
+	mov	t7, z7;		mov	zero, t19;
+	mulx	x3, l, h;	adox	l, t10;	adox	h, t11
+	mulx	x2, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	x5, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x4, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	x7, l, h;	adox	l, t14;	adox	h, t15
+	mulx	x6, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	x9, l, h;	adox	l, t16;	adox	h, t17
+	mulx	x8, l, h;	adcx	l, t15;	adcx	h, t16;	nop
+	mulx	x11, l, h;	adox	l, t18;	adox	h, t19
+	mulx	x10, l, h;	adcx	l, t17;	adcx	h, t18
+
+// y8
+
+	mov	y8, %rdx;	adc	$0, t19
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t9;	adox	h, t10
+	mulx	x0, l, h;	adcx	l, t8;	adcx	h, t9
+	mov	t8, z8;		mov	zero, t20;
+	mulx	x3, l, h;	adox	l, t11;	adox	h, t12
+	mulx	x2, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	x5, l, h;	adox	l, t13;	adox	h, t14
+	mulx	x4, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	x7, l, h;	adox	l, t15;	adox	h, t16
+	mulx	x6, l, h;	adcx	l, t14;	adcx	h, t15
+	mulx	x9, l, h;	adox	l, t17;	adox	h, t18
+	mulx	x8, l, h;	adcx	l, t16;	adcx	h, t17;	nop
+	mulx	x11, l, h;	adox	l, t19;	adox	h, t20
+	mulx	x10, l, h;	adcx	l, t18;	adcx	h, t19
+
+// y9
+
+	mov	y9, %rdx;	adc	$0, t20
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t10;	adox	h, t11
+	mulx	x0, l, h;	adcx	l, t9;	adcx	h, t10
+	mov	t9, z9;		mov	zero, t21;
+	mulx	x3, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x2, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	x5, l, h;	adox	l, t14;	adox	h, t15
+	mulx	x4, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	x7, l, h;	adox	l, t16;	adox	h, t17
+	mulx	x6, l, h;	adcx	l, t15;	adcx	h, t16
+	mulx	x9, l, h;	adox	l, t18;	adox	h, t19
+	mulx	x8, l, h;	adcx	l, t17;	adcx	h, t18;	nop
+	mulx	x11, l, h;	adox	l, t20;	adox	h, t21
+	mulx	x10, l, h;	adcx	l, t19;	adcx	h, t20
+
+// y10
+
+	mov	y10, %rdx;	adc	$0, t21
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t11;	adox	h, t12
+	mulx	x0, l, h;	adcx	l, t10;	adcx	h, t11
+	mov	t10, z10;	mov	zero, t22;
+	mulx	x3, l, h;	adox	l, t13;	adox	h, t14
+	mulx	x2, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	x5, l, h;	adox	l, t15;	adox	h, t16
+	mulx	x4, l, h;	adcx	l, t14;	adcx	h, t15
+	mulx	x7, l, h;	adox	l, t17;	adox	h, t18
+	mulx	x6, l, h;	adcx	l, t16;	adcx	h, t17
+	mulx	x9, l, h;	adox	l, t19;	adox	h, t20
+	mulx	x8, l, h;	adcx	l, t18;	adcx	h, t19;	nop
+	mulx	x11, l, h;	adox	l, t21;	adox	h, t22
+	mulx	x10, l, h;	adcx	l, t20;	adcx	h, t21
+
+// y11
+
+	mov	y11, %rdx;	adc	$0, t22
+	xor	h, h
+
+	mulx	x1, l, h;	adox	l, t12;	adox	h, t13
+	mulx	x0, l, h;	adcx	l, t11;	adcx	h, t12
+	mov	t11, z11;	mov	zero, t23;
+	mulx	x3, l, h;	adox	l, t14;	adox	h, t15
+	mulx	x2, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	x5, l, h;	adox	l, t16;	adox	h, t17
+	mulx	x4, l, h;	adcx	l, t15;	adcx	h, t16
+	mulx	x7, l, h;	adox	l, t18;	adox	h, t19
+	mulx	x6, l, h;	adcx	l, t17;	adcx	h, t18
+	mulx	x9, l, h;	adox	l, t20;	adox	h, t21
+	mulx	x8, l, h;	adcx	l, t19;	adcx	h, t20;	nop
+	mulx	x11, l, h;	adox	l, t22;	adox	h, t23
+	mulx	x10, l, h;	adcx	l, t21;	adcx	h, t22
+
+	// Copy m to the stack, overwriting x
+
+	mov	 0*8(%rsp), h;	adc	$0, t23
+	mov	12*8(h), %rdx;	mov	%rdx, inv
+	mov	11*8(h), %rax;	mov	%rax, m11
+	mov	10*8(h), %rax;	mov	%rax, m10
+	mov	 9*8(h), %rax;	mov	%rax, m9
+	mov	 8*8(h), %rax;	mov	%rax, m8
+	mov	 7*8(h), %rax;	mov	%rax, m7
+	mov	 6*8(h), %rax;	mov	%rax, m6
+	mov	 5*8(h), %rax;	mov	%rax, m5
+	mov	 4*8(h), %rax;	mov	%rax, m4
+	mov	 3*8(h), %rax;	mov	%rax, m3
+	mov	 2*8(h), %rax;	mov	%rax, m2
+	mov	 1*8(h), %rax;	mov	%rax, m1
+	mov	 0*8(h), %rax;	mov	%rax, m0
+
+	// Write out the top half of x*y to the stack, load the low half back in
+
+	mov	t12, z12;	mov	z0, t0
+	mov	t13, z13;	mov	z1, t1
+	mov	t14, z14;	mov	z2, t2
+	mov	t15, z15;	mov	z3, t3
+	mov	t16, z16;	mov	z4, t4
+	mov	t17, z17;	mov	z5, t5
+	mov	t18, z18;	mov	z6, t6
+	mov	t19, z19;	mov	z7, t7
+	mov	t20, z20;	mov	z8, t8
+	mov	t21, z21;	mov	z9, t9
+	mov	t22, z22;	mov	z10, t10
+	mov	t23, z23;	mov	z11, t11
+
+////////////////////////////////////////////////////////////////
+// Reduction
+////////////////////////////////////////////////////////////////
+
+// z0
+
+	//mov	inv, %rdx
+	mulx	t0, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t1;	adox	h, t2
+	mulx	m0, l, h;	adcx	l, t0;	adcx	h, t1
+	mulx	m3, l, h;	adox	l, t3;	adox	h, t4
+	mulx	m2, l, h;	adcx	l, t2;	adcx	h, t3
+	mulx	m5, l, h;	adox	l, t5;	adox	h, t6
+	mulx	m4, l, h;	adcx	l, t4;	adcx	h, t5
+	mulx	m7, l, h;	adox	l, t7;	adox	h, t8
+	mulx	m6, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	m9, l, h;	adox	l, t9;	adox	h, t10
+	mulx	m8, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	m11, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m10, l, h
+
+// z1
+
+	mov	inv, %rdx;	adcx	l, t10;	adcx	h, t11;	adc	$0, t12
+	mulx	t1, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t2;	adox	h, t3
+	mulx	m0, l, h;	adcx	l, t1;	adcx	h, t2
+	mulx	m3, l, h;	adox	l, t4;	adox	h, t5
+	mulx	m2, l, h;	adcx	l, t3;	adcx	h, t4
+	mulx	m5, l, h;	adox	l, t6;	adox	h, t7
+	mulx	m4, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	m7, l, h;	adox	l, t8;	adox	h, t9
+	mulx	m6, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	m9, l, h;	adox	l, t10;	adox	h, t11
+	mulx	m8, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	m11, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m10, l, h
+
+// z2
+
+	mov	inv, %rdx;	adcx	l, t11;	adcx	h, t12;	adc	$0, t13
+	mulx	t2, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t3;	adox	h, t4
+	mulx	m0, l, h;	adcx	l, t2;	adcx	h, t3
+	mulx	m3, l, h;	adox	l, t5;	adox	h, t6
+	mulx	m2, l, h;	adcx	l, t4;	adcx	h, t5
+	mulx	m5, l, h;	adox	l, t7;	adox	h, t8
+	mulx	m4, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	m7, l, h;	adox	l, t9;	adox	h, t10
+	mulx	m6, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	m9, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m8, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	m11, l, h;	adox	l, t13;	adox	h, t14
+	mulx	m10, l, h
+
+// z3
+
+	mov	inv, %rdx;	adcx	l, t12;	adcx	h, t13;	adc	$0, t14
+	mulx	t3, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t4;	adox	h, t5
+	mulx	m0, l, h;	adcx	l, t3;	adcx	h, t4
+	mulx	m3, l, h;	adox	l, t6;	adox	h, t7
+	mulx	m2, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	m5, l, h;	adox	l, t8;	adox	h, t9
+	mulx	m4, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	m7, l, h;	adox	l, t10;	adox	h, t11
+	mulx	m6, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	m9, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m8, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	m11, l, h;	adox	l, t14;	adox	h, t15
+	mulx	m10, l, h
+
+// z4
+
+	mov	inv, %rdx;	adcx	l, t13;	adcx	h, t14;	adc	$0, t15
+	mulx	t4, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t5;	adox	h, t6
+	mulx	m0, l, h;	adcx	l, t4;	adcx	h, t5
+	mulx	m3, l, h;	adox	l, t7;	adox	h, t8
+	mulx	m2, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	m5, l, h;	adox	l, t9;	adox	h, t10
+	mulx	m4, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	m7, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m6, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	m9, l, h;	adox	l, t13;	adox	h, t14
+	mulx	m8, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	m11, l, h;	adox	l, t15;	adox	h, t16
+	mulx	m10, l, h
+
+// z5
+
+	mov	inv, %rdx;	adcx	l, t14;	adcx	h, t15;	adc	$0, t16
+	mulx	t5, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t6;	adox	h, t7
+	mulx	m0, l, h;	adcx	l, t5;	adcx	h, t6
+	mulx	m3, l, h;	adox	l, t8;	adox	h, t9
+	mulx	m2, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	m5, l, h;	adox	l, t10;	adox	h, t11
+	mulx	m4, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	m7, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m6, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	m9, l, h;	adox	l, t14;	adox	h, t15
+	mulx	m8, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	m11, l, h;	adox	l, t16;	adox	h, t17
+	mulx	m10, l, h
+
+// z6
+
+	mov	inv, %rdx;	adcx	l, t15;	adcx	h, t16;	adc	$0, t17
+	mulx	t6, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t7;	adox	h, t8
+	mulx	m0, l, h;	adcx	l, t6;	adcx	h, t7
+	mulx	m3, l, h;	adox	l, t9;	adox	h, t10
+	mulx	m2, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	m5, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m4, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	m7, l, h;	adox	l, t13;	adox	h, t14
+	mulx	m6, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	m9, l, h;	adox	l, t15;	adox	h, t16
+	mulx	m8, l, h;	adcx	l, t14;	adcx	h, t15
+	mulx	m11, l, h;	adox	l, t17;	adox	h, t18
+	mulx	m10, l, h
+
+// z7
+
+	mov	inv, %rdx;	adcx	l, t16;	adcx	h, t17;	adc	$0, t18
+	mulx	t7, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t8;	adox	h, t9
+	mulx	m0, l, h;	adcx	l, t7;	adcx	h, t8
+	mulx	m3, l, h;	adox	l, t10;	adox	h, t11
+	mulx	m2, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	m5, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m4, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	m7, l, h;	adox	l, t14;	adox	h, t15
+	mulx	m6, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	m9, l, h;	adox	l, t16;	adox	h, t17
+	mulx	m8, l, h;	adcx	l, t15;	adcx	h, t16
+	mulx	m11, l, h;	adox	l, t18;	adox	h, t19
+	mulx	m10, l, h
+
+// z8
+
+	mov	inv, %rdx;	adcx	l, t17;	adcx	h, t18;	adc	$0, t19
+	mulx	t8, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t9;	adox	h, t10
+	mulx	m0, l, h;	adcx	l, t8;	adcx	h, t9
+	mulx	m3, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m2, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	m5, l, h;	adox	l, t13;	adox	h, t14
+	mulx	m4, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	m7, l, h;	adox	l, t15;	adox	h, t16
+	mulx	m6, l, h;	adcx	l, t14;	adcx	h, t15
+	mulx	m9, l, h;	adox	l, t17;	adox	h, t18
+	mulx	m8, l, h;	adcx	l, t16;	adcx	h, t17
+	mulx	m11, l, h;	adox	l, t19;	adox	h, t20
+	mulx	m10, l, h
+
+// z9
+
+	mov	inv, %rdx;	adcx	l, t18;	adcx	h, t19;	adc	$0, t20
+	mulx	t9, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t10;	adox	h, t11
+	mulx	m0, l, h;	adcx	l, t9;	adcx	h, t10
+	mulx	m3, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m2, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	m5, l, h;	adox	l, t14;	adox	h, t15
+	mulx	m4, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	m7, l, h;	adox	l, t16;	adox	h, t17
+	mulx	m6, l, h;	adcx	l, t15;	adcx	h, t16
+	mulx	m9, l, h;	adox	l, t18;	adox	h, t19
+	mulx	m8, l, h;	adcx	l, t17;	adcx	h, t18
+	mulx	m11, l, h;	adox	l, t20;	adox	h, t21
+	mulx	m10, l, h
+
+// z10
+
+	mov	inv, %rdx;	adcx	l, t19;	adcx	h, t20;	adc	$0, t21
+	mulx	t10, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t11;	adox	h, t12
+	mulx	m0, l, h;	adcx	l, t10;	adcx	h, t11
+	mulx	m3, l, h;	adox	l, t13;	adox	h, t14
+	mulx	m2, l, h;	adcx	l, t12;	adcx	h, t13
+	mulx	m5, l, h;	adox	l, t15;	adox	h, t16
+	mulx	m4, l, h;	adcx	l, t14;	adcx	h, t15
+	mulx	m7, l, h;	adox	l, t17;	adox	h, t18
+	mulx	m6, l, h;	adcx	l, t16;	adcx	h, t17
+	mulx	m9, l, h;	adox	l, t19;	adox	h, t20
+	mulx	m8, l, h;	adcx	l, t18;	adcx	h, t19
+	mulx	m11, l, h;	adox	l, t21;	adox	h, t22
+	mulx	m10, l, h
+
+// z11
+
+	mov	inv, %rdx;	adcx	l, t20;	adcx	h, t21;	adc	$0, t22
+	mulx	t11, %rdx, h
+	xor	h, h
+
+	mulx	m1, l, h;	adox	l, t12;	adox	h, t13
+	mulx	m0, l, h;	adcx	l, t11;	adcx	h, t12
+	mulx	m3, l, h;	adox	l, t14;	adox	h, t15
+	mulx	m2, l, h;	adcx	l, t13;	adcx	h, t14
+	mulx	m5, l, h;	adox	l, t16;	adox	h, t17
+	mulx	m4, l, h;	adcx	l, t15;	adcx	h, t16
+	mulx	m7, l, h;	adox	l, t18;	adox	h, t19
+	mulx	m6, l, h;	adcx	l, t17;	adcx	h, t18
+	mulx	m9, l, h;	adox	l, t20;	adox	h, t21
+	mulx	m8, l, h;	adcx	l, t19;	adcx	h, t20
+	mulx	m11, l, h;	adox	l, t22;	adox	h, t23
+	mulx	m10, l, h;	adcx	l, t21;	adcx	h, t22;	adc	$0, t23
+
+	xor	t24, t24;	lea	8*(1+13+2+24+6)(%rsp), %rax
+
+	add	z12, t12
+	adc	z13, t13
+	adc	z14, t14
+	adc	z15, t15
+	adc	z16, t16
+	adc	z17, t17
+	adc	z18, t18
+	adc	z19, t19
+	adc	z20, t20
+	adc	z21, t21
+	adc	z22, t22
+	adc	z23, t23
+	adc	$0, t24;	mov	15*8(%rsp), %rdx
+
+	// Conditional subtraction of m
+
+	mov	t12, z0;	sub	m0, t12
+	mov	t13, z1;	sbb	m1, t13
+	mov	t14, z2;	sbb	m2, t14
+	mov	t15, z3;	sbb	m3, t15
+	mov	t16, z4;	sbb	m4, t16
+	mov	t17, z5;	sbb	m5, t17
+	mov	t18, z6;	sbb	m6, t18
+	mov	t19, z7;	sbb	m7, t19
+	mov	t20, z8;	sbb	m8, t20
+	mov	t21, z9;	sbb	m9, t21
+	mov	t22, z10;	sbb	m10, t22
+	mov	t23, z11;	sbb	m11, t23
+				sbb	$0, t24
+
+	cmovc	z0, t12
+	cmovc	z1, t13
+	cmovc	z2, t14
+	cmovc	z3, t15
+	cmovc	z4, t16
+	cmovc	z5, t17
+	cmovc	z6, t18
+	cmovc	z7, t19
+	cmovc	z8, t20
+	cmovc	z9, t21
+	cmovc	z10, t22
+	cmovc	z11, t23
+
+	mov	t12,  0*8(%rdx)
+	mov	t13,  1*8(%rdx)
+	mov	t14,  2*8(%rdx)
+	mov	t15,  3*8(%rdx)
+	mov	t16,  4*8(%rdx)
+	mov	t17,  5*8(%rdx)
+	mov	t18,  6*8(%rdx);	mov	-6*8(%rax), %rbp
+	mov	t19,  7*8(%rdx);	mov	-5*8(%rax), %rbx
+	mov	t20,  8*8(%rdx);	mov	-4*8(%rax), %r12
+	mov	t21,  9*8(%rdx);	mov	-3*8(%rax), %r13
+	mov	t22, 10*8(%rdx);	mov	-2*8(%rax), %r14
+	mov	t23, 11*8(%rdx);	mov	-1*8(%rax), %r15
+
+	add	$8*(1+13+2+24+6), %rsp
+
+#ifdef _WIN64
+	mov	1*8(%rax), %rsi
+	mov	2*8(%rax), %rdi
+#endif
+	ret
+
+// vim: noet ts=8 sw=8
\ No newline at end of file
diff --git a/ff/bw6-assembly/modsub768.S b/ff/bw6-assembly/modsub768.S
new file mode 100644
index 000000000..ccc8e7368
--- /dev/null
+++ b/ff/bw6-assembly/modsub768.S
@@ -0,0 +1,182 @@
+// void modsub768(const uint64_t x[12], const uint64_t y[12], const uint64_t m[13], uint64_t z[12])
+
+#ifdef _WIN64
+#	define x	%rcx
+#	define y	%rdx
+#	define m	%r8
+#	define z	%r9
+
+#	define t2	%rdi
+#	define t3	%rsi
+#else
+#	define x	%rdi
+#	define y	%rsi
+#	define m	%rdx
+#	define z	%rcx
+
+#	define t2	%r9
+#	define t3	%r8
+#endif
+
+#define t0	%r11
+#define t1	%r10
+#define t4	%r15
+#define t5	%r14
+
+#define t6	%r13
+#define t7	%r12
+#define t8	%rbx
+#define t9	%rax
+#define t10	%rbp
+#define t11	x
+#define t12	z
+
+#define  y0	 0*8(y)
+#define  y1	 1*8(y)
+#define  y2	 2*8(y)
+#define  y3	 3*8(y)
+#define  y4	 4*8(y)
+#define  y5	 5*8(y)
+#define  y6	 6*8(y)
+#define  y7	 7*8(y)
+#define  y8	 8*8(y)
+#define  y9	 9*8(y)
+#define y10	10*8(y)
+#define y11	11*8(y)
+
+#define  m0	 0*8(m)
+#define  m1	 1*8(m)
+#define  m2	 2*8(m)
+#define  m3	 3*8(m)
+#define  m4	 4*8(m)
+#define  m5	 5*8(m)
+#define  m6	 6*8(m)
+#define  m7	 7*8(m)
+#define  m8	 8*8(m)
+#define  m9	 9*8(m)
+#define m10	10*8(m)
+#define m11	11*8(m)
+
+// We only use these after replacing y with z
+
+#define  z0	 0*8(y)
+#define  z1	 1*8(y)
+#define  z2	 2*8(y)
+#define  z3	 3*8(y)
+#define  z4	 4*8(y)
+#define  z5	 5*8(y)
+#define  z6	 6*8(y)
+#define  z7	 7*8(y)
+#define  z8	 8*8(y)
+#define  z9	 9*8(y)
+#define z10	10*8(y)
+#define z11	11*8(y)
+
+.text
+
+#ifdef __APPLE__
+#define modsub768 _modsub768
+#endif
+
+.globl  modsub768
+#ifndef __APPLE__
+#ifndef _WIN64
+.type   modsub768, @function
+#endif
+#endif
+
+.p2align 6,,15
+modsub768:
+
+	// Callee-saves
+
+#ifdef _WIN64
+	mov	%rsi, 1*8(%rsp)
+	mov	%rdi, 2*8(%rsp)
+#endif
+			// Load x
+	push	%r15;	mov	 0*8(x),  t0;	mov	 1*8(x),  t1
+	push	%r14;	mov	 2*8(x),  t2;	mov	 3*8(x),  t3
+	push	%r13;	mov	 4*8(x),  t4;	mov	 5*8(x),  t5
+	push	%r12;	mov	 6*8(x),  t6;	mov	 7*8(x),  t7
+	push	%rbx;	mov	 8*8(x),  t8;	mov	 9*8(x),  t9
+	push	%rbp;	mov	10*8(x), t10;	mov	11*8(x), t11
+	push	z
+
+	xor	t12, t12
+	sub	 y0,  t0
+	sbb	 y1,  t1
+	sbb	 y2,  t2
+	sbb	 y3,  t3
+	sbb	 y4,  t4
+	sbb	 y5,  t5
+	sbb	 y6,  t6
+	sbb	 y7,  t7
+	sbb	 y8,  t8
+	sbb	 y9,  t9
+	sbb	y10, t10
+	sbb	y11, t11
+	sbb	 $0, t12	// -1 if y>x
+
+	// no more need for y. load z in its place
+
+	pop	y
+
+	// Conditional addition of m
+
+	mov	 t0,  z0;	add	 m0,  t0
+	mov	 t1,  z1;	adc	 m1,  t1
+	mov	 t2,  z2;	adc	 m2,  t2
+	mov	 t3,  z3;	adc	 m3,  t3
+	mov	 t4,  z4;	adc	 m4,  t4
+	mov	 t5,  z5;	adc	 m5,  t5
+	mov	 t6,  z6;	adc	 m6,  t6
+	mov	 t7,  z7;	adc	 m7,  t7
+	mov	 t8,  z8;	adc	 m8,  t8
+	mov	 t9,  z9;	adc	 m9,  t9
+	mov	t10, z10;	adc	m10, t10
+	mov	t11, z11;	adc	m11, t11
+
+	add	 $1, t12	// sets carry if adding m is needed
+
+	cmovnc	 z0,  t0
+	cmovnc	 z1,  t1
+	cmovnc	 z2,  t2
+	cmovnc	 z3,  t3
+	cmovnc	 z4,  t4
+	cmovnc	 z5,  t5
+	cmovnc	 z6,  t6
+	cmovnc	 z7,  t7
+	cmovnc	 z8,  t8
+	cmovnc	 z9,  t9
+	cmovnc	z10, t10
+	cmovnc	z11, t11
+
+	mov	 t0,  z0
+	mov	 t1,  z1
+	mov	 t2,  z2
+	mov	 t3,  z3
+	mov	 t4,  z4
+	mov	 t5,  z5
+	mov	 t6,  z6
+	mov	 t7,  z7
+	mov	 t8,  z8
+	mov	 t9,  z9
+	mov	t10, z10
+	mov	t11, z11
+
+#ifdef _WIN64
+	mov	7*8(%rsp), %rsi
+	mov	8*8(%rsp), %rdi
+#endif
+			// Load x
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+
+	ret
+
+// vim: noet ts=8 sw=8
diff --git a/ff/src/biginteger/macros.rs b/ff/src/biginteger/macros.rs
index d1c370016..b71d2bfbc 100644
--- a/ff/src/biginteger/macros.rs
+++ b/ff/src/biginteger/macros.rs
@@ -277,6 +277,48 @@ macro_rules! bigint_impl {
 
                 res
             }
+
+            #[inline]
+            fn mul_no_reduce(this: &[u64], other: &[u64]) -> Self {
+                assert!(this.len() == $num_limbs / 2);
+                assert!(other.len() == $num_limbs / 2);
+
+                let mut r = [0u64; $num_limbs];
+                for i in 0..$num_limbs / 2 {
+                    let mut carry = 0u64;
+                    for j in 0..$num_limbs / 2 {
+                        r[j + i] =
+                            mac_with_carry!(r[j + i], this[i], other[j], &mut carry);
+                    }
+                    r[$num_limbs / 2 + i] = carry;
+                }
+                Self::new(r)
+            }
+
+            #[inline]
+            fn mul_no_reduce_lo(this: &[u64], other: &[u64]) -> Self {
+                assert!(this.len() == $num_limbs);
+                assert!(other.len() == $num_limbs);
+
+                let mut r = [0u64; $num_limbs];
+                for i in 0..$num_limbs {
+                    let mut carry = 0u64;
+                    for j in 0..($num_limbs - i) {
+                        r[j + i] =
+                            mac_with_carry!(r[j + i], this[i], other[j], &mut carry);
+                    }
+                }
+                Self::new(r)
+            }
+
+            #[inline]
+            fn from_slice(slice: &[u64]) -> Self {
+                let mut repr = Self::default();
+                for (limb, &value) in repr.0.iter_mut().zip(slice) {
+                    *limb = value;
+                }
+                repr
+            }
         }
 
         impl CanonicalSerialize for $name {
diff --git a/ff/src/biginteger/mod.rs b/ff/src/biginteger/mod.rs
index 3f342e357..3aa2266cb 100644
--- a/ff/src/biginteger/mod.rs
+++ b/ff/src/biginteger/mod.rs
@@ -25,6 +25,7 @@ bigint_impl!(BigInteger128, 2);
 bigint_impl!(BigInteger256, 4);
 bigint_impl!(BigInteger320, 5);
 bigint_impl!(BigInteger384, 6);
+bigint_impl!(BigInteger512, 6);
 bigint_impl!(BigInteger768, 12);
 bigint_impl!(BigInteger832, 13);
 
@@ -137,4 +138,18 @@ pub trait BigInteger:
         *self = Self::read(reader)?;
         Ok(())
     }
+
+    /// Takes two slices of u64 representing big integers and returns a bigger
+    /// BigInteger of type Self representing their product. Preferably used
+    /// only for even NUM_LIMBS. We require the invariant that this.len() ==
+    /// other.len() == NUM_LIMBS / 2
+    fn mul_no_reduce(this: &[u64], other: &[u64]) -> Self;
+
+    /// Similar to `mul_no_reduce` but accepts slices of len == NUM_LIMBS and
+    /// only returns lower half of the result
+    fn mul_no_reduce_lo(this: &[u64], other: &[u64]) -> Self;
+
+    /// Copies data from a slice to Self in a len agnostic way,
+    // based on whichever of the two is shorter.
+    fn from_slice(slice: &[u64]) -> Self;
 }
diff --git a/ff/src/fields/arithmetic.rs b/ff/src/fields/arithmetic.rs
index 82fdaef61..85e7977a6 100644
--- a/ff/src/fields/arithmetic.rs
+++ b/ff/src/fields/arithmetic.rs
@@ -1,14 +1,53 @@
+/// All of these methods store intermediate results on the stack, and so
+/// they support overlap of input and output parameters.
+#[cfg(use_bw6_asm)]
+extern "C" {
+    pub fn modmul768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
+    pub fn modadd768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
+    pub fn modsub768(x: *const u64, y: *const u64, m: *const u64, z: *mut u64);
+}
 /// This modular multiplication algorithm uses Montgomery
 /// reduction for efficient implementation. It also additionally
 /// uses the "no-carry optimization" outlined
 /// [here](https://hackmd.io/@zkteam/modular_multiplication) if
-/// `P::MODULUS` has (a) a non-zero MSB, and (b) at least one
+/// `P::MODULUS` has BOTH (a) a zero MSB, AND (b) at least one
 /// zero bit in the rest of the modulus.
+
 macro_rules! impl_field_mul_assign {
     ($limbs:expr) => {
         #[inline]
         #[ark_ff_asm::unroll_for_loops]
         fn mul_assign(&mut self, other: &Self) {
+            #[cfg(use_bw6_asm)]
+            #[allow(unsafe_code, unused_mut, unconditional_panic)]
+            {
+                if $limbs == 12 {
+                    unsafe {
+                        let modulus_with_inv = [
+                            P::MODULUS.0[0],
+                            P::MODULUS.0[1],
+                            P::MODULUS.0[2],
+                            P::MODULUS.0[3],
+                            P::MODULUS.0[4],
+                            P::MODULUS.0[5],
+                            P::MODULUS.0[6],
+                            P::MODULUS.0[7],
+                            P::MODULUS.0[8],
+                            P::MODULUS.0[9],
+                            P::MODULUS.0[10],
+                            P::MODULUS.0[11],
+                            P::INV,
+                        ];
+                        crate::fields::arithmetic::modmul768(
+                            ((self.0).0).as_ptr(),
+                            ((other.0).0).as_ptr(),
+                            modulus_with_inv.as_ptr(),
+                            ((self.0).0).as_mut_ptr(),
+                        );
+                        return;
+                    }
+                }
+            }
             // Checking the modulus at compile time
             let first_bit_set = P::MODULUS.0[$limbs - 1] >> 63 != 0;
             // $limbs can be 1, hence we can run into a case with an unused mut.
@@ -17,6 +56,7 @@ macro_rules! impl_field_mul_assign {
             for i in 1..$limbs {
                 all_bits_set &= P::MODULUS.0[$limbs - i - 1] == !0u64;
             }
+
             let _no_carry: bool = !(first_bit_set || all_bits_set);
 
             // No-carry optimisation applied to CIOS
@@ -56,6 +96,93 @@ macro_rules! impl_field_mul_assign {
     };
 }
 
+macro_rules! impl_field_add_assign {
+    ($limbs:expr) => {
+        #[inline]
+        #[ark_ff_asm::unroll_for_loops]
+        fn add_assign(&mut self, other: &Self) {
+            #[cfg(use_bw6_asm)]
+            #[allow(unsafe_code, unused_mut, unconditional_panic)]
+            {
+                if $limbs == 12 {
+                    unsafe {
+                        let modulus_with_inv = [
+                            P::MODULUS.0[0],
+                            P::MODULUS.0[1],
+                            P::MODULUS.0[2],
+                            P::MODULUS.0[3],
+                            P::MODULUS.0[4],
+                            P::MODULUS.0[5],
+                            P::MODULUS.0[6],
+                            P::MODULUS.0[7],
+                            P::MODULUS.0[8],
+                            P::MODULUS.0[9],
+                            P::MODULUS.0[10],
+                            P::MODULUS.0[11],
+                            P::INV,
+                        ];
+                        crate::fields::arithmetic::modadd768(
+                            ((self.0).0).as_ptr(),
+                            ((other.0).0).as_ptr(),
+                            modulus_with_inv.as_ptr(),
+                            ((self.0).0).as_mut_ptr(),
+                        );
+                        return;
+                    }
+                }
+            }
+            // This cannot exceed the backing capacity.
+            self.0.add_nocarry(&other.0);
+            // However, it may need to be reduced
+            self.reduce();
+        }
+    };
+}
+
+macro_rules! impl_field_sub_assign {
+    ($limbs:expr) => {
+        #[inline]
+        #[ark_ff_asm::unroll_for_loops]
+        fn sub_assign(&mut self, other: &Self) {
+            #[cfg(use_bw6_asm)]
+            #[allow(unsafe_code, unused_mut, unconditional_panic)]
+            {
+                if $limbs == 12 {
+                    unsafe {
+                        let modulus_with_inv = [
+                            P::MODULUS.0[0],
+                            P::MODULUS.0[1],
+                            P::MODULUS.0[2],
+                            P::MODULUS.0[3],
+                            P::MODULUS.0[4],
+                            P::MODULUS.0[5],
+                            P::MODULUS.0[6],
+                            P::MODULUS.0[7],
+                            P::MODULUS.0[8],
+                            P::MODULUS.0[9],
+                            P::MODULUS.0[10],
+                            P::MODULUS.0[11],
+                            P::INV,
+                        ];
+                        crate::fields::arithmetic::modsub768(
+                            ((self.0).0).as_ptr(),
+                            ((other.0).0).as_ptr(),
+                            modulus_with_inv.as_ptr(),
+                            ((self.0).0).as_mut_ptr(),
+                        );
+                        return;
+                    }
+                }
+            }
+            // If `other` is larger than `self`, add the modulus to self first.
+            if other.0 > self.0 {
+                self.0.add_nocarry(&P::MODULUS);
+            }
+            self.0.sub_noborrow(&other.0);
+        }
+    };
+}
+
 macro_rules! impl_field_into_repr {
     ($limbs:expr, $BigIntegerType:ty) => {
         #[inline]
@@ -91,6 +218,37 @@ macro_rules! impl_field_square_in_place {
                 *self = *self * *self;
                 return self;
             }
+            #[cfg(use_bw6_asm)]
+            #[allow(unsafe_code, unused_mut, unconditional_panic)]
+            {
+                if $limbs == 12 {
+                    unsafe {
+                        let modulus_with_inv = [
+                            P::MODULUS.0[0],
+                            P::MODULUS.0[1],
+                            P::MODULUS.0[2],
+                            P::MODULUS.0[3],
+                            P::MODULUS.0[4],
+                            P::MODULUS.0[5],
+                            P::MODULUS.0[6],
+                            P::MODULUS.0[7],
+                            P::MODULUS.0[8],
+                            P::MODULUS.0[9],
+                            P::MODULUS.0[10],
+                            P::MODULUS.0[11],
+                            P::INV,
+                        ];
+                        crate::fields::arithmetic::modmul768(
+                            ((self.0).0).as_ptr(),
+                            ((self.0).0).as_ptr(),
+                            modulus_with_inv.as_ptr(),
+                            ((self.0).0).as_mut_ptr(),
+                        );
+                        return self;
+                    }
+                }
+            }
+
             #[cfg(use_asm)]
             #[allow(unsafe_code, unused_mut)]
             {
diff --git a/ff/src/fields/macros.rs b/ff/src/fields/macros.rs
index 8a33dc11f..9965ade4f 100644
--- a/ff/src/fields/macros.rs
+++ b/ff/src/fields/macros.rs
@@ -658,25 +658,12 @@ macro_rules! impl_Fp {
         impl_multiplicative_ops_from_ref!($Fp, $FpParameters);
 
         impl<'a, P: $FpParameters> AddAssign<&'a Self> for $Fp<P> {
-            #[inline]
-            fn add_assign(&mut self, other: &Self) {
-                // This cannot exceed the backing capacity.
-                self.0.add_nocarry(&other.0);
-                // However, it may need to be reduced
-                self.reduce();
-            }
+            impl_field_add_assign!($limbs);
         }
 
         impl<'a, P: $FpParameters> SubAssign<&'a Self> for $Fp<P> {
-            #[inline]
-            fn sub_assign(&mut self, other: &Self) {
-                // If `other` is larger than `self`, add the modulus to self first.
-                if other.0 > self.0 {
-                    self.0.add_nocarry(&P::MODULUS);
-                }
-                self.0.sub_noborrow(&other.0);
-            }
-        }
+            impl_field_sub_assign!($limbs);
+       }
 
         impl<'a, P: $FpParameters> MulAssign<&'a Self> for $Fp<P> {
             impl_field_mul_assign!($limbs);
diff --git a/ff/src/fields/mod.rs b/ff/src/fields/mod.rs
index 678962868..c66fc112d 100644
--- a/ff/src/fields/mod.rs
+++ b/ff/src/fields/mod.rs
@@ -67,6 +67,31 @@ macro_rules! field_new {
     };
 }
 
+#[macro_export]
+macro_rules! field_new_from_raw_repr {
+    ($name:ident, $c0:expr) => {
+        $name {
+            0: $c0,
+            1: core::marker::PhantomData,
+        }
+    };
+    ($name:ident, $c0:expr, $c1:expr $(,)?) => {
+        $name {
+            c0: $c0,
+            c1: $c1,
+            _parameters: core::marker::PhantomData,
+        }
+    };
+    ($name:ident, $c0:expr, $c1:expr, $c2:expr $(,)?) => {
+        $name {
+            c0: $c0,
+            c1: $c1,
+            c2: $c2,
+            _parameters: core::marker::PhantomData,
+        }
+    };
+}
+
 /// The interface for a generic field.
 pub trait Field:
     ToBytes
@@ -418,6 +443,11 @@ pub trait PrimeField:
         Self::Params::T_MINUS_ONE_DIV_TWO
     }
 
+    /// Returns the trace minus one divided by two.
+    fn modulus() -> Self::BigInt {
+        Self::Params::MODULUS
+    }
+
     /// Returns the modulus minus one divided by two.
     fn modulus_minus_one_div_two() -> Self::BigInt {
         Self::Params::MODULUS_MINUS_ONE_DIV_TWO
diff --git a/scripts/glv-lattice-basis/Cargo.toml b/scripts/glv-lattice-basis/Cargo.toml
new file mode 100644
index 000000000..c9a045bda
--- /dev/null
+++ b/scripts/glv-lattice-basis/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "glv_lattice_basis"
+version = "0.1.0"
+authors = ["Jonathan Chuang"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ark-bls12-381 = { git = "https://github.com/arkworks-rs/curves", features = ["curve"], default-features = false }
+ark-ff = { path = "../../ff", default-features = false }
+ark-ec = { path = "../../ec", default-features = false }
+num-traits = { version = "0.2", default-features = false }
+num-bigint = "0.4.0"
+
+[features]
+default = [ "std" ]
+std = []
diff --git a/scripts/glv-lattice-basis/LICENSE-APACHE b/scripts/glv-lattice-basis/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/scripts/glv-lattice-basis/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/scripts/glv-lattice-basis/LICENSE-MIT b/scripts/glv-lattice-basis/LICENSE-MIT
new file mode 100644
index 000000000..72dc60d84
--- /dev/null
+++ b/scripts/glv-lattice-basis/LICENSE-MIT
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/scripts/glv-lattice-basis/examples-rename/main.rs b/scripts/glv-lattice-basis/examples-rename/main.rs
new file mode 100644
index 000000000..30fc643f4
--- /dev/null
+++ b/scripts/glv-lattice-basis/examples-rename/main.rs
@@ -0,0 +1,12 @@
+extern crate ark_bls12_381;
+
+use ark_bls12_381::G1Projective as GroupProjective;
+use ark_ff::{
+    BigInteger384 as BaseFieldBigInt,
+    BigInteger512 as FrWideBigInt,
+};
+use glv_lattice_basis::*;
+
+fn main() {
+    print_glv_params::<GroupProjective, FrWideBigInt, BaseFieldBigInt>();
+}
diff --git a/scripts/glv-lattice-basis/src/arithmetic.rs b/scripts/glv-lattice-basis/src/arithmetic.rs
new file mode 100644
index 000000000..bf6cb9747
--- /dev/null
+++ b/scripts/glv-lattice-basis/src/arithmetic.rs
@@ -0,0 +1,34 @@
+use ark_ff::biginteger::BigInteger;
+
+// Naive long division
+pub fn div_with_remainder<BigInt: BigInteger>(
+    numerator: BigInt,
+    divisor: BigInt,
+) -> (BigInt, BigInt) {
+    assert!(divisor != BigInt::from(0), "Divisor cannot be zero");
+    let mut remainder = numerator;
+    let mut quotient = BigInt::from(0);
+
+    let div_num_bits = divisor.num_bits();
+
+    while remainder >= divisor {
+        let mut current_divisor = divisor;
+        let mut num_bits = 1 + remainder.num_bits() - div_num_bits;
+        current_divisor.muln(num_bits);
+        while current_divisor > remainder {
+            current_divisor.div2();
+            num_bits -= 1;
+        }
+        remainder.sub_noborrow(&current_divisor);
+
+        let mut pow2_quot = BigInt::from(1);
+        pow2_quot.muln(num_bits);
+        quotient.add_nocarry(&pow2_quot);
+    }
+
+    let mut reconstructed_numerator =
+        BigInt::mul_no_reduce_lo(&quotient.as_ref(), &divisor.as_ref());
+    reconstructed_numerator.add_nocarry(&remainder);
+    assert_eq!(reconstructed_numerator, numerator);
+    (quotient, remainder)
+}
diff --git a/scripts/glv-lattice-basis/src/lib.rs b/scripts/glv-lattice-basis/src/lib.rs
new file mode 100644
index 000000000..32ca76380
--- /dev/null
+++ b/scripts/glv-lattice-basis/src/lib.rs
@@ -0,0 +1,238 @@
+extern crate ark_ff;
+extern crate ark_ec;
+extern crate num_bigint;
+extern crate num_traits;
+
+mod arithmetic;
+
+use num_bigint::BigUint;
+use ark_ff::{BigInteger, Field, PrimeField};
+use ark_ec::ProjectiveCurve;
+pub use arithmetic::*;
+use num_traits::Zero;
+use std::ops::Neg;
+
+/// Takes data from two endomorphisms and sorts out which corresponds to which
+fn which_endo<G: ProjectiveCurve>(
+    base_roots: (G::BaseField, G::BaseField),
+    scalar_roots: (G::ScalarField, G::ScalarField),
+) -> (
+    (G::BaseField, G::ScalarField),
+    (G::BaseField, G::ScalarField),
+) {
+    // println!("{:?}, {:?}", base_roots, scalar_roots);
+    let g = G::prime_subgroup_generator();
+
+    let mut g_endo = g;
+    *g_endo.get_x() *= &base_roots.0;
+
+    let d1 = if g.mul(scalar_roots.0.into_repr()) == g_endo {
+        (base_roots.0, scalar_roots.0)
+    } else {
+        let mut g_endo = g;
+        *g_endo.get_x() *= &base_roots.1;
+        assert!(g.mul(scalar_roots.0.into_repr()) == g_endo);
+
+        (base_roots.1, scalar_roots.0)
+    };
+
+    let d2 = if g.mul(scalar_roots.1.into_repr()) == g_endo {
+        (base_roots.0, scalar_roots.1)
+    } else {
+        let mut g_endo = g;
+        *g_endo.get_x() *= &base_roots.1;
+        assert!(g.mul(scalar_roots.1.into_repr()) == g_endo);
+
+        (base_roots.1, scalar_roots.1)
+    };
+
+    (d1, d2)
+}
+
+fn cube_root_unity<F: Field, B: BigInteger>() -> (F, F) {
+    let char = B::from_slice(F::characteristic());
+    let deg = F::extension_degree();
+    let mut modulus = char;
+    for _ in 1..deg {
+        modulus = B::mul_no_reduce_lo(&modulus.as_ref(), &char.as_ref());
+    }
+
+    modulus.sub_noborrow(&B::from(1));
+    let (q, r) = div_with_remainder(modulus, B::from(3));
+    assert!(r == B::from(0));
+
+    let mut g = 2u32;
+    let mut root1 = F::one();
+    loop {
+        if root1 != F::one() {
+            break;
+        }
+        let x = F::from(g);
+        root1 = x.pow(q);
+        g += 1;
+    }
+    let root2 = root1 * root1;
+    assert!(root1.pow(&[3]) == F::one());
+    assert!(root2.pow(&[3]) == F::one());
+    assert!(root1 != root2);
+
+    (root1, root2)
+}
+
+fn get_endo_data<G: ProjectiveCurve, B: BigInteger>() -> (G::BaseField, G::ScalarField) {
+    which_endo::<G>(
+        cube_root_unity::<G::BaseField, B>(),
+        cube_root_unity::<G::ScalarField, <G::ScalarField as PrimeField>::BigInt>(),
+    )
+    .1
+}
+
+fn to_str<B: BigInteger>(x: B) -> String {
+    BigUint::from_bytes_be(&x.to_bytes_be()[..]).to_string()
+}
+
+pub fn print_glv_params<G: ProjectiveCurve, WideBigInt: BigInteger, BaseFieldBigInt: BigInteger>() {
+    let (omega, lambda) = get_endo_data::<G, BaseFieldBigInt>();
+    let g = G::prime_subgroup_generator();
+    let mut g_endo = g;
+    *g_endo.get_x() *= &omega;
+    assert!(g.mul(lambda.into_repr()) == g_endo);
+
+    println!("const OMEGA: Self::BaseField = {:?};", omega);
+    let n = <G::ScalarField as PrimeField>::modulus();
+    println!("const LAMBDA: Self::ScalarField = {:?};", to_str(lambda.into_repr()));
+
+    let vecs = get_lattice_basis::<G::ScalarField>(n, lambda.into_repr());
+
+    // We check that `(|B1| + 2) * (|B2| + 2) <  2n`
+    // and `B_i^2 < 2n` e.g. `|B_i| < \sqrt{2n}$
+    // We use this to prove some bounds later
+    let wide_modulus = WideBigInt::from_slice(&n.as_ref()[..]);
+    let two_modulus = WideBigInt::mul_no_reduce_lo(
+        &wide_modulus.as_ref()[..],
+        &WideBigInt::from(2).as_ref()[..],
+    );
+
+    let mut b1 = ((vecs.0).1).1;
+    let mut b2 = ((vecs.1).1).1;
+    let two = <G::ScalarField as PrimeField>::BigInt::from(2);
+    let b1b1 = WideBigInt::mul_no_reduce(&b1.as_ref()[..], &b1.as_ref()[..]);
+    let b2b2 = WideBigInt::mul_no_reduce(&b2.as_ref()[..], &b2.as_ref()[..]);
+
+    b1.add_nocarry(&two);
+    b2.add_nocarry(&two);
+    let b1b2 = WideBigInt::mul_no_reduce(&b1.as_ref()[..], &b2.as_ref()[..]);
+
+    assert!(b1b1 < two_modulus);
+    assert!(b2b2 < two_modulus);
+    assert!(b1b2 < two_modulus);
+
+    for (i, vec) in [vecs.0, vecs.1].iter().enumerate() {
+        let (s1, (flag, t1)) = vec;
+
+        let mut t1_big = WideBigInt::from_slice(t1.as_ref());
+        let n_big = WideBigInt::from_slice(n.as_ref());
+        t1_big.muln(<G::ScalarField as PrimeField>::BigInt::NUM_LIMBS as u32 * 64);
+        let (g1_big, _) = div_with_remainder::<WideBigInt>(t1_big, n_big);
+        let g1 = <G::ScalarField as PrimeField>::BigInt::from_slice(g1_big.as_ref());
+
+        println!("/// |round(B{} * R / n)|", i + 1);
+        println!(
+            "const Q{}: <Self::ScalarField as PrimeField>::BigInt = {:?};",
+            ((i + 1) % 2) + 1,
+            to_str(g1)
+        );
+        println!(
+            "const B{}: <Self::ScalarField as PrimeField>::BigInt = {:?};",
+            i + 1,
+            to_str(*t1)
+        );
+        println!("const B{}_IS_NEG: bool = {:?};", i + 1, flag);
+
+        debug_assert_eq!(
+            recompose_integer(
+                G::ScalarField::from_repr(*s1).unwrap(),
+                if !flag {
+                    G::ScalarField::from_repr(*t1).unwrap()
+                } else {
+                    G::ScalarField::from_repr(*t1).unwrap().neg()
+                },
+                lambda
+            ),
+            G::ScalarField::zero()
+        );
+    }
+    println!(
+        "const R_BITS: u32 = {:?};",
+        <G::ScalarField as PrimeField>::BigInt::NUM_LIMBS * 64
+    );
+}
+
+// We work on arrays of size 3
+// We assume that |E(F_q)| < R = 2^{ceil(limbs/2) * 64}
+pub fn get_lattice_basis<F: PrimeField>(
+    n: F::BigInt,
+    lambda: F::BigInt,
+) -> (
+    (F::BigInt, (bool, F::BigInt)),
+    (F::BigInt, (bool, F::BigInt)),
+) {
+    let mut r = [n, lambda, n];
+    let one = F::one();
+    let zero = F::zero();
+    let mut t: [F; 3] = [zero, one, zero];
+    let max_num_bits_lattice = (F::BigInt::from_slice(F::characteristic()).num_bits() - 1) / 2 + 1;
+
+    // We can use an approximation as we are merely using a heuristic. We should
+    // check that the parameters obtained from this heuristic satisfies the
+    // required conditions separately.
+    let sqrt_n = as_f64(n.as_ref()).sqrt();
+
+    // println!("Log sqrtn: {}", sqrt_n.log2());
+
+    let mut i = 0;
+    // While r_i >= sqrt(n), we perform the extended euclidean algorithm so that
+    // si*n + ti*lambda = ri then return the vectors (r_i, (sign(t_i), |t_i|)),
+    // (r_i+1, (sign(t_i+1), |t_i+1|)) Notice this makes ri + (-ti)*lambda = 0
+    // mod n, which is what we desire for our short lattice basis
+    while as_f64(r[(i + 1) % 3].as_ref()) >= sqrt_n {
+        // while i < 20 {
+        let (q, rem): (F::BigInt, F::BigInt) =
+            div_with_remainder::<F::BigInt>(r[i % 3], r[(i + 1) % 3]);
+        r[(i + 2) % 3] = rem;
+        let int_q = F::from_repr(q).unwrap();
+        t[(i + 2) % 3] = t[i % 3] - int_q * (t[(i + 1) % 3]);
+
+        i += 1;
+    }
+    let just_computed = (i + 1) % 3;
+    // We reverse the signs due to s_i*n = r_i - t_i*LAMBDA
+    let (neg_flag1, t1) = if t[just_computed].into_repr().num_bits() <= max_num_bits_lattice {
+        (true, t[just_computed].into_repr())
+    } else {
+        (false, t[just_computed].neg().into_repr())
+    };
+    let vec_1 = (r[just_computed], (neg_flag1, t1));
+
+    let prev = i % 3;
+    let (neg_flag2, t2) = if t[prev].into_repr().num_bits() <= max_num_bits_lattice {
+        (true, t[prev].into_repr())
+    } else {
+        (false, t[prev].neg().into_repr())
+    };
+    let vec_2 = (r[prev], (neg_flag2, t2));
+
+    (vec_1, vec_2)
+}
+
+pub fn recompose_integer<F: PrimeField>(k1: F, k2: F, lambda: F) -> F {
+    k1 + &(k2 * &lambda)
+}
+
+fn as_f64(bigint_ref: &[u64]) -> f64 {
+    let mut n_float: f64 = 0.0;
+    for (i, limb) in bigint_ref.iter().enumerate() {
+        n_float += (*limb as f64) * 2f64.powf((i as f64) * 64f64)
+    }
+    n_float
+}
diff --git a/scripts/to_dec_str.py b/scripts/to_dec_str.py
new file mode 100644
index 000000000..828b4057e
--- /dev/null
+++ b/scripts/to_dec_str.py
@@ -0,0 +1,14 @@
+def from_u64_slice_to_decimal_str(x):
+	ret = 0
+	for i, limb in enumerate(x):
+		print(i)
+		print(ret)
+		ret += 2 ** (i*64) * limb
+	return ret
+
+print(from_u64_slice_to_decimal_str([
+    7865245318337523249,
+    18346590209729131401,
+    15545362854776399464,
+    6505881510324251116,
+]))
diff --git a/serialize/Cargo.toml b/serialize/Cargo.toml
index ca7e6eadf..cef3e52b7 100644
--- a/serialize/Cargo.toml
+++ b/serialize/Cargo.toml
@@ -15,6 +15,7 @@ edition = "2018"
 [dependencies]
 ark-serialize-derive = { path = "../serialize-derive", optional = true }
 ark-std = { git = "https://github.com/arkworks-rs/utils", default-features = false }
+paste = "0.1"
 
 [features]
 default = []
diff --git a/serialize/src/lib.rs b/serialize/src/lib.rs
index a375c5752..3b5db4fcc 100644
--- a/serialize/src/lib.rs
+++ b/serialize/src/lib.rs
@@ -12,6 +12,7 @@ use ark_std::{
     rc::Rc,
     string::String,
     vec::Vec,
+    vec,
 };
 pub use error::*;
 pub use flags::*;
@@ -58,6 +59,7 @@ pub trait CanonicalSerializeWithFlags: CanonicalSerialize {
 ///
 /// If your code depends on `algebra` instead, the example works analogously
 /// when importing `algebra::serialize::*`.
+
 pub trait CanonicalSerialize {
     /// Serializes `self` into `writer`.
     /// It is left up to a particular type for how it strikes the
@@ -74,17 +76,26 @@ pub trait CanonicalSerialize {
 
     fn serialized_size(&self) -> usize;
 
+    /// Serializes `self` into `writer` with compression, and without
+    /// performing validity checks. Should be used *only* when there is no
+    /// danger of adversarial manipulation of the output.
+    #[inline]
+    fn serialize_unchecked<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
+        self.serialize(writer)
+    }
+
     /// Serializes `self` into `writer` without compression.
     #[inline]
     fn serialize_uncompressed<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
         self.serialize(writer)
     }
 
-    /// Serializes `self` into `writer` without compression, and without
-    /// performing validity checks. Should be used *only* when there is no
-    /// danger of adversarial manipulation of the output.
+    /// Serializes `self` into `writer` without compression.
     #[inline]
-    fn serialize_unchecked<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
+    fn serialize_uncompressed_unchecked<W: Write>(
+        &self,
+        writer: W,
+    ) -> Result<(), SerializationError> {
         self.serialize_uncompressed(writer)
     }
 
@@ -126,6 +137,13 @@ pub trait CanonicalDeserialize: Sized {
     /// Reads `Self` from `reader`.
     fn deserialize<R: Read>(reader: R) -> Result<Self, SerializationError>;
 
+    /// Reads `self` from `reader` with compression, and without performing
+    /// validity checks. Should be used *only* when the input is trusted.
+    #[inline]
+    fn deserialize_unchecked<R: Read>(reader: R) -> Result<Self, SerializationError> {
+        Self::deserialize(reader)
+    }
+
     /// Reads `Self` from `reader` without compression.
     #[inline]
     fn deserialize_uncompressed<R: Read>(reader: R) -> Result<Self, SerializationError> {
@@ -135,7 +153,7 @@ pub trait CanonicalDeserialize: Sized {
     /// Reads `self` from `reader` without compression, and without performing
     /// validity checks. Should be used *only* when the input is trusted.
     #[inline]
-    fn deserialize_unchecked<R: Read>(reader: R) -> Result<Self, SerializationError> {
+    fn deserialize_uncompressed_unchecked<R: Read>(reader: R) -> Result<Self, SerializationError> {
         Self::deserialize_uncompressed(reader)
     }
 }
@@ -193,6 +211,23 @@ impl CanonicalDeserialize for usize {
     }
 }
 
+macro_rules! impl_serialize_for_slice {
+    ($($name:ident),*) => {
+        $(
+            #[inline]
+            fn $name<W: Write>(&self, mut writer: W) -> Result<(), SerializationError> {
+                let len = self.len() as u64;
+                len.serialize(&mut writer)?;
+                for item in self.iter() {
+                    item.$name(&mut writer)?;
+                }
+                Ok(())
+            }
+        )*
+    }
+}
+
+
 // Implement Serialization for `String`
 // It is serialized by obtaining its byte representation as a Vec<u8> and
 // serializing that. This yields an end serialization of
@@ -217,16 +252,14 @@ impl CanonicalDeserialize for String {
     }
 }
 
+
 impl<T: CanonicalSerialize> CanonicalSerialize for [T] {
-    #[inline]
-    fn serialize<W: Write>(&self, mut writer: W) -> Result<(), SerializationError> {
-        let len = self.len() as u64;
-        len.serialize(&mut writer)?;
-        for item in self.iter() {
-            item.serialize(&mut writer)?;
-        }
-        Ok(())
-    }
+    impl_serialize_for_slice!(
+        serialize,
+        serialize_unchecked,
+        serialize_uncompressed,
+        serialize_uncompressed_unchecked
+    );
 
     #[inline]
     fn serialized_size(&self) -> usize {
@@ -236,26 +269,6 @@ impl<T: CanonicalSerialize> CanonicalSerialize for [T] {
             .sum::<usize>()
     }
 
-    #[inline]
-    fn serialize_uncompressed<W: Write>(&self, mut writer: W) -> Result<(), SerializationError> {
-        let len = self.len() as u64;
-        len.serialize(&mut writer)?;
-        for item in self.iter() {
-            item.serialize_uncompressed(&mut writer)?;
-        }
-        Ok(())
-    }
-
-    #[inline]
-    fn serialize_unchecked<W: Write>(&self, mut writer: W) -> Result<(), SerializationError> {
-        let len = self.len() as u64;
-        len.serialize(&mut writer)?;
-        for item in self.iter() {
-            item.serialize_unchecked(&mut writer)?;
-        }
-        Ok(())
-    }
-
     #[inline]
     fn uncompressed_size(&self) -> usize {
         8 + self
@@ -265,63 +278,59 @@ impl<T: CanonicalSerialize> CanonicalSerialize for [T] {
     }
 }
 
-impl<T: CanonicalSerialize> CanonicalSerialize for Vec<T> {
-    #[inline]
-    fn serialize<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
-        self.as_slice().serialize(writer)
+macro_rules! impl_serialize_for_vec {
+    ($($name:ident),*) => {
+        $(
+            #[inline]
+            fn $name<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
+                self.as_slice().$name(writer)
+            }
+        )*
     }
+}
+
+impl<T: CanonicalSerialize> CanonicalSerialize for Vec<T> {
+    impl_serialize_for_vec!(
+        serialize,
+        serialize_unchecked,
+        serialize_uncompressed,
+        serialize_uncompressed_unchecked
+    );
 
     #[inline]
     fn serialized_size(&self) -> usize {
         self.as_slice().serialized_size()
     }
 
-    #[inline]
-    fn serialize_uncompressed<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
-        self.as_slice().serialize_uncompressed(writer)
-    }
-
-    #[inline]
-    fn serialize_unchecked<W: Write>(&self, writer: W) -> Result<(), SerializationError> {
-        self.as_slice().serialize_unchecked(writer)
-    }
-
     #[inline]
     fn uncompressed_size(&self) -> usize {
         self.as_slice().uncompressed_size()
     }
 }
 
-impl<T: CanonicalDeserialize> CanonicalDeserialize for Vec<T> {
-    #[inline]
-    fn deserialize<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
-        let len = u64::deserialize(&mut reader)?;
-        let mut values = Vec::new();
-        for _ in 0..len {
-            values.push(T::deserialize(&mut reader)?);
-        }
-        Ok(values)
-    }
-
-    #[inline]
-    fn deserialize_uncompressed<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
-        let len = u64::deserialize(&mut reader)?;
-        let mut values = Vec::new();
-        for _ in 0..len {
-            values.push(T::deserialize_uncompressed(&mut reader)?);
-        }
-        Ok(values)
+macro_rules! impl_deserialize_for_vec {
+    ($($name:ident),*) => {
+        $(
+            #[inline]
+            fn $name<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
+                let len = u64::deserialize(&mut reader)?;
+                let mut values = vec![];
+                for _ in 0..len {
+                    values.push(T::$name(&mut reader)?);
+                }
+                Ok(values)
+            }
+        )*
     }
+}
 
-    #[inline]
-    fn deserialize_unchecked<R: Read>(mut reader: R) -> Result<Self, SerializationError> {
-        let len = u64::deserialize(&mut reader)?;
-        let mut values = Vec::new();
-        for _ in 0..len {
-            values.push(T::deserialize_unchecked(&mut reader)?);
-        }
-        Ok(values)
-    }
+impl<T: CanonicalDeserialize> CanonicalDeserialize for Vec<T> {
+    impl_deserialize_for_vec!(
+        deserialize,
+        deserialize_unchecked,
+        deserialize_uncompressed,
+        deserialize_uncompressed_unchecked
+    );
 }
 
 #[inline]
@@ -876,6 +885,19 @@ mod test {
         }
     }
 
+    macro_rules! impl_test {
+        ($data:ident, $(($name:ident, $size:ident)),*) => {
+            $(
+                paste::item! {
+                    let mut serialized = vec![0; $data.[< $size _size>]()];
+                    $data.[< serialize_ $name >](&mut serialized[..]).unwrap();
+                    let de = T::[< deserialize_ $name >](&serialized[..]).unwrap();
+                    assert_eq!($data, de);
+                }
+            )*
+        }
+    }
+
     fn test_serialize<
         T: PartialEq + core::fmt::Debug + CanonicalSerialize + CanonicalDeserialize,
     >(
@@ -886,15 +908,12 @@ mod test {
         let de = T::deserialize(&serialized[..]).unwrap();
         assert_eq!(data, de);
 
-        let mut serialized = vec![0; data.uncompressed_size()];
-        data.serialize_uncompressed(&mut serialized[..]).unwrap();
-        let de = T::deserialize_uncompressed(&serialized[..]).unwrap();
-        assert_eq!(data, de);
-
-        let mut serialized = vec![0; data.uncompressed_size()];
-        data.serialize_unchecked(&mut serialized[..]).unwrap();
-        let de = T::deserialize_unchecked(&serialized[..]).unwrap();
-        assert_eq!(data, de);
+        impl_test!(
+            data,
+            (unchecked, serialized),
+            (uncompressed, uncompressed),
+            (uncompressed_unchecked, uncompressed)
+        );
     }
 
     // Serialize T, randomly mutate the data, and deserialize it.
diff --git a/test-curves/Cargo.toml b/test-curves/Cargo.toml
index 61093dfd0..da48978d6 100644
--- a/test-curves/Cargo.toml
+++ b/test-curves/Cargo.toml
@@ -9,10 +9,15 @@ edition = "2018"
 publish = false
 
 [dependencies]
+paste = "0.1"
 ark-std = { git = "https://github.com/arkworks-rs/utils", default-features = false }
 ark-ff = { path = "../ff", default-features = false }
 ark-ec = { path = "../ec", default-features = false }
 
+
+lazy_static = { version = "1.4.0", optional = true }
+accel = { git = "https://github.com/jon-chuang/accel", package = "accel", optional = true }
+
 [dev-dependencies]
 ark-serialize = { path = "../serialize", default-features = false }
 ark-algebra-test-templates = { path = "../test-templates", default-features = false }
@@ -20,8 +25,8 @@ ark-algebra-test-templates = { path = "../test-templates", default-features = fa
 [features]
 default = []
 
+cuda = [ "ark-ec/cuda",  "accel", "lazy_static", "ark-ec/std" ]
 asm = [ "ark-ff/asm" ]
-
 parallel = [ "ark-ff/parallel", "ark-ec/parallel", "ark-std/parallel" ]
 
 bls12_381_scalar_field = []
@@ -33,4 +38,4 @@ mnt4_753_curve = [ "mnt4_753_scalar_field", "mnt4_753_base_field" ]
 
 bn384_small_two_adicity_scalar_field = []
 bn384_small_two_adicity_base_field = []
-bn384_small_two_adicity_curve = [ "bn384_small_two_adicity_scalar_field", "bn384_small_two_adicity_base_field" ]
\ No newline at end of file
+bn384_small_two_adicity_curve = [ "bn384_small_two_adicity_scalar_field", "bn384_small_two_adicity_base_field" ]
diff --git a/test-curves/src/bls12_381/g1.rs b/test-curves/src/bls12_381/g1.rs
index 0e8391f91..64fabeec3 100644
--- a/test-curves/src/bls12_381/g1.rs
+++ b/test-curves/src/bls12_381/g1.rs
@@ -1,9 +1,14 @@
 use crate::bls12_381::*;
 use ark_ec::{
+    impl_glv_for_sw, impl_scalar_mul_kernel_glv, impl_scalar_mul_parameters,
     models::{ModelParameters, SWModelParameters},
     short_weierstrass_jacobian::*,
+    GLVParameters,
+};
+use ark_ff::{
+    biginteger::{BigInteger256, BigInteger384, BigInteger512},
+    field_new, field_new_from_raw_repr, PrimeField, Zero,
 };
-use ark_ff::{field_new, Zero};
 
 pub type G1Affine = GroupAffine<Parameters>;
 pub type G1Projective = GroupProjective<Parameters>;
@@ -16,6 +21,41 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel_glv!(bls12_381, "ark-bls12-381", g1, G1Projective);
+
+impl GLVParameters for Parameters {
+    type WideBigInt = BigInteger512;
+    const OMEGA: Self::BaseField = field_new_from_raw_repr!(
+        Fq,
+        BigInteger384([
+            3526659474838938856,
+            17562030475567847978,
+            1632777218702014455,
+            14009062335050482331,
+            3906511377122991214,
+            368068849512964448,
+        ])
+    );
+    const LAMBDA: Self::ScalarField = field_new_from_raw_repr!(
+        Fr,
+        BigInteger256([
+            7865245318337523249,
+            18346590209729131401,
+            15545362854776399464,
+            6505881510324251116,
+        ])
+    );
+    /// |round(B1 * R / n)|
+    const Q2: <Self::ScalarField as PrimeField>::BigInt =
+        BigInteger256([7203196592358157870, 8965520006802549469, 1, 0]);
+    const B1: <Self::ScalarField as PrimeField>::BigInt =
+        BigInteger256([4294967295, 12413508272118670338, 0, 0]);
+    const B1_IS_NEG: bool = true;
+    /// |round(B2 * R / n)|
+    const Q1: <Self::ScalarField as PrimeField>::BigInt = BigInteger256([2, 0, 0, 0]);
+    const B2: <Self::ScalarField as PrimeField>::BigInt = BigInteger256([1, 0, 0, 0]);
+    const R_BITS: u32 = 256;
+}
 impl SWModelParameters for Parameters {
     /// COEFF_A = 0
     const COEFF_A: Fq = field_new!(Fq, "0");
@@ -40,6 +80,9 @@ impl SWModelParameters for Parameters {
     fn mul_by_a(_: &Self::BaseField) -> Self::BaseField {
         Self::BaseField::zero()
     }
+
+    impl_scalar_mul_parameters!(G1Projective);
+    impl_glv_for_sw!();
 }
 
 /// G1_GENERATOR_X =
diff --git a/test-curves/src/bn384_small_two_adicity/g1.rs b/test-curves/src/bn384_small_two_adicity/g1.rs
index aadf08151..27fc71349 100644
--- a/test-curves/src/bn384_small_two_adicity/g1.rs
+++ b/test-curves/src/bn384_small_two_adicity/g1.rs
@@ -1,4 +1,5 @@
 use ark_ec::{
+    impl_glv_for_sw, impl_scalar_mul_kernel, impl_scalar_mul_parameters,
     models::{ModelParameters, SWModelParameters},
     short_weierstrass_jacobian::*,
 };
@@ -12,6 +13,8 @@ pub type G1Projective = GroupProjective<Parameters>;
 #[derive(Clone, Default, PartialEq, Eq)]
 pub struct Parameters;
 
+impl_scalar_mul_kernel!(bn384, "ark-bn384", g1, G1Projective);
+
 impl ModelParameters for Parameters {
     type BaseField = Fq;
     type ScalarField = Fr;
@@ -38,6 +41,8 @@ impl SWModelParameters for Parameters {
     fn mul_by_a(_: &Self::BaseField) -> Self::BaseField {
         Self::BaseField::zero()
     }
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 /// G1_GENERATOR_X = -1
diff --git a/test-curves/src/mnt4_753/g1.rs b/test-curves/src/mnt4_753/g1.rs
index 71863ead0..1bbfe0d80 100644
--- a/test-curves/src/mnt4_753/g1.rs
+++ b/test-curves/src/mnt4_753/g1.rs
@@ -1,4 +1,5 @@
 use ark_ec::{
+    impl_scalar_mul_kernel, impl_scalar_mul_parameters,
     models::{ModelParameters, SWModelParameters},
     short_weierstrass_jacobian::*,
 };
@@ -17,6 +18,8 @@ impl ModelParameters for Parameters {
     type ScalarField = Fr;
 }
 
+impl_scalar_mul_kernel!(mnt4_753, "ark-mnt4-753", g1, G1Projective);
+
 impl SWModelParameters for Parameters {
     /// COEFF_A = 2
     #[rustfmt::skip]
@@ -38,6 +41,8 @@ impl SWModelParameters for Parameters {
     /// AFFINE_GENERATOR_COEFFS = (G1_GENERATOR_X, G1_GENERATOR_Y)
     const AFFINE_GENERATOR_COEFFS: (Self::BaseField, Self::BaseField) =
         (G1_GENERATOR_X, G1_GENERATOR_Y);
+
+    impl_scalar_mul_parameters!(G1Projective);
 }
 
 // Generator of G1