diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b309b9aa..d8ceb0f5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,6 +34,25 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  compat:
+    if: github.event.pull_request.draft == false
+    name: Wasm-compatibility
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        target:
+          - wasm32-unknown-unknown
+          - wasm32-wasi
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+
+      - name: Download WASM targets
+        run: rustup target add "${{ matrix.target }}"
+      # We run WASM build (for tests) which compiles the lib allowig us to have
+      # `getrandom` as a dev-dependency.
+      - name: Build
+        run: cargo build --tests --release --features "bn256-table derive_serde prefetch" --target "${{ matrix.target }}"
   test:
     if: github.event.pull_request.draft == false
     name: Test
@@ -41,8 +60,10 @@ jobs:
     strategy:
       matrix:
         include:
-          - feature:
           - feature: default
+          - feature: bn256-table
+          - feature: derive_serde
+          - feature: asm
     steps:
       - uses: actions/checkout@v2
       - uses: actions-rs/toolchain@v1
diff --git a/Cargo.toml b/Cargo.toml
index a5c1730e..0810983f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "halo2curves-axiom"
-version = "0.5.3"
+version = "0.6.1"
 authors = ["Privacy Scaling Explorations team", "Taiko Labs", "Intrinsic Technologies"]
 license = "MIT/Apache-2.0"
 edition = "2021"
@@ -19,6 +19,11 @@ hex = "0.4"
 rand_chacha = "0.3.1"
 sha3 = "0.10.8"
 
+# Added to make sure we are able to build the lib in the CI.
+# Notice this will never be loaded for someone using this lib as dep.
+[target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dev-dependencies]
+getrandom = { version = "0.2", features = ["js"] }
+
 [dependencies]
 subtle = "2.4"
 ff = { version = "0.13.0", default-features = false, features = ["std"] }
@@ -36,13 +41,13 @@ serde = { version = "1.0", default-features = false, optional = true }
 serde_arrays = { version = "0.1.0", optional = true }
 hex = { version = "0.4", optional = true, default-features = false, features = ["alloc", "serde"] }
 blake2b_simd = "1"
-maybe-rayon = { version = "0.1.0", default-features = false }
+rayon = "1.8"
 digest = "0.10.7"
 sha2 = "0.10.8"
+unroll = "0.1.5"
 
 [features]
-default = ["bits", "multicore", "bn256-table", "derive_serde"]
-multicore = ["maybe-rayon/threads"]
+default = ["bits", "bn256-table", "derive_serde"]
 asm = []
 bits = ["ff/bits"]
 bn256-table = []
diff --git a/README.md b/README.md
index 3ede071c..a7057af1 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,19 @@ The implementations were originally ported from [matterlabs/pairing](https://git
 * Various features related to serialization and deserialization of curve points and field elements.
 * Curve-specific optimizations and benchmarking capabilities.
 
+## Controlling parallelism
+
+`halo2curves` currently uses [rayon](https://github.com/rayon-rs/rayon) for parallel
+computation. 
+
+The `RAYON_NUM_THREADS` environment variable can be used to set the number of
+threads.
+
+When compiling to WASM-targets, notice that since version `1.7`, `rayon` will fallback automatically (with no need to handle features) to require `getrandom` in order to be able to work.
+For more info related to WASM-compilation.
+
+See: [Rayon: Usage with WebAssembly](https://github.com/rayon-rs/rayon#usage-with-webassembly) for more info.  
+
 ## Benchmarks
 
 Benchmarking is supported through the use of Rust's built-in test framework. Benchmarks can be run without assembly optimizations:
diff --git a/rust-toolchain b/rust-toolchain
index 832e9afb..dc87e8af 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1 +1 @@
-1.70.0
+1.74.0
diff --git a/src/arithmetic.rs b/src/arithmetic.rs
index 4575d5e3..b88adeb5 100644
--- a/src/arithmetic.rs
+++ b/src/arithmetic.rs
@@ -45,6 +45,30 @@ pub(crate) const fn macx(a: u64, b: u64, c: u64) -> (u64, u64) {
     (res as u64, (res >> 64) as u64)
 }
 
+/// Returns a >= b
+#[inline(always)]
+pub(crate) const fn bigint_geq(a: &[u64; 4], b: &[u64; 4]) -> bool {
+    if a[3] > b[3] {
+        return true;
+    } else if a[3] < b[3] {
+        return false;
+    }
+    if a[2] > b[2] {
+        return true;
+    } else if a[2] < b[2] {
+        return false;
+    }
+    if a[1] > b[1] {
+        return true;
+    } else if a[1] < b[1] {
+        return false;
+    }
+    if a[0] >= b[0] {
+        return true;
+    }
+    false
+}
+
 /// Compute a * b, returning the result.
 #[inline(always)]
 pub(crate) fn mul_512(a: [u64; 4], b: [u64; 4]) -> [u64; 8] {
diff --git a/src/bn256/fq.rs b/src/bn256/fq.rs
index 23da849a..8f96ded8 100644
--- a/src/bn256/fq.rs
+++ b/src/bn256/fq.rs
@@ -3,7 +3,7 @@ use crate::bn256::assembly::field_arithmetic_asm;
 #[cfg(not(feature = "asm"))]
 use crate::{arithmetic::macx, field_arithmetic, field_specific};
 
-use crate::arithmetic::{adc, mac, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use crate::{
diff --git a/src/bn256/fr.rs b/src/bn256/fr.rs
index c256f488..bd418b14 100644
--- a/src/bn256/fr.rs
+++ b/src/bn256/fr.rs
@@ -18,7 +18,7 @@ pub use table::FR_TABLE;
 #[cfg(not(feature = "bn256-table"))]
 use crate::impl_from_u64;
 
-use crate::arithmetic::{adc, mac, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use crate::{
diff --git a/src/derive/field.rs b/src/derive/field.rs
index 8d4ef783..24f95ccf 100644
--- a/src/derive/field.rs
+++ b/src/derive/field.rs
@@ -63,73 +63,88 @@ macro_rules! field_common {
                 $crate::ff_ext::jacobi::jacobi::<5>(&self.0, &$modulus.0)
             }
 
-            #[cfg(feature = "asm")]
             const fn montgomery_form(val: [u64; 4], r: $field) -> $field {
                 // Converts a 4 64-bit limb value into its congruent field representation.
                 // If `val` representes a 256 bit value then `r` should be R^2,
                 // if `val` represents the 256 MSB of a 512 bit value, then `r` should be R^3.
 
-                let (r0, carry) = mac(0, val[0], r.0[0], 0);
-                let (r1, carry) = mac(0, val[0], r.0[1], carry);
-                let (r2, carry) = mac(0, val[0], r.0[2], carry);
-                let (r3, r4) = mac(0, val[0], r.0[3], carry);
-
-                let (r1, carry) = mac(r1, val[1], r.0[0], 0);
-                let (r2, carry) = mac(r2, val[1], r.0[1], carry);
-                let (r3, carry) = mac(r3, val[1], r.0[2], carry);
-                let (r4, r5) = mac(r4, val[1], r.0[3], carry);
-
-                let (r2, carry) = mac(r2, val[2], r.0[0], 0);
-                let (r3, carry) = mac(r3, val[2], r.0[1], carry);
-                let (r4, carry) = mac(r4, val[2], r.0[2], carry);
-                let (r5, r6) = mac(r5, val[2], r.0[3], carry);
-
-                let (r3, carry) = mac(r3, val[3], r.0[0], 0);
-                let (r4, carry) = mac(r4, val[3], r.0[1], carry);
-                let (r5, carry) = mac(r5, val[3], r.0[2], carry);
-                let (r6, r7) = mac(r6, val[3], r.0[3], carry);
-
-                // Montgomery reduction
-                let k = r0.wrapping_mul($inv);
-                let (_, carry) = mac(r0, k, $modulus.0[0], 0);
-                let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
-                let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
-                let (r4, carry2) = adc(r4, 0, carry);
-
-                let k = r1.wrapping_mul($inv);
-                let (_, carry) = mac(r1, k, $modulus.0[0], 0);
-                let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
-                let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
-                let (r5, carry2) = adc(r5, carry2, carry);
-
-                let k = r2.wrapping_mul($inv);
-                let (_, carry) = mac(r2, k, $modulus.0[0], 0);
-                let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
-                let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
-                let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
-                let (r6, carry2) = adc(r6, carry2, carry);
-
-                let k = r3.wrapping_mul($inv);
-                let (_, carry) = mac(r3, k, $modulus.0[0], 0);
-                let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
-                let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
-                let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
-                let (r7, carry2) = adc(r7, carry2, carry);
-
-                // Result may be within MODULUS of the correct value
-                let (d0, borrow) = sbb(r4, $modulus.0[0], 0);
-                let (d1, borrow) = sbb(r5, $modulus.0[1], borrow);
-                let (d2, borrow) = sbb(r6, $modulus.0[2], borrow);
-                let (d3, borrow) = sbb(r7, $modulus.0[3], borrow);
-                let (_, borrow) = sbb(carry2, 0, borrow);
-                let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0);
-                let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry);
-                let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry);
-                let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry);
+                #[cfg(feature = "asm")]
+                {
+                    let (r0, carry) = mac(0, val[0], r.0[0], 0);
+                    let (r1, carry) = mac(0, val[0], r.0[1], carry);
+                    let (r2, carry) = mac(0, val[0], r.0[2], carry);
+                    let (r3, r4) = mac(0, val[0], r.0[3], carry);
+
+                    let (r1, carry) = mac(r1, val[1], r.0[0], 0);
+                    let (r2, carry) = mac(r2, val[1], r.0[1], carry);
+                    let (r3, carry) = mac(r3, val[1], r.0[2], carry);
+                    let (r4, r5) = mac(r4, val[1], r.0[3], carry);
+
+                    let (r2, carry) = mac(r2, val[2], r.0[0], 0);
+                    let (r3, carry) = mac(r3, val[2], r.0[1], carry);
+                    let (r4, carry) = mac(r4, val[2], r.0[2], carry);
+                    let (r5, r6) = mac(r5, val[2], r.0[3], carry);
+
+                    let (r3, carry) = mac(r3, val[3], r.0[0], 0);
+                    let (r4, carry) = mac(r4, val[3], r.0[1], carry);
+                    let (r5, carry) = mac(r5, val[3], r.0[2], carry);
+                    let (r6, r7) = mac(r6, val[3], r.0[3], carry);
+
+                    // Montgomery reduction
+                    let k = r0.wrapping_mul($inv);
+                    let (_, carry) = mac(r0, k, $modulus.0[0], 0);
+                    let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
+                    let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
+                    let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
+                    let (r4, carry2) = adc(r4, 0, carry);
+
+                    let k = r1.wrapping_mul($inv);
+                    let (_, carry) = mac(r1, k, $modulus.0[0], 0);
+                    let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
+                    let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
+                    let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
+                    let (r5, carry2) = adc(r5, carry2, carry);
+
+                    let k = r2.wrapping_mul($inv);
+                    let (_, carry) = mac(r2, k, $modulus.0[0], 0);
+                    let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
+                    let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
+                    let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
+                    let (r6, carry2) = adc(r6, carry2, carry);
+
+                    let k = r3.wrapping_mul($inv);
+                    let (_, carry) = mac(r3, k, $modulus.0[0], 0);
+                    let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
+                    let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
+                    let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
+                    let (r7, carry2) = adc(r7, carry2, carry);
+
+                    // Result may be within MODULUS of the correct value
+                    let (d0, borrow) = sbb(r4, $modulus.0[0], 0);
+                    let (d1, borrow) = sbb(r5, $modulus.0[1], borrow);
+                    let (d2, borrow) = sbb(r6, $modulus.0[2], borrow);
+                    let (d3, borrow) = sbb(r7, $modulus.0[3], borrow);
+                    let (_, borrow) = sbb(carry2, 0, borrow);
+                    let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0);
+                    let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry);
+                    let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry);
+                    let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry);
+
+                    $field([d0, d1, d2, d3])
+                }
 
-                $field([d0, d1, d2, d3])
+                #[cfg(not(feature = "asm"))]
+                {
+                    let mut val = val;
+                    if bigint_geq(&val, &$modulus.0) {
+                        let mut borrow = 0;
+                        (val[0], borrow) = sbb(val[0], $modulus.0[0], borrow);
+                        (val[1], borrow) = sbb(val[1], $modulus.0[1], borrow);
+                        (val[2], borrow) = sbb(val[2], $modulus.0[2], borrow);
+                        (val[3], _) = sbb(val[3], $modulus.0[3], borrow);
+                    }
+                    $field::mul(&$field(val), &r)
+                }
             }
 
             fn from_u512(limbs: [u64; 8]) -> $field {
@@ -150,27 +165,13 @@ macro_rules! field_common {
                 let lower_256 = [limbs[0], limbs[1], limbs[2], limbs[3]];
                 let upper_256 = [limbs[4], limbs[5], limbs[6], limbs[7]];
 
-                #[cfg(feature = "asm")]
-                {
-                    Self::montgomery_form(lower_256, $r2) + Self::montgomery_form(upper_256, $r3)
-                }
-                #[cfg(not(feature = "asm"))]
-                {
-                    $field(lower_256) * $r2 + $field(upper_256) * $r3
-                }
+                Self::montgomery_form(lower_256, $r2) + Self::montgomery_form(upper_256, $r3)
             }
 
             /// Converts from an integer represented in little endian
             /// into its (congruent) `$field` representation.
             pub const fn from_raw(val: [u64; 4]) -> Self {
-                #[cfg(feature = "asm")]
-                {
-                    Self::montgomery_form(val, $r2)
-                }
-                #[cfg(not(feature = "asm"))]
-                {
-                    (&$field(val)).mul(&$r2)
-                }
+                Self::montgomery_form(val, $r2)
             }
 
             /// Attempts to convert a little-endian byte representation of
@@ -429,31 +430,69 @@ macro_rules! field_arithmetic {
             }
 
             /// Multiplies `rhs` by `self`, returning the result.
-            #[inline]
-            pub const fn mul(&self, rhs: &Self) -> $field {
-                // Schoolbook multiplication
+            #[inline(always)]
+            #[unroll::unroll_for_loops]
+            #[allow(unused_assignments)]
+            pub const fn mul(&self, rhs: &Self) -> Self {
+                // Fast Coarsely Integrated Operand Scanning (CIOS) as described
+                // in Algorithm 2 of EdMSM: https://eprint.iacr.org/2022/1400.pdf
+                //
+                // Cannot use the fast version (algorithm 2) if
+                // modulus_high_word >= (WORD_SIZE - 1) / 2 - 1 = (2^64 - 1)/2 - 1
+
+                if $modulus.0[3] < (u64::MAX / 2) {
+                    const N: usize = 4;
+                    let mut t: [u64; N] = [0u64; N];
+                    let mut c_2: u64;
+                    for i in 0..4 {
+                        let mut c: u64 = 0u64;
+                        for j in 0..4 {
+                            (t[j], c) = mac(t[j], self.0[j], rhs.0[i], c);
+                        }
+                        c_2 = c;
+
+                        let m = t[0].wrapping_mul(INV);
+                        (_, c) = macx(t[0], m, $modulus.0[0]);
+
+                        for j in 1..4 {
+                            (t[j - 1], c) = mac(t[j], m, $modulus.0[j], c);
+                        }
+                        (t[N - 1], _) = adc(c_2, c, 0);
+                    }
+
+                    if bigint_geq(&t, &$modulus.0) {
+                        let mut borrow = 0;
+                        (t[0], borrow) = sbb(t[0], $modulus.0[0], borrow);
+                        (t[1], borrow) = sbb(t[1], $modulus.0[1], borrow);
+                        (t[2], borrow) = sbb(t[2], $modulus.0[2], borrow);
+                        (t[3], borrow) = sbb(t[3], $modulus.0[3], borrow);
+                    }
+                    $field(t)
+                } else {
+                    // Schoolbook multiplication
 
-                let (r0, carry) = mac(0, self.0[0], rhs.0[0], 0);
-                let (r1, carry) = mac(0, self.0[0], rhs.0[1], carry);
-                let (r2, carry) = mac(0, self.0[0], rhs.0[2], carry);
-                let (r3, r4) = mac(0, self.0[0], rhs.0[3], carry);
+                    let (r0, carry) = mac(0, self.0[0], rhs.0[0], 0);
+                    let (r1, carry) = mac(0, self.0[0], rhs.0[1], carry);
+                    let (r2, carry) = mac(0, self.0[0], rhs.0[2], carry);
+                    let (r3, r4) = mac(0, self.0[0], rhs.0[3], carry);
 
-                let (r1, carry) = mac(r1, self.0[1], rhs.0[0], 0);
-                let (r2, carry) = mac(r2, self.0[1], rhs.0[1], carry);
-                let (r3, carry) = mac(r3, self.0[1], rhs.0[2], carry);
-                let (r4, r5) = mac(r4, self.0[1], rhs.0[3], carry);
+                    let (r1, carry) = mac(r1, self.0[1], rhs.0[0], 0);
+                    let (r2, carry) = mac(r2, self.0[1], rhs.0[1], carry);
+                    let (r3, carry) = mac(r3, self.0[1], rhs.0[2], carry);
+                    let (r4, r5) = mac(r4, self.0[1], rhs.0[3], carry);
 
-                let (r2, carry) = mac(r2, self.0[2], rhs.0[0], 0);
-                let (r3, carry) = mac(r3, self.0[2], rhs.0[1], carry);
-                let (r4, carry) = mac(r4, self.0[2], rhs.0[2], carry);
-                let (r5, r6) = mac(r5, self.0[2], rhs.0[3], carry);
+                    let (r2, carry) = mac(r2, self.0[2], rhs.0[0], 0);
+                    let (r3, carry) = mac(r3, self.0[2], rhs.0[1], carry);
+                    let (r4, carry) = mac(r4, self.0[2], rhs.0[2], carry);
+                    let (r5, r6) = mac(r5, self.0[2], rhs.0[3], carry);
 
-                let (r3, carry) = mac(r3, self.0[3], rhs.0[0], 0);
-                let (r4, carry) = mac(r4, self.0[3], rhs.0[1], carry);
-                let (r5, carry) = mac(r5, self.0[3], rhs.0[2], carry);
-                let (r6, r7) = mac(r6, self.0[3], rhs.0[3], carry);
+                    let (r3, carry) = mac(r3, self.0[3], rhs.0[0], 0);
+                    let (r4, carry) = mac(r4, self.0[3], rhs.0[1], carry);
+                    let (r5, carry) = mac(r5, self.0[3], rhs.0[2], carry);
+                    let (r6, r7) = mac(r6, self.0[3], rhs.0[3], carry);
 
-                $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7])
+                    $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7])
+                }
             }
 
             /// Subtracts `rhs` from `self`, returning the result.
diff --git a/src/ed25519/fq.rs b/src/ed25519/fq.rs
index fed7e413..5d04442a 100644
--- a/src/ed25519/fq.rs
+++ b/src/ed25519/fq.rs
@@ -8,7 +8,7 @@ use subtle::{Choice, ConditionallySelectable, ConstantTimeEq, CtOption};
 #[cfg(feature = "derive_serde")]
 use serde::{Deserialize, Serialize};
 
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 
 /// This represents an element of $\mathbb{F}_q$ where
 ///
diff --git a/src/ed25519/fr.rs b/src/ed25519/fr.rs
index 4ef3ab4b..e91ed4fe 100644
--- a/src/ed25519/fr.rs
+++ b/src/ed25519/fr.rs
@@ -8,7 +8,7 @@ use subtle::{Choice, ConditionallySelectable, ConstantTimeEq, CtOption};
 #[cfg(feature = "derive_serde")]
 use serde::{Deserialize, Serialize};
 
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 
 /// This represents an element of $\mathbb{F}_q$ where
 ///
diff --git a/src/fft.rs b/src/fft.rs
index 6eb3487e..00eca39a 100644
--- a/src/fft.rs
+++ b/src/fft.rs
@@ -1,4 +1,3 @@
-use crate::multicore;
 pub use crate::{CurveAffine, CurveExt};
 use ff::Field;
 use group::{GroupOpsOwned, ScalarMulOwned};
@@ -38,7 +37,7 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(a: &mut [G], omega: Scalar,
         r
     }
 
-    let threads = multicore::current_num_threads();
+    let threads = rayon::current_num_threads();
     let log_threads = threads.ilog2();
     let n = a.len();
     assert_eq!(n, 1 << log_n);
@@ -107,7 +106,7 @@ pub fn recursive_butterfly_arithmetic<Scalar: Field, G: FftGroup<Scalar>>(
         a[1] -= &t;
     } else {
         let (left, right) = a.split_at_mut(n / 2);
-        multicore::join(
+        rayon::join(
             || recursive_butterfly_arithmetic(left, n / 2, twiddle_chunk * 2, twiddles),
             || recursive_butterfly_arithmetic(right, n / 2, twiddle_chunk * 2, twiddles),
         );
diff --git a/src/lib.rs b/src/lib.rs
index 36f1fcda..3397043d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,7 +3,6 @@ pub mod ff_ext;
 pub mod fft;
 pub mod hash_to_curve;
 pub mod msm;
-pub mod multicore;
 pub mod serde;
 
 pub mod bls12_381;
diff --git a/src/msm.rs b/src/msm.rs
index 1a3709c1..25af9711 100644
--- a/src/msm.rs
+++ b/src/msm.rs
@@ -1,10 +1,14 @@
 use std::ops::Neg;
 
+use crate::CurveAffine;
+use ff::Field;
 use ff::PrimeField;
 use group::Group;
-use pasta_curves::arithmetic::CurveAffine;
+use rayon::iter::{
+    IndexedParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator,
+};
 
-use crate::multicore;
+const BATCH_SIZE: usize = 64;
 
 fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 {
     // Booth encoding:
@@ -50,6 +54,238 @@ fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 {
     }
 }
 
+fn batch_add<C: CurveAffine>(
+    size: usize,
+    buckets: &mut [BucketAffine<C>],
+    points: &[SchedulePoint],
+    bases: &[Affine<C>],
+) {
+    let mut t = vec![C::Base::ZERO; size];
+    let mut z = vec![C::Base::ZERO; size];
+    let mut acc = C::Base::ONE;
+
+    for (
+        (
+            SchedulePoint {
+                base_idx,
+                buck_idx,
+                sign,
+            },
+            t,
+        ),
+        z,
+    ) in points.iter().zip(t.iter_mut()).zip(z.iter_mut())
+    {
+        *z = buckets[*buck_idx].x() - bases[*base_idx].x;
+        if *sign {
+            *t = acc * (buckets[*buck_idx].y() - bases[*base_idx].y);
+        } else {
+            *t = acc * (buckets[*buck_idx].y() + bases[*base_idx].y);
+        }
+        acc *= *z;
+    }
+
+    acc = acc.invert().unwrap();
+
+    for (
+        (
+            SchedulePoint {
+                base_idx,
+                buck_idx,
+                sign,
+            },
+            t,
+        ),
+        z,
+    ) in points.iter().zip(t.iter()).zip(z.iter()).rev()
+    {
+        let lambda = acc * t;
+        acc *= z;
+
+        let x = lambda.square() - (buckets[*buck_idx].x() + bases[*base_idx].x);
+        if *sign {
+            buckets[*buck_idx].set_y(&((lambda * (bases[*base_idx].x - x)) - bases[*base_idx].y));
+        } else {
+            buckets[*buck_idx].set_y(&((lambda * (bases[*base_idx].x - x)) + bases[*base_idx].y));
+        }
+        buckets[*buck_idx].set_x(&x);
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+struct Affine<C: CurveAffine> {
+    x: C::Base,
+    y: C::Base,
+}
+
+impl<C: CurveAffine> Affine<C> {
+    fn from(point: &C) -> Self {
+        let coords = point.coordinates().unwrap();
+
+        Self {
+            x: *coords.x(),
+            y: *coords.y(),
+        }
+    }
+
+    fn neg(&self) -> Self {
+        Self {
+            x: self.x,
+            y: -self.y,
+        }
+    }
+
+    fn eval(&self) -> C {
+        C::from_xy(self.x, self.y).unwrap()
+    }
+}
+
+#[derive(Debug, Clone)]
+enum BucketAffine<C: CurveAffine> {
+    None,
+    Point(Affine<C>),
+}
+
+#[derive(Debug, Clone)]
+enum Bucket<C: CurveAffine> {
+    None,
+    Point(C::Curve),
+}
+
+impl<C: CurveAffine> Bucket<C> {
+    fn add_assign(&mut self, point: &C, sign: bool) {
+        *self = match *self {
+            Bucket::None => Bucket::Point({
+                if sign {
+                    point.to_curve()
+                } else {
+                    point.to_curve().neg()
+                }
+            }),
+            Bucket::Point(a) => {
+                if sign {
+                    Self::Point(a + point)
+                } else {
+                    Self::Point(a - point)
+                }
+            }
+        }
+    }
+
+    fn add(&self, other: &BucketAffine<C>) -> C::Curve {
+        match (self, other) {
+            (Self::Point(this), BucketAffine::Point(other)) => *this + other.eval(),
+            (Self::Point(this), BucketAffine::None) => *this,
+            (Self::None, BucketAffine::Point(other)) => other.eval().to_curve(),
+            (Self::None, BucketAffine::None) => C::Curve::identity(),
+        }
+    }
+}
+
+impl<C: CurveAffine> BucketAffine<C> {
+    fn assign(&mut self, point: &Affine<C>, sign: bool) -> bool {
+        match *self {
+            Self::None => {
+                *self = Self::Point(if sign { *point } else { point.neg() });
+                true
+            }
+            Self::Point(_) => false,
+        }
+    }
+
+    fn x(&self) -> C::Base {
+        match self {
+            Self::None => panic!("::x None"),
+            Self::Point(a) => a.x,
+        }
+    }
+
+    fn y(&self) -> C::Base {
+        match self {
+            Self::None => panic!("::y None"),
+            Self::Point(a) => a.y,
+        }
+    }
+
+    fn set_x(&mut self, x: &C::Base) {
+        match self {
+            Self::None => panic!("::set_x None"),
+            Self::Point(ref mut a) => a.x = *x,
+        }
+    }
+
+    fn set_y(&mut self, y: &C::Base) {
+        match self {
+            Self::None => panic!("::set_y None"),
+            Self::Point(ref mut a) => a.y = *y,
+        }
+    }
+}
+
+struct Schedule<C: CurveAffine> {
+    buckets: Vec<BucketAffine<C>>,
+    set: [SchedulePoint; BATCH_SIZE],
+    ptr: usize,
+}
+
+#[derive(Debug, Clone, Default)]
+struct SchedulePoint {
+    base_idx: usize,
+    buck_idx: usize,
+    sign: bool,
+}
+
+impl SchedulePoint {
+    fn new(base_idx: usize, buck_idx: usize, sign: bool) -> Self {
+        Self {
+            base_idx,
+            buck_idx,
+            sign,
+        }
+    }
+}
+
+impl<C: CurveAffine> Schedule<C> {
+    fn new(c: usize) -> Self {
+        let set = (0..BATCH_SIZE)
+            .map(|_| SchedulePoint::default())
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+
+        Self {
+            buckets: vec![BucketAffine::None; 1 << (c - 1)],
+            set,
+            ptr: 0,
+        }
+    }
+
+    fn contains(&self, buck_idx: usize) -> bool {
+        self.set.iter().any(|sch| sch.buck_idx == buck_idx)
+    }
+
+    fn execute(&mut self, bases: &[Affine<C>]) {
+        if self.ptr != 0 {
+            batch_add(self.ptr, &mut self.buckets, &self.set, bases);
+            self.ptr = 0;
+            self.set
+                .iter_mut()
+                .for_each(|sch| *sch = SchedulePoint::default());
+        }
+    }
+
+    fn add(&mut self, bases: &[Affine<C>], base_idx: usize, buck_idx: usize, sign: bool) {
+        if !self.buckets[buck_idx].assign(&bases[base_idx], sign) {
+            self.set[self.ptr] = SchedulePoint::new(base_idx, buck_idx, sign);
+            self.ptr += 1;
+        }
+
+        if self.ptr == self.set.len() {
+            self.execute(bases);
+        }
+    }
+}
+
 pub fn multiexp_serial<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) {
     let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect();
 
@@ -123,30 +359,6 @@ pub fn multiexp_serial<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C], acc: &
     }
 }
 
-/// Performs a small multi-exponentiation operation.
-/// Uses the double-and-add algorithm with doublings shared across points.
-pub fn small_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
-    let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect();
-    let mut acc = C::Curve::identity();
-
-    // for byte idx
-    for byte_idx in (0..32).rev() {
-        // for bit idx
-        for bit_idx in (0..8).rev() {
-            acc = acc.double();
-            // for each coeff
-            for coeff_idx in 0..coeffs.len() {
-                let byte = coeffs[coeff_idx].as_ref()[byte_idx];
-                if ((byte >> bit_idx) & 1) != 0 {
-                    acc += bases[coeff_idx];
-                }
-            }
-        }
-    }
-
-    acc
-}
-
 /// Performs a multi-exponentiation operation.
 ///
 /// This function will panic if coeffs and bases have a different length.
@@ -155,12 +367,12 @@ pub fn small_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::C
 pub fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
     assert_eq!(coeffs.len(), bases.len());
 
-    let num_threads = multicore::current_num_threads();
+    let num_threads = rayon::current_num_threads();
     if coeffs.len() > num_threads {
         let chunk = coeffs.len() / num_threads;
         let num_chunks = coeffs.chunks(chunk).len();
         let mut results = vec![C::Curve::identity(); num_chunks];
-        multicore::scope(|scope| {
+        rayon::scope(|scope| {
             let chunk = coeffs.len() / num_threads;
 
             for ((coeffs, bases), acc) in coeffs
@@ -180,142 +392,96 @@ pub fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Cu
         acc
     }
 }
+///
+/// This function will panic if coeffs and bases have a different length.
+///
+/// This will use multithreading if beneficial.
+pub fn best_multiexp_independent_points<C: CurveAffine>(
+    coeffs: &[C::Scalar],
+    bases: &[C],
+) -> C::Curve {
+    assert_eq!(coeffs.len(), bases.len());
 
-#[cfg(test)]
-mod test {
-
-    use std::ops::Neg;
-
-    use crate::{
-        bn256::{Fr, G1Affine, G1},
-        multicore,
+    // TODO: consider adjusting it with emprical data?
+    let c = if bases.len() < 4 {
+        1
+    } else if bases.len() < 32 {
+        3
+    } else {
+        (f64::from(bases.len() as u32)).ln().ceil() as usize
     };
-    use ark_std::{end_timer, start_timer};
-    use ff::{Field, PrimeField};
-    use group::{Curve, Group};
-    use pasta_curves::arithmetic::CurveAffine;
-    use rand_core::OsRng;
-
-    // keeping older implementation it here for baseline comparison, debugging & benchmarking
-    fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
-        assert_eq!(coeffs.len(), bases.len());
 
-        let num_threads = multicore::current_num_threads();
-        if coeffs.len() > num_threads {
-            let chunk = coeffs.len() / num_threads;
-            let num_chunks = coeffs.chunks(chunk).len();
-            let mut results = vec![C::Curve::identity(); num_chunks];
-            multicore::scope(|scope| {
-                let chunk = coeffs.len() / num_threads;
-
-                for ((coeffs, bases), acc) in coeffs
-                    .chunks(chunk)
-                    .zip(bases.chunks(chunk))
-                    .zip(results.iter_mut())
-                {
-                    scope.spawn(move |_| {
-                        multiexp_serial(coeffs, bases, acc);
-                    });
-                }
-            });
-            results.iter().fold(C::Curve::identity(), |a, b| a + b)
-        } else {
-            let mut acc = C::Curve::identity();
-            multiexp_serial(coeffs, bases, &mut acc);
-            acc
-        }
+    if c < 10 {
+        return best_multiexp(coeffs, bases);
     }
 
-    // keeping older implementation it here for baseline comparision, debugging & benchmarking
-    fn multiexp_serial<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C], acc: &mut C::Curve) {
-        let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_repr()).collect();
-
-        let c = if bases.len() < 4 {
-            1
-        } else if bases.len() < 32 {
-            3
-        } else {
-            (f64::from(bases.len() as u32)).ln().ceil() as usize
-        };
-
-        fn get_at<F: PrimeField>(segment: usize, c: usize, bytes: &F::Repr) -> usize {
-            let skip_bits = segment * c;
-            let skip_bytes = skip_bits / 8;
+    // coeffs to byte representation
+    let coeffs: Vec<_> = coeffs.par_iter().map(|a| a.to_repr()).collect();
+    // copy bases into `Affine` to skip in on curve check for every access
+    let bases_local: Vec<_> = bases.par_iter().map(Affine::from).collect();
 
-            if skip_bytes >= 32 {
-                return 0;
-            }
-
-            let mut v = [0; 8];
-            for (v, o) in v.iter_mut().zip(bytes.as_ref()[skip_bytes..].iter()) {
-                *v = *o;
+    // number of windows
+    let number_of_windows = C::Scalar::NUM_BITS as usize / c + 1;
+    // accumumator for each window
+    let mut acc = vec![C::Curve::identity(); number_of_windows];
+    acc.par_iter_mut().enumerate().rev().for_each(|(w, acc)| {
+        // jacobian buckets for already scheduled points
+        let mut j_bucks = vec![Bucket::<C>::None; 1 << (c - 1)];
+
+        // schedular for affine addition
+        let mut sched = Schedule::new(c);
+
+        for (base_idx, coeff) in coeffs.iter().enumerate() {
+            let buck_idx = get_booth_index(w, c, coeff.as_ref());
+
+            if buck_idx != 0 {
+                // parse bucket index
+                let sign = buck_idx.is_positive();
+                let buck_idx = buck_idx.unsigned_abs() as usize - 1;
+
+                if sched.contains(buck_idx) {
+                    // greedy accumulation
+                    // we use original bases here
+                    j_bucks[buck_idx].add_assign(&bases[base_idx], sign);
+                } else {
+                    // also flushes the schedule if full
+                    sched.add(&bases_local, base_idx, buck_idx, sign);
+                }
             }
-
-            let mut tmp = u64::from_le_bytes(v);
-            tmp >>= skip_bits - (skip_bytes * 8);
-            tmp %= 1 << c;
-
-            tmp as usize
         }
 
-        let segments = (256 / c) + 1;
-
-        for current_segment in (0..segments).rev() {
-            for _ in 0..c {
-                *acc = acc.double();
-            }
-
-            #[derive(Clone, Copy)]
-            enum Bucket<C: CurveAffine> {
-                None,
-                Affine(C),
-                Projective(C::Curve),
-            }
+        // flush the schedule
+        sched.execute(&bases_local);
 
-            impl<C: CurveAffine> Bucket<C> {
-                fn add_assign(&mut self, other: &C) {
-                    *self = match *self {
-                        Bucket::None => Bucket::Affine(*other),
-                        Bucket::Affine(a) => Bucket::Projective(a + *other),
-                        Bucket::Projective(mut a) => {
-                            a += *other;
-                            Bucket::Projective(a)
-                        }
-                    }
-                }
+        // summation by parts
+        // e.g. 3a + 2b + 1c = a +
+        //                    (a) + b +
+        //                    ((a) + b) + c
+        let mut running_sum = C::Curve::identity();
+        for (j_buck, a_buck) in j_bucks.iter().zip(sched.buckets.iter()).rev() {
+            running_sum += j_buck.add(a_buck);
+            *acc += running_sum;
+        }
 
-                fn add(self, mut other: C::Curve) -> C::Curve {
-                    match self {
-                        Bucket::None => other,
-                        Bucket::Affine(a) => {
-                            other += a;
-                            other
-                        }
-                        Bucket::Projective(a) => other + a,
-                    }
-                }
-            }
+        // shift accumulator to the window position
+        for _ in 0..c * w {
+            *acc = acc.double();
+        }
+    });
+    acc.into_iter().sum::<_>()
+}
 
-            let mut buckets: Vec<Bucket<C>> = vec![Bucket::None; (1 << c) - 1];
+#[cfg(test)]
+mod test {
 
-            for (coeff, base) in coeffs.iter().zip(bases.iter()) {
-                let coeff = get_at::<C::Scalar>(current_segment, c, coeff);
-                if coeff != 0 {
-                    buckets[coeff - 1].add_assign(base);
-                }
-            }
+    use std::ops::Neg;
 
-            // Summation by parts
-            // e.g. 3a + 2b + 1c = a +
-            //                    (a) + b +
-            //                    ((a) + b) + c
-            let mut running_sum = C::Curve::identity();
-            for exp in buckets.into_iter().rev() {
-                running_sum = exp.add(running_sum);
-                *acc += &running_sum;
-            }
-        }
-    }
+    use crate::bn256::{Fr, G1Affine, G1};
+    use ark_std::{end_timer, start_timer};
+    use ff::{Field, PrimeField};
+    use group::{Curve, Group};
+    use pasta_curves::arithmetic::CurveAffine;
+    use rand_core::OsRng;
 
     #[test]
     fn test_booth_encoding() {
@@ -379,21 +545,19 @@ mod test {
             let points = &points[..1 << k];
             let scalars = &scalars[..1 << k];
 
-            let t0 = start_timer!(|| format!("w/  booth k={}", k));
-            let e0 = super::best_multiexp(scalars, points);
+            let t0 = start_timer!(|| format!("cyclone k={}", k));
+            let e0 = super::best_multiexp_independent_points(scalars, points);
             end_timer!(t0);
 
-            let t1 = start_timer!(|| format!("w/o booth k={}", k));
-            let e1 = best_multiexp(scalars, points);
+            let t1 = start_timer!(|| format!("older k={}", k));
+            let e1 = super::best_multiexp(scalars, points);
             end_timer!(t1);
-
             assert_eq!(e0, e1);
         }
     }
 
     #[test]
     fn test_msm_cross() {
-        run_msm_cross::<G1Affine>(10, 18);
-        // run_msm_cross::<G1Affine>(19, 23);
+        run_msm_cross::<G1Affine>(14, 22);
     }
 }
diff --git a/src/multicore.rs b/src/multicore.rs
deleted file mode 100644
index d8323553..00000000
--- a/src/multicore.rs
+++ /dev/null
@@ -1,16 +0,0 @@
-pub use maybe_rayon::{
-    iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator},
-    join, scope, Scope,
-};
-
-#[cfg(feature = "multicore")]
-pub use maybe_rayon::{
-    current_num_threads,
-    iter::{IndexedParallelIterator, IntoParallelRefIterator},
-    slice::ParallelSliceMut,
-};
-
-#[cfg(not(feature = "multicore"))]
-pub fn current_num_threads() -> usize {
-    1
-}
diff --git a/src/secp256k1/fp.rs b/src/secp256k1/fp.rs
index 1538544f..cb3493f0 100644
--- a/src/secp256k1/fp.rs
+++ b/src/secp256k1/fp.rs
@@ -1,4 +1,4 @@
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use crate::{
diff --git a/src/secp256k1/fq.rs b/src/secp256k1/fq.rs
index 09087227..f013f61a 100644
--- a/src/secp256k1/fq.rs
+++ b/src/secp256k1/fq.rs
@@ -1,4 +1,4 @@
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use crate::{
diff --git a/src/secp256r1/fp.rs b/src/secp256r1/fp.rs
index f3497c81..6669de11 100644
--- a/src/secp256r1/fp.rs
+++ b/src/secp256r1/fp.rs
@@ -1,4 +1,4 @@
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use crate::{
diff --git a/src/secp256r1/fq.rs b/src/secp256r1/fq.rs
index 86005d35..b96f05d5 100644
--- a/src/secp256r1/fq.rs
+++ b/src/secp256r1/fq.rs
@@ -1,4 +1,4 @@
-use crate::arithmetic::{adc, mac, macx, sbb};
+use crate::arithmetic::{adc, bigint_geq, mac, macx, sbb};
 use crate::extend_field_legendre;
 use crate::ff::{FromUniformBytes, PrimeField, WithSmallOrderMulGroup};
 use core::fmt;