diff --git a/src/lib.rs b/src/lib.rs
index da796894..151c8b2a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,12 +27,15 @@ pub mod traits;
 
 use once_cell::sync::OnceCell;
 
-use crate::bellpepper::{
-  r1cs::{NovaShape, NovaWitness},
-  shape_cs::ShapeCS,
-  solver::SatisfyingAssignment,
-};
 use crate::digest::{DigestComputer, SimpleDigestible};
+use crate::{
+  bellpepper::{
+    r1cs::{NovaShape, NovaWitness},
+    shape_cs::ShapeCS,
+    solver::SatisfyingAssignment,
+  },
+  r1cs::R1CSResult,
+};
 use bellpepper_core::ConstraintSystem;
 use circuit::{NovaAugmentedCircuit, NovaAugmentedCircuitInputs, NovaAugmentedCircuitParams};
 use constants::{BN_LIMB_WIDTH, BN_N_LIMBS, NUM_FE_WITHOUT_IO_FOR_CRHF, NUM_HASH_BITS};
@@ -225,6 +228,21 @@ where
   }
 }
 
+/// A resource buffer for [`RecursiveSNARK`] for storing scratch values that are computed by `prove_step`,
+/// which allows the reuse of memory allocations and avoids unnecessary new allocations in the critical section.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(bound = "")]
+pub struct ResourceBuffer<E: Engine> {
+  l_w: Option<R1CSWitness<E>>,
+  l_u: Option<R1CSInstance<E>>,
+
+  ABC_Z_1: R1CSResult<E>,
+  ABC_Z_2: R1CSResult<E>,
+
+  /// buffer for `commit_T`
+  T: Vec<E::Scalar>,
+}
+
 /// A SNARK that proves the correct execution of an incremental computation
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(bound = "")]
@@ -243,6 +261,12 @@ where
   r_U_secondary: RelaxedR1CSInstance<E2>,
   l_w_secondary: R1CSWitness<E2>,
   l_u_secondary: R1CSInstance<E2>,
+
+  /// Buffer for memory needed by the primary fold-step
+  buffer_primary: ResourceBuffer<E1>,
+  /// Buffer for memory needed by the secondary fold-step
+  buffer_secondary: ResourceBuffer<E2>,
+
   i: usize,
   zi_primary: Vec<E1::Scalar>,
   zi_secondary: Vec<E2::Scalar>,
@@ -268,6 +292,9 @@ where
       return Err(NovaError::InvalidInitialInputLength);
     }
 
+    let r1cs_primary = &pp.r1cs_shape_primary;
+    let r1cs_secondary = &pp.r1cs_shape_secondary;
+
     // base case for the primary
     let mut cs_primary = SatisfyingAssignment::<E1>::new();
     let inputs_primary: NovaAugmentedCircuitInputs<E2> = NovaAugmentedCircuitInputs::new(
@@ -331,9 +358,8 @@ where
     // IVC proof for the secondary circuit
     let l_w_secondary = w_secondary;
     let l_u_secondary = u_secondary;
-    let r_W_secondary = RelaxedR1CSWitness::<E2>::default(&pp.r1cs_shape_secondary);
-    let r_U_secondary =
-      RelaxedR1CSInstance::<E2>::default(&pp.ck_secondary, &pp.r1cs_shape_secondary);
+    let r_W_secondary = RelaxedR1CSWitness::<E2>::default(r1cs_secondary);
+    let r_U_secondary = RelaxedR1CSInstance::<E2>::default(&pp.ck_secondary, r1cs_secondary);
 
     assert!(
       !(zi_primary.len() != pp.F_arity_primary || zi_secondary.len() != pp.F_arity_secondary),
@@ -352,6 +378,22 @@ where
       .collect::<Result<Vec<<E2 as Engine>::Scalar>, NovaError>>()
       .expect("Nova error synthesis");
 
+    let buffer_primary = ResourceBuffer {
+      l_w: None,
+      l_u: None,
+      ABC_Z_1: R1CSResult::default(r1cs_primary),
+      ABC_Z_2: R1CSResult::default(r1cs_primary),
+      T: r1cs::default_T(r1cs_primary),
+    };
+
+    let buffer_secondary = ResourceBuffer {
+      l_w: None,
+      l_u: None,
+      ABC_Z_1: R1CSResult::default(r1cs_secondary),
+      ABC_Z_2: R1CSResult::default(r1cs_secondary),
+      T: r1cs::default_T(r1cs_secondary),
+    };
+
     Ok(Self {
       z0_primary: z0_primary.to_vec(),
       z0_secondary: z0_secondary.to_vec(),
@@ -361,6 +403,9 @@ where
       r_U_secondary,
       l_w_secondary,
       l_u_secondary,
+
+      buffer_primary,
+      buffer_secondary,
       i: 0,
       zi_primary,
       zi_secondary,
@@ -382,16 +427,24 @@ where
       return Ok(());
     }
 
+    // save the inputs before proceeding to the `i+1`th step
+    let r_U_primary_i = self.r_U_primary.clone();
+    let r_U_secondary_i = self.r_U_secondary.clone();
+    let l_u_secondary_i = self.l_u_secondary.clone();
+
     // fold the secondary circuit's instance
-    let (nifs_secondary, (r_U_secondary, r_W_secondary)) = NIFS::prove(
+    let nifs_secondary = NIFS::prove_mut(
       &pp.ck_secondary,
       &pp.ro_consts_secondary,
       &scalar_as_base::<E1>(pp.digest()),
       &pp.r1cs_shape_secondary,
-      &self.r_U_secondary,
-      &self.r_W_secondary,
+      &mut self.r_U_secondary,
+      &mut self.r_W_secondary,
       &self.l_u_secondary,
       &self.l_w_secondary,
+      &mut self.buffer_secondary.T,
+      &mut self.buffer_secondary.ABC_Z_1,
+      &mut self.buffer_secondary.ABC_Z_2,
     )
     .expect("Unable to fold secondary");
 
@@ -404,8 +457,8 @@ where
       E1::Scalar::from(self.i as u64),
       self.z0_primary.to_vec(),
       Some(self.zi_primary.clone()),
-      Some(self.r_U_secondary.clone()),
-      Some(self.l_u_secondary.clone()),
+      Some(r_U_secondary_i),
+      Some(l_u_secondary_i),
       Some(Commitment::<E2>::decompress(&nifs_secondary.comm_T)?),
     );
 
@@ -425,15 +478,18 @@ where
       .expect("Nova error unsat");
 
     // fold the primary circuit's instance
-    let (nifs_primary, (r_U_primary, r_W_primary)) = NIFS::prove(
+    let nifs_primary = NIFS::prove_mut(
       &pp.ck_primary,
       &pp.ro_consts_primary,
       &pp.digest(),
       &pp.r1cs_shape_primary,
-      &self.r_U_primary,
-      &self.r_W_primary,
+      &mut self.r_U_primary,
+      &mut self.r_W_primary,
       &l_u_primary,
       &l_w_primary,
+      &mut self.buffer_primary.T,
+      &mut self.buffer_primary.ABC_Z_1,
+      &mut self.buffer_primary.ABC_Z_2,
     )
     .expect("Unable to fold primary");
 
@@ -446,7 +502,7 @@ where
       E2::Scalar::from(self.i as u64),
       self.z0_secondary.to_vec(),
       Some(self.zi_secondary.clone()),
-      Some(self.r_U_primary.clone()),
+      Some(r_U_primary_i),
       Some(l_u_primary),
       Some(Commitment::<E1>::decompress(&nifs_primary.comm_T)?),
     );
@@ -478,14 +534,8 @@ where
     self.l_u_secondary = l_u_secondary;
     self.l_w_secondary = l_w_secondary;
 
-    self.r_U_primary = r_U_primary;
-    self.r_W_primary = r_W_primary;
-
     self.i += 1;
 
-    self.r_U_secondary = r_U_secondary;
-    self.r_W_secondary = r_W_secondary;
-
     Ok(())
   }
 
diff --git a/src/nifs.rs b/src/nifs.rs
index 878ae05f..bfa1ff19 100644
--- a/src/nifs.rs
+++ b/src/nifs.rs
@@ -4,7 +4,9 @@
 use crate::{
   constants::{NUM_CHALLENGE_BITS, NUM_FE_FOR_RO},
   errors::NovaError,
-  r1cs::{R1CSInstance, R1CSShape, R1CSWitness, RelaxedR1CSInstance, RelaxedR1CSWitness},
+  r1cs::{
+    R1CSInstance, R1CSResult, R1CSShape, R1CSWitness, RelaxedR1CSInstance, RelaxedR1CSWitness,
+  },
   scalar_as_base,
   traits::{commitment::CommitmentTrait, AbsorbInROTrait, Engine, ROTrait},
   Commitment, CommitmentKey, CompressedCommitment,
@@ -73,6 +75,56 @@ impl<E: Engine> NIFS<E> {
     ))
   }
 
+  /// Takes as input a Relaxed R1CS instance-witness tuple `(U1, W1)` and
+  /// an R1CS instance-witness tuple `(U2, W2)` with the same structure `shape`
+  /// and defined with respect to the same `ck`, and updates `(U1, W1)` by folding
+  /// `(U2, W2)` into it with the guarantee that the updated witness `W` satisfies
+  /// the updated instance `U` if and only if `W1` satisfies `U1` and `W2` satisfies `U2`.
+  #[allow(clippy::too_many_arguments)]
+  pub fn prove_mut(
+    ck: &CommitmentKey<E>,
+    ro_consts: &ROConstants<E>,
+    pp_digest: &E::Scalar,
+    S: &R1CSShape<E>,
+    U1: &mut RelaxedR1CSInstance<E>,
+    W1: &mut RelaxedR1CSWitness<E>,
+    U2: &R1CSInstance<E>,
+    W2: &R1CSWitness<E>,
+    T: &mut Vec<E::Scalar>,
+    ABC_Z_1: &mut R1CSResult<E>,
+    ABC_Z_2: &mut R1CSResult<E>,
+  ) -> Result<NIFS<E>, NovaError> {
+    // initialize a new RO
+    let mut ro = E::RO::new(ro_consts.clone(), NUM_FE_FOR_RO);
+
+    // append the digest of pp to the transcript
+    ro.absorb(scalar_as_base::<E>(*pp_digest));
+
+    // append U1 and U2 to transcript
+    U1.absorb_in_ro(&mut ro);
+    U2.absorb_in_ro(&mut ro);
+
+    // compute a commitment to the cross-term
+    let comm_T = S.commit_T_into(ck, U1, W1, U2, W2, T, ABC_Z_1, ABC_Z_2)?;
+
+    // append `comm_T` to the transcript and obtain a challenge
+    comm_T.absorb_in_ro(&mut ro);
+
+    // compute a challenge from the RO
+    let r = ro.squeeze(NUM_CHALLENGE_BITS);
+
+    // fold the instance using `r` and `comm_T`
+    U1.fold_mut(U2, &comm_T, &r);
+
+    // fold the witness using `r` and `T`
+    W1.fold_mut(W2, T, &r)?;
+
+    // return the commitment
+    Ok(Self {
+      comm_T: comm_T.compress(),
+    })
+  }
+
   /// Takes as input a relaxed R1CS instance `U1` and R1CS instance `U2`
   /// with the same shape and defined with respect to the same parameters,
   /// and outputs a folded instance `U` with the same shape,
diff --git a/src/r1cs/mod.rs b/src/r1cs/mod.rs
index f34bd133..a94345a2 100644
--- a/src/r1cs/mod.rs
+++ b/src/r1cs/mod.rs
@@ -47,6 +47,14 @@ pub struct R1CSShape<E: Engine> {
 
 impl<E: Engine> SimpleDigestible for R1CSShape<E> {}
 
+/// A type that holds the result of a R1CS multiplication
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct R1CSResult<E: Engine> {
+  pub(crate) AZ: Vec<E::Scalar>,
+  pub(crate) BZ: Vec<E::Scalar>,
+  pub(crate) CZ: Vec<E::Scalar>,
+}
+
 /// A type that holds a witness for a given R1CS instance
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct R1CSWitness<E: Engine> {
@@ -205,6 +213,32 @@ impl<E: Engine> R1CSShape<E> {
     Ok((Az, Bz, Cz))
   }
 
+  pub(crate) fn multiply_witness_into(
+    &self,
+    W: &[E::Scalar],
+    u: &E::Scalar,
+    X: &[E::Scalar],
+    ABC_Z: &mut R1CSResult<E>,
+  ) -> Result<(), NovaError> {
+    if X.len() != self.num_io || W.len() != self.num_vars {
+      return Err(NovaError::InvalidWitnessLength);
+    }
+
+    let R1CSResult { AZ, BZ, CZ } = ABC_Z;
+
+    rayon::join(
+      || self.A.multiply_witness_into(W, u, X, AZ),
+      || {
+        rayon::join(
+          || self.B.multiply_witness_into(W, u, X, BZ),
+          || self.C.multiply_witness_into(W, u, X, CZ),
+        )
+      },
+    );
+
+    Ok(())
+  }
+
   /// Checks if the Relaxed R1CS instance is satisfiable given a witness and its shape
   pub fn is_sat_relaxed(
     &self,
@@ -317,6 +351,53 @@ impl<E: Engine> R1CSShape<E> {
     Ok((T, comm_T))
   }
 
+  /// A method to compute a commitment to the cross-term `T` given a
+  /// Relaxed R1CS instance-witness pair and an R1CS instance-witness pair
+  ///
+  /// This is [`R1CSShape::commit_T`] but into a buffer.
+  pub fn commit_T_into(
+    &self,
+    ck: &CommitmentKey<E>,
+    U1: &RelaxedR1CSInstance<E>,
+    W1: &RelaxedR1CSWitness<E>,
+    U2: &R1CSInstance<E>,
+    W2: &R1CSWitness<E>,
+    T: &mut Vec<E::Scalar>,
+    ABC_Z_1: &mut R1CSResult<E>,
+    ABC_Z_2: &mut R1CSResult<E>,
+  ) -> Result<Commitment<E>, NovaError> {
+    self.multiply_witness_into(&W1.W, &U1.u, &U1.X, ABC_Z_1)?;
+
+    let R1CSResult {
+      AZ: AZ_1,
+      BZ: BZ_1,
+      CZ: CZ_1,
+    } = ABC_Z_1;
+
+    self.multiply_witness_into(&W2.W, &E::Scalar::ONE, &U2.X, ABC_Z_2)?;
+
+    let R1CSResult {
+      AZ: AZ_2,
+      BZ: BZ_2,
+      CZ: CZ_2,
+    } = ABC_Z_2;
+
+    // this doesn't allocate memory but has bad temporal cache locality -- should test to see which is faster
+    T.clear();
+
+    (0..AZ_1.len())
+      .into_par_iter()
+      .map(|i| {
+        let AZ_1_circ_BZ_2 = AZ_1[i] * BZ_2[i];
+        let AZ_2_circ_BZ_1 = AZ_2[i] * BZ_1[i];
+        let u_1_cdot_Cz_2_plus_Cz_1 = U1.u * CZ_2[i] + CZ_1[i];
+        AZ_1_circ_BZ_2 + AZ_2_circ_BZ_1 - u_1_cdot_Cz_2_plus_Cz_1
+      })
+      .collect_into_vec(T);
+
+    Ok(CE::<E>::commit(ck, T))
+  }
+
   /// Pads the `R1CSShape` so that the number of variables is a power of two
   /// Renumbers variables to accommodate padded variables
   pub fn pad(&self) -> Self {
@@ -379,6 +460,17 @@ impl<E: Engine> R1CSShape<E> {
   }
 }
 
+impl<E: Engine> R1CSResult<E> {
+  /// Produces a default `R1CSResult` given an `R1CSShape`
+  pub fn default(S: &R1CSShape<E>) -> R1CSResult<E> {
+    R1CSResult {
+      AZ: vec![E::Scalar::ZERO; S.num_cons],
+      BZ: vec![E::Scalar::ZERO; S.num_cons],
+      CZ: vec![E::Scalar::ZERO; S.num_cons],
+    }
+  }
+}
+
 impl<E: Engine> R1CSWitness<E> {
   /// A method to create a witness object using a vector of scalars
   pub fn new(S: &R1CSShape<E>, W: Vec<E::Scalar>) -> Result<R1CSWitness<E>, NovaError> {
@@ -468,6 +560,31 @@ impl<E: Engine> RelaxedR1CSWitness<E> {
     Ok(RelaxedR1CSWitness { W, E })
   }
 
+  /// Mutably folds an incoming `R1CSWitness` into the current one
+  pub fn fold_mut(
+    &mut self,
+    W2: &R1CSWitness<E>,
+    T: &[E::Scalar],
+    r: &E::Scalar,
+  ) -> Result<(), NovaError> {
+    if self.W.len() != W2.W.len() {
+      return Err(NovaError::InvalidWitnessLength);
+    }
+
+    self
+      .W
+      .par_iter_mut()
+      .zip_eq(&W2.W)
+      .for_each(|(a, b)| *a += *r * *b);
+    self
+      .E
+      .par_iter_mut()
+      .zip_eq(T)
+      .for_each(|(a, b)| *a += *r * *b);
+
+    Ok(())
+  }
+
   /// Pads the provided witness to the correct length
   pub fn pad(&self, S: &R1CSShape<E>) -> RelaxedR1CSWitness<E> {
     let mut W = self.W.clone();
@@ -549,6 +666,19 @@ impl<E: Engine> RelaxedR1CSInstance<E> {
       u,
     }
   }
+
+  /// Mutably folds an incoming `RelaxedR1CSInstance` into the current one
+  pub fn fold_mut(&mut self, U2: &R1CSInstance<E>, comm_T: &Commitment<E>, r: &E::Scalar) {
+    let (X2, comm_W_2) = (&U2.X, &U2.comm_W);
+
+    // weighted sum of X, comm_W, comm_E, and u
+    self.X.par_iter_mut().zip_eq(X2).for_each(|(a, b)| {
+      *a += *r * *b;
+    });
+    self.comm_W = self.comm_W + *comm_W_2 * *r;
+    self.comm_E = self.comm_E + *comm_T * *r;
+    self.u += *r;
+  }
 }
 
 impl<E: Engine> TranscriptReprTrait<E::GE> for RelaxedR1CSInstance<E> {
@@ -579,6 +709,11 @@ impl<E: Engine> AbsorbInROTrait<E> for RelaxedR1CSInstance<E> {
   }
 }
 
+/// Empty buffer for `commit_T_into`
+pub fn default_T<E: Engine>(shape: &R1CSShape<E>) -> Vec<E::Scalar> {
+  Vec::with_capacity(shape.num_cons)
+}
+
 #[cfg(test)]
 mod tests {
   use ff::Field;
diff --git a/src/r1cs/sparse.rs b/src/r1cs/sparse.rs
index 94634834..ce1db8ee 100644
--- a/src/r1cs/sparse.rs
+++ b/src/r1cs/sparse.rs
@@ -107,9 +107,23 @@ impl<F: PrimeField> SparseMatrix<F> {
   /// Multiply by a witness representing a dense vector; uses rayon to parallelize.
   /// This does not check that the shape of the matrix/vector are compatible.
   pub fn multiply_witness_unchecked(&self, W: &[F], u: &F, X: &[F]) -> Vec<F> {
-    let num_vars = W.len();
     // preallocate the result vector
-    let mut result = Vec::with_capacity(self.indptr.len() - 1);
+    let mut sink = Vec::with_capacity(self.indptr.len() - 1);
+    self.multiply_witness_into_unchecked(W, u, X, &mut sink);
+    sink
+  }
+
+  /// Multiply by a witness representing a dense vector; uses rayon to parallelize.
+  pub fn multiply_witness_into(&self, W: &[F], u: &F, X: &[F], sink: &mut Vec<F>) {
+    assert_eq!(self.cols, W.len() + X.len() + 1, "invalid shape");
+
+    self.multiply_witness_into_unchecked(W, u, X, sink);
+  }
+
+  /// Multiply by a witness representing a dense vector; uses rayon to parallelize.
+  /// This does not check that the shape of the matrix/vector are compatible.
+  pub fn multiply_witness_into_unchecked(&self, W: &[F], u: &F, X: &[F], sink: &mut Vec<F>) {
+    let num_vars = W.len();
     self
       .indptr
       .par_windows(2)
@@ -125,8 +139,7 @@ impl<F: PrimeField> SparseMatrix<F> {
             acc + val
           })
       })
-      .collect_into_vec(&mut result);
-    result
+      .collect_into_vec(sink);
   }
 
   /// number of non-zero entries