bigint: Save one modular doubling in Montgomery RR setup.

Eliminate one modular doubling in Montgomery RR setup. This saves one public modulus modular doubling per RSA signature verification, at the cost of approximately one public-modulus-wide XOR. RsaKeyPair also sees similar savings per Modulus.
2023-11-11 16:30:25 -08:00 · 2023-11-11 16:30:25 -08:00 · 81e17e4b10
commit 81e17e4b10
parent 0349d2a332
2 changed files with 26 additions and 4 deletions
--- a/src/arithmetic/bigint.rs
+++ b/src/arithmetic/bigint.rs
@ -289,10 +289,19 @@ impl<M> One<M, RR> {
        let m_bits = m.len_bits().as_usize_bits();
        let r = (m_bits + (LIMB_BITS - 1)) / LIMB_BITS * LIMB_BITS;
-        // base = 2**(lg m - 1).
+        // base = 2**r - m.
        let bit = m_bits - 1;
        let mut base = m.zero();
-        base.limbs[bit / LIMB_BITS] = 1 << (bit % LIMB_BITS);
+        limb::limbs_negative_odd(&mut base.limbs, m.limbs());
        // Correct base to 2**(lg m) (mod m).
        let lg_m = m.len_bits().as_usize_bits();
        let leading_zero_bits_in_m = r - lg_m;
        if leading_zero_bits_in_m != 0 {
            debug_assert!(leading_zero_bits_in_m < LIMB_BITS);
            // `limbs_negative_odd` flipped all the leading zero bits to ones.
            // Flip them back.
            *base.limbs.last_mut().unwrap() &= (!0) >> leading_zero_bits_in_m;
        }
        // Double `base` so that base == R == 2**r (mod m). For normal moduli
        // that have the high bit of the highest limb set, this requires one
@ -312,7 +321,7 @@ impl<M> One<M, RR> {
        const LG_BASE: usize = 2; // Doubling vs. squaring trade-off.
        debug_assert_eq!(LG_BASE.count_ones(), 1); // Must be 2**n for n >= 0.
-        let doublings = r - bit + LG_BASE;
+        let doublings = leading_zero_bits_in_m + LG_BASE;
        // `m_bits >= LG_BASE` (for the currently chosen value of `LG_BASE`)
        // since we require the modulus to have at least `MODULUS_MIN_LIMBS`
        // limbs. `r >= m_bits` as seen above. So `r >= LG_BASE` and thus
--- a/src/limb.rs
+++ b/src/limb.rs
@ -350,6 +350,19 @@ pub(crate) fn limbs_add_assign_mod(a: &mut [Limb], b: &[Limb], m: &[Limb]) {
    unsafe { LIMBS_add_mod(a.as_mut_ptr(), a.as_ptr(), b.as_ptr(), m.as_ptr(), m.len()) }
 }
 // *r = -a, assuming a is odd.
 pub(crate) fn limbs_negative_odd(r: &mut [Limb], a: &[Limb]) {
    debug_assert_eq!(r.len(), a.len());
    // Two's complement step 1: flip all the bits.
    // The compiler should optimize this to vectorized (a ^ !0).
    r.iter_mut().zip(a.iter()).for_each(|(r, &a)| {
        *r = !a;
    });
    // Two's complement step 2: Add one. Since `a` is odd, `r` is even. Thus we
    // can use a bitwise or for addition.
    r[0] |= 1;
 }
 prefixed_extern! {
    fn LIMBS_are_zero(a: *const Limb, num_limbs: c::size_t) -> LimbMask;
    fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::size_t) -> LimbMask;