bigint: Split Modulus (and PartialModulus) into a submodule.

Better encapsulate `Modulus` and `PartialModulus`. `git diff HEAD^1:src/arithmetic/bigint.rs src/arithmetic/bigint/modulus.rs`
2022-11-28 17:30:02 -08:00 · 2022-11-28 17:30:02 -08:00 · 7ab206e423
commit 7ab206e423
parent dc47d5f3a7
2 changed files with 319 additions and 262 deletions
--- a/src/arithmetic/bigint.rs
+++ b/src/arithmetic/bigint.rs
@ -36,13 +36,16 @@
 //! [Static checking of units in Servo]:
 //!     https://blog.mozilla.org/research/2014/06/23/static-checking-of-units-in-servo/

-pub(crate) use self::private_exponent::PrivateExponent;
+pub(crate) use self::{
+    modulus::{Modulus, PartialModulus, MODULUS_MAX_LIMBS},
+    private_exponent::PrivateExponent,
+};
 pub(crate) use super::nonnegative::Nonnegative;
 use crate::{
    arithmetic::montgomery::*,
    bits, bssl, c, cpu, error,
    limb::{self, Limb, LimbMask, LIMB_BITS, LIMB_BYTES},
-    polyfill::{u64_from_usize, LeadingZerosStripped},
+    polyfill::u64_from_usize,
 };
 use alloc::{borrow::ToOwned as _, boxed::Box, vec};
 use core::{
@ -52,6 +55,7 @@ use core::{
 };

 mod bn_mul_mont_fallback;
+mod modulus;
 mod private_exponent;

 /// A prime modulus.
@ -140,7 +144,7 @@ impl<M> BoxedLimbs<M> {
    ) -> Result<Self, error::Unspecified> {
        let mut r = Self::zero(m.width());
        limb::parse_big_endian_and_pad_consttime(input, &mut r)?;
-        if limb::limbs_less_than_limbs_consttime(&r, &m.limbs) != LimbMask::True {
+        if limb::limbs_less_than_limbs_consttime(&r, m.limbs()) != LimbMask::True {
            return Err(error::Unspecified);
        }
        Ok(r)
@ -205,239 +209,6 @@ pub unsafe trait NotMuchSmallerModulus<L>: SmallerModulus<L> {}

 pub trait PublicModulus {}

-/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
-/// limbs. For a long time we have required 4 limbs for all targets, though
-/// this may be unnecessary. TODO: Replace this with
-/// `n.len() < 256 / LIMB_BITS` so that 32-bit and 64-bit platforms behave the
-/// same.
-pub const MODULUS_MIN_LIMBS: usize = 4;
-
-pub const MODULUS_MAX_LIMBS: usize = 8192 / LIMB_BITS;
-
-/// The modulus *m* for a ring ℤ/mℤ, along with the precomputed values needed
-/// for efficient Montgomery multiplication modulo *m*. The value must be odd
-/// and larger than 2. The larger-than-1 requirement is imposed, at least, by
-/// the modular inversion code.
-pub struct Modulus<M> {
-    limbs: BoxedLimbs<M>, // Also `value >= 3`.
-
-    // n0 * N == -1 (mod r).
-    //
-    // r == 2**(N0_LIMBS_USED * LIMB_BITS) and LG_LITTLE_R == lg(r). This
-    // ensures that we can do integer division by |r| by simply ignoring
-    // `N0_LIMBS_USED` limbs. Similarly, we can calculate values modulo `r` by
-    // just looking at the lowest `N0_LIMBS_USED` limbs. This is what makes
-    // Montgomery multiplication efficient.
-    //
-    // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography
-    // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a
-    // multi-limb Montgomery multiplication of a * b (mod n), given the
-    // unreduced product t == a * b, we repeatedly calculate:
-    //
-    //    t1 := t % r         |t1| is |t|'s lowest limb (see previous paragraph).
-    //    t2 := t1*n0*n
-    //    t3 := t + t2
-    //    t := t3 / r         copy all limbs of |t3| except the lowest to |t|.
-    //
-    // In the last step, it would only make sense to ignore the lowest limb of
-    // |t3| if it were zero. The middle steps ensure that this is the case:
-    //
-    //                            t3 ==  0 (mod r)
-    //                        t + t2 ==  0 (mod r)
-    //                   t + t1*n0*n ==  0 (mod r)
-    //                       t1*n0*n == -t (mod r)
-    //                        t*n0*n == -t (mod r)
-    //                          n0*n == -1 (mod r)
-    //                            n0 == -1/n (mod r)
-    //
-    // Thus, in each iteration of the loop, we multiply by the constant factor
-    // n0, the negative inverse of n (mod r).
-    //
-    // TODO(perf): Not all 32-bit platforms actually make use of n0[1]. For the
-    // ones that don't, we could use a shorter `R` value and use faster `Limb`
-    // calculations instead of double-precision `u64` calculations.
-    n0: N0,
-
-    oneRR: One<M, RR>,
-
-    cpu_features: cpu::Features,
-}
-
-impl<M: PublicModulus> Clone for Modulus<M> {
-    fn clone(&self) -> Self {
-        Self {
-            limbs: self.limbs.clone(),
-            n0: self.n0.clone(),
-            oneRR: self.oneRR.clone(),
-            cpu_features: self.cpu_features,
-        }
-    }
-}
-
-impl<M: PublicModulus> core::fmt::Debug for Modulus<M> {
-    fn fmt(&self, fmt: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
-        fmt.debug_struct("Modulus")
-            // TODO: Print modulus value.
-            .finish()
-    }
-}
-
-impl<M> Modulus<M> {
-    pub(crate) fn from_be_bytes_with_bit_length(
-        input: untrusted::Input,
-        cpu_features: cpu::Features,
-    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
-        let limbs = BoxedLimbs::positive_minimal_width_from_be_bytes(input)?;
-        Self::from_boxed_limbs(limbs, cpu_features)
-    }
-
-    pub(crate) fn from_nonnegative_with_bit_length(
-        n: Nonnegative,
-        cpu_features: cpu::Features,
-    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
-        let limbs = BoxedLimbs {
-            limbs: n.into_limbs(),
-            m: PhantomData,
-        };
-        Self::from_boxed_limbs(limbs, cpu_features)
-    }
-
-    fn from_boxed_limbs(
-        n: BoxedLimbs<M>,
-        cpu_features: cpu::Features,
-    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
-        if n.len() > MODULUS_MAX_LIMBS {
-            return Err(error::KeyRejected::too_large());
-        }
-        if n.len() < MODULUS_MIN_LIMBS {
-            return Err(error::KeyRejected::unexpected_error());
-        }
-        if limb::limbs_are_even_constant_time(&n) != LimbMask::False {
-            return Err(error::KeyRejected::invalid_component());
-        }
-        if limb::limbs_less_than_limb_constant_time(&n, 3) != LimbMask::False {
-            return Err(error::KeyRejected::unexpected_error());
-        }
-
-        // n_mod_r = n % r. As explained in the documentation for `n0`, this is
-        // done by taking the lowest `N0_LIMBS_USED` limbs of `n`.
-        #[allow(clippy::useless_conversion)]
-        let n0 = {
-            prefixed_extern! {
-                fn bn_neg_inv_mod_r_u64(n: u64) -> u64;
-            }
-
-            // XXX: u64::from isn't guaranteed to be constant time.
-            let mut n_mod_r: u64 = u64::from(n[0]);
-
-            if N0_LIMBS_USED == 2 {
-                // XXX: If we use `<< LIMB_BITS` here then 64-bit builds
-                // fail to compile because of `deny(exceeding_bitshifts)`.
-                debug_assert_eq!(LIMB_BITS, 32);
-                n_mod_r |= u64::from(n[1]) << 32;
-            }
-            N0::from(unsafe { bn_neg_inv_mod_r_u64(n_mod_r) })
-        };
-
-        let bits = limb::limbs_minimal_bits(&n.limbs);
-        let oneRR = {
-            let partial = PartialModulus {
-                limbs: &n.limbs,
-                n0: n0.clone(),
-                m: PhantomData,
-                cpu_features,
-            };
-
-            One::newRR(&partial, bits)
-        };
-
-        Ok((
-            Self {
-                limbs: n,
-                n0,
-                oneRR,
-                cpu_features,
-            },
-            bits,
-        ))
-    }
-
-    #[inline]
-    fn width(&self) -> Width<M> {
-        self.limbs.width()
-    }
-
-    fn zero<E>(&self) -> Elem<M, E> {
-        Elem {
-            limbs: BoxedLimbs::zero(self.width()),
-            encoding: PhantomData,
-        }
-    }
-
-    // TODO: Get rid of this
-    fn one(&self) -> Elem<M, Unencoded> {
-        let mut r = self.zero();
-        r.limbs[0] = 1;
-        r
-    }
-
-    pub fn oneRR(&self) -> &One<M, RR> {
-        &self.oneRR
-    }
-
-    pub fn to_elem<L>(&self, l: &Modulus<L>) -> Elem<L, Unencoded>
-    where
-        M: SmallerModulus<L>,
-    {
-        // TODO: Encode this assertion into the `where` above.
-        assert_eq!(self.width().num_limbs, l.width().num_limbs);
-        let limbs = self.limbs.clone();
-        Elem {
-            limbs: BoxedLimbs {
-                limbs: limbs.limbs,
-                m: PhantomData,
-            },
-            encoding: PhantomData,
-        }
-    }
-
-    pub(crate) fn as_partial(&self) -> PartialModulus<M> {
-        PartialModulus {
-            limbs: &self.limbs,
-            n0: self.n0.clone(),
-            m: PhantomData,
-            cpu_features: self.cpu_features,
-        }
-    }
-}
-
-impl<M: PublicModulus> Modulus<M> {
-    pub fn be_bytes(&self) -> LeadingZerosStripped<impl ExactSizeIterator<Item = u8> + Clone + '_> {
-        LeadingZerosStripped::new(limb::unstripped_be_bytes(&self.limbs))
-    }
-}
-
-pub(crate) struct PartialModulus<'a, M> {
-    limbs: &'a [Limb],
-    n0: N0,
-    m: PhantomData<M>,
-    cpu_features: cpu::Features,
-}
-
-impl<M> PartialModulus<'_, M> {
-    // TODO: XXX Avoid duplication with `Modulus`.
-    fn zero(&self) -> Elem<M, R> {
-        let width = Width {
-            num_limbs: self.limbs.len(),
-            m: PhantomData,
-        };
-        Elem {
-            limbs: BoxedLimbs::zero(width),
-            encoding: PhantomData,
-        }
-    }
-}
-
 /// Elements of ℤ/mℤ for some modulus *m*.
 //
 // Defaulting `E` to `Unencoded` is a convenience for callers from outside this
@ -480,7 +251,7 @@ impl<M, E: ReductionEncoding> Elem<M, E> {
        let mut one = [0; MODULUS_MAX_LIMBS];
        one[0] = 1;
        let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
-        limbs_mont_mul(&mut limbs, one, &m.limbs, &m.n0, m.cpu_features);
+        limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
        Elem {
            limbs,
            encoding: PhantomData,
@ -547,7 +318,7 @@ fn elem_mul_<M, AF, BF>(
 where
    (AF, BF): ProductEncoding,
 {
-    limbs_mont_mul(&mut b.limbs, &a.limbs, m.limbs, &m.n0, m.cpu_features);
+    limbs_mont_mul(&mut b.limbs, &a.limbs, m.limbs(), m.n0(), m.cpu_features());
    Elem {
        limbs: b.limbs,
        encoding: PhantomData,
@ -562,8 +333,8 @@ fn elem_mul_by_2<M, AF>(a: &mut Elem<M, AF>, m: &PartialModulus<M>) {
        LIMBS_shl_mod(
            a.limbs.as_mut_ptr(),
            a.limbs.as_ptr(),
-            m.limbs.as_ptr(),
-            m.limbs.len(),
+            m.limbs().as_ptr(),
+            m.limbs().len(),
        );
    }
 }
@ -573,8 +344,8 @@ pub fn elem_reduced_once<Larger, Smaller: SlightlySmallerModulus<Larger>>(
    m: &Modulus<Smaller>,
 ) -> Elem<Smaller, Unencoded> {
    let mut r = a.limbs.clone();
-    assert!(r.len() <= m.limbs.len());
-    limb::limbs_reduce_once_constant_time(&mut r, &m.limbs);
+    assert!(r.len() <= m.limbs().len());
+    limb::limbs_reduce_once_constant_time(&mut r, m.limbs());
    Elem {
        limbs: BoxedLimbs {
            limbs: r.limbs,
@ -594,7 +365,7 @@ pub fn elem_reduced<Larger, Smaller: NotMuchSmallerModulus<Larger>>(
    tmp.copy_from_slice(&a.limbs);

    let mut r = m.zero();
-    limbs_from_mont_in_place(&mut r.limbs, tmp, &m.limbs, &m.n0);
+    limbs_from_mont_in_place(&mut r.limbs, tmp, m.limbs(), m.n0());
    r
 }

@ -605,7 +376,7 @@ fn elem_squared<M, E>(
 where
    (E, E): ProductEncoding,
 {
-    limbs_mont_square(&mut a.limbs, m.limbs, &m.n0, m.cpu_features);
+    limbs_mont_square(&mut a.limbs, m.limbs(), m.n0(), m.cpu_features());
    Elem {
        limbs: a.limbs,
        encoding: PhantomData,
@ -623,7 +394,7 @@ pub fn elem_widen<Larger, Smaller: SmallerModulus<Larger>>(

 // TODO: Document why this works for all Montgomery factors.
 pub fn elem_add<M, E>(mut a: Elem<M, E>, b: Elem<M, E>, m: &Modulus<M>) -> Elem<M, E> {
-    limb::limbs_add_assign_mod(&mut a.limbs, &b.limbs, &m.limbs);
+    limb::limbs_add_assign_mod(&mut a.limbs, &b.limbs, m.limbs());
    a
 }

@ -644,8 +415,8 @@ pub fn elem_sub<M, E>(mut a: Elem<M, E>, b: &Elem<M, E>, m: &Modulus<M>) -> Elem
            a.limbs.as_mut_ptr(),
            a.limbs.as_ptr(),
            b.limbs.as_ptr(),
-            m.limbs.as_ptr(),
-            m.limbs.len(),
+            m.limbs().as_ptr(),
+            m.limbs().len(),
        );
    }
    a
@ -780,7 +551,7 @@ pub fn elem_exp_consttime<M>(
    const WINDOW_BITS: usize = 5;
    const TABLE_ENTRIES: usize = 1 << WINDOW_BITS;

-    let num_limbs = m.limbs.len();
+    let num_limbs = m.limbs().len();

    let mut table = vec![0; TABLE_ENTRIES * num_limbs];

@ -823,7 +594,6 @@ pub fn elem_exp_consttime<M>(
    fn entry_mut(table: &mut [Limb], i: usize, num_limbs: usize) -> &mut [Limb] {
        &mut table[(i * num_limbs)..][..num_limbs]
    }
-    let num_limbs = m.limbs.len();
    entry_mut(&mut table, 0, num_limbs).copy_from_slice(&tmp.limbs);
    entry_mut(&mut table, 1, num_limbs).copy_from_slice(&base.limbs);
    for i in 2..TABLE_ENTRIES {
@ -836,7 +606,7 @@ pub fn elem_exp_consttime<M>(
        let src1 = entry(previous, src1, num_limbs);
        let src2 = entry(previous, src2, num_limbs);
        let dst = entry_mut(rest, 0, num_limbs);
-        limbs_mont_product(dst, src1, src2, &m.limbs, &m.n0, m.cpu_features);
+        limbs_mont_product(dst, src1, src2, m.limbs(), m.n0(), m.cpu_features());
    }

    let (r, _) = limb::fold_5_bit_windows(
@ -874,7 +644,7 @@ pub fn elem_exp_consttime<M>(
    // Pretty much all the math here requires CPU feature detection to have
    // been done. `cpu_features` isn't threaded through all the internal
    // functions, so just make it clear that it has been done at this point.
-    let _ = m.cpu_features;
+    let cpu_features = m.cpu_features();

    // The x86_64 assembly was written under the assumption that the input data
    // is aligned to `MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH` bytes, which was/is
@ -890,7 +660,7 @@ pub fn elem_exp_consttime<M>(
    const WINDOW_BITS: usize = 5;
    const TABLE_ENTRIES: usize = 1 << WINDOW_BITS;

-    let num_limbs = m.limbs.len();
+    let num_limbs = m.limbs().len();

    const ALIGNMENT: usize = 64;
    assert_eq!(ALIGNMENT % LIMB_BYTES, 0);
@ -914,7 +684,7 @@ pub fn elem_exp_consttime<M>(
    const M: usize = BASE + 1; // `np` in OpenSSL

    entry_mut(state, BASE, num_limbs).copy_from_slice(&base.limbs);
-    entry_mut(state, M, num_limbs).copy_from_slice(&m.limbs);
+    entry_mut(state, M, num_limbs).copy_from_slice(m.limbs());

    fn scatter(table: &mut [Limb], state: &[Limb], i: Window, num_limbs: usize) {
        prefixed_extern! {
@ -1013,7 +783,7 @@ pub fn elem_exp_consttime<M>(
    {
        let acc = entry_mut(state, ACC, num_limbs);
        acc[0] = 1;
-        limbs_mont_mul(acc, &m.oneRR.0.limbs, &m.limbs, &m.n0, m.cpu_features);
+        limbs_mont_mul(acc, &m.oneRR().0.limbs, m.limbs(), m.n0(), cpu_features);
    }
    scatter(table, state, 0, num_limbs);

@ -1024,9 +794,9 @@ pub fn elem_exp_consttime<M>(
    for i in 2..(TABLE_ENTRIES as Window) {
        if i % 2 == 0 {
            // TODO: Optimize this to avoid gathering
-            gather_square(table, state, &m.n0, i / 2, num_limbs, m.cpu_features);
+            gather_square(table, state, m.n0(), i / 2, num_limbs, cpu_features);
        } else {
-            gather_mul_base(table, state, &m.n0, i - 1, num_limbs)
+            gather_mul_base(table, state, m.n0(), i - 1, num_limbs)
        };
        scatter(table, state, i, num_limbs);
    }
@ -1038,7 +808,7 @@ pub fn elem_exp_consttime<M>(
            state
        },
        |state, window| {
-            power(table, state, &m.n0, window, num_limbs);
+            power(table, state, m.n0(), window, num_limbs);
            state
        },
    );
@ -1059,7 +829,7 @@ pub fn elem_exp_consttime<M>(
            entry(state, ACC, num_limbs).as_ptr(),
            core::ptr::null(),
            entry(state, M, num_limbs).as_ptr(),
-            &m.n0,
+            m.n0(),
            num_limbs,
        )
    })?;
@ -1106,11 +876,11 @@ impl Nonnegative {
    }

    pub fn verify_less_than_modulus<M>(&self, m: &Modulus<M>) -> Result<(), error::Unspecified> {
-        if self.limbs().len() > m.limbs.len() {
+        if self.limbs().len() > m.limbs().len() {
            return Err(error::Unspecified);
        }
-        if self.limbs().len() == m.limbs.len() {
-            if limb::limbs_less_than_limbs_consttime(self.limbs(), &m.limbs) != LimbMask::True {
+        if self.limbs().len() == m.limbs().len() {
+            if limb::limbs_less_than_limbs_consttime(self.limbs(), m.limbs()) != LimbMask::True {
                return Err(error::Unspecified);
            }
        }
@ -1275,7 +1045,7 @@ prefixed_extern! {

 #[cfg(test)]
 mod tests {
-    use super::*;
+    use super::{modulus::MODULUS_MIN_LIMBS, *};
    use crate::test;
    use alloc::format;

--- a/src/arithmetic/bigint/modulus.rs
+++ b/src/arithmetic/bigint/modulus.rs
@ -0,0 +1,287 @@
+// Copyright 2015-2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    super::montgomery::{Unencoded, R, RR},
+    BoxedLimbs, Elem, Nonnegative, One, PublicModulus, SmallerModulus, Width, N0, N0_LIMBS_USED,
+};
+use crate::{
+    bits, cpu, error,
+    limb::{self, Limb, LimbMask, LIMB_BITS},
+    polyfill::LeadingZerosStripped,
+};
+use core::marker::PhantomData;
+
+/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
+/// limbs. For a long time we have required 4 limbs for all targets, though
+/// this may be unnecessary. TODO: Replace this with
+/// `n.len() < 256 / LIMB_BITS` so that 32-bit and 64-bit platforms behave the
+/// same.
+pub const MODULUS_MIN_LIMBS: usize = 4;
+
+pub const MODULUS_MAX_LIMBS: usize = 8192 / LIMB_BITS;
+
+/// The modulus *m* for a ring ℤ/mℤ, along with the precomputed values needed
+/// for efficient Montgomery multiplication modulo *m*. The value must be odd
+/// and larger than 2. The larger-than-1 requirement is imposed, at least, by
+/// the modular inversion code.
+pub struct Modulus<M> {
+    limbs: BoxedLimbs<M>, // Also `value >= 3`.
+
+    // n0 * N == -1 (mod r).
+    //
+    // r == 2**(N0_LIMBS_USED * LIMB_BITS) and LG_LITTLE_R == lg(r). This
+    // ensures that we can do integer division by |r| by simply ignoring
+    // `N0_LIMBS_USED` limbs. Similarly, we can calculate values modulo `r` by
+    // just looking at the lowest `N0_LIMBS_USED` limbs. This is what makes
+    // Montgomery multiplication efficient.
+    //
+    // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography
+    // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a
+    // multi-limb Montgomery multiplication of a * b (mod n), given the
+    // unreduced product t == a * b, we repeatedly calculate:
+    //
+    //    t1 := t % r         |t1| is |t|'s lowest limb (see previous paragraph).
+    //    t2 := t1*n0*n
+    //    t3 := t + t2
+    //    t := t3 / r         copy all limbs of |t3| except the lowest to |t|.
+    //
+    // In the last step, it would only make sense to ignore the lowest limb of
+    // |t3| if it were zero. The middle steps ensure that this is the case:
+    //
+    //                            t3 ==  0 (mod r)
+    //                        t + t2 ==  0 (mod r)
+    //                   t + t1*n0*n ==  0 (mod r)
+    //                       t1*n0*n == -t (mod r)
+    //                        t*n0*n == -t (mod r)
+    //                          n0*n == -1 (mod r)
+    //                            n0 == -1/n (mod r)
+    //
+    // Thus, in each iteration of the loop, we multiply by the constant factor
+    // n0, the negative inverse of n (mod r).
+    //
+    // TODO(perf): Not all 32-bit platforms actually make use of n0[1]. For the
+    // ones that don't, we could use a shorter `R` value and use faster `Limb`
+    // calculations instead of double-precision `u64` calculations.
+    n0: N0,
+
+    oneRR: One<M, RR>,
+
+    cpu_features: cpu::Features,
+}
+
+impl<M: PublicModulus> Clone for Modulus<M> {
+    fn clone(&self) -> Self {
+        Self {
+            limbs: self.limbs.clone(),
+            n0: self.n0.clone(),
+            oneRR: self.oneRR.clone(),
+            cpu_features: self.cpu_features,
+        }
+    }
+}
+
+impl<M: PublicModulus> core::fmt::Debug for Modulus<M> {
+    fn fmt(&self, fmt: &mut ::core::fmt::Formatter) -> Result<(), ::core::fmt::Error> {
+        fmt.debug_struct("Modulus")
+            // TODO: Print modulus value.
+            .finish()
+    }
+}
+
+impl<M> Modulus<M> {
+    pub(crate) fn from_be_bytes_with_bit_length(
+        input: untrusted::Input,
+        cpu_features: cpu::Features,
+    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
+        let limbs = BoxedLimbs::positive_minimal_width_from_be_bytes(input)?;
+        Self::from_boxed_limbs(limbs, cpu_features)
+    }
+
+    pub(crate) fn from_nonnegative_with_bit_length(
+        n: Nonnegative,
+        cpu_features: cpu::Features,
+    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
+        let limbs = BoxedLimbs {
+            limbs: n.into_limbs(),
+            m: PhantomData,
+        };
+        Self::from_boxed_limbs(limbs, cpu_features)
+    }
+
+    pub(super) fn from_boxed_limbs(
+        n: BoxedLimbs<M>,
+        cpu_features: cpu::Features,
+    ) -> Result<(Self, bits::BitLength), error::KeyRejected> {
+        if n.len() > MODULUS_MAX_LIMBS {
+            return Err(error::KeyRejected::too_large());
+        }
+        if n.len() < MODULUS_MIN_LIMBS {
+            return Err(error::KeyRejected::unexpected_error());
+        }
+        if limb::limbs_are_even_constant_time(&n) != LimbMask::False {
+            return Err(error::KeyRejected::invalid_component());
+        }
+        if limb::limbs_less_than_limb_constant_time(&n, 3) != LimbMask::False {
+            return Err(error::KeyRejected::unexpected_error());
+        }
+
+        // n_mod_r = n % r. As explained in the documentation for `n0`, this is
+        // done by taking the lowest `N0_LIMBS_USED` limbs of `n`.
+        #[allow(clippy::useless_conversion)]
+        let n0 = {
+            prefixed_extern! {
+                fn bn_neg_inv_mod_r_u64(n: u64) -> u64;
+            }
+
+            // XXX: u64::from isn't guaranteed to be constant time.
+            let mut n_mod_r: u64 = u64::from(n[0]);
+
+            if N0_LIMBS_USED == 2 {
+                // XXX: If we use `<< LIMB_BITS` here then 64-bit builds
+                // fail to compile because of `deny(exceeding_bitshifts)`.
+                debug_assert_eq!(LIMB_BITS, 32);
+                n_mod_r |= u64::from(n[1]) << 32;
+            }
+            N0::from(unsafe { bn_neg_inv_mod_r_u64(n_mod_r) })
+        };
+
+        let bits = limb::limbs_minimal_bits(&n.limbs);
+        let oneRR = {
+            let partial = PartialModulus {
+                limbs: &n.limbs,
+                n0: n0.clone(),
+                m: PhantomData,
+                cpu_features,
+            };
+
+            One::newRR(&partial, bits)
+        };
+
+        Ok((
+            Self {
+                limbs: n,
+                n0,
+                oneRR,
+                cpu_features,
+            },
+            bits,
+        ))
+    }
+
+    #[inline]
+    pub(super) fn cpu_features(&self) -> cpu::Features {
+        self.cpu_features
+    }
+
+    #[inline]
+    pub(super) fn limbs(&self) -> &[Limb] {
+        &self.limbs
+    }
+
+    #[inline]
+    pub(super) fn n0(&self) -> &N0 {
+        &self.n0
+    }
+
+    #[inline]
+    pub(super) fn width(&self) -> Width<M> {
+        self.limbs.width()
+    }
+
+    pub(super) fn zero<E>(&self) -> Elem<M, E> {
+        Elem {
+            limbs: BoxedLimbs::zero(self.width()),
+            encoding: PhantomData,
+        }
+    }
+
+    // TODO: Get rid of this
+    pub(super) fn one(&self) -> Elem<M, Unencoded> {
+        let mut r = self.zero();
+        r.limbs[0] = 1;
+        r
+    }
+
+    pub fn oneRR(&self) -> &One<M, RR> {
+        &self.oneRR
+    }
+
+    pub fn to_elem<L>(&self, l: &Modulus<L>) -> Elem<L, Unencoded>
+    where
+        M: SmallerModulus<L>,
+    {
+        // TODO: Encode this assertion into the `where` above.
+        assert_eq!(self.width().num_limbs, l.width().num_limbs);
+        let limbs = self.limbs.clone();
+        Elem {
+            limbs: BoxedLimbs {
+                limbs: limbs.limbs,
+                m: PhantomData,
+            },
+            encoding: PhantomData,
+        }
+    }
+
+    pub(crate) fn as_partial(&self) -> PartialModulus<M> {
+        PartialModulus {
+            limbs: &self.limbs,
+            n0: self.n0.clone(),
+            m: PhantomData,
+            cpu_features: self.cpu_features,
+        }
+    }
+}
+
+impl<M: PublicModulus> Modulus<M> {
+    pub fn be_bytes(&self) -> LeadingZerosStripped<impl ExactSizeIterator<Item = u8> + Clone + '_> {
+        LeadingZerosStripped::new(limb::unstripped_be_bytes(&self.limbs))
+    }
+}
+
+pub(crate) struct PartialModulus<'a, M> {
+    limbs: &'a [Limb],
+    n0: N0,
+    m: PhantomData<M>,
+    cpu_features: cpu::Features,
+}
+
+impl<M> PartialModulus<'_, M> {
+    // TODO: XXX Avoid duplication with `Modulus`.
+    pub(super) fn zero(&self) -> Elem<M, R> {
+        let width = Width {
+            num_limbs: self.limbs.len(),
+            m: PhantomData,
+        };
+        Elem {
+            limbs: BoxedLimbs::zero(width),
+            encoding: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub(super) fn limbs(&self) -> &[Limb] {
+        self.limbs
+    }
+
+    #[inline]
+    pub(super) fn n0(&self) -> &N0 {
+        &self.n0
+    }
+
+    #[inline]
+    pub fn cpu_features(&self) -> cpu::Features {
+        self.cpu_features
+    }
+}