worldfnd · xrvdg · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 25, 2026
diff --git a/skyscraper/bn254-multiplier/gen_multiples.py b/skyscraper/bn254-multiplier/gen_multiples.py
@@ -0,0 +1,35 @@
+"""Generate Rust const lookup tables for multiples of the BN254 scalar field prime."""
+
+p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
+
+
+def int_to_limbs(size, n, count):
+    mask = 2**size - 1
+    limbs = []
+    for _ in range(count):
+        limbs.append(n & mask)
+        n >>= size
+    return limbs
+
+
+def main():
+    multiples_64 = [int_to_limbs(64, k * p, 4) for k in range(0, 6)]
+    multiples_51 = [int_to_limbs(51, k * p, 5) for k in range(0, 6)]
+
+    # Print 64-bit table (for constants.rs)
+    print("pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [")
+    for k, limbs in enumerate(multiples_64):
+        fmt = ", ".join(f"0x{l:016x}" for l in limbs)
+        print(f"    [{fmt}], // {k}P")
+    print("];")
+
+    # Print 51-bit table (for rne/constants.rs)
+    print("\npub const U51_P_MULTIPLES: [[u64; 5]; 6] = [")
+    for k, limbs in enumerate(multiples_51):
+        fmt = ", ".join(f"0x{l:013x}" for l in limbs)
+        print(f"    [{fmt}], // {k}P")
+    print("];")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs
@@ -1,17 +1,47 @@
-pub const U64_NP0: u64 = 0xc2e1f593efffffff;
+pub const U64_P: [u64; 4] = U64_P_MULTIPLES[1];
 
-pub const U64_P: [u64; 4] = [
-    0x43e1f593f0000001,
-    0x2833e84879b97091,
-    0xb85045b68181585d,
-    0x30644e72e131a029,
-];
+pub const U64_2P: [u64; 4] = U64_P_MULTIPLES[2];
 
-pub const U64_2P: [u64; 4] = [
-    0x87c3eb27e0000002,
-    0x5067d090f372e122,
-    0x70a08b6d0302b0ba,
-    0x60c89ce5c2634053,
+/// Lookup table: `U64_P_MULTIPLES[k]` = `k * P` for k in 0..=5.
+/// Index 0 is all-zeros; use as `x - U64_P_MULTIPLES[k]` to subtract k copies
+/// of P.
+pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [
+    [
+        0x0000000000000000,
+        0x0000000000000000,
+        0x0000000000000000,
+        0x0000000000000000,
+    ], // 0P
+    [
+        0x43e1f593f0000001,
+        0x2833e84879b97091,
+        0xb85045b68181585d,
+        0x30644e72e131a029,
+    ], // 1P
+    [
+        0x87c3eb27e0000002,
+        0x5067d090f372e122,
+        0x70a08b6d0302b0ba,
+        0x60c89ce5c2634053,
+    ], // 2P
+    [
+        0xcba5e0bbd0000003,
+        0x789bb8d96d2c51b3,
+        0x28f0d12384840917,
+        0x912ceb58a394e07d,
+    ], // 3P
+    [
+        0x0f87d64fc0000004,
+        0xa0cfa121e6e5c245,
+        0xe14116da06056174,
+        0xc19139cb84c680a6,
+    ], // 4P
+    [
+        0x5369cbe3b0000005,
+        0xc903896a609f32d6,
+        0x99915c908786b9d1,
+        0xf1f5883e65f820d0,
+    ], // 5P
 ];
 
 // R mod P

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
@@ -13,7 +13,7 @@ pub mod rtz;
 pub mod constants;
 pub mod rne;
 mod scalar;
-mod utils;
+pub mod utils;
 
 #[cfg(not(target_arch = "wasm32"))] // Proptest not supported on WASI
 mod test_utils;

diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs
@@ -6,12 +6,53 @@ use crate::pow_2;
 pub const U51_NP0: u64 = 0x1f593efffffff;
 
 /// BN254 scalar field prime
-pub const U51_P: [u64; 5] = [
-    0x1f593f0000001,
-    0x10f372e12287c,
-    0x6056174a0cfa1,
-    0x014dc2822db40,
-    0x30644e72e131a,
+pub const U51_P: [u64; 5] = U51_P_MULTIPLES[1];
+
+/// Lookup table: `U51_P_MULTIPLES[k]` = `k * P` for k in 0..=5, in 51-bit
+/// limbs.
+pub const U51_P_MULTIPLES: [[u64; 5]; 6] = [
+    [
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+    ], // 0P
+    [
+        0x1f593f0000001,
+        0x10f372e12287c,
+        0x6056174a0cfa1,
+        0x014dc2822db40,
+        0x30644e72e131a,
+    ], // 1P
+    [
+        0x3eb27e0000002,
+        0x21e6e5c2450f8,
+        0x40ac2e9419f42,
+        0x029b85045b681,
+        0x60c89ce5c2634,
+    ], // 2P
+    [
+        0x5e0bbd0000003,
+        0x32da58a367974,
+        0x210245de26ee3,
+        0x03e94786891c2,
+        0x112ceb58a394e,
+    ], // 3P
+    [
+        0x7d64fc0000004,
+        0x43cdcb848a1f0,
+        0x01585d2833e84,
+        0x05370a08b6d03,
+        0x419139cb84c68,
+    ], // 4P
+    [
+        0x1cbe3b0000005,
+        0x54c13e65aca6d,
+        0x61ae747240e25,
+        0x0684cc8ae4843,
+        0x71f5883e65f82,
+    ], // 5P
 ];
 
 /// Bit mask for 51-bit limbs.

diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs
@@ -1,4 +1,4 @@
-use crate::constants::U64_2P;
+use crate::constants::{self, U64_2P};
 
 /// Macro to extract a subarray from an array.
 ///
@@ -97,3 +97,136 @@ pub const fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64
     let c: u128 = widening_mul(a, b) + carry as u128 + add as u128;
     (c as u64, (c >> 64) as u64)
 }
+
+/// Precomputed magic constants for fast approximate floor division by the
+/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by
+/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the
+/// declared bit range.
+pub struct MulShift {
+    mul:   u64,
+    shift: u32,
+}
+
+impl MulShift {
+    /// Computes magic multiplication and shift constants for approximate floor
+    /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within
+    /// `max_bit_size` bits. `precision` controls how many of the top bits of
+    /// `val` are used — higher precision yields a tighter approximation.
+    pub const fn new(max_bit_size: u32, precision: u32) -> Self {
+        // Generate magic numbers for division by bn254's prime for a range of top-bit
+        // widths.
+
+        // Based on Warren's "Hacker's Delight" (integer
+        // division by constants) to find a magic multiplier for each bit-width.
+
+        use crate::constants::U64_P;
+
+        let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w))
+        let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1
+        let mut s = precision; // start at precision; s < precision values are skipped
+        let m;
+        loop {
+            // s = shift exponent
+            if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) {
+                m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier
+                break;
+            }
+            s += 1;
+            assert!(s < 64, "no magic multiplier found");
+        }
+
+        MulShift { mul: m, shift: s }
+    }
+
+    #[inline(always)]
+    /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient).
+    /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`].
+    pub const fn div_p(&self, val: u64) -> u64 {
+        // assumes systems can handle multiplication by 64 bits without performance
+        // penalty.
+        (val * self.mul) >> self.shift
+    }
+}
+
+/// Approximate floor division by P using the upper 6 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable without multiplication on ARM64 and x86.
+///
+/// Tradeoff: due the limited range of this division \[0,4\] (instead of \[0,5\]
+/// for u256) will lead to a larger value after subtraction reduction.
+/// subtraction reduction output: [0, 1+ε] with ε < 0.3.
+#[inline(always)]
+pub fn div_p_6b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 6);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 6);
+    MULSHIFT.div_p(upper_bits)
+}
+
+/// Approximate floor division by P using the upper 32 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable with a 32bx32b->64b multiplier.
+#[inline(always)]
+pub fn div_p_32b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 32);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 32);
+    MULSHIFT.div_p(upper_bits)
+}
+
+/// Subtracts an approximate multiple of P from `x` using `div_p` on the high
+/// limb.
+///
+/// The result is not fully reduced; the output range depends on the precision
+/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`].
+#[inline(always)]
+pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4] {
+    // No clamping as the max value of x can't go past 5. Which is the maximum of
+    // the table.
+    let q = div_p(x[3]) as usize;
+    sub(x, constants::U64_P_MULTIPLES[q])
+}
+
+#[cfg(kani)]
+mod proofs {
+    use {
+        super::{constants::U64_2P, div_p_32b, div_p_6b},
+        crate::constants::U64_P_MULTIPLES,
+    };
+
+    /// Lexicographic ≤ on little-endian 256-bit integers.
+    fn le256(a: [u64; 4], b: [u64; 4]) -> bool {
+        for i in (0..4).rev() {
+            if a[i] != b[i] {
+                return a[i] < b[i];
+            }
+        }
+        true
+    }
+
+    /// TODO: tighter bounds
+    #[kani::proof]
+    fn div_p_32b_underapprox() {
+        let x: u64 = kani::any();
+        let q = div_p_32b(x);
+
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x >= r);
+        assert!(le256([0, 0, 0, x - r], U64_2P));
+    }
+
+    #[kani::proof]
+    // TODO tighter bounds
+    fn div_p_6b_underapprox() {
+        let x: u64 = kani::any();
+        let q = div_p_6b(x);
+
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x >= r);
+        assert!(le256([0, 0, 0, x - r], U64_2P));
+    }
+}