From b4c916ec20d8b4a33c308f8083362b2522996cf6 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Feb 2026 16:37:05 +0800
Subject: [PATCH 01/13] sub reduction: analysis

---
 skyscraper/bn254-multiplier/sub_reduce.py | 230 ++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 skyscraper/bn254-multiplier/sub_reduce.py

diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py
new file mode 100644
index 000000000..998734aa7
--- /dev/null
+++ b/skyscraper/bn254-multiplier/sub_reduce.py
@@ -0,0 +1,230 @@
+import math
+from math import ceil
+
+p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
+
+U51_i1 = pow(
+    2**51,
+    -1,
+    p,
+)
+U51_i2 = pow(
+    2**51,
+    -2,
+    p,
+)
+U51_i3 = pow(
+    2**51,
+    -3,
+    p,
+)
+U51_i4 = pow(
+    2**51,
+    -4,
+    p,
+)
+
+
+def rho(sign, k):
+    sign = 1 if sign else -1
+
+    return pow(sign * 2**51, -1 * k, p)
+
+
+def rho_combinations(input):
+    for i in range(0, 64):
+        bitstring = bin(i)[2:].zfill(6)
+        # 1 is positive, 0 is negative
+        sel1 = int(bitstring[1 - 1])
+        sel2 = int(bitstring[2 - 1])
+        sel3 = int(bitstring[3 - 1])
+        sel4 = int(bitstring[4 - 1])
+        sel5 = int(bitstring[5 - 1])
+        sel6 = int(bitstring[6 - 1])
+
+        # max_val = input**2 >> 4 * 51
+        max_val = input * (p - 1) >> 4 * 51
+        # maximum value with these inclusion. Only add the once that are supposed to be added
+        # values that are meant to be subtracted are ignored
+        max_val += (
+            sel1 * rho(sel1, 1)
+            + sel1 * rho(sel1, 2)
+            + sel3 * rho(sel3, 3)
+            + sel4 * rho(sel4, 4)
+            + sel5 * p
+        ) * (2**51 - 1)
+
+        max_val >>= 51
+        max_val += sel6 * p
+        max_val >>= 1
+
+        # minimum value with these inclusion. Only add the once that are supposed to be added
+        # values that are supposed to be added are ignored
+        min_val = 0
+        min_val += (
+            -(1 - sel1) * rho(sel1, 1)
+            - (1 - sel2) * rho(sel2, 2)
+            - (1 - sel3) * rho(sel3, 3)
+            - (1 - sel4) * rho(sel4, 4)
+            - (1 - sel5) * p
+        ) * (2**51 - 1)
+
+        min_val = min_val >> 51
+        # could be that it doesn't need any compensation for final reduction
+        min_val -= (1 - sel6) * p
+        min_val >>= 1
+        diff = max_val.bit_length() - min_val.bit_length()
+
+        subp = max_val / p
+
+        print(
+            bitstring,
+            f"{max_val.bit_length():03} {int(math.copysign(1, min_val)):2} {min_val.bit_length():03} {diff:3}\t  {(input + max_val).bit_length()} {(input + ceil(subp) * p).bit_length()} {(input + ceil(subp) * p) / p} {subp}",
+        )
+
+
+def shift_mul():
+    print("shift_mul")
+    neq = 0
+    nlp = 0
+    neg_max = 0
+    neg_min = 0
+
+    power = 5
+    for i in range(0, 2**power):
+        val_min = i << (256 - power)
+        subp = val_min // p
+        shift_subp = (val_min >> (256 - power)) // ((p >> (256 - power)) + 1)
+
+        val_max = val_min + (1 << (256 - power)) - 1
+        rem_max = val_max - (shift_subp) * p
+
+        rem_min = val_min - (shift_subp) * p
+
+        if subp != shift_subp:
+            neq += 1
+
+        if rem_max >= 1.7 * p:
+            print(hex(rem_max))
+            nlp += 1
+
+        if rem_max < 0:
+            neg_max += 1
+
+        if rem_min < 0:
+            neg_min += 1
+
+    print(neq, nlp, neg_max, neg_min)
+
+
+def shift_count():
+    print("shift count")
+    neq = 0
+    nlp = 0
+    neg_max = 0
+    neg_min = 0
+
+    power = 2
+
+    for i in range(0, 2**power):
+        val_min = i << (256 - power)
+        subp = val_min // p
+        shift_subp = i
+        # shift_subp = ((i >> 2) & i) + (i >> 1)
+
+        val_max = val_min + (1 << (256 - power)) - 1
+        rem_max = val_max - (shift_subp) * p
+
+        rem_min = val_min - (shift_subp) * p
+
+        if subp != shift_subp:
+            print(hex(val_min), subp, shift_subp)
+            neq += 1
+
+        if rem_max >= p + 1.5 * p:
+            print(hex(rem_max))
+            nlp += 1
+
+        if rem_max < 0:
+            neg_max += 1
+
+        if rem_min < 0:
+            neg_min += 1
+
+    print(neq, nlp, neg_max, neg_min)
+
+
+def shift_sum():
+    print("shift sum")
+    neq = 0
+    nlp = 0
+    neg_max = 0
+    neg_min = 0
+
+    power = 3
+
+    for i in range(0, 2**power):
+        val_min = i << (256 - power)
+        subp = val_min // p
+        shift_subp = ((i >> 2) & i) + (i >> 1)
+        print(bin(i)[2:].zfill(power), shift_subp)
+
+        val_max = val_min + (1 << (256 - power)) - 1
+        rem_max = val_max - (shift_subp) * p
+
+        rem_min = val_min - (shift_subp) * p
+
+        if subp != shift_subp:
+            print(hex(val_min), subp, shift_subp)
+            neq += 1
+
+        if rem_max >= p + 0.75 * p:
+            print(hex(rem_max))
+            nlp += 1
+
+        if rem_max < 0:
+            neg_max += 1
+
+        if rem_min < 0:
+            neg_min += 1
+
+    print(neq, nlp, neg_max, neg_min)
+
+
+def m_calculation():
+    res = list()
+    for w in range(0, 65):
+        # increase by +1 otherwise the shift results in a smaller divider than the original problem. For us it's important to not overshoot
+        # ceil
+        d = (p >> (256 - w)) + 1
+        nc = 2**w - 1 - (2**w % d)
+        for i in range(0, 128):
+            if 2**i > nc * (d - 1 - (2**i - 1) % d):
+                if i < w:
+                    print(w, i, d.bit_length())
+                m = (2**i + d - 1 - (2**i - 1) % d) // d
+                neg = False
+                # comment out if the bits fit in the register.
+                if m >= 2**w:
+                    neg = True
+                    m = m - 2**w
+                res.append(
+                    (
+                        w,
+                        m.bit_length(),
+                        neg,
+                        i,
+                        m,
+                    )
+                )
+                break
+    return res
+
+
+if __name__ == "__main__":
+    shift_mul()
+    shift_count()
+    shift_sum()
+
+    for b in m_calculation():
+        print(b)

From f9fc899c5e4e4d91eed2333ebb6ea72327f20815 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Feb 2026 16:38:27 +0800
Subject: [PATCH 02/13] subred: remove rho experiments

---
 skyscraper/bn254-multiplier/sub_reduce.py | 85 +----------------------
 1 file changed, 1 insertion(+), 84 deletions(-)

diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py
index 998734aa7..aad16fa1a 100644
--- a/skyscraper/bn254-multiplier/sub_reduce.py
+++ b/skyscraper/bn254-multiplier/sub_reduce.py
@@ -1,89 +1,6 @@
-import math
-from math import ceil
-
 p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
 
-U51_i1 = pow(
-    2**51,
-    -1,
-    p,
-)
-U51_i2 = pow(
-    2**51,
-    -2,
-    p,
-)
-U51_i3 = pow(
-    2**51,
-    -3,
-    p,
-)
-U51_i4 = pow(
-    2**51,
-    -4,
-    p,
-)
-
-
-def rho(sign, k):
-    sign = 1 if sign else -1
-
-    return pow(sign * 2**51, -1 * k, p)
-
-
-def rho_combinations(input):
-    for i in range(0, 64):
-        bitstring = bin(i)[2:].zfill(6)
-        # 1 is positive, 0 is negative
-        sel1 = int(bitstring[1 - 1])
-        sel2 = int(bitstring[2 - 1])
-        sel3 = int(bitstring[3 - 1])
-        sel4 = int(bitstring[4 - 1])
-        sel5 = int(bitstring[5 - 1])
-        sel6 = int(bitstring[6 - 1])
-
-        # max_val = input**2 >> 4 * 51
-        max_val = input * (p - 1) >> 4 * 51
-        # maximum value with these inclusion. Only add the once that are supposed to be added
-        # values that are meant to be subtracted are ignored
-        max_val += (
-            sel1 * rho(sel1, 1)
-            + sel1 * rho(sel1, 2)
-            + sel3 * rho(sel3, 3)
-            + sel4 * rho(sel4, 4)
-            + sel5 * p
-        ) * (2**51 - 1)
-
-        max_val >>= 51
-        max_val += sel6 * p
-        max_val >>= 1
-
-        # minimum value with these inclusion. Only add the once that are supposed to be added
-        # values that are supposed to be added are ignored
-        min_val = 0
-        min_val += (
-            -(1 - sel1) * rho(sel1, 1)
-            - (1 - sel2) * rho(sel2, 2)
-            - (1 - sel3) * rho(sel3, 3)
-            - (1 - sel4) * rho(sel4, 4)
-            - (1 - sel5) * p
-        ) * (2**51 - 1)
-
-        min_val = min_val >> 51
-        # could be that it doesn't need any compensation for final reduction
-        min_val -= (1 - sel6) * p
-        min_val >>= 1
-        diff = max_val.bit_length() - min_val.bit_length()
-
-        subp = max_val / p
-
-        print(
-            bitstring,
-            f"{max_val.bit_length():03} {int(math.copysign(1, min_val)):2} {min_val.bit_length():03} {diff:3}\t  {(input + max_val).bit_length()} {(input + ceil(subp) * p).bit_length()} {(input + ceil(subp) * p) / p} {subp}",
-        )
-
-
-def shift_mul():
+ def shift_mul():
     print("shift_mul")
     neq = 0
     nlp = 0

From a3954f6e8af34e497479531d06510f655c84551e Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Feb 2026 16:45:29 +0800
Subject: [PATCH 03/13] subred: remove approaches superseeded by warren_magic

---
 skyscraper/bn254-multiplier/sub_reduce.py | 155 ++++++++--------------
 1 file changed, 59 insertions(+), 96 deletions(-)

diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py
index aad16fa1a..e85e661c6 100644
--- a/skyscraper/bn254-multiplier/sub_reduce.py
+++ b/skyscraper/bn254-multiplier/sub_reduce.py
@@ -1,136 +1,98 @@
-p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
-
- def shift_mul():
-    print("shift_mul")
-    neq = 0
-    nlp = 0
-    neg_max = 0
-    neg_min = 0
-
-    power = 5
-    for i in range(0, 2**power):
-        val_min = i << (256 - power)
-        subp = val_min // p
-        shift_subp = (val_min >> (256 - power)) // ((p >> (256 - power)) + 1)
-
-        val_max = val_min + (1 << (256 - power)) - 1
-        rem_max = val_max - (shift_subp) * p
-
-        rem_min = val_min - (shift_subp) * p
-
-        if subp != shift_subp:
-            neq += 1
-
-        if rem_max >= 1.7 * p:
-            print(hex(rem_max))
-            nlp += 1
-
-        if rem_max < 0:
-            neg_max += 1
+"""Sub-reduction strategies for bn254 modular arithmetic."""
 
-        if rem_min < 0:
-            neg_min += 1
+p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
 
-    print(neq, nlp, neg_max, neg_min)
 
+def shift_sum():
+    """
+    Generate truth table for top 3 bit reduction strategy.
 
-def shift_count():
-    print("shift count")
+    The formula ((i >> 2) & i) + (i >> 1) computes this
+    approximation using only shifts and adds — no multiplication needed.
+    """
+    # Keep track of potential erroneous conditions
     neq = 0
-    nlp = 0
     neg_max = 0
     neg_min = 0
 
-    power = 2
+    power = 3
+
+    max_pval = 0
+    truth_table = list()
 
     for i in range(0, 2**power):
         val_min = i << (256 - power)
         subp = val_min // p
-        shift_subp = i
-        # shift_subp = ((i >> 2) & i) + (i >> 1)
+        shift_subp = ((i >> 2) & i) + (i >> 1)
 
         val_max = val_min + (1 << (256 - power)) - 1
         rem_max = val_max - (shift_subp) * p
 
         rem_min = val_min - (shift_subp) * p
 
+        # Validation: track the maximum remainder relative to p
+        max_pval = max(max_pval, rem_max)
+
+        truth_table.append((bin(i)[2:].zfill(power), shift_subp))
+
+        # Check for erroneous situations
         if subp != shift_subp:
             print(hex(val_min), subp, shift_subp)
             neq += 1
 
-        if rem_max >= p + 1.5 * p:
-            print(hex(rem_max))
-            nlp += 1
-
         if rem_max < 0:
             neg_max += 1
 
         if rem_min < 0:
             neg_min += 1
 
-    print(neq, nlp, neg_max, neg_min)
-
-
-def shift_sum():
-    print("shift sum")
-    neq = 0
-    nlp = 0
-    neg_max = 0
-    neg_min = 0
-
-    power = 3
-
-    for i in range(0, 2**power):
-        val_min = i << (256 - power)
-        subp = val_min // p
-        shift_subp = ((i >> 2) & i) + (i >> 1)
-        print(bin(i)[2:].zfill(power), shift_subp)
-
-        val_max = val_min + (1 << (256 - power)) - 1
-        rem_max = val_max - (shift_subp) * p
+    print(f"{'bits':>5} {'subtractions':>12}")
+    for e, r in truth_table:
+        print(f"{e:>5} {r:>12}")
 
-        rem_min = val_min - (shift_subp) * p
+    print(
+        f"\nmax_remainder/p={max_pval / p:.4f}  mismatches={neq}  neg_max={neg_max}  neg_min={neg_min}"
+    )
 
-        if subp != shift_subp:
-            print(hex(val_min), subp, shift_subp)
-            neq += 1
 
-        if rem_max >= p + 0.75 * p:
-            print(hex(rem_max))
-            nlp += 1
+def warren_magic():
+    """
+    Generate magic numbers for division by bn254's prime for a range of top-bit widths.
 
-        if rem_max < 0:
-            neg_max += 1
-
-        if rem_min < 0:
-            neg_min += 1
+    Based on Warren's "Hacker's Delight" (integer
+    division by constants) to find a magic multiplier for each bit-width.
 
-    print(neq, nlp, neg_max, neg_min)
-
-
-def m_calculation():
+    Returns a list of tuples (w, m_bits, sub, shift, m) where:
+    - w: number of bits of the dividend (top bits of the value)
+    - m_bits: number of bits in the magic multiplier
+    - sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
+      so we store m - 2^w and compensate at runtime)
+    - shift: the number of bits to right-shift the product (called 's' in Warren)
+    - m: the magic multiplier
+    """
     res = list()
     for w in range(0, 65):
-        # increase by +1 otherwise the shift results in a smaller divider than the original problem. For us it's important to not overshoot
-        # ceil
-        d = (p >> (256 - w)) + 1
-        nc = 2**w - 1 - (2**w % d)
-        for i in range(0, 128):
-            if 2**i > nc * (d - 1 - (2**i - 1) % d):
-                if i < w:
-                    print(w, i, d.bit_length())
-                m = (2**i + d - 1 - (2**i - 1) % d) // d
-                neg = False
-                # comment out if the bits fit in the register.
+        d = (p >> (256 - w)) + 1  # d = divisor = ceil(p / 2^(256-w))
+        nc = 2**w - 1 - (2**w % d)  # nc = largest value s.t. nc mod d != d-1
+        for s in range(0, 128):  # s = shift exponent
+            if 2**s > nc * (d - 1 - (2**s - 1) % d):
+                if s < w:
+                    print(w, s, d.bit_length())
+                m = (2**s + d - 1 - (2**s - 1) % d) // d  # m = magic multiplier
+                sub = False
+
+                # "Subtract and shift" variant: when m >= 2^w it won't fit in w bits,
+                # so subtract 2^w and compensate at runtime.
+                # Comment out if the register can hold more bits than w.
                 if m >= 2**w:
-                    neg = True
+                    sub = True
                     m = m - 2**w
                 res.append(
                     (
                         w,
                         m.bit_length(),
-                        neg,
-                        i,
+                        sub,
+                        s,
                         m,
                     )
                 )
@@ -139,9 +101,10 @@ def m_calculation():
 
 
 if __name__ == "__main__":
-    shift_mul()
-    shift_count()
+    print("shift sum")
     shift_sum()
 
-    for b in m_calculation():
-        print(b)
+    print("\n warren")
+    print(f"{'w':>3} {'m_bits':>6} {'sub':>5} {'shift':>5} {'m':>20}")
+    for w, m_bits, sub, shift, m in warren_magic():
+        print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {m:>20}")

From bcd1bbcc9e3c20d8bc3a559371c2a4503e633109 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 16:38:05 +0800
Subject: [PATCH 04/13] sub_reduce: take size of input into account

---
 skyscraper/bn254-multiplier/sub_reduce.py | 32 +++++++++++++++++------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py
index e85e661c6..fddc95ffc 100644
--- a/skyscraper/bn254-multiplier/sub_reduce.py
+++ b/skyscraper/bn254-multiplier/sub_reduce.py
@@ -1,5 +1,7 @@
 """Sub-reduction strategies for bn254 modular arithmetic."""
 
+import argparse
+
 p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
 
 
@@ -55,7 +57,7 @@ def shift_sum():
     )
 
 
-def warren_magic():
+def warren_magic(max_bit_size, apply_sub: bool = False):
     """
     Generate magic numbers for division by bn254's prime for a range of top-bit widths.
 
@@ -63,7 +65,7 @@ def warren_magic():
     division by constants) to find a magic multiplier for each bit-width.
 
     Returns a list of tuples (w, m_bits, sub, shift, m) where:
-    - w: number of bits of the dividend (top bits of the value)
+    - w: number of top bits of the value
     - m_bits: number of bits in the magic multiplier
     - sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
       so we store m - 2^w and compensate at runtime)
@@ -72,8 +74,8 @@ def warren_magic():
     """
     res = list()
     for w in range(0, 65):
-        d = (p >> (256 - w)) + 1  # d = divisor = ceil(p / 2^(256-w))
-        nc = 2**w - 1 - (2**w % d)  # nc = largest value s.t. nc mod d != d-1
+        d = (p >> (max_bit_size - w)) + 1  # d = divisor = ceil(p / 2^(max_bit_size-w))
+        nc = 2**w - 1 - (2**w % d)  # nc = largest value s.t. nc mod d == d-1
         for s in range(0, 128):  # s = shift exponent
             if 2**s > nc * (d - 1 - (2**s - 1) % d):
                 if s < w:
@@ -83,8 +85,8 @@ def warren_magic():
 
                 # "Subtract and shift" variant: when m >= 2^w it won't fit in w bits,
                 # so subtract 2^w and compensate at runtime.
-                # Comment out if the register can hold more bits than w.
-                if m >= 2**w:
+                # Skip this part via apply_sub if the register can hold more bits than w.
+                if apply_sub and m >= 2**w:
                     sub = True
                     m = m - 2**w
                 res.append(
@@ -101,10 +103,24 @@ def warren_magic():
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sub-reduction strategies for bn254.")
+    parser.add_argument(
+        "--sub",
+        action="store_true",
+        help="Apply the 'subtract and shift' variant when m >= 2^w.",
+    )
+    args = parser.parse_args()
+
     print("shift sum")
     shift_sum()
 
     print("\n warren")
     print(f"{'w':>3} {'m_bits':>6} {'sub':>5} {'shift':>5} {'m':>20}")
-    for w, m_bits, sub, shift, m in warren_magic():
-        print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {m:>20}")
+    for w, m_bits, sub, shift, m in warren_magic(257, apply_sub=args.sub):
+        print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}")
+
+    for w, m_bits, sub, shift, m in warren_magic(256, apply_sub=args.sub):
+        print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}")
+
+    for w, m_bits, sub, shift, m in warren_magic(255, apply_sub=args.sub):
+        print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}")

From e1593567450f3a0b87c363c53eceea18cad847ef Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 16:44:00 +0800
Subject: [PATCH 05/13] subrec: lookup table for subtraction + generator code

---
 skyscraper/bn254-multiplier/gen_multiples.py  | 35 ++++++++++++++
 skyscraper/bn254-multiplier/src/constants.rs  | 42 +++++++++++++++++
 .../bn254-multiplier/src/rne/constants.rs     | 47 +++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 skyscraper/bn254-multiplier/gen_multiples.py

diff --git a/skyscraper/bn254-multiplier/gen_multiples.py b/skyscraper/bn254-multiplier/gen_multiples.py
new file mode 100644
index 000000000..67c86127e
--- /dev/null
+++ b/skyscraper/bn254-multiplier/gen_multiples.py
@@ -0,0 +1,35 @@
+"""Generate Rust const lookup tables for multiples of the BN254 scalar field prime."""
+
+p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001
+
+
+def int_to_limbs(size, n, count):
+    mask = 2**size - 1
+    limbs = []
+    for _ in range(count):
+        limbs.append(n & mask)
+        n >>= size
+    return limbs
+
+
+def main():
+    multiples_64 = [int_to_limbs(64, k * p, 4) for k in range(0, 6)]
+    multiples_51 = [int_to_limbs(51, k * p, 5) for k in range(0, 6)]
+
+    # Print 64-bit table (for constants.rs)
+    print("pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [")
+    for k, limbs in enumerate(multiples_64):
+        fmt = ", ".join(f"0x{l:016x}" for l in limbs)
+        print(f"    [{fmt}], // {k}P")
+    print("];")
+
+    # Print 51-bit table (for rne/constants.rs)
+    print("\npub const U51_P_MULTIPLES: [[u64; 5]; 6] = [")
+    for k, limbs in enumerate(multiples_51):
+        fmt = ", ".join(f"0x{l:013x}" for l in limbs)
+        print(f"    [{fmt}], // {k}P")
+    print("];")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs
index b49971136..fe026869a 100644
--- a/skyscraper/bn254-multiplier/src/constants.rs
+++ b/skyscraper/bn254-multiplier/src/constants.rs
@@ -14,6 +14,48 @@ pub const U64_2P: [u64; 4] = [
     0x60c89ce5c2634053,
 ];
 
+/// Lookup table: `U64_P_MULTIPLES[k]` = `k * P` for k in 0..=5.
+/// Index 0 is all-zeros; use as `x - U64_P_MULTIPLES[k]` to subtract k copies
+/// of P.
+pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [
+    [
+        0x0000000000000000,
+        0x0000000000000000,
+        0x0000000000000000,
+        0x0000000000000000,
+    ], // 0P
+    [
+        0x43e1f593f0000001,
+        0x2833e84879b97091,
+        0xb85045b68181585d,
+        0x30644e72e131a029,
+    ], // 1P
+    [
+        0x87c3eb27e0000002,
+        0x5067d090f372e122,
+        0x70a08b6d0302b0ba,
+        0x60c89ce5c2634053,
+    ], // 2P
+    [
+        0xcba5e0bbd0000003,
+        0x789bb8d96d2c51b3,
+        0x28f0d12384840917,
+        0x912ceb58a394e07d,
+    ], // 3P
+    [
+        0x0f87d64fc0000004,
+        0xa0cfa121e6e5c245,
+        0xe14116da06056174,
+        0xc19139cb84c680a6,
+    ], // 4P
+    [
+        0x5369cbe3b0000005,
+        0xc903896a609f32d6,
+        0x99915c908786b9d1,
+        0xf1f5883e65f820d0,
+    ], // 5P
+];
+
 // R mod P
 pub const U64_R: [u64; 4] = [
     0xac96341c4ffffffb,
diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs
index 77862defc..f1cf0fe06 100644
--- a/skyscraper/bn254-multiplier/src/rne/constants.rs
+++ b/skyscraper/bn254-multiplier/src/rne/constants.rs
@@ -14,6 +14,53 @@ pub const U51_P: [u64; 5] = [
     0x30644e72e131a,
 ];
 
+/// Lookup table: `U51_P_MULTIPLES[k]` = `k * P` for k in 0..=5, in 51-bit
+/// limbs.
+pub const U51_P_MULTIPLES: [[u64; 5]; 6] = [
+    [
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+        0x0000000000000,
+    ], // 0P
+    [
+        0x1f593f0000001,
+        0x10f372e12287c,
+        0x6056174a0cfa1,
+        0x014dc2822db40,
+        0x30644e72e131a,
+    ], // 1P
+    [
+        0x3eb27e0000002,
+        0x21e6e5c2450f8,
+        0x40ac2e9419f42,
+        0x029b85045b681,
+        0x60c89ce5c2634,
+    ], // 2P
+    [
+        0x5e0bbd0000003,
+        0x32da58a367974,
+        0x210245de26ee3,
+        0x03e94786891c2,
+        0x112ceb58a394e,
+    ], // 3P
+    [
+        0x7d64fc0000004,
+        0x43cdcb848a1f0,
+        0x01585d2833e84,
+        0x05370a08b6d03,
+        0x419139cb84c68,
+    ], // 4P
+    [
+        0x1cbe3b0000005,
+        0x54c13e65aca6d,
+        0x61ae747240e25,
+        0x0684cc8ae4843,
+        0x71f5883e65f82,
+    ], // 5P
+];
+
 /// Bit mask for 51-bit limbs.
 pub const MASK51: u64 = 2_u64.pow(51) - 1;
 

From 990b85371cdbe959c3d8902cd499267d7ada2e4b Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 17:23:36 +0800
Subject: [PATCH 06/13] mulshift: documentation + tests

---
 skyscraper/bn254-multiplier/src/lib.rs | 102 +++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
index 454d01945..b3dc74727 100644
--- a/skyscraper/bn254-multiplier/src/lib.rs
+++ b/skyscraper/bn254-multiplier/src/lib.rs
@@ -33,3 +33,105 @@ const fn pow_2(n: u32) -> f64 {
     let exp = (n as u64 + 1023) << 52;
     f64::from_bits(exp)
 }
+
+/// Precomputed magic constants for fast approximate floor division by the
+/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by
+/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the
+/// declared bit range.
+pub struct MulShift {
+    mul:   u64,
+    shift: u32,
+}
+
+impl MulShift {
+    /// Computes magic multiplication and shift constants for approximate floor
+    /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within
+    /// `max_bit_size` bits. `precision` controls how many of the top bits of
+    /// `val` are used — higher precision yields a tighter approximation.
+    pub const fn new(max_bit_size: u32, precision: u32) -> Self {
+        // Generate magic numbers for division by bn254's prime for a range of top-bit
+        // widths.
+
+        // Based on Warren's "Hacker's Delight" (integer
+        // division by constants) to find a magic multiplier for each bit-width.
+
+        // Returns a list of tuples (w, m_bits, sub, shift, m) where:
+        //- w: number of top bits of the value
+        //- m_bits: number of bits in the magic multiplier
+        //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
+        //  so we store m - 2^w and compensate at runtime)
+        //- shift: the number of bits to right-shift the product (called 's' in Warren)
+        //- m: the magic multiplier
+        use crate::constants::U64_P;
+
+        let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w))
+        let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1
+        let mut s = precision; // start at precision; s < precision values are skipped
+        let m;
+        loop {
+            // s = shift exponent
+            if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) {
+                m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier
+                break;
+            }
+            s += 1;
+            assert!(s < 64, "no magic multiplier found");
+        }
+
+        MulShift { mul: m, shift: s }
+    }
+
+    #[inline(always)]
+    /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient).
+    /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`].
+    pub const fn div_p(&self, val: u64) -> u64 {
+        // assumes systems can handle multiplication by 64 bits without performance
+        // penalty.
+        (val * self.mul) >> self.shift
+    }
+}
+
+/// Approximate floor division by P using the upper 6 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable without multiplication on ARM64 and x86.
+///
+/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
+/// u256) will to a larger value after subtraction reduction.
+/// subtraction reduction output: [0, 1+ε] with ε < 0.3.
+pub fn div_p_6b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 6);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 6);
+    MULSHIFT.div_p(upper_bits)
+}
+
+/// Approximate floor division by P using the upper 32 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable with a 32bx32b->64b multiplier.
+pub fn div_p_32b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 32);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 32);
+    MULSHIFT.div_p(upper_bits)
+}
+
+#[cfg(kani)]
+mod proofs {
+    use super::div_p_32b;
+
+    #[inline(never)]
+    fn div32(x: u64) -> u32 {
+        // cast to not create the 64bit magic number
+        return (x >> 32) as u32 / 0x30644e73 as u32;
+    }
+
+    #[kani::proof]
+    fn div_p_32b_matches_div32() {
+        let x: u64 = kani::any();
+        assert_eq!(div_p_32b(x), div32(x) as u64);
+    }
+}

From f4b310b3132c16acaaa637d0587c8d58b0ca896a Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 17:25:01 +0800
Subject: [PATCH 07/13] constants: synonyms for table constants

---
 skyscraper/bn254-multiplier/src/constants.rs     | 14 ++------------
 skyscraper/bn254-multiplier/src/rne/constants.rs |  8 +-------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs
index fe026869a..d515fc8fd 100644
--- a/skyscraper/bn254-multiplier/src/constants.rs
+++ b/skyscraper/bn254-multiplier/src/constants.rs
@@ -1,18 +1,8 @@
 pub const U64_NP0: u64 = 0xc2e1f593efffffff;
 
-pub const U64_P: [u64; 4] = [
-    0x43e1f593f0000001,
-    0x2833e84879b97091,
-    0xb85045b68181585d,
-    0x30644e72e131a029,
-];
+pub const U64_P: [u64; 4] = U64_P_MULTIPLES[1];
 
-pub const U64_2P: [u64; 4] = [
-    0x87c3eb27e0000002,
-    0x5067d090f372e122,
-    0x70a08b6d0302b0ba,
-    0x60c89ce5c2634053,
-];
+pub const U64_2P: [u64; 4] = U64_P_MULTIPLES[2];
 
 /// Lookup table: `U64_P_MULTIPLES[k]` = `k * P` for k in 0..=5.
 /// Index 0 is all-zeros; use as `x - U64_P_MULTIPLES[k]` to subtract k copies
diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs
index f1cf0fe06..462e8938e 100644
--- a/skyscraper/bn254-multiplier/src/rne/constants.rs
+++ b/skyscraper/bn254-multiplier/src/rne/constants.rs
@@ -6,13 +6,7 @@ use crate::pow_2;
 pub const U51_NP0: u64 = 0x1f593efffffff;
 
 /// BN254 scalar field prime
-pub const U51_P: [u64; 5] = [
-    0x1f593f0000001,
-    0x10f372e12287c,
-    0x6056174a0cfa1,
-    0x014dc2822db40,
-    0x30644e72e131a,
-];
+pub const U51_P: [u64; 5] = U51_P_MULTIPLES[1];
 
 /// Lookup table: `U51_P_MULTIPLES[k]` = `k * P` for k in 0..=5, in 51-bit
 /// limbs.

From 8d55f2aaf9373922f758c3662c27e4ac346f2595 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 18:41:44 +0800
Subject: [PATCH 08/13] subrec: under approximation test

---
 skyscraper/bn254-multiplier/src/lib.rs | 36 +++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
index b3dc74727..885ddd1de 100644
--- a/skyscraper/bn254-multiplier/src/lib.rs
+++ b/skyscraper/bn254-multiplier/src/lib.rs
@@ -121,17 +121,39 @@ pub fn div_p_32b(x: u64) -> u64 {
 
 #[cfg(kani)]
 mod proofs {
-    use super::div_p_32b;
+    use super::{constants::U64_P, div_p_32b};
+
+    /// Compute q * P as (4-limb little-endian result, overflow carry).
+    fn mul_small_by_p(q: u64) -> ([u64; 4], u64) {
+        let q = q as u128;
+        let t0 = q * U64_P[0] as u128;
+        let t1 = q * U64_P[1] as u128 + (t0 >> 64);
+        let t2 = q * U64_P[2] as u128 + (t1 >> 64);
+        let t3 = q * U64_P[3] as u128 + (t2 >> 64);
+        (
+            [t0 as u64, t1 as u64, t2 as u64, t3 as u64],
+            (t3 >> 64) as u64,
+        )
+    }
 
-    #[inline(never)]
-    fn div32(x: u64) -> u32 {
-        // cast to not create the 64bit magic number
-        return (x >> 32) as u32 / 0x30644e73 as u32;
+    /// Lexicographic ≤ on little-endian 256-bit integers.
+    fn le256(a: [u64; 4], b: [u64; 4]) -> bool {
+        for i in (0..4).rev() {
+            if a[i] != b[i] {
+                return a[i] < b[i];
+            }
+        }
+        true
     }
 
+    /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192.
     #[kani::proof]
-    fn div_p_32b_matches_div32() {
+    fn div_p_32b_underapprox() {
         let x: u64 = kani::any();
-        assert_eq!(div_p_32b(x), div32(x) as u64);
+        let q = div_p_32b(x);
+
+        let (q_times_p, carry) = mul_small_by_p(q);
+        assert_eq!(carry, 0);
+        assert!(le256(q_times_p, [0, 0, 0, x]));
     }
 }

From 95d569b2cd87d4f3973ef88fe5675909a17792eb Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 19:01:00 +0800
Subject: [PATCH 09/13] subrec: test bounds approach instead of approximation

---
 skyscraper/bn254-multiplier/src/lib.rs | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
index 885ddd1de..776b235fc 100644
--- a/skyscraper/bn254-multiplier/src/lib.rs
+++ b/skyscraper/bn254-multiplier/src/lib.rs
@@ -121,7 +121,13 @@ pub fn div_p_32b(x: u64) -> u64 {
 
 #[cfg(kani)]
 mod proofs {
-    use super::{constants::U64_P, div_p_32b};
+    use {
+        super::{
+            constants::{U64_2P, U64_P},
+            div_p_32b, div_p_6b, MulShift,
+        },
+        crate::constants::U64_P_MULTIPLES,
+    };
 
     /// Compute q * P as (4-limb little-endian result, overflow carry).
     fn mul_small_by_p(q: u64) -> ([u64; 4], u64) {
@@ -147,13 +153,23 @@ mod proofs {
     }
 
     /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192.
+    /// TODO: encode tighter bounds
     #[kani::proof]
     fn div_p_32b_underapprox() {
         let x: u64 = kani::any();
         let q = div_p_32b(x);
 
-        let (q_times_p, carry) = mul_small_by_p(q);
-        assert_eq!(carry, 0);
-        assert!(le256(q_times_p, [0, 0, 0, x]));
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(le256([0, 0, 0, x - r], U64_2P));
+    }
+
+    #[kani::proof]
+    // TODO tighter bounds
+    fn div_p_6b_underapprox() {
+        let x: u64 = kani::any();
+        let q = div_p_6b(x);
+
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(le256([0, 0, 0, x - r], U64_2P));
     }
 }

From 77fa31092a8d4f48f7275709d38cc48732781f8f Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Feb 2026 19:12:40 +0800
Subject: [PATCH 10/13] subrec: subtraction reduce function

---
 skyscraper/bn254-multiplier/src/lib.rs | 33 ++++++++++++++------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
index 776b235fc..c720b3eb9 100644
--- a/skyscraper/bn254-multiplier/src/lib.rs
+++ b/skyscraper/bn254-multiplier/src/lib.rs
@@ -100,6 +100,7 @@ impl MulShift {
 /// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
 /// u256) will to a larger value after subtraction reduction.
 /// subtraction reduction output: [0, 1+ε] with ε < 0.3.
+#[inline(always)]
 pub fn div_p_6b(x: u64) -> u64 {
     let upper_bits = x >> (64 - 6);
     // const to force compile time evaluation
@@ -112,6 +113,7 @@ pub fn div_p_6b(x: u64) -> u64 {
 ///
 /// Returns a value ≤ ⌊x / P⌋. This is the most precise
 /// approximation achievable with a 32bx32b->64b multiplier.
+#[inline(always)]
 pub fn div_p_32b(x: u64) -> u64 {
     let upper_bits = x >> (64 - 32);
     // const to force compile time evaluation
@@ -119,6 +121,19 @@ pub fn div_p_32b(x: u64) -> u64 {
     MULSHIFT.div_p(upper_bits)
 }
 
+/// Subtracts an approximate multiple of P from `x` using `div_p` on the high
+/// limb.
+///
+/// The result is not fully reduced; the output range depends on the precision
+/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`].
+#[inline(always)]
+pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4] {
+    // No clamping as the max value of x can't go passed 5. Which is the maximum of
+    // the table.
+    let q = div_p(x[3]) as usize;
+    utils::sub(x, constants::U64_P_MULTIPLES[q])
+}
+
 #[cfg(kani)]
 mod proofs {
     use {
@@ -129,19 +144,6 @@ mod proofs {
         crate::constants::U64_P_MULTIPLES,
     };
 
-    /// Compute q * P as (4-limb little-endian result, overflow carry).
-    fn mul_small_by_p(q: u64) -> ([u64; 4], u64) {
-        let q = q as u128;
-        let t0 = q * U64_P[0] as u128;
-        let t1 = q * U64_P[1] as u128 + (t0 >> 64);
-        let t2 = q * U64_P[2] as u128 + (t1 >> 64);
-        let t3 = q * U64_P[3] as u128 + (t2 >> 64);
-        (
-            [t0 as u64, t1 as u64, t2 as u64, t3 as u64],
-            (t3 >> 64) as u64,
-        )
-    }
-
     /// Lexicographic ≤ on little-endian 256-bit integers.
     fn le256(a: [u64; 4], b: [u64; 4]) -> bool {
         for i in (0..4).rev() {
@@ -152,14 +154,14 @@ mod proofs {
         true
     }
 
-    /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192.
-    /// TODO: encode tighter bounds
+    /// TODO: tighter bounds
     #[kani::proof]
     fn div_p_32b_underapprox() {
         let x: u64 = kani::any();
         let q = div_p_32b(x);
 
         let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x - r >= 0);
         assert!(le256([0, 0, 0, x - r], U64_2P));
     }
 
@@ -170,6 +172,7 @@ mod proofs {
         let q = div_p_6b(x);
 
         let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x - r >= 0);
         assert!(le256([0, 0, 0, x - r], U64_2P));
     }
 }

From cca45ade5f74fc4a34b18d8e35498f9ce201ca0e Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 10 Mar 2026 16:52:11 +0800
Subject: [PATCH 11/13] subrec: move mulsub

---
 skyscraper/bn254-multiplier/src/lib.rs   | 145 +----------------------
 skyscraper/bn254-multiplier/src/utils.rs | 145 ++++++++++++++++++++++-
 2 files changed, 145 insertions(+), 145 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs
index c720b3eb9..3b4d99bfa 100644
--- a/skyscraper/bn254-multiplier/src/lib.rs
+++ b/skyscraper/bn254-multiplier/src/lib.rs
@@ -13,7 +13,7 @@ pub mod rtz;
 pub mod constants;
 pub mod rne;
 mod scalar;
-mod utils;
+pub mod utils;
 
 #[cfg(not(target_arch = "wasm32"))] // Proptest not supported on WASI
 mod test_utils;
@@ -33,146 +33,3 @@ const fn pow_2(n: u32) -> f64 {
     let exp = (n as u64 + 1023) << 52;
     f64::from_bits(exp)
 }
-
-/// Precomputed magic constants for fast approximate floor division by the
-/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by
-/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the
-/// declared bit range.
-pub struct MulShift {
-    mul:   u64,
-    shift: u32,
-}
-
-impl MulShift {
-    /// Computes magic multiplication and shift constants for approximate floor
-    /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within
-    /// `max_bit_size` bits. `precision` controls how many of the top bits of
-    /// `val` are used — higher precision yields a tighter approximation.
-    pub const fn new(max_bit_size: u32, precision: u32) -> Self {
-        // Generate magic numbers for division by bn254's prime for a range of top-bit
-        // widths.
-
-        // Based on Warren's "Hacker's Delight" (integer
-        // division by constants) to find a magic multiplier for each bit-width.
-
-        // Returns a list of tuples (w, m_bits, sub, shift, m) where:
-        //- w: number of top bits of the value
-        //- m_bits: number of bits in the magic multiplier
-        //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
-        //  so we store m - 2^w and compensate at runtime)
-        //- shift: the number of bits to right-shift the product (called 's' in Warren)
-        //- m: the magic multiplier
-        use crate::constants::U64_P;
-
-        let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w))
-        let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1
-        let mut s = precision; // start at precision; s < precision values are skipped
-        let m;
-        loop {
-            // s = shift exponent
-            if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) {
-                m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier
-                break;
-            }
-            s += 1;
-            assert!(s < 64, "no magic multiplier found");
-        }
-
-        MulShift { mul: m, shift: s }
-    }
-
-    #[inline(always)]
-    /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient).
-    /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`].
-    pub const fn div_p(&self, val: u64) -> u64 {
-        // assumes systems can handle multiplication by 64 bits without performance
-        // penalty.
-        (val * self.mul) >> self.shift
-    }
-}
-
-/// Approximate floor division by P using the upper 6 bits of `x`.
-/// `x` must be the upper limb of a u256 in 64-bit radix.
-///
-/// Returns a value ≤ ⌊x / P⌋. This is the most precise
-/// approximation achievable without multiplication on ARM64 and x86.
-///
-/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
-/// u256) will to a larger value after subtraction reduction.
-/// subtraction reduction output: [0, 1+ε] with ε < 0.3.
-#[inline(always)]
-pub fn div_p_6b(x: u64) -> u64 {
-    let upper_bits = x >> (64 - 6);
-    // const to force compile time evaluation
-    const MULSHIFT: MulShift = MulShift::new(256, 6);
-    MULSHIFT.div_p(upper_bits)
-}
-
-/// Approximate floor division by P using the upper 32 bits of `x`.
-/// `x` must be the upper limb of a u256 in 64-bit radix.
-///
-/// Returns a value ≤ ⌊x / P⌋. This is the most precise
-/// approximation achievable with a 32bx32b->64b multiplier.
-#[inline(always)]
-pub fn div_p_32b(x: u64) -> u64 {
-    let upper_bits = x >> (64 - 32);
-    // const to force compile time evaluation
-    const MULSHIFT: MulShift = MulShift::new(256, 32);
-    MULSHIFT.div_p(upper_bits)
-}
-
-/// Subtracts an approximate multiple of P from `x` using `div_p` on the high
-/// limb.
-///
-/// The result is not fully reduced; the output range depends on the precision
-/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`].
-#[inline(always)]
-pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4] {
-    // No clamping as the max value of x can't go passed 5. Which is the maximum of
-    // the table.
-    let q = div_p(x[3]) as usize;
-    utils::sub(x, constants::U64_P_MULTIPLES[q])
-}
-
-#[cfg(kani)]
-mod proofs {
-    use {
-        super::{
-            constants::{U64_2P, U64_P},
-            div_p_32b, div_p_6b, MulShift,
-        },
-        crate::constants::U64_P_MULTIPLES,
-    };
-
-    /// Lexicographic ≤ on little-endian 256-bit integers.
-    fn le256(a: [u64; 4], b: [u64; 4]) -> bool {
-        for i in (0..4).rev() {
-            if a[i] != b[i] {
-                return a[i] < b[i];
-            }
-        }
-        true
-    }
-
-    /// TODO: tighter bounds
-    #[kani::proof]
-    fn div_p_32b_underapprox() {
-        let x: u64 = kani::any();
-        let q = div_p_32b(x);
-
-        let r = U64_P_MULTIPLES[q as usize][3];
-        assert!(x - r >= 0);
-        assert!(le256([0, 0, 0, x - r], U64_2P));
-    }
-
-    #[kani::proof]
-    // TODO tighter bounds
-    fn div_p_6b_underapprox() {
-        let x: u64 = kani::any();
-        let q = div_p_6b(x);
-
-        let r = U64_P_MULTIPLES[q as usize][3];
-        assert!(x - r >= 0);
-        assert!(le256([0, 0, 0, x - r], U64_2P));
-    }
-}
diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs
index ee3ac57b7..066282dcb 100644
--- a/skyscraper/bn254-multiplier/src/utils.rs
+++ b/skyscraper/bn254-multiplier/src/utils.rs
@@ -1,4 +1,4 @@
-use crate::constants::U64_2P;
+use crate::constants::{self, U64_2P};
 
 /// Macro to extract a subarray from an array.
 ///
@@ -97,3 +97,146 @@ pub const fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64
     let c: u128 = widening_mul(a, b) + carry as u128 + add as u128;
     (c as u64, (c >> 64) as u64)
 }
+
+/// Precomputed magic constants for fast approximate floor division by the
+/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by
+/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the
+/// declared bit range.
+pub struct MulShift {
+    mul:   u64,
+    shift: u32,
+}
+
+impl MulShift {
+    /// Computes magic multiplication and shift constants for approximate floor
+    /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within
+    /// `max_bit_size` bits. `precision` controls how many of the top bits of
+    /// `val` are used — higher precision yields a tighter approximation.
+    pub const fn new(max_bit_size: u32, precision: u32) -> Self {
+        // Generate magic numbers for division by bn254's prime for a range of top-bit
+        // widths.
+
+        // Based on Warren's "Hacker's Delight" (integer
+        // division by constants) to find a magic multiplier for each bit-width.
+
+        // Returns a list of tuples (w, m_bits, sub, shift, m) where:
+        //- w: number of top bits of the value
+        //- m_bits: number of bits in the magic multiplier
+        //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
+        //  so we store m - 2^w and compensate at runtime)
+        //- shift: the number of bits to right-shift the product (called 's' in Warren)
+        //- m: the magic multiplier
+        use crate::constants::U64_P;
+
+        let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w))
+        let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1
+        let mut s = precision; // start at precision; s < precision values are skipped
+        let m;
+        loop {
+            // s = shift exponent
+            if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) {
+                m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier
+                break;
+            }
+            s += 1;
+            assert!(s < 64, "no magic multiplier found");
+        }
+
+        MulShift { mul: m, shift: s }
+    }
+
+    #[inline(always)]
+    /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient).
+    /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`].
+    pub const fn div_p(&self, val: u64) -> u64 {
+        // assumes systems can handle multiplication by 64 bits without performance
+        // penalty.
+        (val * self.mul) >> self.shift
+    }
+}
+
+/// Approximate floor division by P using the upper 6 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable without multiplication on ARM64 and x86.
+///
+/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
+/// u256) will to a larger value after subtraction reduction.
+/// subtraction reduction output: [0, 1+ε] with ε < 0.3.
+#[inline(always)]
+pub fn div_p_6b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 6);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 6);
+    MULSHIFT.div_p(upper_bits)
+}
+
+/// Approximate floor division by P using the upper 32 bits of `x`.
+/// `x` must be the upper limb of a u256 in 64-bit radix.
+///
+/// Returns a value ≤ ⌊x / P⌋. This is the most precise
+/// approximation achievable with a 32bx32b->64b multiplier.
+#[inline(always)]
+pub fn div_p_32b(x: u64) -> u64 {
+    let upper_bits = x >> (64 - 32);
+    // const to force compile time evaluation
+    const MULSHIFT: MulShift = MulShift::new(256, 32);
+    MULSHIFT.div_p(upper_bits)
+}
+
+/// Subtracts an approximate multiple of P from `x` using `div_p` on the high
+/// limb.
+///
+/// The result is not fully reduced; the output range depends on the precision
+/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`].
+#[inline(always)]
+pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4] {
+    // No clamping as the max value of x can't go passed 5. Which is the maximum of
+    // the table.
+    let q = div_p(x[3]) as usize;
+    sub(x, constants::U64_P_MULTIPLES[q])
+}
+
+#[cfg(kani)]
+mod proofs {
+    use {
+        super::{
+            constants::{U64_2P, U64_P},
+            div_p_32b, div_p_6b, MulShift,
+        },
+        crate::constants::U64_P_MULTIPLES,
+    };
+
+    /// Lexicographic ≤ on little-endian 256-bit integers.
+    fn le256(a: [u64; 4], b: [u64; 4]) -> bool {
+        for i in (0..4).rev() {
+            if a[i] != b[i] {
+                return a[i] < b[i];
+            }
+        }
+        true
+    }
+
+    /// TODO: tighter bounds
+    #[kani::proof]
+    fn div_p_32b_underapprox() {
+        let x: u64 = kani::any();
+        let q = div_p_32b(x);
+
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x - r >= 0);
+        assert!(le256([0, 0, 0, x - r], U64_2P));
+    }
+
+    #[kani::proof]
+    // TODO tighter bounds
+    fn div_p_6b_underapprox() {
+        let x: u64 = kani::any();
+        let q = div_p_6b(x);
+
+        let r = U64_P_MULTIPLES[q as usize][3];
+        assert!(x - r >= 0);
+        assert!(le256([0, 0, 0, x - r], U64_2P));
+    }
+}

From f9a4f4d5ab219d25099c0904cfa55f2757d7cc1b Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 10 Mar 2026 17:13:58 +0800
Subject: [PATCH 12/13] subrec: minor improvements

---
 skyscraper/bn254-multiplier/src/constants.rs |  2 --
 skyscraper/bn254-multiplier/src/utils.rs     | 20 +++++---------------
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs
index d515fc8fd..b831593f3 100644
--- a/skyscraper/bn254-multiplier/src/constants.rs
+++ b/skyscraper/bn254-multiplier/src/constants.rs
@@ -1,5 +1,3 @@
-pub const U64_NP0: u64 = 0xc2e1f593efffffff;
-
 pub const U64_P: [u64; 4] = U64_P_MULTIPLES[1];
 
 pub const U64_2P: [u64; 4] = U64_P_MULTIPLES[2];
diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs
index 066282dcb..2c33263e8 100644
--- a/skyscraper/bn254-multiplier/src/utils.rs
+++ b/skyscraper/bn254-multiplier/src/utils.rs
@@ -119,13 +119,6 @@ impl MulShift {
         // Based on Warren's "Hacker's Delight" (integer
         // division by constants) to find a magic multiplier for each bit-width.
 
-        // Returns a list of tuples (w, m_bits, sub, shift, m) where:
-        //- w: number of top bits of the value
-        //- m_bits: number of bits in the magic multiplier
-        //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w,
-        //  so we store m - 2^w and compensate at runtime)
-        //- shift: the number of bits to right-shift the product (called 's' in Warren)
-        //- m: the magic multiplier
         use crate::constants::U64_P;
 
         let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w))
@@ -162,7 +155,7 @@ impl MulShift {
 /// approximation achievable without multiplication on ARM64 and x86.
 ///
 /// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
-/// u256) will to a larger value after subtraction reduction.
+/// u256) will lead to a larger value after subtraction reduction.
 /// subtraction reduction output: [0, 1+ε] with ε < 0.3.
 #[inline(always)]
 pub fn div_p_6b(x: u64) -> u64 {
@@ -192,7 +185,7 @@ pub fn div_p_32b(x: u64) -> u64 {
 /// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`].
 #[inline(always)]
 pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4] {
-    // No clamping as the max value of x can't go passed 5. Which is the maximum of
+    // No clamping as the max value of x can't go past 5. Which is the maximum of
     // the table.
     let q = div_p(x[3]) as usize;
     sub(x, constants::U64_P_MULTIPLES[q])
@@ -201,10 +194,7 @@ pub fn subtraction_reduce<F: Fn(u64) -> u64>(div_p: F, x: [u64; 4]) -> [u64; 4]
 #[cfg(kani)]
 mod proofs {
     use {
-        super::{
-            constants::{U64_2P, U64_P},
-            div_p_32b, div_p_6b, MulShift,
-        },
+        super::{constants::U64_2P, div_p_32b, div_p_6b},
         crate::constants::U64_P_MULTIPLES,
     };
 
@@ -225,7 +215,7 @@ mod proofs {
         let q = div_p_32b(x);
 
         let r = U64_P_MULTIPLES[q as usize][3];
-        assert!(x - r >= 0);
+        assert!(x >= r);
         assert!(le256([0, 0, 0, x - r], U64_2P));
     }
 
@@ -236,7 +226,7 @@ mod proofs {
         let q = div_p_6b(x);
 
         let r = U64_P_MULTIPLES[q as usize][3];
-        assert!(x - r >= 0);
+        assert!(x >= r);
         assert!(le256([0, 0, 0, x - r], U64_2P));
     }
 }

From 7b08e2c797ac79b3910d9eb8bbef6f6d8436c596 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Mar 2026 18:20:24 +0800
Subject: [PATCH 13/13] fix escaping documentation

---
 skyscraper/bn254-multiplier/src/utils.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs
index 2c33263e8..c8cc68ef1 100644
--- a/skyscraper/bn254-multiplier/src/utils.rs
+++ b/skyscraper/bn254-multiplier/src/utils.rs
@@ -154,8 +154,8 @@ impl MulShift {
 /// Returns a value ≤ ⌊x / P⌋. This is the most precise
 /// approximation achievable without multiplication on ARM64 and x86.
 ///
-/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for
-/// u256) will lead to a larger value after subtraction reduction.
+/// Tradeoff: due the limited range of this division \[0,4\] (instead of \[0,5\]
+/// for u256) will lead to a larger value after subtraction reduction.
 /// subtraction reduction output: [0, 1+ε] with ε < 0.3.
 #[inline(always)]
 pub fn div_p_6b(x: u64) -> u64 {