From b4c916ec20d8b4a33c308f8083362b2522996cf6 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Feb 2026 16:37:05 +0800 Subject: [PATCH 01/13] sub reduction: analysis --- skyscraper/bn254-multiplier/sub_reduce.py | 230 ++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 skyscraper/bn254-multiplier/sub_reduce.py diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py new file mode 100644 index 000000000..998734aa7 --- /dev/null +++ b/skyscraper/bn254-multiplier/sub_reduce.py @@ -0,0 +1,230 @@ +import math +from math import ceil + +p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 + +U51_i1 = pow( + 2**51, + -1, + p, +) +U51_i2 = pow( + 2**51, + -2, + p, +) +U51_i3 = pow( + 2**51, + -3, + p, +) +U51_i4 = pow( + 2**51, + -4, + p, +) + + +def rho(sign, k): + sign = 1 if sign else -1 + + return pow(sign * 2**51, -1 * k, p) + + +def rho_combinations(input): + for i in range(0, 64): + bitstring = bin(i)[2:].zfill(6) + # 1 is positive, 0 is negative + sel1 = int(bitstring[1 - 1]) + sel2 = int(bitstring[2 - 1]) + sel3 = int(bitstring[3 - 1]) + sel4 = int(bitstring[4 - 1]) + sel5 = int(bitstring[5 - 1]) + sel6 = int(bitstring[6 - 1]) + + # max_val = input**2 >> 4 * 51 + max_val = input * (p - 1) >> 4 * 51 + # maximum value with these inclusion. Only add the once that are supposed to be added + # values that are meant to be subtracted are ignored + max_val += ( + sel1 * rho(sel1, 1) + + sel1 * rho(sel1, 2) + + sel3 * rho(sel3, 3) + + sel4 * rho(sel4, 4) + + sel5 * p + ) * (2**51 - 1) + + max_val >>= 51 + max_val += sel6 * p + max_val >>= 1 + + # minimum value with these inclusion. Only add the once that are supposed to be added + # values that are supposed to be added are ignored + min_val = 0 + min_val += ( + -(1 - sel1) * rho(sel1, 1) + - (1 - sel2) * rho(sel2, 2) + - (1 - sel3) * rho(sel3, 3) + - (1 - sel4) * rho(sel4, 4) + - (1 - sel5) * p + ) * (2**51 - 1) + + min_val = min_val >> 51 + # could be that it doesn't need any compensation for final reduction + min_val -= (1 - sel6) * p + min_val >>= 1 + diff = max_val.bit_length() - min_val.bit_length() + + subp = max_val / p + + print( + bitstring, + f"{max_val.bit_length():03} {int(math.copysign(1, min_val)):2} {min_val.bit_length():03} {diff:3}\t {(input + max_val).bit_length()} {(input + ceil(subp) * p).bit_length()} {(input + ceil(subp) * p) / p} {subp}", + ) + + +def shift_mul(): + print("shift_mul") + neq = 0 + nlp = 0 + neg_max = 0 + neg_min = 0 + + power = 5 + for i in range(0, 2**power): + val_min = i << (256 - power) + subp = val_min // p + shift_subp = (val_min >> (256 - power)) // ((p >> (256 - power)) + 1) + + val_max = val_min + (1 << (256 - power)) - 1 + rem_max = val_max - (shift_subp) * p + + rem_min = val_min - (shift_subp) * p + + if subp != shift_subp: + neq += 1 + + if rem_max >= 1.7 * p: + print(hex(rem_max)) + nlp += 1 + + if rem_max < 0: + neg_max += 1 + + if rem_min < 0: + neg_min += 1 + + print(neq, nlp, neg_max, neg_min) + + +def shift_count(): + print("shift count") + neq = 0 + nlp = 0 + neg_max = 0 + neg_min = 0 + + power = 2 + + for i in range(0, 2**power): + val_min = i << (256 - power) + subp = val_min // p + shift_subp = i + # shift_subp = ((i >> 2) & i) + (i >> 1) + + val_max = val_min + (1 << (256 - power)) - 1 + rem_max = val_max - (shift_subp) * p + + rem_min = val_min - (shift_subp) * p + + if subp != shift_subp: + print(hex(val_min), subp, shift_subp) + neq += 1 + + if rem_max >= p + 1.5 * p: + print(hex(rem_max)) + nlp += 1 + + if rem_max < 0: + neg_max += 1 + + if rem_min < 0: + neg_min += 1 + + print(neq, nlp, neg_max, neg_min) + + +def shift_sum(): + print("shift sum") + neq = 0 + nlp = 0 + neg_max = 0 + neg_min = 0 + + power = 3 + + for i in range(0, 2**power): + val_min = i << (256 - power) + subp = val_min // p + shift_subp = ((i >> 2) & i) + (i >> 1) + print(bin(i)[2:].zfill(power), shift_subp) + + val_max = val_min + (1 << (256 - power)) - 1 + rem_max = val_max - (shift_subp) * p + + rem_min = val_min - (shift_subp) * p + + if subp != shift_subp: + print(hex(val_min), subp, shift_subp) + neq += 1 + + if rem_max >= p + 0.75 * p: + print(hex(rem_max)) + nlp += 1 + + if rem_max < 0: + neg_max += 1 + + if rem_min < 0: + neg_min += 1 + + print(neq, nlp, neg_max, neg_min) + + +def m_calculation(): + res = list() + for w in range(0, 65): + # increase by +1 otherwise the shift results in a smaller divider than the original problem. For us it's important to not overshoot + # ceil + d = (p >> (256 - w)) + 1 + nc = 2**w - 1 - (2**w % d) + for i in range(0, 128): + if 2**i > nc * (d - 1 - (2**i - 1) % d): + if i < w: + print(w, i, d.bit_length()) + m = (2**i + d - 1 - (2**i - 1) % d) // d + neg = False + # comment out if the bits fit in the register. + if m >= 2**w: + neg = True + m = m - 2**w + res.append( + ( + w, + m.bit_length(), + neg, + i, + m, + ) + ) + break + return res + + +if __name__ == "__main__": + shift_mul() + shift_count() + shift_sum() + + for b in m_calculation(): + print(b) From f9fc899c5e4e4d91eed2333ebb6ea72327f20815 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Feb 2026 16:38:27 +0800 Subject: [PATCH 02/13] subred: remove rho experiments --- skyscraper/bn254-multiplier/sub_reduce.py | 85 +---------------------- 1 file changed, 1 insertion(+), 84 deletions(-) diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py index 998734aa7..aad16fa1a 100644 --- a/skyscraper/bn254-multiplier/sub_reduce.py +++ b/skyscraper/bn254-multiplier/sub_reduce.py @@ -1,89 +1,6 @@ -import math -from math import ceil - p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 -U51_i1 = pow( - 2**51, - -1, - p, -) -U51_i2 = pow( - 2**51, - -2, - p, -) -U51_i3 = pow( - 2**51, - -3, - p, -) -U51_i4 = pow( - 2**51, - -4, - p, -) - - -def rho(sign, k): - sign = 1 if sign else -1 - - return pow(sign * 2**51, -1 * k, p) - - -def rho_combinations(input): - for i in range(0, 64): - bitstring = bin(i)[2:].zfill(6) - # 1 is positive, 0 is negative - sel1 = int(bitstring[1 - 1]) - sel2 = int(bitstring[2 - 1]) - sel3 = int(bitstring[3 - 1]) - sel4 = int(bitstring[4 - 1]) - sel5 = int(bitstring[5 - 1]) - sel6 = int(bitstring[6 - 1]) - - # max_val = input**2 >> 4 * 51 - max_val = input * (p - 1) >> 4 * 51 - # maximum value with these inclusion. Only add the once that are supposed to be added - # values that are meant to be subtracted are ignored - max_val += ( - sel1 * rho(sel1, 1) - + sel1 * rho(sel1, 2) - + sel3 * rho(sel3, 3) - + sel4 * rho(sel4, 4) - + sel5 * p - ) * (2**51 - 1) - - max_val >>= 51 - max_val += sel6 * p - max_val >>= 1 - - # minimum value with these inclusion. Only add the once that are supposed to be added - # values that are supposed to be added are ignored - min_val = 0 - min_val += ( - -(1 - sel1) * rho(sel1, 1) - - (1 - sel2) * rho(sel2, 2) - - (1 - sel3) * rho(sel3, 3) - - (1 - sel4) * rho(sel4, 4) - - (1 - sel5) * p - ) * (2**51 - 1) - - min_val = min_val >> 51 - # could be that it doesn't need any compensation for final reduction - min_val -= (1 - sel6) * p - min_val >>= 1 - diff = max_val.bit_length() - min_val.bit_length() - - subp = max_val / p - - print( - bitstring, - f"{max_val.bit_length():03} {int(math.copysign(1, min_val)):2} {min_val.bit_length():03} {diff:3}\t {(input + max_val).bit_length()} {(input + ceil(subp) * p).bit_length()} {(input + ceil(subp) * p) / p} {subp}", - ) - - -def shift_mul(): + def shift_mul(): print("shift_mul") neq = 0 nlp = 0 From a3954f6e8af34e497479531d06510f655c84551e Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Feb 2026 16:45:29 +0800 Subject: [PATCH 03/13] subred: remove approaches superseeded by warren_magic --- skyscraper/bn254-multiplier/sub_reduce.py | 155 ++++++++-------------- 1 file changed, 59 insertions(+), 96 deletions(-) diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py index aad16fa1a..e85e661c6 100644 --- a/skyscraper/bn254-multiplier/sub_reduce.py +++ b/skyscraper/bn254-multiplier/sub_reduce.py @@ -1,136 +1,98 @@ -p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 - - def shift_mul(): - print("shift_mul") - neq = 0 - nlp = 0 - neg_max = 0 - neg_min = 0 - - power = 5 - for i in range(0, 2**power): - val_min = i << (256 - power) - subp = val_min // p - shift_subp = (val_min >> (256 - power)) // ((p >> (256 - power)) + 1) - - val_max = val_min + (1 << (256 - power)) - 1 - rem_max = val_max - (shift_subp) * p - - rem_min = val_min - (shift_subp) * p - - if subp != shift_subp: - neq += 1 - - if rem_max >= 1.7 * p: - print(hex(rem_max)) - nlp += 1 - - if rem_max < 0: - neg_max += 1 +"""Sub-reduction strategies for bn254 modular arithmetic.""" - if rem_min < 0: - neg_min += 1 +p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 - print(neq, nlp, neg_max, neg_min) +def shift_sum(): + """ + Generate truth table for top 3 bit reduction strategy. -def shift_count(): - print("shift count") + The formula ((i >> 2) & i) + (i >> 1) computes this + approximation using only shifts and adds — no multiplication needed. + """ + # Keep track of potential erroneous conditions neq = 0 - nlp = 0 neg_max = 0 neg_min = 0 - power = 2 + power = 3 + + max_pval = 0 + truth_table = list() for i in range(0, 2**power): val_min = i << (256 - power) subp = val_min // p - shift_subp = i - # shift_subp = ((i >> 2) & i) + (i >> 1) + shift_subp = ((i >> 2) & i) + (i >> 1) val_max = val_min + (1 << (256 - power)) - 1 rem_max = val_max - (shift_subp) * p rem_min = val_min - (shift_subp) * p + # Validation: track the maximum remainder relative to p + max_pval = max(max_pval, rem_max) + + truth_table.append((bin(i)[2:].zfill(power), shift_subp)) + + # Check for erroneous situations if subp != shift_subp: print(hex(val_min), subp, shift_subp) neq += 1 - if rem_max >= p + 1.5 * p: - print(hex(rem_max)) - nlp += 1 - if rem_max < 0: neg_max += 1 if rem_min < 0: neg_min += 1 - print(neq, nlp, neg_max, neg_min) - - -def shift_sum(): - print("shift sum") - neq = 0 - nlp = 0 - neg_max = 0 - neg_min = 0 - - power = 3 - - for i in range(0, 2**power): - val_min = i << (256 - power) - subp = val_min // p - shift_subp = ((i >> 2) & i) + (i >> 1) - print(bin(i)[2:].zfill(power), shift_subp) - - val_max = val_min + (1 << (256 - power)) - 1 - rem_max = val_max - (shift_subp) * p + print(f"{'bits':>5} {'subtractions':>12}") + for e, r in truth_table: + print(f"{e:>5} {r:>12}") - rem_min = val_min - (shift_subp) * p + print( + f"\nmax_remainder/p={max_pval / p:.4f} mismatches={neq} neg_max={neg_max} neg_min={neg_min}" + ) - if subp != shift_subp: - print(hex(val_min), subp, shift_subp) - neq += 1 - if rem_max >= p + 0.75 * p: - print(hex(rem_max)) - nlp += 1 +def warren_magic(): + """ + Generate magic numbers for division by bn254's prime for a range of top-bit widths. - if rem_max < 0: - neg_max += 1 - - if rem_min < 0: - neg_min += 1 + Based on Warren's "Hacker's Delight" (integer + division by constants) to find a magic multiplier for each bit-width. - print(neq, nlp, neg_max, neg_min) - - -def m_calculation(): + Returns a list of tuples (w, m_bits, sub, shift, m) where: + - w: number of bits of the dividend (top bits of the value) + - m_bits: number of bits in the magic multiplier + - sub: whether the "subtract and shift" variant is used (m exceeded 2^w, + so we store m - 2^w and compensate at runtime) + - shift: the number of bits to right-shift the product (called 's' in Warren) + - m: the magic multiplier + """ res = list() for w in range(0, 65): - # increase by +1 otherwise the shift results in a smaller divider than the original problem. For us it's important to not overshoot - # ceil - d = (p >> (256 - w)) + 1 - nc = 2**w - 1 - (2**w % d) - for i in range(0, 128): - if 2**i > nc * (d - 1 - (2**i - 1) % d): - if i < w: - print(w, i, d.bit_length()) - m = (2**i + d - 1 - (2**i - 1) % d) // d - neg = False - # comment out if the bits fit in the register. + d = (p >> (256 - w)) + 1 # d = divisor = ceil(p / 2^(256-w)) + nc = 2**w - 1 - (2**w % d) # nc = largest value s.t. nc mod d != d-1 + for s in range(0, 128): # s = shift exponent + if 2**s > nc * (d - 1 - (2**s - 1) % d): + if s < w: + print(w, s, d.bit_length()) + m = (2**s + d - 1 - (2**s - 1) % d) // d # m = magic multiplier + sub = False + + # "Subtract and shift" variant: when m >= 2^w it won't fit in w bits, + # so subtract 2^w and compensate at runtime. + # Comment out if the register can hold more bits than w. if m >= 2**w: - neg = True + sub = True m = m - 2**w res.append( ( w, m.bit_length(), - neg, - i, + sub, + s, m, ) ) @@ -139,9 +101,10 @@ def m_calculation(): if __name__ == "__main__": - shift_mul() - shift_count() + print("shift sum") shift_sum() - for b in m_calculation(): - print(b) + print("\n warren") + print(f"{'w':>3} {'m_bits':>6} {'sub':>5} {'shift':>5} {'m':>20}") + for w, m_bits, sub, shift, m in warren_magic(): + print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {m:>20}") From bcd1bbcc9e3c20d8bc3a559371c2a4503e633109 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 16:38:05 +0800 Subject: [PATCH 04/13] sub_reduce: take size of input into account --- skyscraper/bn254-multiplier/sub_reduce.py | 32 +++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/skyscraper/bn254-multiplier/sub_reduce.py b/skyscraper/bn254-multiplier/sub_reduce.py index e85e661c6..fddc95ffc 100644 --- a/skyscraper/bn254-multiplier/sub_reduce.py +++ b/skyscraper/bn254-multiplier/sub_reduce.py @@ -1,5 +1,7 @@ """Sub-reduction strategies for bn254 modular arithmetic.""" +import argparse + p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 @@ -55,7 +57,7 @@ def shift_sum(): ) -def warren_magic(): +def warren_magic(max_bit_size, apply_sub: bool = False): """ Generate magic numbers for division by bn254's prime for a range of top-bit widths. @@ -63,7 +65,7 @@ def warren_magic(): division by constants) to find a magic multiplier for each bit-width. Returns a list of tuples (w, m_bits, sub, shift, m) where: - - w: number of bits of the dividend (top bits of the value) + - w: number of top bits of the value - m_bits: number of bits in the magic multiplier - sub: whether the "subtract and shift" variant is used (m exceeded 2^w, so we store m - 2^w and compensate at runtime) @@ -72,8 +74,8 @@ def warren_magic(): """ res = list() for w in range(0, 65): - d = (p >> (256 - w)) + 1 # d = divisor = ceil(p / 2^(256-w)) - nc = 2**w - 1 - (2**w % d) # nc = largest value s.t. nc mod d != d-1 + d = (p >> (max_bit_size - w)) + 1 # d = divisor = ceil(p / 2^(max_bit_size-w)) + nc = 2**w - 1 - (2**w % d) # nc = largest value s.t. nc mod d == d-1 for s in range(0, 128): # s = shift exponent if 2**s > nc * (d - 1 - (2**s - 1) % d): if s < w: @@ -83,8 +85,8 @@ def warren_magic(): # "Subtract and shift" variant: when m >= 2^w it won't fit in w bits, # so subtract 2^w and compensate at runtime. - # Comment out if the register can hold more bits than w. - if m >= 2**w: + # Skip this part via apply_sub if the register can hold more bits than w. + if apply_sub and m >= 2**w: sub = True m = m - 2**w res.append( @@ -101,10 +103,24 @@ def warren_magic(): if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Sub-reduction strategies for bn254.") + parser.add_argument( + "--sub", + action="store_true", + help="Apply the 'subtract and shift' variant when m >= 2^w.", + ) + args = parser.parse_args() + print("shift sum") shift_sum() print("\n warren") print(f"{'w':>3} {'m_bits':>6} {'sub':>5} {'shift':>5} {'m':>20}") - for w, m_bits, sub, shift, m in warren_magic(): - print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {m:>20}") + for w, m_bits, sub, shift, m in warren_magic(257, apply_sub=args.sub): + print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}") + + for w, m_bits, sub, shift, m in warren_magic(256, apply_sub=args.sub): + print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}") + + for w, m_bits, sub, shift, m in warren_magic(255, apply_sub=args.sub): + print(f"{w:>3} {m_bits:>6} {str(sub):>5} {shift:>5} {hex(m):>20}") From e1593567450f3a0b87c363c53eceea18cad847ef Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 16:44:00 +0800 Subject: [PATCH 05/13] subrec: lookup table for subtraction + generator code --- skyscraper/bn254-multiplier/gen_multiples.py | 35 ++++++++++++++ skyscraper/bn254-multiplier/src/constants.rs | 42 +++++++++++++++++ .../bn254-multiplier/src/rne/constants.rs | 47 +++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 skyscraper/bn254-multiplier/gen_multiples.py diff --git a/skyscraper/bn254-multiplier/gen_multiples.py b/skyscraper/bn254-multiplier/gen_multiples.py new file mode 100644 index 000000000..67c86127e --- /dev/null +++ b/skyscraper/bn254-multiplier/gen_multiples.py @@ -0,0 +1,35 @@ +"""Generate Rust const lookup tables for multiples of the BN254 scalar field prime.""" + +p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 + + +def int_to_limbs(size, n, count): + mask = 2**size - 1 + limbs = [] + for _ in range(count): + limbs.append(n & mask) + n >>= size + return limbs + + +def main(): + multiples_64 = [int_to_limbs(64, k * p, 4) for k in range(0, 6)] + multiples_51 = [int_to_limbs(51, k * p, 5) for k in range(0, 6)] + + # Print 64-bit table (for constants.rs) + print("pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [") + for k, limbs in enumerate(multiples_64): + fmt = ", ".join(f"0x{l:016x}" for l in limbs) + print(f" [{fmt}], // {k}P") + print("];") + + # Print 51-bit table (for rne/constants.rs) + print("\npub const U51_P_MULTIPLES: [[u64; 5]; 6] = [") + for k, limbs in enumerate(multiples_51): + fmt = ", ".join(f"0x{l:013x}" for l in limbs) + print(f" [{fmt}], // {k}P") + print("];") + + +if __name__ == "__main__": + main() diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs index b49971136..fe026869a 100644 --- a/skyscraper/bn254-multiplier/src/constants.rs +++ b/skyscraper/bn254-multiplier/src/constants.rs @@ -14,6 +14,48 @@ pub const U64_2P: [u64; 4] = [ 0x60c89ce5c2634053, ]; +/// Lookup table: `U64_P_MULTIPLES[k]` = `k * P` for k in 0..=5. +/// Index 0 is all-zeros; use as `x - U64_P_MULTIPLES[k]` to subtract k copies +/// of P. +pub const U64_P_MULTIPLES: [[u64; 4]; 6] = [ + [ + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + 0x0000000000000000, + ], // 0P + [ + 0x43e1f593f0000001, + 0x2833e84879b97091, + 0xb85045b68181585d, + 0x30644e72e131a029, + ], // 1P + [ + 0x87c3eb27e0000002, + 0x5067d090f372e122, + 0x70a08b6d0302b0ba, + 0x60c89ce5c2634053, + ], // 2P + [ + 0xcba5e0bbd0000003, + 0x789bb8d96d2c51b3, + 0x28f0d12384840917, + 0x912ceb58a394e07d, + ], // 3P + [ + 0x0f87d64fc0000004, + 0xa0cfa121e6e5c245, + 0xe14116da06056174, + 0xc19139cb84c680a6, + ], // 4P + [ + 0x5369cbe3b0000005, + 0xc903896a609f32d6, + 0x99915c908786b9d1, + 0xf1f5883e65f820d0, + ], // 5P +]; + // R mod P pub const U64_R: [u64; 4] = [ 0xac96341c4ffffffb, diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs index 77862defc..f1cf0fe06 100644 --- a/skyscraper/bn254-multiplier/src/rne/constants.rs +++ b/skyscraper/bn254-multiplier/src/rne/constants.rs @@ -14,6 +14,53 @@ pub const U51_P: [u64; 5] = [ 0x30644e72e131a, ]; +/// Lookup table: `U51_P_MULTIPLES[k]` = `k * P` for k in 0..=5, in 51-bit +/// limbs. +pub const U51_P_MULTIPLES: [[u64; 5]; 6] = [ + [ + 0x0000000000000, + 0x0000000000000, + 0x0000000000000, + 0x0000000000000, + 0x0000000000000, + ], // 0P + [ + 0x1f593f0000001, + 0x10f372e12287c, + 0x6056174a0cfa1, + 0x014dc2822db40, + 0x30644e72e131a, + ], // 1P + [ + 0x3eb27e0000002, + 0x21e6e5c2450f8, + 0x40ac2e9419f42, + 0x029b85045b681, + 0x60c89ce5c2634, + ], // 2P + [ + 0x5e0bbd0000003, + 0x32da58a367974, + 0x210245de26ee3, + 0x03e94786891c2, + 0x112ceb58a394e, + ], // 3P + [ + 0x7d64fc0000004, + 0x43cdcb848a1f0, + 0x01585d2833e84, + 0x05370a08b6d03, + 0x419139cb84c68, + ], // 4P + [ + 0x1cbe3b0000005, + 0x54c13e65aca6d, + 0x61ae747240e25, + 0x0684cc8ae4843, + 0x71f5883e65f82, + ], // 5P +]; + /// Bit mask for 51-bit limbs. pub const MASK51: u64 = 2_u64.pow(51) - 1; From 990b85371cdbe959c3d8902cd499267d7ada2e4b Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 17:23:36 +0800 Subject: [PATCH 06/13] mulshift: documentation + tests --- skyscraper/bn254-multiplier/src/lib.rs | 102 +++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs index 454d01945..b3dc74727 100644 --- a/skyscraper/bn254-multiplier/src/lib.rs +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -33,3 +33,105 @@ const fn pow_2(n: u32) -> f64 { let exp = (n as u64 + 1023) << 52; f64::from_bits(exp) } + +/// Precomputed magic constants for fast approximate floor division by the +/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by +/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the +/// declared bit range. +pub struct MulShift { + mul: u64, + shift: u32, +} + +impl MulShift { + /// Computes magic multiplication and shift constants for approximate floor + /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within + /// `max_bit_size` bits. `precision` controls how many of the top bits of + /// `val` are used — higher precision yields a tighter approximation. + pub const fn new(max_bit_size: u32, precision: u32) -> Self { + // Generate magic numbers for division by bn254's prime for a range of top-bit + // widths. + + // Based on Warren's "Hacker's Delight" (integer + // division by constants) to find a magic multiplier for each bit-width. + + // Returns a list of tuples (w, m_bits, sub, shift, m) where: + //- w: number of top bits of the value + //- m_bits: number of bits in the magic multiplier + //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w, + // so we store m - 2^w and compensate at runtime) + //- shift: the number of bits to right-shift the product (called 's' in Warren) + //- m: the magic multiplier + use crate::constants::U64_P; + + let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w)) + let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1 + let mut s = precision; // start at precision; s < precision values are skipped + let m; + loop { + // s = shift exponent + if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) { + m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier + break; + } + s += 1; + assert!(s < 64, "no magic multiplier found"); + } + + MulShift { mul: m, shift: s } + } + + #[inline(always)] + /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient). + /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`]. + pub const fn div_p(&self, val: u64) -> u64 { + // assumes systems can handle multiplication by 64 bits without performance + // penalty. + (val * self.mul) >> self.shift + } +} + +/// Approximate floor division by P using the upper 6 bits of `x`. +/// `x` must be the upper limb of a u256 in 64-bit radix. +/// +/// Returns a value ≤ ⌊x / P⌋. This is the most precise +/// approximation achievable without multiplication on ARM64 and x86. +/// +/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for +/// u256) will to a larger value after subtraction reduction. +/// subtraction reduction output: [0, 1+ε] with ε < 0.3. +pub fn div_p_6b(x: u64) -> u64 { + let upper_bits = x >> (64 - 6); + // const to force compile time evaluation + const MULSHIFT: MulShift = MulShift::new(256, 6); + MULSHIFT.div_p(upper_bits) +} + +/// Approximate floor division by P using the upper 32 bits of `x`. +/// `x` must be the upper limb of a u256 in 64-bit radix. +/// +/// Returns a value ≤ ⌊x / P⌋. This is the most precise +/// approximation achievable with a 32bx32b->64b multiplier. +pub fn div_p_32b(x: u64) -> u64 { + let upper_bits = x >> (64 - 32); + // const to force compile time evaluation + const MULSHIFT: MulShift = MulShift::new(256, 32); + MULSHIFT.div_p(upper_bits) +} + +#[cfg(kani)] +mod proofs { + use super::div_p_32b; + + #[inline(never)] + fn div32(x: u64) -> u32 { + // cast to not create the 64bit magic number + return (x >> 32) as u32 / 0x30644e73 as u32; + } + + #[kani::proof] + fn div_p_32b_matches_div32() { + let x: u64 = kani::any(); + assert_eq!(div_p_32b(x), div32(x) as u64); + } +} From f4b310b3132c16acaaa637d0587c8d58b0ca896a Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 17:25:01 +0800 Subject: [PATCH 07/13] constants: synonyms for table constants --- skyscraper/bn254-multiplier/src/constants.rs | 14 ++------------ skyscraper/bn254-multiplier/src/rne/constants.rs | 8 +------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs index fe026869a..d515fc8fd 100644 --- a/skyscraper/bn254-multiplier/src/constants.rs +++ b/skyscraper/bn254-multiplier/src/constants.rs @@ -1,18 +1,8 @@ pub const U64_NP0: u64 = 0xc2e1f593efffffff; -pub const U64_P: [u64; 4] = [ - 0x43e1f593f0000001, - 0x2833e84879b97091, - 0xb85045b68181585d, - 0x30644e72e131a029, -]; +pub const U64_P: [u64; 4] = U64_P_MULTIPLES[1]; -pub const U64_2P: [u64; 4] = [ - 0x87c3eb27e0000002, - 0x5067d090f372e122, - 0x70a08b6d0302b0ba, - 0x60c89ce5c2634053, -]; +pub const U64_2P: [u64; 4] = U64_P_MULTIPLES[2]; /// Lookup table: `U64_P_MULTIPLES[k]` = `k * P` for k in 0..=5. /// Index 0 is all-zeros; use as `x - U64_P_MULTIPLES[k]` to subtract k copies diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs index f1cf0fe06..462e8938e 100644 --- a/skyscraper/bn254-multiplier/src/rne/constants.rs +++ b/skyscraper/bn254-multiplier/src/rne/constants.rs @@ -6,13 +6,7 @@ use crate::pow_2; pub const U51_NP0: u64 = 0x1f593efffffff; /// BN254 scalar field prime -pub const U51_P: [u64; 5] = [ - 0x1f593f0000001, - 0x10f372e12287c, - 0x6056174a0cfa1, - 0x014dc2822db40, - 0x30644e72e131a, -]; +pub const U51_P: [u64; 5] = U51_P_MULTIPLES[1]; /// Lookup table: `U51_P_MULTIPLES[k]` = `k * P` for k in 0..=5, in 51-bit /// limbs. From 8d55f2aaf9373922f758c3662c27e4ac346f2595 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 18:41:44 +0800 Subject: [PATCH 08/13] subrec: under approximation test --- skyscraper/bn254-multiplier/src/lib.rs | 36 +++++++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs index b3dc74727..885ddd1de 100644 --- a/skyscraper/bn254-multiplier/src/lib.rs +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -121,17 +121,39 @@ pub fn div_p_32b(x: u64) -> u64 { #[cfg(kani)] mod proofs { - use super::div_p_32b; + use super::{constants::U64_P, div_p_32b}; + + /// Compute q * P as (4-limb little-endian result, overflow carry). + fn mul_small_by_p(q: u64) -> ([u64; 4], u64) { + let q = q as u128; + let t0 = q * U64_P[0] as u128; + let t1 = q * U64_P[1] as u128 + (t0 >> 64); + let t2 = q * U64_P[2] as u128 + (t1 >> 64); + let t3 = q * U64_P[3] as u128 + (t2 >> 64); + ( + [t0 as u64, t1 as u64, t2 as u64, t3 as u64], + (t3 >> 64) as u64, + ) + } - #[inline(never)] - fn div32(x: u64) -> u32 { - // cast to not create the 64bit magic number - return (x >> 32) as u32 / 0x30644e73 as u32; + /// Lexicographic ≤ on little-endian 256-bit integers. + fn le256(a: [u64; 4], b: [u64; 4]) -> bool { + for i in (0..4).rev() { + if a[i] != b[i] { + return a[i] < b[i]; + } + } + true } + /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192. #[kani::proof] - fn div_p_32b_matches_div32() { + fn div_p_32b_underapprox() { let x: u64 = kani::any(); - assert_eq!(div_p_32b(x), div32(x) as u64); + let q = div_p_32b(x); + + let (q_times_p, carry) = mul_small_by_p(q); + assert_eq!(carry, 0); + assert!(le256(q_times_p, [0, 0, 0, x])); } } From 95d569b2cd87d4f3973ef88fe5675909a17792eb Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 19:01:00 +0800 Subject: [PATCH 09/13] subrec: test bounds approach instead of approximation --- skyscraper/bn254-multiplier/src/lib.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs index 885ddd1de..776b235fc 100644 --- a/skyscraper/bn254-multiplier/src/lib.rs +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -121,7 +121,13 @@ pub fn div_p_32b(x: u64) -> u64 { #[cfg(kani)] mod proofs { - use super::{constants::U64_P, div_p_32b}; + use { + super::{ + constants::{U64_2P, U64_P}, + div_p_32b, div_p_6b, MulShift, + }, + crate::constants::U64_P_MULTIPLES, + }; /// Compute q * P as (4-limb little-endian result, overflow carry). fn mul_small_by_p(q: u64) -> ([u64; 4], u64) { @@ -147,13 +153,23 @@ mod proofs { } /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192. + /// TODO: encode tighter bounds #[kani::proof] fn div_p_32b_underapprox() { let x: u64 = kani::any(); let q = div_p_32b(x); - let (q_times_p, carry) = mul_small_by_p(q); - assert_eq!(carry, 0); - assert!(le256(q_times_p, [0, 0, 0, x])); + let r = U64_P_MULTIPLES[q as usize][3]; + assert!(le256([0, 0, 0, x - r], U64_2P)); + } + + #[kani::proof] + // TODO tighter bounds + fn div_p_6b_underapprox() { + let x: u64 = kani::any(); + let q = div_p_6b(x); + + let r = U64_P_MULTIPLES[q as usize][3]; + assert!(le256([0, 0, 0, x - r], U64_2P)); } } From 77fa31092a8d4f48f7275709d38cc48732781f8f Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Feb 2026 19:12:40 +0800 Subject: [PATCH 10/13] subrec: subtraction reduce function --- skyscraper/bn254-multiplier/src/lib.rs | 33 ++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs index 776b235fc..c720b3eb9 100644 --- a/skyscraper/bn254-multiplier/src/lib.rs +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -100,6 +100,7 @@ impl MulShift { /// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for /// u256) will to a larger value after subtraction reduction. /// subtraction reduction output: [0, 1+ε] with ε < 0.3. +#[inline(always)] pub fn div_p_6b(x: u64) -> u64 { let upper_bits = x >> (64 - 6); // const to force compile time evaluation @@ -112,6 +113,7 @@ pub fn div_p_6b(x: u64) -> u64 { /// /// Returns a value ≤ ⌊x / P⌋. This is the most precise /// approximation achievable with a 32bx32b->64b multiplier. +#[inline(always)] pub fn div_p_32b(x: u64) -> u64 { let upper_bits = x >> (64 - 32); // const to force compile time evaluation @@ -119,6 +121,19 @@ pub fn div_p_32b(x: u64) -> u64 { MULSHIFT.div_p(upper_bits) } +/// Subtracts an approximate multiple of P from `x` using `div_p` on the high +/// limb. +/// +/// The result is not fully reduced; the output range depends on the precision +/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`]. +#[inline(always)] +pub fn subtraction_reduce u64>(div_p: F, x: [u64; 4]) -> [u64; 4] { + // No clamping as the max value of x can't go passed 5. Which is the maximum of + // the table. + let q = div_p(x[3]) as usize; + utils::sub(x, constants::U64_P_MULTIPLES[q]) +} + #[cfg(kani)] mod proofs { use { @@ -129,19 +144,6 @@ mod proofs { crate::constants::U64_P_MULTIPLES, }; - /// Compute q * P as (4-limb little-endian result, overflow carry). - fn mul_small_by_p(q: u64) -> ([u64; 4], u64) { - let q = q as u128; - let t0 = q * U64_P[0] as u128; - let t1 = q * U64_P[1] as u128 + (t0 >> 64); - let t2 = q * U64_P[2] as u128 + (t1 >> 64); - let t3 = q * U64_P[3] as u128 + (t2 >> 64); - ( - [t0 as u64, t1 as u64, t2 as u64, t3 as u64], - (t3 >> 64) as u64, - ) - } - /// Lexicographic ≤ on little-endian 256-bit integers. fn le256(a: [u64; 4], b: [u64; 4]) -> bool { for i in (0..4).rev() { @@ -152,14 +154,14 @@ mod proofs { true } - /// For every 64-bit x, div_p_32b(x) * P ≤ x * 2^192. - /// TODO: encode tighter bounds + /// TODO: tighter bounds #[kani::proof] fn div_p_32b_underapprox() { let x: u64 = kani::any(); let q = div_p_32b(x); let r = U64_P_MULTIPLES[q as usize][3]; + assert!(x - r >= 0); assert!(le256([0, 0, 0, x - r], U64_2P)); } @@ -170,6 +172,7 @@ mod proofs { let q = div_p_6b(x); let r = U64_P_MULTIPLES[q as usize][3]; + assert!(x - r >= 0); assert!(le256([0, 0, 0, x - r], U64_2P)); } } From cca45ade5f74fc4a34b18d8e35498f9ce201ca0e Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Tue, 10 Mar 2026 16:52:11 +0800 Subject: [PATCH 11/13] subrec: move mulsub --- skyscraper/bn254-multiplier/src/lib.rs | 145 +---------------------- skyscraper/bn254-multiplier/src/utils.rs | 145 ++++++++++++++++++++++- 2 files changed, 145 insertions(+), 145 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs index c720b3eb9..3b4d99bfa 100644 --- a/skyscraper/bn254-multiplier/src/lib.rs +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -13,7 +13,7 @@ pub mod rtz; pub mod constants; pub mod rne; mod scalar; -mod utils; +pub mod utils; #[cfg(not(target_arch = "wasm32"))] // Proptest not supported on WASI mod test_utils; @@ -33,146 +33,3 @@ const fn pow_2(n: u32) -> f64 { let exp = (n as u64 + 1023) << 52; f64::from_bits(exp) } - -/// Precomputed magic constants for fast approximate floor division by the -/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by -/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the -/// declared bit range. -pub struct MulShift { - mul: u64, - shift: u32, -} - -impl MulShift { - /// Computes magic multiplication and shift constants for approximate floor - /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within - /// `max_bit_size` bits. `precision` controls how many of the top bits of - /// `val` are used — higher precision yields a tighter approximation. - pub const fn new(max_bit_size: u32, precision: u32) -> Self { - // Generate magic numbers for division by bn254's prime for a range of top-bit - // widths. - - // Based on Warren's "Hacker's Delight" (integer - // division by constants) to find a magic multiplier for each bit-width. - - // Returns a list of tuples (w, m_bits, sub, shift, m) where: - //- w: number of top bits of the value - //- m_bits: number of bits in the magic multiplier - //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w, - // so we store m - 2^w and compensate at runtime) - //- shift: the number of bits to right-shift the product (called 's' in Warren) - //- m: the magic multiplier - use crate::constants::U64_P; - - let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w)) - let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1 - let mut s = precision; // start at precision; s < precision values are skipped - let m; - loop { - // s = shift exponent - if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) { - m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier - break; - } - s += 1; - assert!(s < 64, "no magic multiplier found"); - } - - MulShift { mul: m, shift: s } - } - - #[inline(always)] - /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient). - /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`]. - pub const fn div_p(&self, val: u64) -> u64 { - // assumes systems can handle multiplication by 64 bits without performance - // penalty. - (val * self.mul) >> self.shift - } -} - -/// Approximate floor division by P using the upper 6 bits of `x`. -/// `x` must be the upper limb of a u256 in 64-bit radix. -/// -/// Returns a value ≤ ⌊x / P⌋. This is the most precise -/// approximation achievable without multiplication on ARM64 and x86. -/// -/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for -/// u256) will to a larger value after subtraction reduction. -/// subtraction reduction output: [0, 1+ε] with ε < 0.3. -#[inline(always)] -pub fn div_p_6b(x: u64) -> u64 { - let upper_bits = x >> (64 - 6); - // const to force compile time evaluation - const MULSHIFT: MulShift = MulShift::new(256, 6); - MULSHIFT.div_p(upper_bits) -} - -/// Approximate floor division by P using the upper 32 bits of `x`. -/// `x` must be the upper limb of a u256 in 64-bit radix. -/// -/// Returns a value ≤ ⌊x / P⌋. This is the most precise -/// approximation achievable with a 32bx32b->64b multiplier. -#[inline(always)] -pub fn div_p_32b(x: u64) -> u64 { - let upper_bits = x >> (64 - 32); - // const to force compile time evaluation - const MULSHIFT: MulShift = MulShift::new(256, 32); - MULSHIFT.div_p(upper_bits) -} - -/// Subtracts an approximate multiple of P from `x` using `div_p` on the high -/// limb. -/// -/// The result is not fully reduced; the output range depends on the precision -/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`]. -#[inline(always)] -pub fn subtraction_reduce u64>(div_p: F, x: [u64; 4]) -> [u64; 4] { - // No clamping as the max value of x can't go passed 5. Which is the maximum of - // the table. - let q = div_p(x[3]) as usize; - utils::sub(x, constants::U64_P_MULTIPLES[q]) -} - -#[cfg(kani)] -mod proofs { - use { - super::{ - constants::{U64_2P, U64_P}, - div_p_32b, div_p_6b, MulShift, - }, - crate::constants::U64_P_MULTIPLES, - }; - - /// Lexicographic ≤ on little-endian 256-bit integers. - fn le256(a: [u64; 4], b: [u64; 4]) -> bool { - for i in (0..4).rev() { - if a[i] != b[i] { - return a[i] < b[i]; - } - } - true - } - - /// TODO: tighter bounds - #[kani::proof] - fn div_p_32b_underapprox() { - let x: u64 = kani::any(); - let q = div_p_32b(x); - - let r = U64_P_MULTIPLES[q as usize][3]; - assert!(x - r >= 0); - assert!(le256([0, 0, 0, x - r], U64_2P)); - } - - #[kani::proof] - // TODO tighter bounds - fn div_p_6b_underapprox() { - let x: u64 = kani::any(); - let q = div_p_6b(x); - - let r = U64_P_MULTIPLES[q as usize][3]; - assert!(x - r >= 0); - assert!(le256([0, 0, 0, x - r], U64_2P)); - } -} diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs index ee3ac57b7..066282dcb 100644 --- a/skyscraper/bn254-multiplier/src/utils.rs +++ b/skyscraper/bn254-multiplier/src/utils.rs @@ -1,4 +1,4 @@ -use crate::constants::U64_2P; +use crate::constants::{self, U64_2P}; /// Macro to extract a subarray from an array. /// @@ -97,3 +97,146 @@ pub const fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64 let c: u128 = widening_mul(a, b) + carry as u128 + add as u128; (c as u64, (c >> 64) as u64) } + +/// Precomputed magic constants for fast approximate floor division by the +/// BN254 prime P, following Warren's "Hacker's Delight" (integer division by +/// constants). Guarantees `div_p(val) ≤ ⌊val / P⌋` for all inputs within the +/// declared bit range. +pub struct MulShift { + mul: u64, + shift: u32, +} + +impl MulShift { + /// Computes magic multiplication and shift constants for approximate floor + /// division by P, such that `div_p(val) ≤ ⌊val / P⌋` for all `val` within + /// `max_bit_size` bits. `precision` controls how many of the top bits of + /// `val` are used — higher precision yields a tighter approximation. + pub const fn new(max_bit_size: u32, precision: u32) -> Self { + // Generate magic numbers for division by bn254's prime for a range of top-bit + // widths. + + // Based on Warren's "Hacker's Delight" (integer + // division by constants) to find a magic multiplier for each bit-width. + + // Returns a list of tuples (w, m_bits, sub, shift, m) where: + //- w: number of top bits of the value + //- m_bits: number of bits in the magic multiplier + //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w, + // so we store m - 2^w and compensate at runtime) + //- shift: the number of bits to right-shift the product (called 's' in Warren) + //- m: the magic multiplier + use crate::constants::U64_P; + + let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w)) + let nc = 2_u64.pow(precision) - 1 - (2_u64.pow(precision) % d); // nc = largest value s.t. nc mod d == d-1 + let mut s = precision; // start at precision; s < precision values are skipped + let m; + loop { + // s = shift exponent + if 2_u64.pow(s) > nc * (d - 1 - (2_u64.pow(s) - 1) % d) { + m = (2_u64.pow(s) + d - 1 - (2_u64.pow(s) - 1) % d) / d; // m = magic multiplier + break; + } + s += 1; + assert!(s < 64, "no magic multiplier found"); + } + + MulShift { mul: m, shift: s } + } + + #[inline(always)] + /// Returns an under-approximation of ⌊val / P⌋ (result ≤ true quotient). + /// `val` must fit within the `max_bit_size` passed to [`MulShift::new`]. + pub const fn div_p(&self, val: u64) -> u64 { + // assumes systems can handle multiplication by 64 bits without performance + // penalty. + (val * self.mul) >> self.shift + } +} + +/// Approximate floor division by P using the upper 6 bits of `x`. +/// `x` must be the upper limb of a u256 in 64-bit radix. +/// +/// Returns a value ≤ ⌊x / P⌋. This is the most precise +/// approximation achievable without multiplication on ARM64 and x86. +/// +/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for +/// u256) will to a larger value after subtraction reduction. +/// subtraction reduction output: [0, 1+ε] with ε < 0.3. +#[inline(always)] +pub fn div_p_6b(x: u64) -> u64 { + let upper_bits = x >> (64 - 6); + // const to force compile time evaluation + const MULSHIFT: MulShift = MulShift::new(256, 6); + MULSHIFT.div_p(upper_bits) +} + +/// Approximate floor division by P using the upper 32 bits of `x`. +/// `x` must be the upper limb of a u256 in 64-bit radix. +/// +/// Returns a value ≤ ⌊x / P⌋. This is the most precise +/// approximation achievable with a 32bx32b->64b multiplier. +#[inline(always)] +pub fn div_p_32b(x: u64) -> u64 { + let upper_bits = x >> (64 - 32); + // const to force compile time evaluation + const MULSHIFT: MulShift = MulShift::new(256, 32); + MULSHIFT.div_p(upper_bits) +} + +/// Subtracts an approximate multiple of P from `x` using `div_p` on the high +/// limb. +/// +/// The result is not fully reduced; the output range depends on the precision +/// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`]. +#[inline(always)] +pub fn subtraction_reduce u64>(div_p: F, x: [u64; 4]) -> [u64; 4] { + // No clamping as the max value of x can't go passed 5. Which is the maximum of + // the table. + let q = div_p(x[3]) as usize; + sub(x, constants::U64_P_MULTIPLES[q]) +} + +#[cfg(kani)] +mod proofs { + use { + super::{ + constants::{U64_2P, U64_P}, + div_p_32b, div_p_6b, MulShift, + }, + crate::constants::U64_P_MULTIPLES, + }; + + /// Lexicographic ≤ on little-endian 256-bit integers. + fn le256(a: [u64; 4], b: [u64; 4]) -> bool { + for i in (0..4).rev() { + if a[i] != b[i] { + return a[i] < b[i]; + } + } + true + } + + /// TODO: tighter bounds + #[kani::proof] + fn div_p_32b_underapprox() { + let x: u64 = kani::any(); + let q = div_p_32b(x); + + let r = U64_P_MULTIPLES[q as usize][3]; + assert!(x - r >= 0); + assert!(le256([0, 0, 0, x - r], U64_2P)); + } + + #[kani::proof] + // TODO tighter bounds + fn div_p_6b_underapprox() { + let x: u64 = kani::any(); + let q = div_p_6b(x); + + let r = U64_P_MULTIPLES[q as usize][3]; + assert!(x - r >= 0); + assert!(le256([0, 0, 0, x - r], U64_2P)); + } +} From f9a4f4d5ab219d25099c0904cfa55f2757d7cc1b Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Tue, 10 Mar 2026 17:13:58 +0800 Subject: [PATCH 12/13] subrec: minor improvements --- skyscraper/bn254-multiplier/src/constants.rs | 2 -- skyscraper/bn254-multiplier/src/utils.rs | 20 +++++--------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs index d515fc8fd..b831593f3 100644 --- a/skyscraper/bn254-multiplier/src/constants.rs +++ b/skyscraper/bn254-multiplier/src/constants.rs @@ -1,5 +1,3 @@ -pub const U64_NP0: u64 = 0xc2e1f593efffffff; - pub const U64_P: [u64; 4] = U64_P_MULTIPLES[1]; pub const U64_2P: [u64; 4] = U64_P_MULTIPLES[2]; diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs index 066282dcb..2c33263e8 100644 --- a/skyscraper/bn254-multiplier/src/utils.rs +++ b/skyscraper/bn254-multiplier/src/utils.rs @@ -119,13 +119,6 @@ impl MulShift { // Based on Warren's "Hacker's Delight" (integer // division by constants) to find a magic multiplier for each bit-width. - // Returns a list of tuples (w, m_bits, sub, shift, m) where: - //- w: number of top bits of the value - //- m_bits: number of bits in the magic multiplier - //- sub: whether the "subtract and shift" variant is used (m exceeded 2^w, - // so we store m - 2^w and compensate at runtime) - //- shift: the number of bits to right-shift the product (called 's' in Warren) - //- m: the magic multiplier use crate::constants::U64_P; let d = (U64_P[3] >> (max_bit_size - 192 - precision)) + 1; // d = divisor = ceil(p / 2^(max_bit_size-w)) @@ -162,7 +155,7 @@ impl MulShift { /// approximation achievable without multiplication on ARM64 and x86. /// /// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for -/// u256) will to a larger value after subtraction reduction. +/// u256) will lead to a larger value after subtraction reduction. /// subtraction reduction output: [0, 1+ε] with ε < 0.3. #[inline(always)] pub fn div_p_6b(x: u64) -> u64 { @@ -192,7 +185,7 @@ pub fn div_p_32b(x: u64) -> u64 { /// of the supplied `div_p` — see [`div_p_6b`] and [`div_p_32b`]. #[inline(always)] pub fn subtraction_reduce u64>(div_p: F, x: [u64; 4]) -> [u64; 4] { - // No clamping as the max value of x can't go passed 5. Which is the maximum of + // No clamping as the max value of x can't go past 5. Which is the maximum of // the table. let q = div_p(x[3]) as usize; sub(x, constants::U64_P_MULTIPLES[q]) @@ -201,10 +194,7 @@ pub fn subtraction_reduce u64>(div_p: F, x: [u64; 4]) -> [u64; 4] #[cfg(kani)] mod proofs { use { - super::{ - constants::{U64_2P, U64_P}, - div_p_32b, div_p_6b, MulShift, - }, + super::{constants::U64_2P, div_p_32b, div_p_6b}, crate::constants::U64_P_MULTIPLES, }; @@ -225,7 +215,7 @@ mod proofs { let q = div_p_32b(x); let r = U64_P_MULTIPLES[q as usize][3]; - assert!(x - r >= 0); + assert!(x >= r); assert!(le256([0, 0, 0, x - r], U64_2P)); } @@ -236,7 +226,7 @@ mod proofs { let q = div_p_6b(x); let r = U64_P_MULTIPLES[q as usize][3]; - assert!(x - r >= 0); + assert!(x >= r); assert!(le256([0, 0, 0, x - r], U64_2P)); } } From 7b08e2c797ac79b3910d9eb8bbef6f6d8436c596 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Mar 2026 18:20:24 +0800 Subject: [PATCH 13/13] fix escaping documentation --- skyscraper/bn254-multiplier/src/utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skyscraper/bn254-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs index 2c33263e8..c8cc68ef1 100644 --- a/skyscraper/bn254-multiplier/src/utils.rs +++ b/skyscraper/bn254-multiplier/src/utils.rs @@ -154,8 +154,8 @@ impl MulShift { /// Returns a value ≤ ⌊x / P⌋. This is the most precise /// approximation achievable without multiplication on ARM64 and x86. /// -/// Tradeoff: due the limited range of this division [0,4] (instead of [0,5] for -/// u256) will lead to a larger value after subtraction reduction. +/// Tradeoff: due the limited range of this division \[0,4\] (instead of \[0,5\] +/// for u256) will lead to a larger value after subtraction reduction. /// subtraction reduction output: [0, 1+ε] with ε < 0.3. #[inline(always)] pub fn div_p_6b(x: u64) -> u64 {