Skip to content

Commit 83dcd98

Browse files
authored
FpDbl revisited (#144) - 7% perf improvement everywhere, up to 30% in double-width primitives
* reorg mul -> limbs_double_width, ConstantineASM CttASM * Implement squaring specialized scalar path (22% faster than mul) * Implement "portable" assembly for squaring * stash part of the changes * Reorg montgomery reduction - prepare to introduce Comba optimization * Implement comba Montgomery reduce (but it's slower!) * rename t -> a * 30% performance improvement by avoiding toOpenArray! * variable renaming * Fix 32-bit imports * slightly better assembly for sub2x * There is an annoying bottleneck * use out-of-place Fp assembly instead of in-place * diffAlias is unneeded now * cosmetic * speedup fpDbl sub by 20% * Fix Fp2 -> Fp6 -> Fp12 towering. It seems 5% faster * Stash ADCX/ADOX squaring
1 parent d12d5fa commit 83dcd98

28 files changed

+1048
-470
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ generated incorrect add-with-carry code.
4242

4343
On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
4444
and also ensure constant-time code.
45-
This can be deactivated with `"-d:ConstantineASM=false"`:
45+
This can be deactivated with `"-d:CttASM=false"`:
4646
- at a significant performance cost with GCC (~50% slower than Clang).
4747
- at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang).
4848
- There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly.

benchmarks/bench_fp_double_width.nim

+7
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
172172
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
173173
r.prod(a, b)
174174

175+
proc square2xBench*(rLen, aLen: static int, iters: int) =
176+
var r: BigInt[rLen]
177+
let a = rng.random_unsafe(BigInt[aLen])
178+
bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
179+
r.square(a)
180+
175181
proc reduce2x*(T: typedesc, iters: int) =
176182
var r: T
177183
var t: doubleWidth(T)
@@ -189,6 +195,7 @@ proc main() =
189195
diff2x(Fp[BLS12_381], iters = 10_000_000)
190196
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
191197
mul2xBench(768, 384, 384, iters = 10_000_000)
198+
square2xBench(768, 384, iters = 10_000_000)
192199
reduce2x(Fp[BLS12_381], iters = 10_000_000)
193200
separator()
194201

constantine.nimble

+2-2
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
232232
if compiler != "":
233233
cc = "--cc:" & compiler
234234
if not useAsm:
235-
cc &= " -d:ConstantineASM=false"
235+
cc &= " -d:CttASM=false"
236236
exec "nim c " & cc &
237237
" -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
238238
" --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
@@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM
246246
if not(td.useGMP and not requireGMP):
247247
var flags = ""
248248
if not testASM:
249-
flags &= " -d:ConstantineASM=false"
249+
flags &= " -d:CttASM=false"
250250
if test32bit:
251251
flags &= " -d:Constantine32"
252252
if td.path in useDebug:

constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim

+39-30
Original file line numberDiff line numberDiff line change
@@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
2727
# Substraction
2828
# ------------------------------------------------------------
2929

30-
macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
30+
macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
3131
## Generate an optimized out-of-place double-width substraction kernel
3232

3333
result = newStmtList()
3434

3535
var ctx = init(Assembler_x86, BaseType)
3636
let
37-
N2 = N div 2
37+
H = N div 2
3838

39-
arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
40-
# We reuse the reg used for B for overflow detection
41-
arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
42-
# We could force M as immediate by specializing per moduli
43-
arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
39+
r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
40+
# We reuse the reg used for b for overflow detection
41+
b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
42+
# We could force m as immediate by specializing per moduli
43+
M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
4444
# If N is too big, we need to spill registers. TODO.
45-
arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
46-
arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
45+
u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
46+
v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
47+
48+
let usym = u.nimSymbol
49+
let vsym = v.nimSymbol
50+
result.add quote do:
51+
var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
52+
staticFor i, 0, `H`:
53+
`usym`[i] = `A`[i]
54+
staticFor i, `H`, `N`:
55+
`vsym`[i-`H`] = `A`[i]
4756

4857
# Substraction
49-
for i in 0 ..< N:
50-
ctx.mov arrT[i mod N2], arrA[i]
58+
# u = a[0..<H] - b[0..<H], v = a[H..<N]
59+
for i in 0 ..< H:
5160
if i == 0:
52-
ctx.sub arrT[0], arrB[0]
61+
ctx.sub u[0], b[0]
5362
else:
54-
ctx.sbb arrT[i mod N2], arrB[i]
55-
ctx.mov arrA[i], arrT[i mod N2]
56-
# Interleaved copy the modulus to hide SBB latencies
57-
if i < N2:
58-
ctx.mov arrTadd[i], arrM[i]
63+
ctx.sbb u[i], b[i]
64+
65+
# Everything should be hot in cache now so movs are cheaper
66+
# we can try using 2 per SBB
67+
# v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
68+
for i in H ..< N:
69+
ctx.mov r[i-H], u[i-H]
70+
ctx.sbb v[i-H], b[i]
71+
ctx.mov u[i-H], M[i-H] # TODO, bottleneck 17% perf: prefetch or inline modulus?
5972

6073
# Mask: underflowed contains 0xFFFF or 0x0000
61-
let underflowed = arrB.reuseRegister()
74+
let underflowed = b.reuseRegister()
6275
ctx.sbb underflowed, underflowed
6376

6477
# Now mask the adder, with 0 or the modulus limbs
65-
for i in 0 ..< N2:
66-
ctx.`and` arrTadd[i], underflowed
78+
for i in 0 ..< H:
79+
ctx.`and` u[i], underflowed
6780

6881
# Add the masked modulus
69-
for i in 0 ..< N2:
82+
for i in 0 ..< H:
7083
if i == 0:
71-
ctx.add arrT[0], arrTadd[0]
84+
ctx.add u[0], v[0]
7285
else:
73-
ctx.adc arrT[i], arrTadd[i]
74-
ctx.mov arrA[i+N2], arrT[i]
86+
ctx.adc u[i], v[i]
87+
ctx.mov r[i+H], u[i]
7588

76-
let t = arrT.nimSymbol
77-
let tadd = arrTadd.nimSymbol
78-
result.add quote do:
79-
var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
8089
result.add ctx.generate
8190

82-
func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
91+
func sub2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
8392
## Constant-time double-width substraction
84-
sub2x_gen(a, b, M)
93+
sub2x_gen(r, a, b, M)

0 commit comments

Comments
 (0)