@@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
27
27
# Substraction
28
28
# ------------------------------------------------------------
29
29
30
- macro sub2x_gen [N: static int ](a : var Limbs [N], b : Limbs [N], M : Limbs [N div 2 ]): untyped =
30
+ macro sub2x_gen [N: static int ](R : var Limbs [N], A, B : Limbs [N], m : Limbs [N div 2 ]): untyped =
31
31
# # Generate an optimized out-of-place double-width substraction kernel
32
32
33
33
result = newStmtList ()
34
34
35
35
var ctx = init (Assembler_x86, BaseType )
36
36
let
37
- N2 = N div 2
37
+ H = N div 2
38
38
39
- arrA = init (OperandArray , nimSymbol = a , N, PointerInReg , InputOutput )
40
- # We reuse the reg used for B for overflow detection
41
- arrB = init (OperandArray , nimSymbol = b , N, PointerInReg , InputOutput )
42
- # We could force M as immediate by specializing per moduli
43
- arrM = init (OperandArray , nimSymbol = M , N, PointerInReg , Input )
39
+ r = init (OperandArray , nimSymbol = R , N, PointerInReg , InputOutput )
40
+ # We reuse the reg used for b for overflow detection
41
+ b = init (OperandArray , nimSymbol = B , N, PointerInReg , InputOutput )
42
+ # We could force m as immediate by specializing per moduli
43
+ M = init (OperandArray , nimSymbol = m , N, PointerInReg , Input )
44
44
# If N is too big, we need to spill registers. TODO.
45
- arrT = init (OperandArray , nimSymbol = ident " t" , N2 , ElemsInReg , Output_EarlyClobber)
46
- arrTadd = init (OperandArray , nimSymbol = ident " tadd" , N2 , ElemsInReg , Output_EarlyClobber)
45
+ u = init (OperandArray , nimSymbol = ident " U" , H, ElemsInReg , InputOutput )
46
+ v = init (OperandArray , nimSymbol = ident " V" , H, ElemsInReg , InputOutput )
47
+
48
+ let usym = u.nimSymbol
49
+ let vsym = v.nimSymbol
50
+ result .add quote do :
51
+ var `usym`{.noinit .}, `vsym` {.noInit .}: typeof (`A`)
52
+ staticFor i, 0 , `H`:
53
+ `usym`[i] = `A`[i]
54
+ staticFor i, `H`, `N`:
55
+ `vsym`[i- `H`] = `A`[i]
47
56
48
57
# Substraction
49
- for i in 0 ..< N:
50
- ctx. mov arrT[i mod N2 ], arrA[i]
58
+ # u = a[0..<H] - b[0 ..<H], v = a[H..<N]
59
+ for i in 0 ..< H:
51
60
if i == 0 :
52
- ctx.sub arrT [0 ], arrB [0 ]
61
+ ctx.sub u [0 ], b [0 ]
53
62
else :
54
- ctx.sbb arrT[i mod N2 ], arrB[i]
55
- ctx.mov arrA[i], arrT[i mod N2 ]
56
- # Interleaved copy the modulus to hide SBB latencies
57
- if i < N2 :
58
- ctx.mov arrTadd[i], arrM[i]
63
+ ctx.sbb u[i], b[i]
64
+
65
+ # Everything should be hot in cache now so movs are cheaper
66
+ # we can try using 2 per SBB
67
+ # v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
68
+ for i in H ..< N:
69
+ ctx.mov r[i- H], u[i- H]
70
+ ctx.sbb v[i- H], b[i]
71
+ ctx.mov u[i- H], M[i- H] # TODO , bottleneck 17% perf: prefetch or inline modulus?
59
72
60
73
# Mask: underflowed contains 0xFFFF or 0x0000
61
- let underflowed = arrB .reuseRegister ()
74
+ let underflowed = b .reuseRegister ()
62
75
ctx.sbb underflowed, underflowed
63
76
64
77
# Now mask the adder, with 0 or the modulus limbs
65
- for i in 0 ..< N2 :
66
- ctx.`and` arrTadd [i], underflowed
78
+ for i in 0 ..< H :
79
+ ctx.`and` u [i], underflowed
67
80
68
81
# Add the masked modulus
69
- for i in 0 ..< N2 :
82
+ for i in 0 ..< H :
70
83
if i == 0 :
71
- ctx.add arrT [0 ], arrTadd [0 ]
84
+ ctx.add u [0 ], v [0 ]
72
85
else :
73
- ctx.adc arrT [i], arrTadd [i]
74
- ctx.mov arrA [i+ N2 ], arrT [i]
86
+ ctx.adc u [i], v [i]
87
+ ctx.mov r [i+ H ], u [i]
75
88
76
- let t = arrT.nimSymbol
77
- let tadd = arrTadd.nimSymbol
78
- result .add quote do :
79
- var `t`{.noinit .}, `tadd` {.noInit .}: typeof (`a`)
80
89
result .add ctx.generate
81
90
82
- func sub2x_asm * [N: static int ](a : var Limbs [N], b: Limbs [N], M: Limbs [N div 2 ]) =
91
+ func sub2x_asm * [N: static int ](r : var Limbs [N], a , b: Limbs [N], M: Limbs [N div 2 ]) =
83
92
# # Constant-time double-width substraction
84
- sub2x_gen (a, b, M)
93
+ sub2x_gen (r, a, b, M)
0 commit comments