mratsim
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎benchmarks/bench_fp_double_width.nim
+7 b/‎benchmarks/bench_fp_double_width.nim
+7
diff --git a/‎constantine.nimble
+2-2 b/‎constantine.nimble
+2-2
diff --git a/‎constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim
+39-30 b/‎constantine/arithmetic/assembly/limbs_asm_modular_dbl_width_x86.nim
+39-30
@@ -42,7 +42,7 @@ generated incorrect add-with-carry code.
 
 On x86-64, inline assembly is used to workaround compilers having issues optimizing large integer arithmetic,
 and also ensure constant-time code.
-This can be deactivated with `"-d:ConstantineASM=false"`:
+This can be deactivated with `"-d:CttASM=false"`:
 - at a significant performance cost with GCC (~50% slower than Clang).
 - at misssed opportunity on recent CPUs that support MULX/ADCX/ADOX instructions (~60% faster than Clang).
 - There is a 2.4x perf ratio between using plain GCC vs GCC with inline assembly.
 
@@ -172,6 +172,12 @@ proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
   bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
     r.prod(a, b)
 
+proc square2xBench*(rLen, aLen: static int, iters: int) =
+  var r: BigInt[rLen]
+  let a = rng.random_unsafe(BigInt[aLen])
+  bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
+    r.square(a)
+
 proc reduce2x*(T: typedesc, iters: int) =
   var r: T
   var t: doubleWidth(T)
@@ -189,6 +195,7 @@ proc main() =
   diff2x(Fp[BLS12_381], iters = 10_000_000)
   diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
   mul2xBench(768, 384, 384, iters = 10_000_000)
+  square2xBench(768, 384, iters = 10_000_000)
   reduce2x(Fp[BLS12_381], iters = 10_000_000)
   separator()
 
 
@@ -232,7 +232,7 @@ proc buildBench(benchName: string, compiler = "", useAsm = true, run = false) =
   if compiler != "":
     cc = "--cc:" & compiler
   if not useAsm:
-    cc &= " -d:ConstantineASM=false"
+    cc &= " -d:CttASM=false"
   exec "nim c " & cc &
        " -d:danger --verbosity:0 -o:build/bench/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
        " --nimcache:nimcache/" & benchName & "_" & compiler & "_" & (if useAsm: "useASM" else: "noASM") &
@@ -246,7 +246,7 @@ proc runTests(requireGMP: bool, dumpCmdFile = false, test32bit = false, testASM
     if not(td.useGMP and not requireGMP):
       var flags = ""
       if not testASM:
-        flags &= " -d:ConstantineASM=false"
+        flags &= " -d:CttASM=false"
       if test32bit:
         flags &= " -d:Constantine32"
       if td.path in useDebug:
 
@@ -27,58 +27,67 @@ static: doAssert UseASM_X86_64
 # Substraction
 # ------------------------------------------------------------
 
-macro sub2x_gen[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]): untyped =
+macro sub2x_gen[N: static int](R: var Limbs[N], A, B: Limbs[N], m: Limbs[N div 2]): untyped =
   ## Generate an optimized out-of-place double-width substraction kernel
 
   result = newStmtList()
 
   var ctx = init(Assembler_x86, BaseType)
   let
-    N2 = N div 2
+    H = N div 2
 
-    arrA = init(OperandArray, nimSymbol = a, N, PointerInReg, InputOutput)
-    # We reuse the reg used for B for overflow detection
-    arrB = init(OperandArray, nimSymbol = b, N, PointerInReg, InputOutput)
-    # We could force M as immediate by specializing per moduli
-    arrM = init(OperandArray, nimSymbol = M, N, PointerInReg, Input)
+    r = init(OperandArray, nimSymbol = R, N, PointerInReg, InputOutput)
+    # We reuse the reg used for b for overflow detection
+    b = init(OperandArray, nimSymbol = B, N, PointerInReg, InputOutput)
+    # We could force m as immediate by specializing per moduli
+    M = init(OperandArray, nimSymbol = m, N, PointerInReg, Input)
     # If N is too big, we need to spill registers. TODO.
-    arrT = init(OperandArray, nimSymbol = ident"t", N2, ElemsInReg, Output_EarlyClobber)
-    arrTadd = init(OperandArray, nimSymbol = ident"tadd", N2, ElemsInReg, Output_EarlyClobber)
+    u = init(OperandArray, nimSymbol = ident"U", H, ElemsInReg, InputOutput)
+    v = init(OperandArray, nimSymbol = ident"V", H, ElemsInReg, InputOutput)
+
+  let usym = u.nimSymbol
+  let vsym = v.nimSymbol
+  result.add quote do:
+    var `usym`{.noinit.}, `vsym` {.noInit.}: typeof(`A`)
+    staticFor i, 0, `H`:
+      `usym`[i] = `A`[i]
+    staticFor i, `H`, `N`:
+      `vsym`[i-`H`] = `A`[i]
 
   # Substraction
-  for i in 0 ..< N:
-    ctx.mov arrT[i mod N2], arrA[i]
+  # u = a[0..<H] - b[0..<H], v = a[H..<N]
+  for i in 0 ..< H:
     if i == 0:
-      ctx.sub arrT[0], arrB[0]
+      ctx.sub u[0], b[0]
     else:
-      ctx.sbb arrT[i mod N2], arrB[i]
-    ctx.mov arrA[i], arrT[i mod N2]
-    # Interleaved copy the modulus to hide SBB latencies
-    if i < N2:
-      ctx.mov arrTadd[i], arrM[i]
+      ctx.sbb u[i], b[i]
+
+  # Everything should be hot in cache now so movs are cheaper
+  # we can try using 2 per SBB
+  # v = a[H..<N] - b[H..<N], a[0..<H] = u, u = M
+  for i in H ..< N:
+    ctx.mov r[i-H], u[i-H]
+    ctx.sbb v[i-H], b[i]
+    ctx.mov u[i-H], M[i-H] # TODO, bottleneck 17% perf: prefetch or inline modulus?
 
   # Mask: underflowed contains 0xFFFF or 0x0000
-  let underflowed = arrB.reuseRegister()
+  let underflowed = b.reuseRegister()
   ctx.sbb underflowed, underflowed
 
   # Now mask the adder, with 0 or the modulus limbs
-  for i in 0 ..< N2:
-    ctx.`and` arrTadd[i], underflowed
+  for i in 0 ..< H:
+    ctx.`and` u[i], underflowed
 
   # Add the masked modulus
-  for i in 0 ..< N2:
+  for i in 0 ..< H:
     if i == 0:
-      ctx.add arrT[0], arrTadd[0]
+      ctx.add u[0], v[0]
     else:
-      ctx.adc arrT[i], arrTadd[i]
-    ctx.mov arrA[i+N2], arrT[i]
+      ctx.adc u[i], v[i]
+    ctx.mov r[i+H], u[i]
 
-  let t = arrT.nimSymbol
-  let tadd = arrTadd.nimSymbol
-  result.add quote do:
-    var `t`{.noinit.}, `tadd` {.noInit.}: typeof(`a`)
   result.add ctx.generate
 
-func sub2x_asm*[N: static int](a: var Limbs[N], b: Limbs[N], M: Limbs[N div 2]) =
+func sub2x_asm*[N: static int](r: var Limbs[N], a, b: Limbs[N], M: Limbs[N div 2]) =
   ## Constant-time double-width substraction
-  sub2x_gen(a, b, M)
+  sub2x_gen(r, a, b, M)