Skip to content

Commit 56a7c5b

Browse files
committed
math/big: add partial arm64 assembly support
benchmark old ns/op new ns/op delta BenchmarkAddVV_1 18.7 14.8 -20.86% BenchmarkAddVV_2 21.8 16.6 -23.85% BenchmarkAddVV_3 26.1 17.1 -34.48% BenchmarkAddVV_4 30.4 21.9 -27.96% BenchmarkAddVV_5 35.5 19.8 -44.23% BenchmarkAddVV_1e1 63.0 28.3 -55.08% BenchmarkAddVV_1e2 593 178 -69.98% BenchmarkAddVV_1e3 5691 1490 -73.82% BenchmarkAddVV_1e4 56868 20761 -63.49% BenchmarkAddVV_1e5 569062 207679 -63.51% BenchmarkAddVW_1 15.8 12.6 -20.25% BenchmarkAddVW_2 17.8 13.1 -26.40% BenchmarkAddVW_3 21.2 13.9 -34.43% BenchmarkAddVW_4 23.6 14.7 -37.71% BenchmarkAddVW_5 26.0 15.8 -39.23% BenchmarkAddVW_1e1 41.3 21.6 -47.70% BenchmarkAddVW_1e2 383 145 -62.14% BenchmarkAddVW_1e3 3703 1264 -65.87% BenchmarkAddVW_1e4 36920 14359 -61.11% BenchmarkAddVW_1e5 370345 143046 -61.37% BenchmarkAddMulVVW_1 33.2 32.5 -2.11% BenchmarkAddMulVVW_2 58.0 57.2 -1.38% BenchmarkAddMulVVW_3 95.2 93.9 -1.37% BenchmarkAddMulVVW_4 108 106 -1.85% BenchmarkAddMulVVW_5 159 156 -1.89% BenchmarkAddMulVVW_1e1 344 340 -1.16% BenchmarkAddMulVVW_1e2 3644 3624 -0.55% BenchmarkAddMulVVW_1e3 37344 37208 -0.36% BenchmarkAddMulVVW_1e4 373295 372170 -0.30% BenchmarkAddMulVVW_1e5 3438116 3425606 -0.36% BenchmarkBitLen0 7.21 4.32 -40.08% BenchmarkBitLen1 6.49 4.32 -33.44% BenchmarkBitLen2 7.23 4.32 -40.25% BenchmarkBitLen3 6.49 4.32 -33.44% BenchmarkBitLen4 7.22 4.32 -40.17% BenchmarkBitLen5 6.52 4.33 -33.59% BenchmarkBitLen8 7.22 4.32 -40.17% BenchmarkBitLen9 6.49 4.32 -33.44% BenchmarkBitLen16 8.66 4.32 -50.12% BenchmarkBitLen17 7.95 4.32 -45.66% BenchmarkBitLen31 8.69 4.32 -50.29% BenchmarkGCD10x10 5021 5033 +0.24% BenchmarkGCD10x100 5571 5572 +0.02% BenchmarkGCD10x1000 6707 6729 +0.33% BenchmarkGCD10x10000 13526 13419 -0.79% BenchmarkGCD10x100000 85668 83242 -2.83% BenchmarkGCD100x100 24196 23936 -1.07% BenchmarkGCD100x1000 28802 27309 -5.18% BenchmarkGCD100x10000 64111 51704 -19.35% BenchmarkGCD100x100000 385840 274385 -28.89% BenchmarkGCD1000x1000 262892 236269 -10.13% BenchmarkGCD1000x10000 371393 277883 -25.18% BenchmarkGCD1000x100000 1311795 589055 -55.10% BenchmarkGCD10000x10000 9596740 6123930 -36.19% BenchmarkGCD10000x100000 16404000 7269610 -55.68% BenchmarkGCD100000x100000 776660000 419270000 -46.02% BenchmarkHilbert 13478980 13402270 -0.57% BenchmarkBinomial 9802 9440 -3.69% BenchmarkBitset 142 142 +0.00% BenchmarkBitsetNeg 328 279 -14.94% BenchmarkBitsetOrig 853 861 +0.94% BenchmarkBitsetNegOrig 1489 1444 -3.02% BenchmarkMul 420949000 410481000 -2.49% BenchmarkExp3Power0x10 1148 1229 +7.06% BenchmarkExp3Power0x40 1322 1376 +4.08% BenchmarkExp3Power0x100 2437 2486 +2.01% BenchmarkExp3Power0x400 9456 9346 -1.16% BenchmarkExp3Power0x1000 113623 108701 -4.33% BenchmarkExp3Power0x4000 1134933 1101481 -2.95% BenchmarkExp3Power0x10000 10773570 10396160 -3.50% BenchmarkExp3Power0x40000 101362100 97788300 -3.53% BenchmarkExp3Power0x100000 921114000 885249000 -3.89% BenchmarkExp3Power0x400000 8323094000 7969020000 -4.25% BenchmarkFibo 322021600 92554450 -71.26% BenchmarkScanPi 1264583 321065 -74.61% BenchmarkStringPiParallel 1644661 554216 -66.30% BenchmarkScan10Base2 1111 1080 -2.79% BenchmarkScan100Base2 6645 6345 -4.51% BenchmarkScan1000Base2 84084 62405 -25.78% BenchmarkScan10000Base2 3105998 932551 -69.98% BenchmarkScan100000Base2 257234800 40113333 -84.41% BenchmarkScan10Base8 571 573 +0.35% BenchmarkScan100Base8 2810 2543 -9.50% BenchmarkScan1000Base8 47383 25834 -45.48% BenchmarkScan10000Base8 2739518 567203 -79.30% BenchmarkScan100000Base8 253952400 36495680 -85.63% BenchmarkScan10Base10 553 556 +0.54% BenchmarkScan100Base10 2640 2385 -9.66% BenchmarkScan1000Base10 50865 24049 -52.72% BenchmarkScan10000Base10 3279916 549313 -83.25% BenchmarkScan100000Base10 309121000 36213140 -88.29% BenchmarkScan10Base16 478 483 +1.05% BenchmarkScan100Base16 2353 2144 -8.88% BenchmarkScan1000Base16 48091 24246 -49.58% BenchmarkScan10000Base16 2858886 586475 -79.49% BenchmarkScan100000Base16 266320000 38190500 -85.66% BenchmarkString10Base2 736 730 -0.82% BenchmarkString100Base2 2695 2707 +0.45% BenchmarkString1000Base2 20549 20388 -0.78% BenchmarkString10000Base2 212638 210782 -0.87% BenchmarkString100000Base2 1944963 1938033 -0.36% BenchmarkString10Base8 524 517 -1.34% BenchmarkString100Base8 1326 1320 -0.45% BenchmarkString1000Base8 8213 8249 +0.44% BenchmarkString10000Base8 72204 72092 -0.16% BenchmarkString100000Base8 769068 765993 -0.40% BenchmarkString10Base10 1018 982 -3.54% BenchmarkString100Base10 3485 3206 -8.01% BenchmarkString1000Base10 37102 18935 -48.97% BenchmarkString10000Base10 188633 88637 -53.01% BenchmarkString100000Base10 124490300 19700940 -84.17% BenchmarkString10Base16 509 502 -1.38% BenchmarkString100Base16 1084 1098 +1.29% BenchmarkString1000Base16 5641 5650 +0.16% BenchmarkString10000Base16 46900 46745 -0.33% BenchmarkString100000Base16 508957 505840 -0.61% BenchmarkLeafSize0 8934320 8149465 -8.78% BenchmarkLeafSize1 237666 118381 -50.19% BenchmarkLeafSize2 237807 117854 -50.44% BenchmarkLeafSize3 1688640 353494 -79.07% BenchmarkLeafSize4 235676 116196 -50.70% BenchmarkLeafSize5 2121896 430325 -79.72% BenchmarkLeafSize6 1682306 351775 -79.09% BenchmarkLeafSize7 1051847 251436 -76.10% BenchmarkLeafSize8 232697 115674 -50.29% BenchmarkLeafSize9 2403616 488443 -79.68% BenchmarkLeafSize10 2120975 429545 -79.75% BenchmarkLeafSize11 2023789 426525 -78.92% BenchmarkLeafSize12 1684830 351985 -79.11% BenchmarkLeafSize13 1465529 337906 -76.94% BenchmarkLeafSize14 1050498 253872 -75.83% BenchmarkLeafSize15 683228 197384 -71.11% BenchmarkLeafSize16 232496 116026 -50.10% BenchmarkLeafSize32 245841 126671 -48.47% BenchmarkLeafSize64 301728 190285 -36.93% Change-Id: I63e63297896d96b89c9a275b893c2b405a7e105d Reviewed-on: https://go-review.googlesource.com/9260 Reviewed-by: David Crawshaw <[email protected]>
1 parent 1f65c9c commit 56a7c5b

File tree

1 file changed

+139
-8
lines changed

1 file changed

+139
-8
lines changed

src/math/big/arith_arm64.s

+139-8
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,169 @@
99
// This file provides fast assembly versions for the elementary
1010
// arithmetic operations on vectors implemented in arith.go.
1111

12+
// TODO: Consider re-implementing using Advanced SIMD
13+
// once the assembler supports those instructions.
14+
15+
// func mulWW(x, y Word) (z1, z0 Word)
1216
TEXT ·mulWW(SB),NOSPLIT,$0
13-
B ·mulWW_g(SB)
17+
MOVD x+0(FP), R0
18+
MOVD y+8(FP), R1
19+
MUL R0, R1, R2
20+
UMULH R0, R1, R3
21+
MOVD R3, z1+16(FP)
22+
MOVD R2, z0+24(FP)
23+
RET
24+
1425

26+
// func divWW(x1, x0, y Word) (q, r Word)
1527
TEXT ·divWW(SB),NOSPLIT,$0
16-
B ·divWW_g(SB)
28+
B ·divWW_g(SB) // ARM64 has no multiword division
29+
1730

31+
// func addVV(z, x, y []Word) (c Word)
1832
TEXT ·addVV(SB),NOSPLIT,$0
19-
B ·addVV_g(SB)
33+
MOVD z+0(FP), R3
34+
MOVD z_len+8(FP), R0
35+
MOVD x+24(FP), R1
36+
MOVD y+48(FP), R2
37+
ADDS $0, R0 // clear carry flag
38+
loop:
39+
CBZ R0, done // careful not to touch the carry flag
40+
MOVD.P 8(R1), R4
41+
MOVD.P 8(R2), R5
42+
ADCS R4, R5
43+
MOVD.P R5, 8(R3)
44+
SUB $1, R0
45+
B loop
46+
done:
47+
CSET HS, R0 // extract carry flag
48+
MOVD R0, c+72(FP)
49+
RET
2050

51+
52+
// func subVV(z, x, y []Word) (c Word)
2153
TEXT ·subVV(SB),NOSPLIT,$0
22-
B ·subVV_g(SB)
54+
MOVD z+0(FP), R3
55+
MOVD z_len+8(FP), R0
56+
MOVD x+24(FP), R1
57+
MOVD y+48(FP), R2
58+
CMP R0, R0 // set carry flag
59+
loop:
60+
CBZ R0, done // careful not to touch the carry flag
61+
MOVD.P 8(R1), R4
62+
MOVD.P 8(R2), R5
63+
SBCS R5, R4
64+
MOVD.P R4, 8(R3)
65+
SUB $1, R0
66+
B loop
67+
done:
68+
CSET LO, R0 // extract carry flag
69+
MOVD R0, c+72(FP)
70+
RET
71+
2372

73+
// func addVW(z, x []Word, y Word) (c Word)
2474
TEXT ·addVW(SB),NOSPLIT,$0
25-
B ·addVW_g(SB)
75+
MOVD z+0(FP), R3
76+
MOVD z_len+8(FP), R0
77+
MOVD x+24(FP), R1
78+
MOVD y+48(FP), R2
79+
CBZ R0, return_y
80+
MOVD.P 8(R1), R4
81+
ADDS R2, R4
82+
MOVD.P R4, 8(R3)
83+
SUB $1, R0
84+
loop:
85+
CBZ R0, done // careful not to touch the carry flag
86+
MOVD.P 8(R1), R4
87+
ADCS $0, R4
88+
MOVD.P R4, 8(R3)
89+
SUB $1, R0
90+
B loop
91+
done:
92+
CSET HS, R0 // extract carry flag
93+
MOVD R0, c+56(FP)
94+
RET
95+
return_y: // z is empty; copy y to c
96+
MOVD R2, c+56(FP)
97+
RET
98+
2699

100+
// func subVW(z, x []Word, y Word) (c Word)
27101
TEXT ·subVW(SB),NOSPLIT,$0
28-
B ·subVW_g(SB)
102+
MOVD z+0(FP), R3
103+
MOVD z_len+8(FP), R0
104+
MOVD x+24(FP), R1
105+
MOVD y+48(FP), R2
106+
CBZ R0, rety
107+
MOVD.P 8(R1), R4
108+
SUBS R2, R4
109+
MOVD.P R4, 8(R3)
110+
SUB $1, R0
111+
loop:
112+
CBZ R0, done // careful not to touch the carry flag
113+
MOVD.P 8(R1), R4
114+
SBCS $0, R4
115+
MOVD.P R4, 8(R3)
116+
SUB $1, R0
117+
B loop
118+
done:
119+
CSET LO, R0 // extract carry flag
120+
MOVD R0, c+56(FP)
121+
RET
122+
rety: // z is empty; copy y to c
123+
MOVD R2, c+56(FP)
124+
RET
29125

126+
127+
// func shlVU(z, x []Word, s uint) (c Word)
30128
TEXT ·shlVU(SB),NOSPLIT,$0
31129
B ·shlVU_g(SB)
32130

131+
132+
// func shrVU(z, x []Word, s uint) (c Word)
33133
TEXT ·shrVU(SB),NOSPLIT,$0
34134
B ·shrVU_g(SB)
35135

136+
137+
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
36138
TEXT ·mulAddVWW(SB),NOSPLIT,$0
37-
B ·mulAddVWW_g(SB)
139+
MOVD z+0(FP), R1
140+
MOVD z_len+8(FP), R0
141+
MOVD x+24(FP), R2
142+
MOVD y+48(FP), R3
143+
MOVD r+56(FP), R4
144+
loop:
145+
CBZ R0, done
146+
MOVD.P 8(R2), R5
147+
UMULH R5, R3, R7
148+
MUL R5, R3, R6
149+
ADDS R4, R6
150+
ADC $0, R7
151+
MOVD.P R6, 8(R1)
152+
MOVD R7, R4
153+
SUB $1, R0
154+
B loop
155+
done:
156+
MOVD R4, c+64(FP)
157+
RET
38158

159+
160+
// func addMulVVW(z, x []Word, y Word) (c Word)
39161
TEXT ·addMulVVW(SB),NOSPLIT,$0
40162
B ·addMulVVW_g(SB)
41163

164+
165+
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
42166
TEXT ·divWVW(SB),NOSPLIT,$0
43167
B ·divWVW_g(SB)
44168

169+
170+
// func bitLen(x Word) (n int)
45171
TEXT ·bitLen(SB),NOSPLIT,$0
46-
B ·bitLen_g(SB)
172+
MOVD x+0(FP), R0
173+
CLZ R0, R0
174+
MOVD $64, R1
175+
SUB R0, R1, R0
176+
MOVD R0, n+8(FP)
177+
RET

0 commit comments

Comments
 (0)