Skip to content

Commit 9480496

Browse files
author
Shigeki Ohtsu
committed
deps: update asm files for openssl-1.0.2b
asm files are generated as - In `deps/openssl/asm/`, make with CC=gcc and ASM=nasm - In `deps/openssl/asm_obsolute/`, make with no envs for compilers Fixes: #1921 PR-URL: #1950 Reviewed-By: Fedor Indutny <[email protected]> Reviewed-By: Ben Noordhuis <[email protected]>
1 parent 3844491 commit 9480496

File tree

31 files changed

+7274
-2737
lines changed

31 files changed

+7274
-2737
lines changed

deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S

+114-92
Large diffs are not rendered by default.

deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S

+1-1
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ gcm_ghash_neon:
495495
veor q10,q10,q9 @
496496
vshl.i64 q9,q0,#63
497497
veor q10, q10, q9 @
498-
veor d1,d1,d20 @
498+
veor d1,d1,d20 @
499499
veor d4,d4,d21
500500

501501
vshr.u64 q10,q0,#1 @ 2nd phase

deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S

+158-44
Original file line numberDiff line numberDiff line change
@@ -7,109 +7,223 @@
77
.type gcm_init_v8,%function
88
.align 4
99
gcm_init_v8:
10-
vld1.64 {q9},[r1] @ load H
11-
vmov.i8 q8,#0xe1
10+
vld1.64 {q9},[r1] @ load input H
11+
vmov.i8 q11,#0xe1
12+
vshl.i64 q11,q11,#57 @ 0xc2.0
1213
vext.8 q3,q9,q9,#8
13-
vshl.i64 q8,q8,#57
14-
vshr.u64 q10,q8,#63
15-
vext.8 q8,q10,q8,#8 @ t0=0xc2....01
14+
vshr.u64 q10,q11,#63
1615
vdup.32 q9,d18[1]
17-
vshr.u64 q11,q3,#63
16+
vext.8 q8,q10,q11,#8 @ t0=0xc2....01
17+
vshr.u64 q10,q3,#63
1818
vshr.s32 q9,q9,#31 @ broadcast carry bit
19-
vand q11,q11,q8
19+
vand q10,q10,q8
2020
vshl.i64 q3,q3,#1
21-
vext.8 q11,q11,q11,#8
21+
vext.8 q10,q10,q10,#8
2222
vand q8,q8,q9
23-
vorr q3,q3,q11 @ H<<<=1
24-
veor q3,q3,q8 @ twisted H
25-
vst1.64 {q3},[r0]
23+
vorr q3,q3,q10 @ H<<<=1
24+
veor q12,q3,q8 @ twisted H
25+
vst1.64 {q12},[r0]! @ store Htable[0]
26+
27+
@ calculate H^2
28+
vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
29+
.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
30+
veor q8,q8,q12
31+
.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
32+
.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
33+
34+
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
35+
veor q10,q0,q2
36+
veor q1,q1,q9
37+
veor q1,q1,q10
38+
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
39+
40+
vmov d4,d3 @ Xh|Xm - 256-bit result
41+
vmov d3,d0 @ Xm is rotated Xl
42+
veor q0,q1,q10
43+
44+
vext.8 q10,q0,q0,#8 @ 2nd phase
45+
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
46+
veor q10,q10,q2
47+
veor q14,q0,q10
48+
49+
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
50+
veor q9,q9,q14
51+
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
52+
vst1.64 {q13-q14},[r0] @ store Htable[1..2]
2653

2754
bx lr
2855
.size gcm_init_v8,.-gcm_init_v8
29-
3056
.global gcm_gmult_v8
3157
.type gcm_gmult_v8,%function
3258
.align 4
3359
gcm_gmult_v8:
3460
vld1.64 {q9},[r0] @ load Xi
3561
vmov.i8 q11,#0xe1
36-
vld1.64 {q12},[r1] @ load twisted H
62+
vld1.64 {q12-q13},[r1] @ load twisted H, ...
3763
vshl.u64 q11,q11,#57
3864
#ifndef __ARMEB__
3965
vrev64.8 q9,q9
4066
#endif
41-
vext.8 q13,q12,q12,#8
42-
mov r3,#0
4367
vext.8 q3,q9,q9,#8
44-
mov r12,#0
45-
veor q13,q13,q12 @ Karatsuba pre-processing
46-
mov r2,r0
47-
b .Lgmult_v8
48-
.size gcm_gmult_v8,.-gcm_gmult_v8
4968

69+
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
70+
veor q9,q9,q3 @ Karatsuba pre-processing
71+
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
72+
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
73+
74+
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
75+
veor q10,q0,q2
76+
veor q1,q1,q9
77+
veor q1,q1,q10
78+
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
79+
80+
vmov d4,d3 @ Xh|Xm - 256-bit result
81+
vmov d3,d0 @ Xm is rotated Xl
82+
veor q0,q1,q10
83+
84+
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
85+
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
86+
veor q10,q10,q2
87+
veor q0,q0,q10
88+
89+
#ifndef __ARMEB__
90+
vrev64.8 q0,q0
91+
#endif
92+
vext.8 q0,q0,q0,#8
93+
vst1.64 {q0},[r0] @ write out Xi
94+
95+
bx lr
96+
.size gcm_gmult_v8,.-gcm_gmult_v8
5097
.global gcm_ghash_v8
5198
.type gcm_ghash_v8,%function
5299
.align 4
53100
gcm_ghash_v8:
101+
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
54102
vld1.64 {q0},[r0] @ load [rotated] Xi
55-
subs r3,r3,#16
103+
@ "[rotated]" means that
104+
@ loaded value would have
105+
@ to be rotated in order to
106+
@ make it appear as in
107+
@ alorithm specification
108+
subs r3,r3,#32 @ see if r3 is 32 or larger
109+
mov r12,#16 @ r12 is used as post-
110+
@ increment for input pointer;
111+
@ as loop is modulo-scheduled
112+
@ r12 is zeroed just in time
113+
@ to preclude oversteping
114+
@ inp[len], which means that
115+
@ last block[s] are actually
116+
@ loaded twice, but last
117+
@ copy is not processed
118+
vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2
56119
vmov.i8 q11,#0xe1
57-
mov r12,#16
58-
vld1.64 {q12},[r1] @ load twisted H
59-
moveq r12,#0
60-
vext.8 q0,q0,q0,#8
61-
vshl.u64 q11,q11,#57
62-
vld1.64 {q9},[r2],r12 @ load [rotated] inp
63-
vext.8 q13,q12,q12,#8
120+
vld1.64 {q14},[r1]
121+
moveq r12,#0 @ is it time to zero r12?
122+
vext.8 q0,q0,q0,#8 @ rotate Xi
123+
vld1.64 {q8},[r2]! @ load [rotated] I[0]
124+
vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
64125
#ifndef __ARMEB__
126+
vrev64.8 q8,q8
65127
vrev64.8 q0,q0
128+
#endif
129+
vext.8 q3,q8,q8,#8 @ rotate I[0]
130+
blo .Lodd_tail_v8 @ r3 was less than 32
131+
vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
132+
#ifndef __ARMEB__
66133
vrev64.8 q9,q9
67134
#endif
68-
veor q13,q13,q12 @ Karatsuba pre-processing
69-
vext.8 q3,q9,q9,#8
70-
b .Loop_v8
135+
vext.8 q7,q9,q9,#8
136+
veor q3,q3,q0 @ I[i]^=Xi
137+
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
138+
veor q9,q9,q7 @ Karatsuba pre-processing
139+
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
140+
b .Loop_mod2x_v8
71141

72142
.align 4
73-
.Loop_v8:
143+
.Loop_mod2x_v8:
144+
vext.8 q10,q3,q3,#8
145+
subs r3,r3,#32 @ is there more data?
146+
.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
147+
movlo r12,#0 @ is it time to zero r12?
148+
149+
.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
150+
veor q10,q10,q3 @ Karatsuba pre-processing
151+
.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
152+
veor q0,q0,q4 @ accumulate
153+
.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
154+
vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
155+
156+
veor q2,q2,q6
157+
moveq r12,#0 @ is it time to zero r12?
158+
veor q1,q1,q5
159+
160+
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
161+
veor q10,q0,q2
162+
veor q1,q1,q9
163+
vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
164+
#ifndef __ARMEB__
165+
vrev64.8 q8,q8
166+
#endif
167+
veor q1,q1,q10
168+
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
169+
170+
#ifndef __ARMEB__
171+
vrev64.8 q9,q9
172+
#endif
173+
vmov d4,d3 @ Xh|Xm - 256-bit result
174+
vmov d3,d0 @ Xm is rotated Xl
175+
vext.8 q7,q9,q9,#8
176+
vext.8 q3,q8,q8,#8
177+
veor q0,q1,q10
178+
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
179+
veor q3,q3,q2 @ accumulate q3 early
180+
181+
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
182+
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
183+
veor q3,q3,q10
184+
veor q9,q9,q7 @ Karatsuba pre-processing
185+
veor q3,q3,q0
186+
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
187+
bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
188+
189+
veor q2,q2,q10
190+
vext.8 q3,q8,q8,#8 @ re-construct q3
191+
adds r3,r3,#32 @ re-construct r3
192+
veor q0,q0,q2 @ re-construct q0
193+
beq .Ldone_v8 @ is r3 zero?
194+
.Lodd_tail_v8:
74195
vext.8 q10,q0,q0,#8
75196
veor q3,q3,q0 @ inp^=Xi
76-
veor q9,q9,q10 @ q9 is rotated inp^Xi
197+
veor q9,q8,q10 @ q9 is rotated inp^Xi
77198

78-
.Lgmult_v8:
79199
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
80200
veor q9,q9,q3 @ Karatsuba pre-processing
81201
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
82-
subs r3,r3,#16
83202
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
84-
moveq r12,#0
85203

86204
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
87205
veor q10,q0,q2
88206
veor q1,q1,q9
89-
vld1.64 {q9},[r2],r12 @ load [rotated] inp
90207
veor q1,q1,q10
91-
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
208+
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
92209

93210
vmov d4,d3 @ Xh|Xm - 256-bit result
94211
vmov d3,d0 @ Xm is rotated Xl
95-
#ifndef __ARMEB__
96-
vrev64.8 q9,q9
97-
#endif
98212
veor q0,q1,q10
99-
vext.8 q3,q9,q9,#8
100213

101-
vext.8 q10,q0,q0,#8 @ 2nd phase
214+
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
102215
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
103216
veor q10,q10,q2
104217
veor q0,q0,q10
105-
bhs .Loop_v8
106218

219+
.Ldone_v8:
107220
#ifndef __ARMEB__
108221
vrev64.8 q0,q0
109222
#endif
110223
vext.8 q0,q0,q0,#8
111224
vst1.64 {q0},[r0] @ write out Xi
112225

226+
vldmia sp!,{d8-d15} @ 32-bit ABI says so
113227
bx lr
114228
.size gcm_ghash_v8,.-gcm_ghash_v8
115229
.asciz "GHASH for ARMv8, CRYPTOGAMS by <[email protected]>"

0 commit comments

Comments
 (0)