Skip to content

Commit 7db50db

Browse files
committed
Add more noinit and templates.
1 parent 86fdf1a commit 7db50db

File tree

4 files changed

+52
-54
lines changed

4 files changed

+52
-54
lines changed

nimcrypto/sha2/sha2_avx.nim

+32-34
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,8 @@ when defined(amd64):
111111
w[7] = w[6]; w[6] = w[5]; w[5] = w[4]; w[4] = w[3]
112112
w[3] = w[2]; w[2] = w[1]; w[1] = w[0]; w[0] = tmp
113113

114-
proc sha256UpdateAvx(x: var array[4, m128i], k256i: int,
115-
loMask, hiMask: m128i): m128i {.inline, noinit.} =
116-
var t {.align(64).}: array[4, m128i]
114+
template sha256UpdateAvx(x, k256i, loMask, hiMask: untyped): m128i =
115+
var t {.align(32), noinit.}: array[4, m128i]
117116

118117
t[0] = mm_alignr_epi8(x[1], x[0], 4)
119118
t[3] = mm_alignr_epi8(x[3], x[2], 4)
@@ -150,9 +149,8 @@ when defined(amd64):
150149

151150
mm_add_epi32(x[3], m128i.load(K0D, k256i))
152151

153-
proc sha512UpdateAvx(x: var array[8, m128i], k512i: int): m128i {.
154-
inline, noinit.} =
155-
var t {.align(64).}: array[4, m128i]
152+
template sha512UpdateAvx(x, k512i: untyped): m128i =
153+
var t {.align(32), noinit.}: array[4, m128i]
156154

157155
t[0] = mm_alignr_epi8(x[1], x[0], 8)
158156
t[3] = mm_alignr_epi8(x[5], x[4], 8)
@@ -188,10 +186,10 @@ when defined(amd64):
188186

189187
mm_add_epi64(x[7], m128i.load(K1D, k512i))
190188

191-
proc loadData32(x: var array[4, m128i],
192-
ms: var array[16, uint32], data: openArray[byte]) {.
193-
inline, noinit.} =
194-
let shuffleMask =
189+
# x: var array[4, m128i]
190+
# ms: var array[16, uint32]
191+
template loadData32(x, ms: untyped, data: openArray[byte]) =
192+
let shuffleMask {.align(32).} =
195193
mm_setr_epi32(0x00010203'u32, 0x04050607'u32,
196194
0x08090a0b'u32, 0x0c0d0e0f'u32)
197195
x[0] = m128i.load(data, 0)
@@ -210,10 +208,10 @@ when defined(amd64):
210208
x[3] = mm_shuffle_epi8(x[3], shuffleMask)
211209
m128i.store(ms, 12, mm_add_epi32(x[3], m128i.load(K0D, 12)))
212210

213-
proc loadData64(x: var array[8, m128i],
214-
ms: var array[16, uint64], data: openArray[byte]) {.
215-
inline, noinit.} =
216-
let shuffleMask =
211+
# x: var array[8, m128i]
212+
# ms: var array[16, uint32]
213+
template loadData64(x, ms: untyped, data: openArray[byte]) =
214+
let shuffleMask {.align(32).}=
217215
mm_setr_epi32(0x04050607'u32, 0x00010203'u32,
218216
0x0c0d0e0f'u32, 0x08090a0b'u32)
219217

@@ -253,17 +251,17 @@ when defined(amd64):
253251
data: openArray[byte],
254252
blocks: int) {.inline, noinit.} =
255253
let
256-
loMask =
254+
loMask {.align(32).} =
257255
mm_setr_epi32(0x03020100'u32, 0x0b0a0908'u32, 0xffffffff'u32,
258256
0xffffffff'u32)
259-
hiMask =
257+
hiMask {.align(32).} =
260258
mm_setr_epi32(0xffffffff'u32, 0xffffffff'u32, 0x03020100'u32,
261259
0x0b0a0908'u32)
262260

263261
var
264-
ms {.align(64).}: array[16, uint32]
265-
x {.align(64).}: array[4, m128i]
266-
cs {.align(64).}: array[8, uint32]
262+
ms {.align(32), noinit.}: array[16, uint32]
263+
x {.align(32), noinit.}: array[4, m128i]
264+
cs {.align(32), noinit.}: array[8, uint32]
267265
blocksCount = blocks
268266
offset = 0
269267

@@ -275,86 +273,86 @@ when defined(amd64):
275273
offset + sha256.sizeBlock() - 1))
276274

277275
block:
278-
let s0 = sha256UpdateAvx(x, 16, loMask, hiMask)
276+
let s0 {.align(32).} = sha256UpdateAvx(x, 16, loMask, hiMask)
279277
ROUND256(cs, ms[0])
280278
ROUND256(cs, ms[1])
281279
ROUND256(cs, ms[2])
282280
ROUND256(cs, ms[3])
283281
m128i.store(ms, 0, s0)
284282

285-
let s1 = sha256UpdateAvx(x, 20, loMask, hiMask)
283+
let s1 {.align(32).} = sha256UpdateAvx(x, 20, loMask, hiMask)
286284
ROUND256(cs, ms[4])
287285
ROUND256(cs, ms[5])
288286
ROUND256(cs, ms[6])
289287
ROUND256(cs, ms[7])
290288
m128i.store(ms, 4, s1)
291289

292-
let s2 = sha256UpdateAvx(x, 24, loMask, hiMask)
290+
let s2 {.align(32).} = sha256UpdateAvx(x, 24, loMask, hiMask)
293291
ROUND256(cs, ms[8])
294292
ROUND256(cs, ms[9])
295293
ROUND256(cs, ms[10])
296294
ROUND256(cs, ms[11])
297295
m128i.store(ms, 8, s2)
298296

299-
let s3 = sha256UpdateAvx(x, 28, loMask, hiMask)
297+
let s3 {.align(32).} = sha256UpdateAvx(x, 28, loMask, hiMask)
300298
ROUND256(cs, ms[12])
301299
ROUND256(cs, ms[13])
302300
ROUND256(cs, ms[14])
303301
ROUND256(cs, ms[15])
304302
m128i.store(ms, 12, s3)
305303

306304
block:
307-
let s0 = sha256UpdateAvx(x, 32, loMask, hiMask)
305+
let s0 {.align(32).} = sha256UpdateAvx(x, 32, loMask, hiMask)
308306
ROUND256(cs, ms[0])
309307
ROUND256(cs, ms[1])
310308
ROUND256(cs, ms[2])
311309
ROUND256(cs, ms[3])
312310
m128i.store(ms, 0, s0)
313311

314-
let s1 = sha256UpdateAvx(x, 36, loMask, hiMask)
312+
let s1 {.align(32).} = sha256UpdateAvx(x, 36, loMask, hiMask)
315313
ROUND256(cs, ms[4])
316314
ROUND256(cs, ms[5])
317315
ROUND256(cs, ms[6])
318316
ROUND256(cs, ms[7])
319317
m128i.store(ms, 4, s1)
320318

321-
let s2 = sha256UpdateAvx(x, 40, loMask, hiMask)
319+
let s2 {.align(32).} = sha256UpdateAvx(x, 40, loMask, hiMask)
322320
ROUND256(cs, ms[8])
323321
ROUND256(cs, ms[9])
324322
ROUND256(cs, ms[10])
325323
ROUND256(cs, ms[11])
326324
m128i.store(ms, 8, s2)
327325

328-
let s3 = sha256UpdateAvx(x, 44, loMask, hiMask)
326+
let s3 {.align(32).} = sha256UpdateAvx(x, 44, loMask, hiMask)
329327
ROUND256(cs, ms[12])
330328
ROUND256(cs, ms[13])
331329
ROUND256(cs, ms[14])
332330
ROUND256(cs, ms[15])
333331
m128i.store(ms, 12, s3)
334332

335333
block:
336-
let s0 = sha256UpdateAvx(x, 48, loMask, hiMask)
334+
let s0 {.align(32).} = sha256UpdateAvx(x, 48, loMask, hiMask)
337335
ROUND256(cs, ms[0])
338336
ROUND256(cs, ms[1])
339337
ROUND256(cs, ms[2])
340338
ROUND256(cs, ms[3])
341339
m128i.store(ms, 0, s0)
342340

343-
let s1 = sha256UpdateAvx(x, 52, loMask, hiMask)
341+
let s1 {.align(32).} = sha256UpdateAvx(x, 52, loMask, hiMask)
344342
ROUND256(cs, ms[4])
345343
ROUND256(cs, ms[5])
346344
ROUND256(cs, ms[6])
347345
ROUND256(cs, ms[7])
348346
m128i.store(ms, 4, s1)
349347

350-
let s2 = sha256UpdateAvx(x, 56, loMask, hiMask)
348+
let s2 {.align(32).} = sha256UpdateAvx(x, 56, loMask, hiMask)
351349
ROUND256(cs, ms[8])
352350
ROUND256(cs, ms[9])
353351
ROUND256(cs, ms[10])
354352
ROUND256(cs, ms[11])
355353
m128i.store(ms, 8, s2)
356354

357-
let s3 = sha256UpdateAvx(x, 60, loMask, hiMask)
355+
let s3 {.align(32).} = sha256UpdateAvx(x, 60, loMask, hiMask)
358356
ROUND256(cs, ms[12])
359357
ROUND256(cs, ms[13])
360358
ROUND256(cs, ms[14])
@@ -388,9 +386,9 @@ when defined(amd64):
388386
data: openArray[byte],
389387
blocks: int) {.inline, noinit.} =
390388
var
391-
ms {.align(64).}: array[16, uint64]
392-
x {.align(64).}: array[8, m128i]
393-
cs {.align(64).}: array[8, uint64]
389+
ms {.align(32), noinit.}: array[16, uint64]
390+
x {.align(32), noinit.}: array[8, m128i]
391+
cs {.align(32), noinit.}: array[8, uint64]
394392
blocksCount = blocks
395393
offset = 0
396394

nimcrypto/sha2/sha2_avx2.nim

+11-11
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ when defined(amd64):
163163
w[3] = w[2]; w[2] = w[1]; w[1] = w[0]; w[0] = tmp
164164

165165
template sha256UpdateAvx2(x, k256i, loMask, hiMask: untyped): m256i =
166-
var t {.align(32).}: array[4, m256i]
166+
var t {.align(32), noinit.}: array[4, m256i]
167167

168168
t[0] = mm256_alignr_epi8(x[1], x[0], 4)
169169
t[3] = mm256_alignr_epi8(x[3], x[2], 4)
@@ -202,7 +202,7 @@ when defined(amd64):
202202
mm256_add_epi32(x[3], m256i.load(K0x2, k256i))
203203

204204
template sha512UpdateAvx2(x, k512i: untyped): m256i =
205-
var t {.align(32).}: array[4, m256i]
205+
var t {.align(32), noinit.}: array[4, m256i]
206206

207207
t[0] = mm256_alignr_epi8(x[1], x[0], 8)
208208
t[3] = mm256_alignr_epi8(x[5], x[4], 8)
@@ -240,7 +240,7 @@ when defined(amd64):
240240

241241
template loadData32(x, ms, t2: untyped,
242242
data: openArray[byte]) =
243-
let shuffleMask =
243+
let shuffleMask {.align(32).} =
244244
mm256_setr_epi32(0x00010203'u32, 0x04050607'u32,
245245
0x08090a0b'u32, 0x0c0d0e0f'u32,
246246
0x00010203'u32, 0x04050607'u32,
@@ -328,10 +328,10 @@ when defined(amd64):
328328
data: openArray[byte],
329329
blocks: int) {.inline, noinit.} =
330330
var
331-
x {.align(32).}: array[4, m256i]
332-
ms {.align(32).}: array[16, uint32]
333-
t2 {.align(32).}: array[64, uint32]
334-
cs {.align(32).}: array[8, uint32]
331+
x {.align(32), noinit.}: array[4, m256i]
332+
ms {.align(32), noinit.}: array[16, uint32]
333+
t2 {.align(32), noinit.}: array[64, uint32]
334+
cs {.align(32), noinit.}: array[8, uint32]
335335
blocksCount = blocks
336336
offset = 0
337337

@@ -546,10 +546,10 @@ when defined(amd64):
546546
data: openArray[byte],
547547
blocks: int) {.inline, noinit.} =
548548
var
549-
x {.align(32).}: array[8, m256i]
550-
ms {.align(32).}: array[16, uint64]
551-
cs {.align(32).}: array[8, uint64]
552-
t2 {.align(32).}: array[80, uint64]
549+
x {.align(32), noinit.}: array[8, m256i]
550+
ms {.align(32), noinit.}: array[16, uint64]
551+
cs {.align(32), noinit.}: array[8, uint64]
552+
t2 {.align(32), noinit.}: array[80, uint64]
553553
blocksCount = blocks
554554
offset = 0
555555

nimcrypto/sha2/sha2_neon.nim

+3-3
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ when defined(arm64):
8383
data: openArray[byte],
8484
blocks: int) {.noinit, inline.} =
8585
var
86-
ms: array[4, uint32x4]
87-
temp: array[3, uint32x4]
88-
cs = uint32x4x2.load(state, 0)
86+
ms {.align(32), noinit.}: array[4, uint32x4]
87+
temp {.align(32), noinit.}: array[3, uint32x4]
88+
cs {.align(32).} = uint32x4x2.load(state, 0)
8989
offset = 0
9090

9191
for j in 0 ..< blocks:

nimcrypto/sha2/sha2_sha.nim

+6-6
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,15 @@ when defined(amd64):
8181
proc sha256Compress*(state: var array[8, uint32],
8282
data: openArray[byte],
8383
blocks: int) {.noinit, inline.} =
84-
let shufMask =
84+
let shufMask {.align(32).} =
8585
mm_set_epi64x(0x0c0d0e0f08090a0b'u64, 0x0405060700010203'u64)
8686

8787
var
88-
msgtmp: array[4, m128i]
89-
msg: m128i
90-
tmp = mm_shuffle_epi32(m128i.load(state, 0), 0xB1'u32)
91-
state1 = mm_shuffle_epi32(m128i.load(state, 4), 0x1B'u32)
92-
state0 = mm_alignr_epi8(tmp, state1, 8)
88+
msgtmp {.noinit.} : array[4, m128i]
89+
msg {.align(32), noinit.} : m128i
90+
tmp {.align(32).} = mm_shuffle_epi32(m128i.load(state, 0), 0xB1'u32)
91+
state1 {.align(32).} = mm_shuffle_epi32(m128i.load(state, 4), 0x1B'u32)
92+
state0 {.align(32).} = mm_alignr_epi8(tmp, state1, 8)
9393
blocksCount = blocks
9494
offset = 0
9595

0 commit comments

Comments
 (0)