Skip to content

Commit 6e7be06

Browse files
committed
Refs JuliaLang/julia#5728. Fix gemv performance bug on Haswell Mac OSX.
On Mac OS X, it should use .align 4 (equal to .align 16 on Linux). I didn't get the performance benefit from .align. Thus, I deleted it.
1 parent a04d055 commit 6e7be06

12 files changed

+106
-31
lines changed

kernel/x86_64/cgemv_n_microk_haswell-4.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5151
"cmpq $0 , %1 \n\t"
5252
"je 2f \n\t"
5353

54-
".align 16 \n\t"
54+
// ".align 16 \n\t"
5555
"1: \n\t"
5656
"prefetcht0 320(%4,%0,4) \n\t"
5757
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
@@ -202,7 +202,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
202202
"cmpq $0 , %1 \n\t"
203203
"je 2f \n\t"
204204

205-
".align 16 \n\t"
205+
// ".align 16 \n\t"
206206
"1: \n\t"
207207
"prefetcht0 320(%4,%0,4) \n\t"
208208
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
@@ -322,7 +322,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
322322
"cmpq $0 , %1 \n\t"
323323
"je 2f \n\t"
324324

325-
".align 16 \n\t"
325+
// ".align 16 \n\t"
326326
"1: \n\t"
327327
"prefetcht0 320(%4,%0,4) \n\t"
328328
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
@@ -454,7 +454,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
454454
"cmpq $0 , %1 \n\t"
455455
"je 2f \n\t"
456456

457-
".align 16 \n\t"
457+
// ".align 16 \n\t"
458458
"1: \n\t"
459459
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
460460
"vmovups 32(%2,%0,4), %%ymm9 \n\t"

kernel/x86_64/cgemv_t_microk_haswell-4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
7676
"cmpq $0, %1 \n\t"
7777
"je 3f \n\t"
7878

79-
".align 16 \n\t"
79+
// ".align 16 \n\t"
8080
"1: \n\t"
8181
"prefetcht0 192(%4,%0,4) \n\t"
8282
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@@ -292,7 +292,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
292292
"cmpq $0, %1 \n\t"
293293
"je 3f \n\t"
294294

295-
".align 16 \n\t"
295+
// ".align 16 \n\t"
296296
"1: \n\t"
297297
"prefetcht0 192(%4,%0,4) \n\t"
298298
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@@ -446,7 +446,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
446446
"cmpq $0, %1 \n\t"
447447
"je 3f \n\t"
448448

449-
".align 16 \n\t"
449+
// ".align 16 \n\t"
450450
"1: \n\t"
451451
"prefetcht0 192(%4,%0,4) \n\t"
452452
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0

kernel/x86_64/dgemv_n_4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
8282
"shufpd $0, %%xmm12, %%xmm12 \n\t"
8383
"shufpd $0, %%xmm13, %%xmm13 \n\t"
8484

85-
".align 16 \n\t"
85+
// ".align 16 \n\t"
8686
"1: \n\t"
8787
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
8888
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
@@ -129,7 +129,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
129129

130130
#endif
131131

132-
#ifndef HAVE_KERNEL_4x2
132+
#ifndef HAVE_KERNEL_4x1
133133

134134
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
135135

@@ -144,7 +144,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
144144
"mulsd (%5), %%xmm12 \n\t" // alpha
145145
"shufpd $0, %%xmm12, %%xmm12 \n\t"
146146

147-
".align 16 \n\t"
147+
// ".align 16 \n\t"
148148
"1: \n\t"
149149
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
150150
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a

kernel/x86_64/dgemv_n_microk_haswell-4.c

+76-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
5252
"subq $4 , %1 \n\t"
5353
"jz 2f \n\t"
5454

55-
".align 16 \n\t"
55+
// ".align 16 \n\t"
5656
"1: \n\t"
5757

5858
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
@@ -114,3 +114,78 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
114114
}
115115

116116

117+
#define HAVE_KERNEL_4x2
118+
119+
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
120+
121+
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
122+
{
123+
124+
BLASLONG register i = 0;
125+
126+
__asm__ __volatile__
127+
(
128+
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
129+
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
130+
131+
"vmovups (%4,%0,8), %%ymm0 \n\t"
132+
"vmovups (%5,%0,8), %%ymm1 \n\t"
133+
134+
"vbroadcastsd (%6), %%ymm6 \n\t" // alpha
135+
136+
"addq $4 , %0 \n\t"
137+
"subq $4 , %1 \n\t"
138+
"jz 2f \n\t"
139+
140+
"1: \n\t"
141+
142+
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
143+
"vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
144+
"vmovups (%4,%0,8), %%ymm0 \n\t"
145+
"vmovups (%5,%0,8), %%ymm1 \n\t"
146+
147+
"vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
148+
"vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
149+
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
150+
151+
"vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
152+
153+
"addq $4 , %0 \n\t"
154+
"subq $4 , %1 \n\t"
155+
"jnz 1b \n\t"
156+
157+
158+
"2: \n\t"
159+
160+
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
161+
"vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
162+
163+
164+
"vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
165+
"vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
166+
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
167+
168+
"vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
169+
170+
171+
"vzeroupper \n\t"
172+
173+
174+
:
175+
:
176+
"r" (i), // 0
177+
"r" (n), // 1
178+
"r" (x), // 2
179+
"r" (y), // 3
180+
"r" (ap[0]), // 4
181+
"r" (ap[1]), // 5
182+
"r" (alpha) // 6
183+
: "cc",
184+
"%xmm0", "%xmm1",
185+
"%xmm4", "%xmm5",
186+
"%xmm6",
187+
"%xmm8",
188+
"%xmm12", "%xmm13",
189+
"memory"
190+
);
191+
}

kernel/x86_64/dgemv_t_4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
9595
"cmpq $0, %1 \n\t"
9696
"je 3f \n\t"
9797

98-
".align 16 \n\t"
98+
// ".align 16 \n\t"
9999
"1: \n\t"
100100

101101
"movups (%5,%0,8) , %%xmm14 \n\t" // x
@@ -171,7 +171,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
171171
"cmpq $0, %1 \n\t"
172172
"je 3f \n\t"
173173

174-
".align 16 \n\t"
174+
// ".align 16 \n\t"
175175
"1: \n\t"
176176

177177
"movups (%3,%0,8) , %%xmm12 \n\t"
@@ -245,7 +245,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
245245
"movsd (%2) , %%xmm10 \n\t"
246246
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
247247

248-
".align 16 \n\t"
248+
// ".align 16 \n\t"
249249
"1: \n\t"
250250

251251
"movups (%3,%0,8) , %%xmm12 \n\t"

kernel/x86_64/dgemv_t_microk_haswell-4.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5959
"cmpq $0, %1 \n\t"
6060
"je 3f \n\t"
6161

62-
".align 16 \n\t"
62+
// ".align 16 \n\t"
6363
"1: \n\t"
6464
// "prefetcht0 384(%2,%0,8) \n\t"
6565
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x

kernel/x86_64/sgemv_n_4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
131131
"shufps $0, %%xmm12, %%xmm12 \n\t"
132132
"shufps $0, %%xmm13, %%xmm13 \n\t"
133133

134-
".align 16 \n\t"
134+
// ".align 16 \n\t"
135135
"1: \n\t"
136136
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
137137

@@ -189,7 +189,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
189189
"cmpq $0, %1 \n\t"
190190
"je 2f \n\t"
191191

192-
".align 16 \n\t"
192+
// ".align 16 \n\t"
193193
"1: \n\t"
194194
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
195195
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
@@ -264,7 +264,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
264264
__asm__ __volatile__
265265
(
266266

267-
".align 16 \n\t"
267+
// ".align 16 \n\t"
268268
"1: \n\t"
269269

270270
"movups (%2,%0,4) , %%xmm12 \n\t"

kernel/x86_64/sgemv_n_microk_haswell-4.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
112112
"je 4f \n\t"
113113

114114

115-
".align 16 \n\t"
115+
// ".align 16 \n\t"
116116
"1: \n\t"
117117

118118
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
@@ -246,7 +246,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
246246
"je 4f \n\t"
247247

248248

249-
".align 16 \n\t"
249+
// ".align 16 \n\t"
250250
"1: \n\t"
251251
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
252252
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"

kernel/x86_64/sgemv_t_4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
105105
"cmpq $0, %1 \n\t"
106106
"je 3f \n\t"
107107

108-
".align 16 \n\t"
108+
// ".align 16 \n\t"
109109
"1: \n\t"
110110

111111
"movups (%5,%0,4) , %%xmm14 \n\t" // x
@@ -183,7 +183,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
183183
"cmpq $0, %1 \n\t"
184184
"je 3f \n\t"
185185

186-
".align 16 \n\t"
186+
// ".align 16 \n\t"
187187
"1: \n\t"
188188

189189
"movups (%3,%0,4) , %%xmm12 \n\t"
@@ -258,7 +258,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
258258
"movss (%2) , %%xmm10 \n\t"
259259
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
260260

261-
".align 16 \n\t"
261+
// ".align 16 \n\t"
262262
"1: \n\t"
263263

264264
"movups (%3,%0,4) , %%xmm12 \n\t"

kernel/x86_64/sgemv_t_microk_haswell-4.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
7575
"je 4f \n\t"
7676

7777

78-
".align 16 \n\t"
78+
// ".align 16 \n\t"
7979
"1: \n\t"
8080
"prefetcht0 384(%2,%0,4) \n\t"
8181
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x

kernel/x86_64/zgemv_n_microk_haswell-4.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
4747
"vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
4848

4949

50-
".align 16 \n\t"
50+
// ".align 16 \n\t"
5151
"1: \n\t"
5252
"prefetcht0 192(%4,%0,8) \n\t"
5353
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -152,7 +152,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
152152
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
153153

154154

155-
".align 16 \n\t"
155+
// ".align 16 \n\t"
156156
"1: \n\t"
157157
"prefetcht0 192(%4,%0,8) \n\t"
158158
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -236,7 +236,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
236236
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
237237
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
238238

239-
".align 16 \n\t"
239+
// ".align 16 \n\t"
240240
"1: \n\t"
241241
"prefetcht0 192(%4,%0,8) \n\t"
242242
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -338,7 +338,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
338338
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r
339339
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
340340

341-
".align 16 \n\t"
341+
// ".align 16 \n\t"
342342
"1: \n\t"
343343
"prefetcht0 192(%2,%0,8) \n\t"
344344
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src

kernel/x86_64/zgemv_t_microk_haswell-4.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
4646
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
4747
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
4848

49-
".align 16 \n\t"
49+
// ".align 16 \n\t"
5050
"1: \n\t"
5151

5252
"prefetcht0 192(%2,%0,8) \n\t"
@@ -219,7 +219,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
219219
"vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
220220
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
221221

222-
".align 16 \n\t"
222+
// ".align 16 \n\t"
223223
"1: \n\t"
224224

225225
"prefetcht0 192(%2,%0,8) \n\t"
@@ -341,7 +341,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
341341
"vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
342342
"vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
343343

344-
".align 16 \n\t"
344+
// ".align 16 \n\t"
345345
"1: \n\t"
346346

347347
"prefetcht0 192(%2,%0,8) \n\t"

0 commit comments

Comments
 (0)