@@ -52,7 +52,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
52
52
"subq $4 , %1 \n\t"
53
53
"jz 2f \n\t"
54
54
55
- ".align 16 \n\t"
55
+ // ".align 16 \n\t"
56
56
"1: \n\t"
57
57
58
58
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
@@ -114,3 +114,78 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
114
114
}
115
115
116
116
117
+ #define HAVE_KERNEL_4x2
118
+
119
+ static void dgemv_kernel_4x2 ( BLASLONG n , FLOAT * * ap , FLOAT * x , FLOAT * y , FLOAT * alpha ) __attribute__ ((noinline ));
120
+
121
+ static void dgemv_kernel_4x2 ( BLASLONG n , FLOAT * * ap , FLOAT * x , FLOAT * y , FLOAT * alpha )
122
+ {
123
+
124
+ BLASLONG register i = 0 ;
125
+
126
+ __asm__ __volatile__
127
+ (
128
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
129
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
130
+
131
+ "vmovups (%4,%0,8), %%ymm0 \n\t"
132
+ "vmovups (%5,%0,8), %%ymm1 \n\t"
133
+
134
+ "vbroadcastsd (%6), %%ymm6 \n\t" // alpha
135
+
136
+ "addq $4 , %0 \n\t"
137
+ "subq $4 , %1 \n\t"
138
+ "jz 2f \n\t"
139
+
140
+ "1: \n\t"
141
+
142
+ "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
143
+ "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
144
+ "vmovups (%4,%0,8), %%ymm0 \n\t"
145
+ "vmovups (%5,%0,8), %%ymm1 \n\t"
146
+
147
+ "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
148
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
149
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
150
+
151
+ "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
152
+
153
+ "addq $4 , %0 \n\t"
154
+ "subq $4 , %1 \n\t"
155
+ "jnz 1b \n\t"
156
+
157
+
158
+ "2: \n\t"
159
+
160
+ "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
161
+ "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
162
+
163
+
164
+ "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
165
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
166
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
167
+
168
+ "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
169
+
170
+
171
+ "vzeroupper \n\t"
172
+
173
+
174
+ :
175
+ :
176
+ "r" (i ), // 0
177
+ "r" (n ), // 1
178
+ "r" (x ), // 2
179
+ "r" (y ), // 3
180
+ "r" (ap [0 ]), // 4
181
+ "r" (ap [1 ]), // 5
182
+ "r" (alpha ) // 6
183
+ : "cc" ,
184
+ "%xmm0" , "%xmm1" ,
185
+ "%xmm4" , "%xmm5" ,
186
+ "%xmm6" ,
187
+ "%xmm8" ,
188
+ "%xmm12" , "%xmm13" ,
189
+ "memory"
190
+ );
191
+ }
0 commit comments