Skip to content

Commit e23366e

Browse files
authored
Merge pull request #1921 from fenrus75/haswelldgemm
Replicate some of the SKYLAKEX dgemm improvements also to HASWELL
2 parents e8ca5a5 + b28f75c commit e23366e

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

kernel/x86_64/KERNEL.HASWELL

+2-1
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,10 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
4545

4646
DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c
4747
DGEMMKERNEL = dgemm_kernel_4x8_haswell.S
48+
DGEMM_BETA = dgemm_beta_skylakex.c
4849
DGEMMINCOPY = ../generic/gemm_ncopy_4.c
4950
DGEMMITCOPY = ../generic/gemm_tcopy_4.c
50-
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
51+
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
5152
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
5253
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
5354
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)

kernel/x86_64/dgemm_beta_skylakex.c

+12-4
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,38 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6161
c_offset = c;
6262

6363
if (beta == ZERO){
64-
__m512d z_zero;
6564

66-
z_zero = _mm512_setzero_pd();
6765
j = n;
6866
do {
6967
c_offset1 = c_offset;
7068
c_offset += ldc;
7169

7270
i = m;
73-
71+
#ifdef __AVX2__
72+
#ifdef __AVX512CD__
7473
while (i >= 32) {
74+
__m512d z_zero = _mm512_setzero_pd();
7575
_mm512_storeu_pd(c_offset1, z_zero);
7676
_mm512_storeu_pd(c_offset1 + 8, z_zero);
7777
_mm512_storeu_pd(c_offset1 + 16, z_zero);
7878
_mm512_storeu_pd(c_offset1 + 24 , z_zero);
7979
c_offset1 += 32;
8080
i -= 32;
8181
}
82+
#endif
8283
while (i >= 8) {
84+
#ifdef __AVX512CD__
85+
__m512d z_zero = _mm512_setzero_pd();
8386
_mm512_storeu_pd(c_offset1, z_zero);
87+
#else
88+
__m256d y_zero = _mm256_setzero_pd();
89+
_mm256_storeu_pd(c_offset1, y_zero);
90+
_mm256_storeu_pd(c_offset1 + 4, y_zero);
91+
#endif
8492
c_offset1 += 8;
8593
i -= 8;
8694
}
87-
95+
#endif
8896
while (i > 0) {
8997
*c_offset1 = ZERO;
9098
c_offset1 ++;

param.h

+1
Original file line numberDiff line numberDiff line change
@@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15081508
#define SYMV_P 8
15091509

15101510
#define SWITCH_RATIO 32
1511+
#define GEMM_PREFERED_SIZE 16
15111512

15121513
#ifdef ARCH_X86
15131514

0 commit comments

Comments
 (0)