Skip to content

Commit a5a1118

Browse files
authored
Merge pull request #1 from xianyi/develop
rebase
2 parents 6ba30e2 + e23366e commit a5a1118

15 files changed

+596
-51
lines changed

Makefile.x86_64

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ endif
99
endif
1010

1111
ifeq ($(CORE), SKYLAKEX)
12+
ifndef DYNAMIC_ARCH
1213
ifndef NO_AVX512
1314
CCOMMON_OPT += -march=skylake-avx512
1415
FCOMMON_OPT += -march=skylake-avx512
@@ -22,6 +23,7 @@ endif
2223
endif
2324
endif
2425
endif
26+
endif
2527

2628
ifeq ($(OSNAME), Interix)
2729
ARFLAGS = -m x64

cmake/arch.cmake

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ endif ()
4444

4545

4646
if (DYNAMIC_ARCH)
47+
if (ARM64)
48+
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
49+
endif ()
50+
4751
if (X86)
4852
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
4953
endif ()

common_level3.h

+8
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
4747
extern "C" {
4848
#endif
4949

50+
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
51+
float * A, BLASLONG strideA,
52+
float * B, BLASLONG strideB,
53+
float * R, BLASLONG strideR);
54+
55+
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
56+
57+
5058
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
5159
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
5260
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,

cpuid_power.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ int detect(void){
136136
char buffer[512], *p;
137137

138138
p = (char *)NULL;
139-
infile = popen("prtconf|grep 'Processor Type'");
139+
infile = popen("prtconf|grep 'Processor Type'", "r");
140140
while (fgets(buffer, sizeof(buffer), infile)){
141141
if (!strncmp("Pro", buffer, 3)){
142142
p = strchr(buffer, ':') + 2;

driver/others/CMakeLists.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
4747
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
4848

4949
if (DYNAMIC_ARCH)
50-
list(APPEND COMMON_SOURCES dynamic.c)
50+
if (ARM64)
51+
list(APPEND COMMON_SOURCES dynamic_arm64.c)
52+
else ()
53+
list(APPEND COMMON_SOURCES dynamic.c)
54+
endif ()
5155
else ()
5256
list(APPEND COMMON_SOURCES parameter.c)
5357
endif ()

interface/gemm.c

+8
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
271271

272272
PRINT_DEBUG_CNAME;
273273

274+
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
275+
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) {
276+
sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc);
277+
return;
278+
}
279+
280+
#endif
281+
274282
#ifndef COMPLEX
275283
args.alpha = (void *)α
276284
args.beta = (void *)β

kernel/CMakeLists.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -125,10 +125,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
125125
set(USE_TRMM true)
126126
endif ()
127127

128-
foreach (float_type ${FLOAT_TYPES})
128+
foreach (float_type SINGLE DOUBLE)
129129
string(SUBSTRING ${float_type} 0 1 float_char)
130130
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
131+
endforeach()
131132

133+
foreach (float_type ${FLOAT_TYPES})
134+
string(SUBSTRING ${float_type} 0 1 float_char)
132135
if (${float_char}GEMMINCOPY)
133136
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
134137
endif ()

kernel/Makefile

+36-1
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,43 @@ endif
55
TOPDIR = ..
66
include $(TOPDIR)/Makefile.system
77

8+
AVX2OPT =
9+
ifeq ($(C_COMPILER), GCC)
10+
# AVX2 support was added in 4.7.0
11+
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
12+
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
13+
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
14+
AVX2OPT = -mavx2
15+
endif
16+
endif
17+
ifeq ($(C_COMPILER), CLANG)
18+
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
19+
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
20+
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
21+
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
22+
AVX2OPT = -mavx2
23+
endif
24+
endif
25+
ifdef NO_AVX2
26+
AVX2OPT=
27+
endif
28+
829
ifdef TARGET_CORE
9-
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
30+
ifeq ($(TARGET_CORE), SKYLAKEX)
31+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
32+
ifeq ($(OSNAME), CYGWIN_NT)
33+
override CFLAGS += -fno-asynchronous-unwind-tables
34+
endif
35+
ifeq ($(OSNAME), WINNT)
36+
ifeq ($(C_COMPILER), GCC)
37+
override CFLAGS += -fno-asynchronous-unwind-tables
38+
endif
39+
endif
40+
else ifeq ($(TARGET_CORE), HASWELL)
41+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
42+
else
43+
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
44+
endif
1045
BUILD_KERNEL = 1
1146
KDIR =
1247
TSUFFIX = _$(TARGET_CORE)

kernel/arm64/KERNEL.ARMV8

+31-32
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,38 @@ CDOTKERNEL = zdot.S
104104
ZDOTKERNEL = zdot.S
105105
DSDOTKERNEL = dot.S
106106

107-
ifneq ($(OS_DARWIN)$(CROSS),11)
107+
ifeq ($(OS_DARWIN)$(CROSS),11)
108+
109+
STRMMKERNEL = ../generic/trmmkernel_2x2.c
110+
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
111+
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
112+
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
113+
114+
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
115+
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
116+
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
117+
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
118+
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
108119

120+
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
121+
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
122+
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
123+
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
124+
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
125+
126+
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
127+
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
128+
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
129+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
130+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
131+
132+
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
133+
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
134+
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
135+
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
136+
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
137+
138+
else
109139
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
110140
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
111141
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
@@ -173,35 +203,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
173203
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
174204
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
175205

176-
else
177-
178-
STRMMKERNEL = ../generic/trmmkernel_2x2.c
179-
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
180-
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
181-
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
182-
183-
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
184-
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
185-
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
186-
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
187-
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
188-
189-
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
190-
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
191-
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
192-
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
193-
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
194-
195-
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
196-
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
197-
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
198-
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
199-
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
200-
201-
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
202-
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
203-
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
204-
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
205-
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
206-
207206
endif

kernel/x86_64/KERNEL.HASWELL

+4-2
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,10 @@ ZAXPYKERNEL = zaxpy.c
3333

3434
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
3535
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
36+
SGEMM_BETA = sgemm_beta_skylakex.c
3637
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
3738
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
38-
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
39+
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
3940
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
4041
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
4142
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -44,9 +45,10 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
4445

4546
DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c
4647
DGEMMKERNEL = dgemm_kernel_4x8_haswell.S
48+
DGEMM_BETA = dgemm_beta_skylakex.c
4749
DGEMMINCOPY = ../generic/gemm_ncopy_4.c
4850
DGEMMITCOPY = ../generic/gemm_tcopy_4.c
49-
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
51+
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
5052
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
5153
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
5254
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)

kernel/x86_64/dgemm_beta_skylakex.c

+12-4
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,38 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6161
c_offset = c;
6262

6363
if (beta == ZERO){
64-
__m512d z_zero;
6564

66-
z_zero = _mm512_setzero_pd();
6765
j = n;
6866
do {
6967
c_offset1 = c_offset;
7068
c_offset += ldc;
7169

7270
i = m;
73-
71+
#ifdef __AVX2__
72+
#ifdef __AVX512CD__
7473
while (i >= 32) {
74+
__m512d z_zero = _mm512_setzero_pd();
7575
_mm512_storeu_pd(c_offset1, z_zero);
7676
_mm512_storeu_pd(c_offset1 + 8, z_zero);
7777
_mm512_storeu_pd(c_offset1 + 16, z_zero);
7878
_mm512_storeu_pd(c_offset1 + 24 , z_zero);
7979
c_offset1 += 32;
8080
i -= 32;
8181
}
82+
#endif
8283
while (i >= 8) {
84+
#ifdef __AVX512CD__
85+
__m512d z_zero = _mm512_setzero_pd();
8386
_mm512_storeu_pd(c_offset1, z_zero);
87+
#else
88+
__m256d y_zero = _mm256_setzero_pd();
89+
_mm256_storeu_pd(c_offset1, y_zero);
90+
_mm256_storeu_pd(c_offset1 + 4, y_zero);
91+
#endif
8492
c_offset1 += 8;
8593
i -= 8;
8694
}
87-
95+
#endif
8896
while (i > 0) {
8997
*c_offset1 = ZERO;
9098
c_offset1 ++;

kernel/x86_64/sgemm_beta_skylakex.c

+12-6
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
6161
c_offset = c;
6262

6363
if (beta == ZERO){
64-
__m512 z_zero;
65-
__m256 y_zero;
6664

67-
z_zero = _mm512_setzero_ps();
68-
y_zero = _mm256_setzero_ps();
6965
j = n;
7066
do {
7167
c_offset1 = c_offset;
7268
c_offset += ldc;
7369

7470
i = m;
75-
71+
#ifdef __AVX2__
7672
while (i >= 32) {
73+
#ifdef __AVX512CD__
74+
__m512 z_zero = _mm512_setzero_ps();
7775
_mm512_storeu_ps(c_offset1, z_zero);
7876
_mm512_storeu_ps(c_offset1 + 16, z_zero);
77+
#else
78+
__m256 y_zero = _mm256_setzero_ps();
79+
_mm256_storeu_ps(c_offset1, y_zero);
80+
_mm256_storeu_ps(c_offset1 + 8, y_zero);
81+
_mm256_storeu_ps(c_offset1 + 16, y_zero);
82+
_mm256_storeu_ps(c_offset1 + 24, y_zero);
83+
#endif
7984
c_offset1 += 32;
8085
i -= 32;
8186
}
8287
while (i >= 8) {
88+
__m256 y_zero = _mm256_setzero_ps();
8389
_mm256_storeu_ps(c_offset1, y_zero);
8490
c_offset1 += 8;
8591
i -= 8;
8692
}
87-
93+
#endif
8894
while (i > 0) {
8995
*c_offset1 = ZERO;
9096
c_offset1 ++;

0 commit comments

Comments
 (0)