From 99c7bba8e404fcf697f00bc986e106892eff47ad Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 07:24:29 +0000 Subject: [PATCH 01/16] Initial support for SkylakeX / AVX512 This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server) target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set, which brings 2 basic things: 1) 512 bit wide SIMD (2x width of AVX2) 2) 32 SIMD registers (2x the number on AVX2) This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel to AVX512VL; more will follow later but this patch aims to get the infrastructure in place for this "later". Full performance tuning has not been done yet; with more registers and wider SIMD it's in theory possible to retune the kernels but even without that there's an interesting enough performance increase (30-40% range) with just this change. --- Makefile.system | 8 +- TargetList.txt | 1 + cmake/arch.cmake | 3 + cmake/system.cmake | 2 +- cpuid.h | 3 + cpuid_x86.c | 2 + driver/others/dynamic.c | 2 + driver/others/parameter.c | 4 +- getarch.c | 15 + kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 16 + kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SKYLAKEX | 4 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 6812 ++++++++++++++++++++ kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 119 + 57 files changed, 7034 insertions(+), 47 deletions(-) create mode 100644 kernel/x86_64/KERNEL.SKYLAKEX create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.S diff --git a/Makefile.system b/Makefile.system index 7bfac1fa80..b005b80c9f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -62,6 +62,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -95,6 +98,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET_CORE), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -467,7 +473,7 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN +DYNAMIC_CORE += HASWELL ZEN SKYLAKEX endif endif diff --git a/TargetList.txt b/TargetList.txt index aeeaa9ede3..31e4881c4e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -20,6 +20,7 @@ DUNNINGTON NEHALEM SANDYBRIDGE HASWELL +SKYLAKEX ATOM b)AMD CPU: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 798a9ef824..527d2bec6e 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -56,6 +56,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX2) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () + if (NOT NO_AVX512) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 6458956710..c21fe7c142 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") diff --git a/cpuid.h b/cpuid.h index 1dacc49bae..a6bc211f3e 100644 --- a/cpuid.h +++ b/cpuid.h @@ -115,6 +115,7 @@ #define CORE_STEAMROLLER 25 #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 +#define CORE_SKYLAKEX 28 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,6 +138,7 @@ #define HAVE_AVX (1 << 18) #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -211,5 +213,6 @@ typedef struct { #define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 +#define CPUTYPE_SKYLAKEX 52 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 342c565252..5f49e77157 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -50,6 +50,8 @@ #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CORE_HASWELL CORE_NEHALEM +#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM +#define CORE_SKYLAKEX CORE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index fbf7cd40e4..a0c9794b1c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -74,6 +74,7 @@ extern gotoblas_t gotoblas_STEAMROLLER; extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; @@ -83,6 +84,7 @@ extern gotoblas_t gotoblas_ZEN; //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 31a48644ff..e7332c0c42 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 992fc2b953..fcffe63e22 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "HASWELL" #endif +#ifdef FORCE_SKYLAKEX +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SKYLAKEX" +#define ARCHCONFIG "-DSKYLAKEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" +#define LIBNAME "skylakex" +#define CORENAME "SKYLAKEX" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c06d1eae88..947114ebef 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") + if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0664263967..b37e536efa 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), SKYLAKEX) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b6c5b54deb..9030d7c6dd 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -871,6 +871,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SKYLAKEX + +#ifdef DEBUG + fprintf(stderr, "SkylakeX\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 0b475afa21..34653d400a 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index e98854f34b..492f343447 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 086852cfce..6840c54adf 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 2dd8ad08b2..361ccf6030 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 154276f6ac..11825429ef 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index acdcd6e22b..4c054f3992 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index da561b5833..e674967365 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index a11b0286ac..498057697b 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 787ab59822..f3072983d0 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 9a3b0cbd7d..879ae9c383 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index bd7a78b5ae..6c308197b7 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX new file mode 100644 index 0000000000..744831d678 --- /dev/null +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -0,0 +1,4 @@ +include $(KERNELDIR)/KERNEL.HASWELL + +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index b1ec19bd3d..586d05ac2d 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 5f01f7eebf..93fca0a0d9 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 770c955b2a..d81766cd40 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index d75e58fdd9..6bdea67871 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 9b9179da04..72af998092 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 4bde62824f..b4acdccd21 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 8162a5d833..0595490288 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 1b9ca7a605..309fbe7677 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 6b99d6fdd0..a7478e3a8b 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 428558617a..2c7b3b17c2 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3e8db3fa3f..73099462c1 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 61cb77a64c..431e4bb3fc 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89fe408a6..d89c4070d7 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index b6f3c21afe..c3ab2ffe6b 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S new file mode 100644 index 0000000000..1fab892ca7 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -0,0 +1,6812 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm2 + vbroadcastss -1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm2 + vbroadcastss 1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm12,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm3,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro KERNEL16x6_SUB4 + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm7 + vbroadcastss -1 * SIZE(BO), %zmm9 + VFMADD231PS_( %zmm8,%zmm7,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm9,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm11 + vbroadcastss 1 * SIZE(BO), %zmm13 + VFMADD231PS_( %zmm12,%zmm11,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm13,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm16 + vbroadcastss -3 * SIZE(BO), %zmm17 + + VFMADD231PS_( %zmm4,%zmm16,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm17,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm18 + vbroadcastss -1 * SIZE(BO), %zmm19 + VFMADD231PS_( %zmm8,%zmm18,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm19,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm20 + vbroadcastss 1 * SIZE(BO), %zmm21 + VFMADD231PS_( %zmm12,%zmm20,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm21,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm22 + vbroadcastss -3 * SIZE(BO), %zmm23 + + VFMADD231PS_( %zmm4,%zmm22,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm23,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm24 + vbroadcastss -1 * SIZE(BO), %zmm25 + VFMADD231PS_( %zmm8,%zmm24,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm25,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm26 + vbroadcastss 1 * SIZE(BO), %zmm27 + VFMADD231PS_( %zmm12,%zmm26,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm27,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm28 + vbroadcastss -3 * SIZE(BO), %zmm29 + + VFMADD231PS_( %zmm4,%zmm28,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm29,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm30 + vbroadcastss -1 * SIZE(BO), %zmm31 + VFMADD231PS_( %zmm8,%zmm30,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm31,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm1 + vbroadcastss 1 * SIZE(BO), %zmm5 + VFMADD231PS_( %zmm12,%zmm1,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm5,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + vmulps %zmm0 , %zmm12, %zmm12 + vmulps %zmm0 , %zmm14, %zmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO1, LDC,2), %zmm8,%zmm8 + + vaddps (CO2), %zmm10,%zmm10 + + vaddps (CO2, LDC), %zmm12,%zmm12 + + vaddps (CO2, LDC,2), %zmm14,%zmm14 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO1, LDC,2) + + vmovups %zmm10, (CO2) + + vmovups %zmm12, (CO2, LDC) + + vmovups %zmm14, (CO2, LDC,2) + +.endm + + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO2), %zmm8,%zmm8 + + vaddps (CO2, LDC), %zmm10,%zmm10 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO2) + + vmovups %zmm10, (CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + +#endif + + vmovups %zmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index fd028964be..65305ac59f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index f04d461f77..065e5b3852 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 199d8a5176..73ae001ea8 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 691a071f73..f37c251a18 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8cae3fc1b8..8a5c44c9ba 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index d7091624d5..0c40a3435e 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 3549b98637..7a2eeace59 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 882b035a90..0408b577c7 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 8cb1d532f1..53866cf954 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index d11c76647c..ef12569c89 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index f6f88155c6..0fedc496b9 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 3e4b7d5dfc..2ab7a671bb 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index aa5d8fac00..2a6d0e4c79 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index dd95eea174..e44bd75506 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 75124cf3ed..e9f330c365 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index db1a4ff5f8..9f0dead180 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 599765a6de..b6106a37d7 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 4227d548e8..49a5e85e89 100644 --- a/param.h +++ b/param.h @@ -1613,6 +1613,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#endif + +#ifdef SKYLAKEX + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#ifdef WINDOWS_ABI +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 128 +#else +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#endif +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif + + #endif From 00235157339dc5fba2b4194bd660c45257e539e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:22:59 +0200 Subject: [PATCH 02/16] Typo fix (misplaced parenthesis) --- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 361ccf6030..e2f731fca8 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif From f1fb9a474571846ffc140313dbe5b8ba21925b74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:48:27 +0200 Subject: [PATCH 03/16] Propagate NO_AVX512 if needed --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index b005b80c9f..cec4b44e55 100644 --- a/Makefile.system +++ b/Makefile.system @@ -147,6 +147,10 @@ ifeq ($(NO_AVX2), 1) GETARCH_FLAGS += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +GETARCH_FLAGS += -DNO_AVX512 +endif + ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif From a7d0f49cec68dc3f116feed0320708ae004af4c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:13:25 +0200 Subject: [PATCH 04/16] Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available --- Makefile.system | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index cec4b44e55..82e38a6d28 100644 --- a/Makefile.system +++ b/Makefile.system @@ -477,7 +477,12 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN SKYLAKEX +DYNAMIC_CORE += HASWELL ZEN +endif +ifneq ($(NO_AVX512), 1) +ifneq ($(NO_AVX2), 1) +DYNAMIC_CORE += SKYLAKEX +endif endif endif From 5a92b311e05fb938e1fd85dcaf6fbeebc77bd4fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:29:07 +0200 Subject: [PATCH 05/16] Separate Skylake X from Skylake --- cpuid_x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 5f49e77157..d0dbe1d24e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1301,6 +1301,19 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) @@ -1558,6 +1571,7 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *lowercpuname[] = { @@ -1612,6 +1626,7 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", + "skylakex" }; static char *corename[] = { @@ -1643,6 +1658,7 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *corename_lower[] = { @@ -1674,6 +1690,7 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", + "skylakex" }; @@ -1862,6 +1879,19 @@ int get_coretype(void){ else return CORE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if/support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) From 5a51cf4576df2e065e5517b04369ff10a2a83f58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:41:33 +0200 Subject: [PATCH 06/16] Separate Skylake X from Skylake --- driver/others/dynamic.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a0c9794b1c..5e9a24b8b5 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -79,6 +79,11 @@ extern gotoblas_t gotoblas_EXCAVATOR; #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#endif #endif #else //Use NEHALEM kernels for sandy bridge @@ -286,8 +291,21 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 5) { + // Intel Skylake X +#ifndef NO_AVX512 + return $gotoblas_SKYLAKEX; +#else + if(support_avx()) + return &gotoblas_HASWELL; + else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } +#endif //Intel Skylake - if (model == 14 || model == 5) { + if (model == 14) { if(support_avx()) return &gotoblas_HASWELL; else{ @@ -447,7 +465,8 @@ static char *corename[] = { "Haswell", "Steamroller", "Excavator", - "Zen" + "Zen", + "SkylakeX" }; char *gotoblas_corename(void) { @@ -475,7 +494,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; - + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; return corename[0]; } @@ -505,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); From ef626c6824c26415bc074d11325245e72f9e3284 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:13:19 +0200 Subject: [PATCH 07/16] typo fix --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5e9a24b8b5..2c902d1083 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -294,7 +294,7 @@ static gotoblas_t *get_coretype(void){ if (model == 5) { // Intel Skylake X #ifndef NO_AVX512 - return $gotoblas_SKYLAKEX; + return &gotoblas_SKYLAKEX; #else if(support_avx()) return &gotoblas_HASWELL; From 89372e0993b7d9fe9061797625713519392fa42b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 22:15:09 +0000 Subject: [PATCH 08/16] Use AVX512 also for DGEMM this required switching to the generic gemm_beta code (which is faster anyway on SKX) for both DGEMM and SGEMM Performance for the not-retuned version is in the 30% range --- kernel/x86_64/KERNEL.SKYLAKEX | 15 + kernel/x86_64/dgemm_kernel_16x2_skylakex.S | 5138 ++++++++++++++++++++ kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 3 +- 3 files changed, 5154 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.S diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 744831d678..c273ff8cd1 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + +DTRMMKERNEL = ../generic/trmmkernel_16x2.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.S b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S new file mode 100644 index 0000000000..91ac512805 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S @@ -0,0 +1,5138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 1024 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + vbroadcastsd -12 * SIZE(BO), %zmm1 + vbroadcastsd -11 * SIZE(BO), %zmm2 + vbroadcastsd -10 * SIZE(BO), %zmm3 + + vmovaps -16 * SIZE(AO), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + + vmovaps -8 * SIZE(AO), %zmm9 + VFMADD231PD_ %zmm10,%zmm1,%zmm9 + VFMADD231PD_ %zmm11,%zmm2,%zmm9 + VFMADD231PD_ %zmm12,%zmm3,%zmm9 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %zmm2 +.endm + + + + +.macro KERNEL16x3_2 + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_3 + vmovups 0 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_4 + vmovups 16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + addq $12, BI + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $64, %rax +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %zmm0 + + vmulpd %zmm0 , %zmm4 , %zmm4 + vmulpd %zmm0 , %zmm10, %zmm10 + + vmulpd %zmm0 , %zmm5 , %zmm5 + vmulpd %zmm0 , %zmm11, %zmm11 + + vmulpd %zmm0 , %zmm6 , %zmm6 + vmulpd %zmm0 , %zmm12, %zmm12 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %zmm4,%zmm4 + vaddpd 8 * SIZE(CO1), %zmm10,%zmm10 + + vaddpd (CO1, LDC), %zmm5,%zmm5 + vaddpd 8 * SIZE(CO1, LDC), %zmm11,%zmm11 + + vaddpd (CO1, LDC, 2), %zmm6,%zmm6 + vaddpd 8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12 + +#endif + + vmovups %zmm4 , (CO1) + vmovups %zmm10, 8 * SIZE(CO1) + + vmovups %zmm5 , (CO1, LDC) + vmovups %zmm11, 8 * SIZE(CO1, LDC) + + vmovups %zmm6 , (CO1, LDC, 2) + vmovups %zmm12, 8 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S index 1fab892ca7..ac4421252d 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) +# prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) @@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) From ac7b6e3e9aeffe111a0ef23ba74ac2b181b87e30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 08:23:40 +0200 Subject: [PATCH 09/16] Fix misplaced endif --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2c902d1083..ac1186c8ff 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -302,8 +302,8 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } +#endif } -#endif //Intel Skylake if (model == 14) { if(support_avx()) From 8be027e4c62460f373980e883c487a30a15b5a5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 14:36:39 +0200 Subject: [PATCH 10/16] Update dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ac1186c8ff..96612cc52f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -82,7 +82,7 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; #else -#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#define gotoblas_SKYLAKEX gotoblas_HASWELL #endif #endif #else From dc9fe05ab5845452d684746bb7b7b7ad400c0c31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 17:10:19 +0200 Subject: [PATCH 11/16] Update cpuid_x86.c --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index d0dbe1d24e..fc937865cb 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1883,7 +1883,7 @@ int get_coretype(void){ #ifndef NO_AVX512 return CORE_SKYLAKEX; #else - if/support_avx()) + if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else From b7feded85acaf95d68ed4cfd573e60c83fdbca5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:24:05 +0200 Subject: [PATCH 12/16] Propagate NO_AVX512 via CCOMMON_OPT --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 82e38a6d28..8c875d6f78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -939,6 +939,10 @@ ifeq ($(NO_AVX2), 1) CCOMMON_OPT += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +CCOMMON_OPT += -DNO_AVX512 +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER From 38ad05bd0484ea723a42415f986cf0db24e01ca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:26:49 +0200 Subject: [PATCH 13/16] Extend loop range to find SkylakeX in force_coretype --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 96612cc52f..acb2d8b8c2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -506,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 23; i++) + for ( i=1 ; i <= 24; i++) { if (!strncasecmp(coretype,corename[i],20)) { From 15a78d6b662569a464de9a00517897b036fe7886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 15:58:34 +0200 Subject: [PATCH 14/16] export NO_AVX512 setting --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 8c875d6f78..eaf3e98891 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1249,6 +1249,7 @@ export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE +export NO_AVX512 export SGEMM_UNROLL_M export SGEMM_UNROLL_N From e8002536ec90b74148abce1c3de9bca0061dbe32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 18:23:01 +0200 Subject: [PATCH 15/16] disable quiet_make for the moment --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index eaf3e98891..5c16e2bee7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -ifeq ($(QUIET_MAKE), 1) -MAKE += -s -endif +#ifeq ($(QUIET_MAKE), 1) +#MAKE += -s +#endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From f6021c798dea23685af3eedcb63c4a388c78f226 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 19:09:38 +0200 Subject: [PATCH 16/16] Re-enable QUIET_MAKE --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5c16e2bee7..eaf3e98891 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -#ifeq ($(QUIET_MAKE), 1) -#MAKE += -s -#endif +ifeq ($(QUIET_MAKE), 1) +MAKE += -s +endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0