Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ref #103: enhancement for small matrix dimensions #390

Merged
merged 16 commits into from
Jun 27, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ ifndef BINARY64
else
@echo " BINARY ... 64bit "
endif

ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
endif

@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
Expand Down
3 changes: 2 additions & 1 deletion Makefile.rule
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ NO_AFFINITY = 1
# COMMON_OPT = -O2

# gfortran option for LAPACK
FCOMMON_OPT = -frecursive
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# FCOMMON_OPT = -frecursive

# Profiling flags
COMMON_PROF = -pg
Expand Down
68 changes: 65 additions & 3 deletions Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,55 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
endif

# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif


#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif

# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif




ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT
endif
endif

ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
Expand All @@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
Expand Down Expand Up @@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif

ifeq ($(ARCH), x86_64)
Expand Down Expand Up @@ -503,8 +544,10 @@ else
ifdef BINARY64
FCOMMON_OPT += -m64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
endif
endif
else
FCOMMON_OPT += -m32
endif
Expand All @@ -517,8 +560,10 @@ endif
ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifdef USE_OPENMP
FCOMMON_OPT += -openmp
endif
Expand All @@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64
FCOMMON_OPT += -q64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8
endif
endif
else
FCOMMON_OPT += -q32
endif
Expand All @@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
Expand All @@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif

ifneq ($(ARCH), mips64)
ifndef BINARY64
Expand All @@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif

ifeq ($(ARCH), mips64)
ifndef BINARY64
Expand Down Expand Up @@ -682,10 +735,12 @@ endif

ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
endif

ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM)
Expand Down Expand Up @@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif

ifeq ($(BINARY), 32)
CCOMMON_OPT += -DNO_AVX
endif

ifdef SMP
CCOMMON_OPT += -DSMP_SERVER

Expand Down Expand Up @@ -872,8 +931,11 @@ endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
endif

ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif
Expand Down
2 changes: 2 additions & 0 deletions cpuid_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <string.h>
#include "cpuid.h"

/*
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
Expand All @@ -50,6 +51,7 @@
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif
*/

#ifndef CPUIDEMU

Expand Down
2 changes: 1 addition & 1 deletion driver/others/divtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
#include "common.h"

#ifdef SMP
#ifndef USE64BITINT
#if !defined(USE64BITINT) || defined(ARCH_X86)
unsigned int blas_quick_divide_table[] = {
0x00000000, 0x00000001, 0x80000001, 0x55555556,
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,
Expand Down
63 changes: 56 additions & 7 deletions interface/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
#endif

#ifndef GEMM_MULTITHREAD_THRESHOLD
# define GEMM_MULTITHREAD_THRESHOLD 4
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif

static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
Expand Down Expand Up @@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);

args.common = NULL;
int nthreads_max = num_cpu_avail(3);
int nthreads_avail = nthreads_max;

if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
args.nthreads = 1;
}else{
args.nthreads = num_cpu_avail(3);
#ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#else
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}

#endif
args.common = NULL;

if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else
args.nthreads = nthreads_max;


if (args.nthreads == 1) {
#endif

Expand Down
9 changes: 5 additions & 4 deletions interface/ger.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

Expand Down Expand Up @@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,

FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

Expand Down Expand Up @@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,

buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);


if (nthreads == 1) {
#endif

GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);

#ifdef SMP
#ifdef SMPBUG
} else {

GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);
Expand Down
2 changes: 1 addition & 1 deletion interface/rotmg.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){

#endif

FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;

if(*dd1 < ZERO)
{
Expand Down
8 changes: 4 additions & 4 deletions interface/zger.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

Expand Down Expand Up @@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,

FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

Expand Down Expand Up @@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,

buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);

if (nthreads == 1) {
Expand All @@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif

#ifdef SMP
#ifdef SMPBUG

} else {

Expand Down
3 changes: 3 additions & 0 deletions kernel/x86_64/KERNEL.BARCELONA
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

Expand Down
Loading