Pairwise BLAS-based sumabs and sumabs2

simonster · simonster · commit d33608ca297d · 2014-05-29T14:34:48.000+02:00
diff --git a/base/linalg/dense.jl b/base/linalg/dense.jl
@@ -27,22 +27,18 @@ isposdef{T}(A::AbstractMatrix{T}, UL::Symbol) = (S = typeof(sqrt(one(T))); ispos
 isposdef{T}(A::AbstractMatrix{T}) = (S = typeof(sqrt(one(T))); isposdef!(S == T ? copy(A) : convert(AbstractMatrix{S}, A)))
 isposdef(x::Number) = imag(x)==0 && real(x) > 0
 
-
-Base.sumabs{T<:BlasFloat}(x::Union(Array{T},StridedVector{T})) = 
-    length(x) > 32 ? BLAS.asum(x) : Base._sumabs(x)
-
 stride1(x::Array) = 1
 stride1(x::StridedVector) = stride(x, 1)::Int
 
-function Base.sumabs2{T<:BlasFloat}(x::Union(Array{T},StridedVector{T}))
-    n = length(x)
-    if n < DOT_CUTOFF
-        return Base._sumabs2(x)
-    else
-        px = pointer(x)
-        incx = stride1(x)
-        return BLAS.dot(n, px, incx, px, incx)        
-    end
+Base.sum_seq{T<:BlasFloat}(::Base.AbsFun, a::Array{T}, ifirst::Int, ilast::Int) =
+    BLAS.asum(ilast-ifirst+1, pointer(a, ifirst), stride1(a))
+
+# This appears to show a benefit from a larger block size
+Base.sum_pairwise_blocksize(::Base.Abs2Fun) = 4096
+function Base.sum_seq{T<:BlasFloat}(::Base.Abs2Fun, a::Array{T}, ifirst::Int, ilast::Int)
+    px = pointer(a, ifirst)
+    incx = stride1(a)
+    BLAS.dot(ilast-ifirst+1, px, incx, px, incx)
 end
 
 function norm{T<:BlasFloat, TI<:Integer}(x::StridedVector{T}, rx::Union(UnitRange{TI},Range{TI}))
diff --git a/base/reduce.jl b/base/reduce.jl
@@ -250,7 +250,7 @@ sumabs(itr) = sum(AbsFun(), itr)
 sumabs2(itr) = sum(Abs2Fun(), itr)
 
 # Note: sum_seq uses four accumulators, so each accumulator gets at most 256 numbers
-const PAIRWISE_SUM_BLOCKSIZE = 1024
+sum_pairwise_blocksize(f) = 1024
 
 # a fast implementation of sum in sequential order (from left to right).
 # to allow type-stable loops, requires length > 1
@@ -311,7 +311,7 @@ end
 function sum_pairwise(f, a::AbstractArray, ifirst::Int, ilast::Int)
     # bsiz: maximum block size
 
-    if ifirst + PAIRWISE_SUM_BLOCKSIZE >= ilast
+    if ifirst + sum_pairwise_blocksize(f) >= ilast
         sum_seq(f, a, ifirst, ilast)
     else
         imid = (ifirst + ilast) >>> 1