move cor, cov, std, stdm, var, varm and linreg to StatsBase (#27152)

fredrikekre · ViralBShah · commit 85dce1b8edee · 2018-05-28T08:38:37.000-04:00
fix JuliaLang/julia#25571 (comment) (included in JuliaStats/StatsBase.jl#379) fix #23769 (included in JuliaStats/StatsBase.jl#379) fix #27140
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -1010,27 +1010,3 @@ end
 
 chol(A::SparseMatrixCSC) = error("Use cholesky() instead of chol() for sparse matrices.")
 eigen(A::SparseMatrixCSC) = error("Use IterativeEigensolvers.eigs() instead of eigen() for sparse matrices.")
-
-function Base.cov(X::SparseMatrixCSC; dims::Int=1, corrected::Bool=true)
-    vardim = dims
-    a, b = size(X)
-    n, p = vardim == 1 ? (a, b) : (b, a)
-
-    # The covariance can be decomposed into two terms
-    # 1/(n - 1) ∑ (x_i - x̄)*(x_i - x̄)' = 1/(n - 1) (∑ x_i*x_i' - n*x̄*x̄')
-    # which can be evaluated via a sparse matrix-matrix product
-
-    # Compute ∑ x_i*x_i' = X'X using sparse matrix-matrix product
-    out = Matrix(Base.unscaled_covzm(X, vardim))
-
-    # Compute x̄
-    x̄ᵀ = mean(X, dims=vardim)
-
-    # Subtract n*x̄*x̄' from X'X
-    @inbounds for j in 1:p, i in 1:p
-        out[i,j] -= x̄ᵀ[i] * x̄ᵀ[j]' * n
-    end
-
-    # scale with the sample size n or the corrected sample size n - 1
-    return rmul!(out, inv(n - corrected))
-end
diff --git a/src/sparsematrix.jl b/src/sparsematrix.jl
@@ -3489,67 +3489,6 @@ function hash(A::SparseMatrixCSC{T}, h::UInt) where T
     hashrun(0, length(A)-lastidx, h)  # Hash zeros at end
 end
 
-## Statistics
-
-# This is the function that does the reduction underlying var/std
-function Base.centralize_sumabs2!(R::AbstractArray{S}, A::SparseMatrixCSC{Tv,Ti}, means::AbstractArray) where {S,Tv,Ti}
-    lsiz = Base.check_reducedims(R,A)
-    size(means) == size(R) || error("size of means must match size of R")
-    isempty(R) || fill!(R, zero(S))
-    isempty(A) && return R
-
-    colptr = A.colptr
-    rowval = A.rowval
-    nzval = A.nzval
-    m = size(A, 1)
-    n = size(A, 2)
-
-    if size(R, 1) == size(R, 2) == 1
-        # Reduction along both columns and rows
-        R[1, 1] = Base.centralize_sumabs2(A, means[1])
-    elseif size(R, 1) == 1
-        # Reduction along rows
-        @inbounds for col = 1:n
-            mu = means[col]
-            r = convert(S, (m-colptr[col+1]+colptr[col])*abs2(mu))
-            @simd for j = colptr[col]:colptr[col+1]-1
-                r += abs2(nzval[j] - mu)
-            end
-            R[1, col] = r
-        end
-    elseif size(R, 2) == 1
-        # Reduction along columns
-        rownz = fill(convert(Ti, n), m)
-        @inbounds for col = 1:n
-            @simd for j = colptr[col]:colptr[col+1]-1
-                row = rowval[j]
-                R[row, 1] += abs2(nzval[j] - means[row])
-                rownz[row] -= 1
-            end
-        end
-        for i = 1:m
-            R[i, 1] += rownz[i]*abs2(means[i])
-        end
-    else
-        # Reduction along a dimension > 2
-        @inbounds for col = 1:n
-            lastrow = 0
-            @simd for j = colptr[col]:colptr[col+1]-1
-                row = rowval[j]
-                for i = lastrow+1:row-1
-                    R[i, col] = abs2(means[i, col])
-                end
-                R[row, col] = abs2(nzval[j] - means[row, col])
-                lastrow = row
-            end
-            for i = lastrow+1:m
-                R[i, col] = abs2(means[i, col])
-            end
-        end
-    end
-    return R
-end
-
 ## Uniform matrix arithmetic
 
 (+)(A::SparseMatrixCSC, J::UniformScaling) = A + sparse(J, size(A)...)
diff --git a/test/sparse.jl b/test/sparse.jl
@@ -489,7 +489,7 @@ end
     pA = sparse(rand(3, 7))
 
     for arr in (se33, sA, pA)
-        for f in (sum, prod, minimum, maximum, var)
+        for f in (sum, prod, minimum, maximum)
             farr = Array(arr)
             @test f(arr) ≈ f(farr)
             @test f(arr, dims=1) ≈ f(farr, dims=1)
@@ -518,9 +518,8 @@ end
         @test prod(sparse(Int[])) === 1
         @test_throws ArgumentError minimum(sparse(Int[]))
         @test_throws ArgumentError maximum(sparse(Int[]))
-        @test var(sparse(Int[])) === NaN
 
-        for f in (sum, prod, var)
+        for f in (sum, prod)
             @test isequal(f(spzeros(0, 1), dims=1), f(Matrix{Int}(I, 0, 1), dims=1))
             @test isequal(f(spzeros(0, 1), dims=2), f(Matrix{Int}(I, 0, 1), dims=2))
             @test isequal(f(spzeros(0, 1), dims=(1, 2)), f(Matrix{Int}(I, 0, 1), dims=(1, 2)))
@@ -2033,63 +2032,6 @@ end
     @test issymmetric(B)
 end
 
-# Faster covariance function for sparse matrices
-# Prevents densifying the input matrix when subtracting the mean
-# Test against dense implementation
-# PR https://github.com/JuliaLang/julia/pull/22735
-# Part of this test needed to be hacked due to the treatment
-# of Inf in sparse matrix algebra
-# https://github.com/JuliaLang/julia/issues/22921
-# The issue will be resolved in
-# https://github.com/JuliaLang/julia/issues/22733
-@testset "optimizing sparse $elty covariance" for elty in (Float64, Complex{Float64})
-    n = 10
-    p = 5
-    np2 = div(n*p, 2)
-    nzvals, x_sparse = guardsrand(1) do
-        if elty <: Real
-            nzvals = randn(np2)
-        else
-            nzvals = complex.(randn(np2), randn(np2))
-        end
-        nzvals, sparse(rand(1:n, np2), rand(1:p, np2), nzvals, n, p)
-    end
-    x_dense  = convert(Matrix{elty}, x_sparse)
-    @testset "Test with no Infs and NaNs, vardim=$vardim, corrected=$corrected" for vardim in (1, 2),
-                                                                                 corrected in (true, false)
-        @test cov(x_sparse, dims=vardim, corrected=corrected) ≈
-              cov(x_dense , dims=vardim, corrected=corrected)
-    end
-
-    @testset "Test with $x11, vardim=$vardim, corrected=$corrected" for x11 in (NaN, Inf),
-                                                                     vardim in (1, 2),
-                                                                  corrected in (true, false)
-        x_sparse[1,1] = x11
-        x_dense[1 ,1] = x11
-
-        cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected)
-        cov_dense  = cov(x_dense , dims=vardim, corrected=corrected)
-        @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
-        @test isfinite.(cov_sparse) == isfinite.(cov_dense)
-        @test isfinite.(cov_sparse) == isfinite.(cov_dense)
-    end
-
-    @testset "Test with NaN and Inf, vardim=$vardim, corrected=$corrected" for vardim in (1, 2),
-                                                                            corrected in (true, false)
-        x_sparse[1,1] = Inf
-        x_dense[1 ,1] = Inf
-        x_sparse[2,1] = NaN
-        x_dense[2 ,1] = NaN
-
-        cov_sparse = cov(x_sparse, dims=vardim, corrected=corrected)
-        cov_dense  = cov(x_dense , dims=vardim, corrected=corrected)
-        @test cov_sparse[(1 + vardim):end, (1 + vardim):end] ≈
-              cov_dense[ (1 + vardim):end, (1 + vardim):end]
-        @test isfinite.(cov_sparse) == isfinite.(cov_dense)
-        @test isfinite.(cov_sparse) == isfinite.(cov_dense)
-    end
-end
-
 @testset "similar should not alias the input sparse array" begin
     a = sparse(rand(3,3) .+ 0.1)
     b = similar(a, Float32, Int32)