diff --git a/base/exports.jl b/base/exports.jl index 26aa1b078f858..5f22845e22aa1 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -798,7 +798,9 @@ export median, quantile, std, + stdm, var, + varm, # signal processing bfft, diff --git a/base/statistics.jl b/base/statistics.jl index 7e0cff598c29e..290f954a32fdd 100644 --- a/base/statistics.jl +++ b/base/statistics.jl @@ -12,6 +12,7 @@ function mean(iterable) end return total/count end +mean(v::AbstractArray, region) = sum(v, region) / prod(size(v)[region]) function median!{T<:Real}(v::AbstractVector{T}) isempty(v) && error("median of an empty array is undefined") @@ -19,57 +20,41 @@ function median!{T<:Real}(v::AbstractVector{T}) isnan(v[end]) && error("median is undefined in presence of NaNs") isodd(length(v)) ? float(v[div(end+1,2)]) : (v[div(end,2)]+v[div(end,2)+1])/2 end -median{T<:Real}(v::AbstractArray{T}) = median!(copy(reshape(v, length(v)))) +median{T<:Real}(v::AbstractArray{T}) = median!(copy(vec(v))) ## variance with known mean -function var(v::AbstractVector, m::Number, corrected::Bool) +function varm(v::AbstractVector, m::Number) n = length(v) - if n == 0 || (n == 1 && corrected) + if n == 0 || n == 1 return NaN end x = v - m - return dot(x, x) / (n - (corrected ? 1 : 0)) + return dot(x, x) / (n - 1) end -var(v::AbstractVector, m::Number) = var(v, m, true) -var(v::AbstractArray, m::Number, corrected::Bool) = var(reshape(v, length(v)), m, corrected) -var(v::AbstractArray, m::Number) = var(v, m, true) -function var(v::Ranges, m::Number, corrected::Bool) - f = first(v) - m - s = step(v) - l = length(v) - if l == 0 || (l == 1 && corrected) - return NaN - end - if corrected - return f^2 * l / (l - 1) + f * s * l + s^2 * l * (2 * l - 1) / 6 - else - return f^2 + f * s * (l - 1) + s^2 * (l - 1) * (2 * l - 1) / 6 - end -end -var(v::Ranges, m::Number) = var(v, m, true) +varm(v::AbstractArray, m::Number) = varm(vec(v), m) +varm(v::Ranges, m::Number) = var(v) ## variance -function var(v::Ranges, corrected::Bool) +function var(v::Ranges) s = step(v) l = length(v) - if l == 0 || (l == 1 && corrected) + if l == 0 || l == 1 return NaN end - return abs2(s) * (l + 1) * (corrected ? l : (l - 1)) / 12 + return abs2(s) * (l + 1) * l / 12 +end +var(v::AbstractArray) = varm(v, mean(v)) +function var(v::AbstractArray, region) + x = bsxfun(-, v, mean(v, region)) + return sum(x.^2, region) / (prod(size(v)[region]) - 1) end -var(v::AbstractVector, corrected::Bool) = var(v, mean(v), corrected) -var(v::AbstractArray, corrected::Bool) = var(reshape(v, length(v)), corrected) -var(v::AbstractArray) = var(v, true) ## standard deviation with known mean -std(v::AbstractArray, m::Number, corrected::Bool) = sqrt(var(v, m, corrected)) -std(v::AbstractArray, m::Number) = std(v, m, true) +stdm(v, m::Number) = sqrt(varm(v, m)) ## standard deviation -std(v::AbstractArray, corrected::Bool) = std(v, mean(v), corrected) -std(v::AbstractArray) = std(v, true) -std(v::Ranges, corrected::Bool) = sqrt(var(v, corrected)) -std(v::Ranges) = std(v, true) +std(v) = sqrt(var(v)) +std(v, region) = sqrt(var(v, region)) ## hist ## @@ -155,35 +140,35 @@ function center(x::AbstractVector) res end -function cov(x::AbstractVecOrMat, y::AbstractVecOrMat, corrected::Bool) +function cov(x::AbstractVecOrMat, y::AbstractVecOrMat) if size(x, 1) != size(y, 1) error("incompatible matrices") end n = size(x, 1) xc = center(x) yc = center(y) - conj(xc' * yc / (n - (corrected ? 1 : 0))) + conj(xc' * yc / (n - 1)) end -cov(x::AbstractVector, y::AbstractVector, corrected::Bool) = cov(x'', y, corrected)[1] +cov(x::AbstractVector, y::AbstractVector) = cov(x'', y)[1] -function cov(x::AbstractVecOrMat, corrected::Bool) +function cov(x::AbstractVecOrMat) n = size(x, 1) xc = center(x) - conj(xc' * xc / (n - (corrected ? 1 : 0))) + conj(xc' * xc / (n - 1)) end -cov(x::AbstractVector, corrected::Bool) = cov(x'', corrected)[1] +cov(x::AbstractVector) = cov(x'')[1] -function cor(x::AbstractVecOrMat, y::AbstractVecOrMat, corrected::Bool) - z = cov(x, y, corrected) +function cor(x::AbstractVecOrMat, y::AbstractVecOrMat) + z = cov(x, y) scale = Base.amap(std, x, 2) * Base.amap(std, y, 2)' z ./ scale end -cor(x::AbstractVector, y::AbstractVector, corrected::Bool) = - cov(x, y, corrected) / std(x) / std(y) +cor(x::AbstractVector, y::AbstractVector) = + cov(x, y) / std(x) / std(y) -function cor(x::AbstractVecOrMat, corrected::Bool) - res = cov(x, corrected) +function cor(x::AbstractVecOrMat) + res = cov(x) n = size(res, 1) scale = 1 / sqrt(diag(res)) for j in 1:n @@ -195,13 +180,7 @@ function cor(x::AbstractVecOrMat, corrected::Bool) end res end -cor(x::AbstractVector, corrected::Bool) = cor(x'', corrected)[1] - -cov(x::AbstractVecOrMat) = cov(x, true) -cov(x::AbstractVecOrMat, y::AbstractVecOrMat) = cov(x, y, true) -cor(x::AbstractVecOrMat) = cor(x, true) -cor(x::AbstractVecOrMat, y::AbstractVecOrMat) = cor(x, y, true) - +cor(x::AbstractVector) = cor(x'')[1] ## quantiles ## diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 5dc0ed13fa5a9..e91f822cdd0b2 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -2672,37 +2672,37 @@ Combinatorics Statistics ---------- -.. function:: mean(v, [dim]) +.. function:: mean(v[, region]) - Compute the mean of whole array ``v``, or optionally along dimension ``dim`` + Compute the mean of whole array ``v``, or optionally along the dimensions in ``region``. -.. function:: std(v, [corrected]) +.. function:: std(v[, region]) - Compute the sample standard deviation of a vector ``v``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v .- mean(v)).^2) / (length(v) - 1))`` and involves an implicit correction term sometimes called the Bessel correction which insures that the estimator of the variance is unbiased. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sqrt(sum((v .- mean(v)).^2) / length(v))``, which is the empirical standard deviation of the sample. + Compute the sample standard deviation of a vector or array``v``, optionally along dimensions in ``region``. The algorithm returns an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v - mean(v)).^2) / (length(v) - 1))``. -.. function:: std(v, m, [corrected]) +.. function:: stdm(v, m) - Compute the sample standard deviation of a vector ``v`` with known mean ``m``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v .- m).^2) / (length(v) - 1))`` and involves an implicit correction term sometimes called the Bessel correction which insures that the estimator of the variance is unbiased. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sqrt(sum((v .- m).^2) / length(v))``, which is the empirical standard deviation of the sample. + Compute the sample standard deviation of a vector ``v`` with known mean ``m``. -.. function:: var(v, [corrected]) +.. function:: var(v[, region]) - Compute the sample variance of a vector ``v``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an unbiased estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v .- mean(v)).^2) / (length(v) - 1)`` and involves an implicit correction term sometimes called the Bessel correction. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sum((v .- mean(v)).^2) / length(v)``, which is the empirical variance of the sample. + Compute the sample variance of a vector or array``v``, optionally along dimensions in ``region``. The algorithm will return an estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v - mean(v)).^2) / (length(v) - 1)``. -.. function:: var(v, m, [corrected]) +.. function:: varm(v, m) - Compute the sample variance of a vector ``v`` with known mean ``m``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an unbiased estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v .- m)).^2) / (length(v) - 1)`` and involves an implicit correction term sometimes called the Bessel correction. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sum((v .- m)).^2) / length(v)``, which is the empirical variance of the sample. + Compute the sample variance of a vector ``v`` with known mean ``m``. .. function:: median(v) - Compute the median of a vector ``v`` + Compute the median of a vector ``v``. -.. function:: hist(v, [n]) +.. function:: hist(v[, n]) - Compute the histogram of ``v``, optionally using ``n`` bins + Compute the histogram of ``v``, optionally using ``n`` bins. .. function:: hist(v, e) - Compute the histogram of ``v`` using a vector ``e`` as the edges for the bins + Compute the histogram of ``v`` using a vector ``e`` as the edges for the bins. .. function:: quantile(v, p) @@ -2712,13 +2712,17 @@ Statistics Compute the quantiles of a vector ``v`` at the probability values ``[.0, .2, .4, .6, .8, 1.0]``. -.. function:: cov(v) +.. function:: cov(v1[, v2]) - Compute the Pearson covariance between two vectors ``v1`` and ``v2``. + Compute the Pearson covariance between two vectors ``v1`` and ``v2``. If + called with a single element ``v``, then computes covariance of columns of + ``v``. -.. function:: cor(v) +.. function:: cor(v1[, v2]) - Compute the Pearson correlation between two vectors ``v1`` and ``v2``. + Compute the Pearson correlation between two vectors ``v1`` and ``v2``. If + called with a single element ``v``, then computes correlation of columns of + ``v``. Signal Processing ----------------- diff --git a/test/statistics.jl b/test/statistics.jl index e6ab7eb293d59..e73a4e924cbda 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -17,8 +17,13 @@ @test_fails median([NaN,0.0]) @test mean([1,2,3]) == 2. +@test mean([0 1 2; 4 5 6], 1) == [2. 3. 4.] @test var([1,2,3]) == 1. +@test var(1:8) == 6. +@test var([1 2 3 4 5; 6 7 8 9 10], 2) == [2.5 2.5]' +@test varm([1,2,3], 2) == 1. @test std([1,2,3]) == 1. +@test stdm([1,2,3], 2) == 1. @test hist([1,2,3],10) == [1,0,0,0,0,1,0,0,0,1] @test hist([1,2,3],[0,2,4]) == [1,2,0]