Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

varm/stdm and stats dimension arguments #2561

Merged
merged 4 commits into from
Mar 14, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,9 @@ export
median,
quantile,
std,
stdm,
var,
varm,

# signal processing
bfft,
Expand Down
83 changes: 31 additions & 52 deletions base/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,64 +12,49 @@ function mean(iterable)
end
return total/count
end
mean(v::AbstractArray, region) = sum(v, region) / prod(size(v)[region])

function median!{T<:Real}(v::AbstractVector{T})
isempty(v) && error("median of an empty array is undefined")
sort!(v) # TODO: do something more efficient, e.g. select but detect NaNs
isnan(v[end]) && error("median is undefined in presence of NaNs")
isodd(length(v)) ? float(v[div(end+1,2)]) : (v[div(end,2)]+v[div(end,2)+1])/2
end
median{T<:Real}(v::AbstractArray{T}) = median!(copy(reshape(v, length(v))))
median{T<:Real}(v::AbstractArray{T}) = median!(copy(vec(v)))

## variance with known mean
function var(v::AbstractVector, m::Number, corrected::Bool)
function varm(v::AbstractVector, m::Number)
n = length(v)
if n == 0 || (n == 1 && corrected)
if n == 0 || n == 1
return NaN
end
x = v - m
return dot(x, x) / (n - (corrected ? 1 : 0))
return dot(x, x) / (n - 1)
end
var(v::AbstractVector, m::Number) = var(v, m, true)
var(v::AbstractArray, m::Number, corrected::Bool) = var(reshape(v, length(v)), m, corrected)
var(v::AbstractArray, m::Number) = var(v, m, true)
function var(v::Ranges, m::Number, corrected::Bool)
f = first(v) - m
s = step(v)
l = length(v)
if l == 0 || (l == 1 && corrected)
return NaN
end
if corrected
return f^2 * l / (l - 1) + f * s * l + s^2 * l * (2 * l - 1) / 6
else
return f^2 + f * s * (l - 1) + s^2 * (l - 1) * (2 * l - 1) / 6
end
end
var(v::Ranges, m::Number) = var(v, m, true)
varm(v::AbstractArray, m::Number) = varm(vec(v), m)
varm(v::Ranges, m::Number) = var(v)

## variance
function var(v::Ranges, corrected::Bool)
function var(v::Ranges)
s = step(v)
l = length(v)
if l == 0 || (l == 1 && corrected)
if l == 0 || l == 1
return NaN
end
return abs2(s) * (l + 1) * (corrected ? l : (l - 1)) / 12
return abs2(s) * (l + 1) * l / 12
end
var(v::AbstractArray) = varm(v, mean(v))
function var(v::AbstractArray, region)
x = bsxfun(-, v, mean(v, region))
return sum(x.^2, region) / (prod(size(v)[region]) - 1)
end
var(v::AbstractVector, corrected::Bool) = var(v, mean(v), corrected)
var(v::AbstractArray, corrected::Bool) = var(reshape(v, length(v)), corrected)
var(v::AbstractArray) = var(v, true)

## standard deviation with known mean
std(v::AbstractArray, m::Number, corrected::Bool) = sqrt(var(v, m, corrected))
std(v::AbstractArray, m::Number) = std(v, m, true)
stdm(v, m::Number) = sqrt(varm(v, m))

## standard deviation
std(v::AbstractArray, corrected::Bool) = std(v, mean(v), corrected)
std(v::AbstractArray) = std(v, true)
std(v::Ranges, corrected::Bool) = sqrt(var(v, corrected))
std(v::Ranges) = std(v, true)
std(v) = sqrt(var(v))
std(v, region) = sqrt(var(v, region))

## hist ##

Expand Down Expand Up @@ -155,35 +140,35 @@ function center(x::AbstractVector)
res
end

function cov(x::AbstractVecOrMat, y::AbstractVecOrMat, corrected::Bool)
function cov(x::AbstractVecOrMat, y::AbstractVecOrMat)
if size(x, 1) != size(y, 1)
error("incompatible matrices")
end
n = size(x, 1)
xc = center(x)
yc = center(y)
conj(xc' * yc / (n - (corrected ? 1 : 0)))
conj(xc' * yc / (n - 1))
end
cov(x::AbstractVector, y::AbstractVector, corrected::Bool) = cov(x'', y, corrected)[1]
cov(x::AbstractVector, y::AbstractVector) = cov(x'', y)[1]

function cov(x::AbstractVecOrMat, corrected::Bool)
function cov(x::AbstractVecOrMat)
n = size(x, 1)
xc = center(x)
conj(xc' * xc / (n - (corrected ? 1 : 0)))
conj(xc' * xc / (n - 1))
end
cov(x::AbstractVector, corrected::Bool) = cov(x'', corrected)[1]
cov(x::AbstractVector) = cov(x'')[1]

function cor(x::AbstractVecOrMat, y::AbstractVecOrMat, corrected::Bool)
z = cov(x, y, corrected)
function cor(x::AbstractVecOrMat, y::AbstractVecOrMat)
z = cov(x, y)
scale = Base.amap(std, x, 2) * Base.amap(std, y, 2)'
z ./ scale
end
cor(x::AbstractVector, y::AbstractVector, corrected::Bool) =
cov(x, y, corrected) / std(x) / std(y)
cor(x::AbstractVector, y::AbstractVector) =
cov(x, y) / std(x) / std(y)


function cor(x::AbstractVecOrMat, corrected::Bool)
res = cov(x, corrected)
function cor(x::AbstractVecOrMat)
res = cov(x)
n = size(res, 1)
scale = 1 / sqrt(diag(res))
for j in 1:n
Expand All @@ -195,13 +180,7 @@ function cor(x::AbstractVecOrMat, corrected::Bool)
end
res
end
cor(x::AbstractVector, corrected::Bool) = cor(x'', corrected)[1]

cov(x::AbstractVecOrMat) = cov(x, true)
cov(x::AbstractVecOrMat, y::AbstractVecOrMat) = cov(x, y, true)
cor(x::AbstractVecOrMat) = cor(x, true)
cor(x::AbstractVecOrMat, y::AbstractVecOrMat) = cor(x, y, true)

cor(x::AbstractVector) = cor(x'')[1]

## quantiles ##

Expand Down
40 changes: 22 additions & 18 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2672,37 +2672,37 @@ Combinatorics
Statistics
----------

.. function:: mean(v, [dim])
.. function:: mean(v[, region])

Compute the mean of whole array ``v``, or optionally along dimension ``dim``
Compute the mean of whole array ``v``, or optionally along the dimensions in ``region``.

.. function:: std(v, [corrected])
.. function:: std(v[, region])

Compute the sample standard deviation of a vector ``v``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v .- mean(v)).^2) / (length(v) - 1))`` and involves an implicit correction term sometimes called the Bessel correction which insures that the estimator of the variance is unbiased. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sqrt(sum((v .- mean(v)).^2) / length(v))``, which is the empirical standard deviation of the sample.
Compute the sample standard deviation of a vector or array``v``, optionally along dimensions in ``region``. The algorithm returns an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v - mean(v)).^2) / (length(v) - 1))``.

.. function:: std(v, m, [corrected])
.. function:: stdm(v, m)

Compute the sample standard deviation of a vector ``v`` with known mean ``m``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an estimator of the generative distribution's standard deviation under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sqrt(sum((v .- m).^2) / (length(v) - 1))`` and involves an implicit correction term sometimes called the Bessel correction which insures that the estimator of the variance is unbiased. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sqrt(sum((v .- m).^2) / length(v))``, which is the empirical standard deviation of the sample.
Compute the sample standard deviation of a vector ``v`` with known mean ``m``.

.. function:: var(v, [corrected])
.. function:: var(v[, region])

Compute the sample variance of a vector ``v``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an unbiased estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v .- mean(v)).^2) / (length(v) - 1)`` and involves an implicit correction term sometimes called the Bessel correction. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sum((v .- mean(v)).^2) / length(v)``, which is the empirical variance of the sample.
Compute the sample variance of a vector or array``v``, optionally along dimensions in ``region``. The algorithm will return an estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v - mean(v)).^2) / (length(v) - 1)``.

.. function:: var(v, m, [corrected])
.. function:: varm(v, m)

Compute the sample variance of a vector ``v`` with known mean ``m``. If the optional argument ``corrected`` is either left unspecified or is explicitly set to the default value of ``true``, then the algorithm will return an unbiased estimator of the generative distribution's variance under the assumption that each entry of ``v`` is an IID draw from that generative distribution. This computation is equivalent to calculating ``sum((v .- m)).^2) / (length(v) - 1)`` and involves an implicit correction term sometimes called the Bessel correction. If, instead, the optional argument ``corrected`` is set to ``false``, then the algorithm will produce the equivalent of ``sum((v .- m)).^2) / length(v)``, which is the empirical variance of the sample.
Compute the sample variance of a vector ``v`` with known mean ``m``.

.. function:: median(v)

Compute the median of a vector ``v``
Compute the median of a vector ``v``.

.. function:: hist(v, [n])
.. function:: hist(v[, n])

Compute the histogram of ``v``, optionally using ``n`` bins
Compute the histogram of ``v``, optionally using ``n`` bins.

.. function:: hist(v, e)

Compute the histogram of ``v`` using a vector ``e`` as the edges for the bins
Compute the histogram of ``v`` using a vector ``e`` as the edges for the bins.

.. function:: quantile(v, p)

Expand All @@ -2712,13 +2712,17 @@ Statistics

Compute the quantiles of a vector ``v`` at the probability values ``[.0, .2, .4, .6, .8, 1.0]``.

.. function:: cov(v)
.. function:: cov(v1[, v2])

Compute the Pearson covariance between two vectors ``v1`` and ``v2``.
Compute the Pearson covariance between two vectors ``v1`` and ``v2``. If
called with a single element ``v``, then computes covariance of columns of
``v``.

.. function:: cor(v)
.. function:: cor(v1[, v2])

Compute the Pearson correlation between two vectors ``v1`` and ``v2``.
Compute the Pearson correlation between two vectors ``v1`` and ``v2``. If
called with a single element ``v``, then computes correlation of columns of
``v``.

Signal Processing
-----------------
Expand Down
5 changes: 5 additions & 0 deletions test/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@
@test_fails median([NaN,0.0])

@test mean([1,2,3]) == 2.
@test mean([0 1 2; 4 5 6], 1) == [2. 3. 4.]
@test var([1,2,3]) == 1.
@test var(1:8) == 6.
@test var([1 2 3 4 5; 6 7 8 9 10], 2) == [2.5 2.5]'
@test varm([1,2,3], 2) == 1.
@test std([1,2,3]) == 1.
@test stdm([1,2,3], 2) == 1.
@test hist([1,2,3],10) == [1,0,0,0,0,1,0,0,0,1]
@test hist([1,2,3],[0,2,4]) == [1,2,0]

Expand Down