Skip to content

Commit

Permalink
Merge pull request #213 from Shimuuar/covariance-alloc
Browse files Browse the repository at this point in the history
Make covariance and correlation non-allocating
  • Loading branch information
Shimuuar authored Jan 21, 2025
2 parents 36d1cc0 + 4a8c8da commit e629285
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 35 deletions.
36 changes: 32 additions & 4 deletions Statistics/Correlation.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
module Statistics.Correlation
( -- * Pearson correlation
pearson
, pearson2
, pearsonMatByRow
-- * Spearman correlation
, spearman
, spearman2
, spearmanMatByRow
) where

Expand All @@ -25,11 +27,18 @@ import Statistics.Test.Internal (rankUnsorted)

-- | Pearson correlation for sample of pairs. Exactly same as
-- 'Statistics.Sample.correlation'
pearson :: (G.Vector v (Double, Double), G.Vector v Double)
pearson :: (G.Vector v (Double, Double))
=> v (Double, Double) -> Double
pearson = correlation
{-# INLINE pearson #-}

-- | Pearson correlation for sample of pairs. Exactly same as
-- 'Statistics.Sample.correlation'
pearson2 :: (G.Vector v Double)
=> v Double -> v Double -> Double
pearson2 = correlation2
{-# INLINE pearson2 #-}

-- | Compute pairwise Pearson correlation between rows of a matrix
pearsonMatByRow :: Matrix -> Matrix
pearsonMatByRow m
Expand All @@ -43,15 +52,13 @@ pearsonMatByRow m
-- Spearman
----------------------------------------------------------------

-- | compute Spearman correlation between two samples
-- | Compute Spearman correlation between two samples
spearman :: ( Ord a
, Ord b
, G.Vector v a
, G.Vector v b
, G.Vector v (a, b)
, G.Vector v Int
, G.Vector v Double
, G.Vector v (Double, Double)
, G.Vector v (Int, a)
, G.Vector v (Int, b)
)
Expand All @@ -64,6 +71,27 @@ spearman xy
(x, y) = G.unzip xy
{-# INLINE spearman #-}

-- | Compute Spearman correlation between two samples. Samples must
-- have same length.
spearman2 :: ( Ord a
, Ord b
, G.Vector v a
, G.Vector v b
, G.Vector v Int
, G.Vector v (Int, a)
, G.Vector v (Int, b)
)
=> v a
-> v b
-> Double
spearman2 xs ys
| nx /= ny = error "Statistics.Correlation.spearman2: samples must have same length"
| otherwise = pearson $ G.zip (rankUnsorted xs) (rankUnsorted ys)
where
nx = G.length xs
ny = G.length ys
{-# INLINE spearman2 #-}

-- | compute pairwise Spearman correlation between rows of a matrix
spearmanMatByRow :: Matrix -> Matrix
spearmanMatByRow
Expand Down
4 changes: 2 additions & 2 deletions Statistics/Function.hs
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ indices a = G.enumFromTo 0 (G.length a - 1)
{-# INLINE indices #-}

-- | Zip a vector with its indices.
indexed :: (G.Vector v e, G.Vector v Int, G.Vector v (Int,e)) => v e -> v (Int,e)
indexed a = G.zip (indices a) a
indexed :: (G.Vector v e, G.Vector v (Int,e)) => v e -> v (Int,e)
indexed xs = G.imap (,) xs
{-# INLINE indexed #-}

data MM = MM {-# UNPACK #-} !Double {-# UNPACK #-} !Double
Expand Down
89 changes: 69 additions & 20 deletions Statistics/Sample.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ module Statistics.Sample
, range

-- * Statistics of location
, expectation
, mean
, welfordMean
, meanWeighted
Expand Down Expand Up @@ -54,17 +55,20 @@ module Statistics.Sample
-- * Joint distributions
, covariance
, correlation
, covariance2
, correlation2
, pair
-- * References
-- $references
) where

import Statistics.Function (minMax)
import Statistics.Function (minMax,square)
import Statistics.Sample.Internal (robustSumVar, sum)
import Statistics.Types.Internal (Sample,WeightedSample)
import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
import Numeric.Sum (kbn, Summation(zero,add))

-- Operator ^ will be overridden
import Prelude hiding ((^), sum)
Expand All @@ -76,9 +80,17 @@ range s = hi - lo
where (lo , hi) = minMax s
{-# INLINE range #-}

-- | /O(n)/ Compute expectation of function over for sample. This is
-- simply @mean . map f@ but won't create intermediate vector.
expectation :: (G.Vector v a) => (a -> Double) -> v a -> Double
expectation f xs = kbn (G.foldl' (\s -> add s . f) zero xs)
/ fromIntegral (G.length xs)
{-# INLINE expectation #-}

-- | /O(n)/ Arithmetic mean. This uses Kahan-Babuška-Neumaier
-- summation, so is more accurate than 'welfordMean' unless the input
-- values are very large.
-- values are very large. This function is not subject to stream
-- fusion.
mean :: (G.Vector v Double) => v Double -> Double
mean xs = sum xs / fromIntegral (G.length xs)
{-# SPECIALIZE mean :: U.Vector Double -> Double #-}
Expand Down Expand Up @@ -122,7 +134,7 @@ harmonicMean = fini . G.foldl' go (T 0 0)

-- | /O(n)/ Geometric mean of a sample containing no negative values.
geometricMean :: (G.Vector v Double) => v Double -> Double
geometricMean = exp . mean . G.map log
geometricMean = exp . expectation log
{-# INLINE geometricMean #-}

-- | Compute the /k/th central moment of a sample. The central moment
Expand All @@ -138,7 +150,7 @@ centralMoment a xs
| a < 0 = error "Statistics.Sample.centralMoment: negative input"
| a == 0 = 1
| a == 1 = 0
| otherwise = sum (G.map go xs) / fromIntegral (G.length xs)
| otherwise = expectation go xs
where
go x = (x-m) ^ a
m = mean xs
Expand Down Expand Up @@ -354,42 +366,79 @@ fastStdDev = sqrt . fastVariance

-- | Covariance of sample of pairs. For empty sample it's set to
-- zero
covariance :: (G.Vector v (Double,Double), G.Vector v Double)
covariance :: (G.Vector v (Double,Double))
=> v (Double,Double)
-> Double
covariance xy
| n == 0 = 0
| otherwise = mean $ G.zipWith (*)
(G.map (\x -> x - muX) xs)
(G.map (\y -> y - muY) ys)
| otherwise = expectation (\(x,y) -> (x - muX)*(y - muY)) xy
where
n = G.length xy
(xs,ys) = G.unzip xy
muX = mean xs
muY = mean ys
n = G.length xy
muX = expectation fst xy
muY = expectation snd xy
{-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-}
{-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-}

-- | Correlation coefficient for sample of pairs. Also known as
-- Pearson's correlation. For empty sample it's set to zero.
correlation :: (G.Vector v (Double,Double), G.Vector v Double)
correlation :: (G.Vector v (Double,Double))
=> v (Double,Double)
-> Double
correlation xy
| n == 0 = 0
| otherwise = cov / sqrt (varX * varY)
where
n = G.length xy
(xs,ys) = G.unzip xy
(muX,varX) = meanVariance xs
(muY,varY) = meanVariance ys
cov = mean $ G.zipWith (*)
(G.map (\x -> x - muX) xs)
(G.map (\y -> y - muY) ys)
n = G.length xy
muX = expectation (\(x,_) -> x) xy
muY = expectation (\(_,y) -> y) xy
varX = expectation (\(x,_) -> square (x - muX)) xy
varY = expectation (\(_,y) -> square (y - muY)) xy
cov = expectation (\(x,y) -> (x - muX)*(y - muY)) xy
{-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-}
{-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-}


-- | Covariance of two samples. Both vectors must be of the same
-- length. If both are empty it's set to zero
covariance2 :: (G.Vector v Double)
=> v Double
-> v Double
-> Double
covariance2 xs ys
| nx /= ny = error $ "Statistics.Sample.covariance2: both samples must have same length"
| nx == 0 = 0
| otherwise = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys)
/ fromIntegral nx
where
nx = G.length xs
ny = G.length ys
muX = mean xs
muY = mean ys
{-# SPECIALIZE covariance2 :: U.Vector Double -> U.Vector Double -> Double #-}
{-# SPECIALIZE covariance2 :: V.Vector Double -> V.Vector Double -> Double #-}

-- | Correlation coefficient for two samples. Both vector must have
-- same length Also known as Pearson's correlation. For empty sample
-- it's set to zero.
correlation2 :: (G.Vector v Double)
=> v Double
-> v Double
-> Double
correlation2 xs ys
| nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length"
| nx == 0 = 0
| otherwise = cov / sqrt (varX * varY)
where
nx = G.length xs
ny = G.length ys
(muX,varX) = meanVariance xs
(muY,varY) = meanVariance ys
cov = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys)
/ fromIntegral nx
{-# SPECIALIZE correlation2 :: U.Vector Double -> U.Vector Double -> Double #-}
{-# SPECIALIZE correlation2 :: V.Vector Double -> V.Vector Double -> Double #-}


-- | Pair two samples. It's like 'G.zip' but requires that both
-- samples have equal size.
pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b)
Expand Down
8 changes: 4 additions & 4 deletions Statistics/Test/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module Statistics.Test.Internal (
import Data.Ord
import Data.Vector.Generic ((!))
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Generic.Mutable as M
import Statistics.Function

Expand All @@ -33,10 +34,10 @@ data Rank v a = Rank {
--
-- >>> rank (==) (VU.fromList [10,10,10,30::Int])
-- [2.0,2.0,2.0,4.0]
rank :: (G.Vector v a, G.Vector v Double)
rank :: (G.Vector v a)
=> (a -> a -> Bool) -- ^ Equivalence relation
-> v a -- ^ Vector to rank
-> v Double
-> U.Vector Double
rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
where
go (Rank 0 _ r v)
Expand All @@ -59,11 +60,10 @@ rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
rankUnsorted :: ( Ord a
, G.Vector v a
, G.Vector v Int
, G.Vector v Double
, G.Vector v (Int, a)
)
=> v a
-> v Double
-> U.Vector Double
rankUnsorted xs = G.create $ do
-- Put ranks into their original positions
-- NOTE: backpermute will do wrong thing
Expand Down
2 changes: 1 addition & 1 deletion Statistics/Test/StudentT.hs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ welchTTest test sample1 sample2
-- | Paired two-sample t-test. Two samples are paired in a
-- within-subject design. Returns @Nothing@ if sample size is not
-- sufficient.
pairedTTest :: forall v. (G.Vector v (Double, Double), G.Vector v Double)
pairedTTest :: forall v. (G.Vector v (Double, Double))
=> PositionTest -- ^ one- or two-tailed test
-> v (Double, Double) -- ^ paired samples
-> Maybe (Test StudentT)
Expand Down
6 changes: 5 additions & 1 deletion benchmark/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ main =
, bench "varianceUnbiased" $ nf (\x -> varianceUnbiased x) sample
, bench "varianceWeighted" $ nf (\x -> varianceWeighted x) sampleW
-- Correlation
, bench "pearson" $ nf pearson sampleW
, bench "pearson" $ nf pearson sampleW
, bench "covariance" $ nf covariance sampleW
, bench "correlation" $ nf correlation sampleW
, bench "covariance2" $ nf (covariance2 sample) sample
, bench "correlation2" $ nf (correlation2 sample) sample
-- Other
, bench "stdDev" $ nf (\x -> stdDev x) sample
, bench "skewness" $ nf (\x -> skewness x) sample
Expand Down
15 changes: 13 additions & 2 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
## Changes in NEXT_VERSIONS
## Changes in 0.16.3.0

* Computation of `rSquare` has special case for case when data variation is 0.
* `S.Sample.correlation`, `S.Sample.covariance`,
`S.Correlation.pearson` do not allocate temporary arrays.

* Variants of correlation which take two vectors as input are added:
`S.Sample.correlation2`, `S.Sample.covariance2`, `S.Correlation.pearson2`,
`S.Correlation.spearman2`.

* Contexts for `S.Function.indexed`, `S.Correlation.spearman`, `S.pairedTTest`,
`S.Sample.correlation`, `S.Sample.covariance`, reduced.

* Computation of `rSquare` in linear regression has special case for case when
data variation is 0.

* Doctests added.

Expand Down
2 changes: 1 addition & 1 deletion statistics.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cabal-version: 3.0
build-type: Simple

name: statistics
version: 0.16.2.1
version: 0.16.3.0
synopsis: A library of statistical types, data, and functions
description:
This library provides a number of common functions and types useful
Expand Down

0 comments on commit e629285

Please sign in to comment.