Merge pull request #213 from Shimuuar/covariance-alloc

Make covariance and correlation non-allocating
haskell · Jan 21, 2025 · e629285 · e629285
2 parents 36d1cc0 + 4a8c8da
commit e629285
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 35 deletions.
diff --git a/Statistics/Correlation.hs b/Statistics/Correlation.hs
@@ -6,9 +6,11 @@
 module Statistics.Correlation
     ( -- * Pearson correlation
       pearson
+    , pearson2
     , pearsonMatByRow
       -- * Spearman correlation
     , spearman
+    , spearman2
     , spearmanMatByRow
     ) where
 
@@ -25,11 +27,18 @@ import Statistics.Test.Internal (rankUnsorted)
 
 -- | Pearson correlation for sample of pairs. Exactly same as
 -- 'Statistics.Sample.correlation'
-pearson :: (G.Vector v (Double, Double), G.Vector v Double)
+pearson :: (G.Vector v (Double, Double))
         => v (Double, Double) -> Double
 pearson = correlation
 {-# INLINE pearson #-}
 
+-- | Pearson correlation for sample of pairs. Exactly same as
+-- 'Statistics.Sample.correlation'
+pearson2 :: (G.Vector v Double)
+         => v Double -> v Double -> Double
+pearson2 = correlation2
+{-# INLINE pearson2 #-}
+
 -- | Compute pairwise Pearson correlation between rows of a matrix
 pearsonMatByRow :: Matrix -> Matrix
 pearsonMatByRow m
@@ -43,15 +52,13 @@ pearsonMatByRow m
 -- Spearman
 ----------------------------------------------------------------
 
--- | compute Spearman correlation between two samples
+-- | Compute Spearman correlation between two samples
 spearman :: ( Ord a
             , Ord b
             , G.Vector v a
             , G.Vector v b
             , G.Vector v (a, b)
             , G.Vector v Int
-            , G.Vector v Double
-            , G.Vector v (Double, Double)
             , G.Vector v (Int, a)
             , G.Vector v (Int, b)
             )
@@ -64,6 +71,27 @@ spearman xy
     (x, y) = G.unzip xy
 {-# INLINE spearman #-}
 
+-- | Compute Spearman correlation between two samples. Samples must
+--   have same length.
+spearman2 :: ( Ord a
+            , Ord b
+            , G.Vector v a
+            , G.Vector v b
+            , G.Vector v Int
+            , G.Vector v (Int, a)
+            , G.Vector v (Int, b)
+            )
+         => v a
+         -> v b
+         -> Double
+spearman2 xs ys
+  | nx /= ny  = error "Statistics.Correlation.spearman2: samples must have same length"
+  | otherwise = pearson $ G.zip (rankUnsorted xs) (rankUnsorted ys)
+  where
+    nx = G.length xs
+    ny = G.length ys
+{-# INLINE spearman2 #-}
+
 -- | compute pairwise Spearman correlation between rows of a matrix
 spearmanMatByRow :: Matrix -> Matrix
 spearmanMatByRow

diff --git a/Statistics/Function.hs b/Statistics/Function.hs
@@ -76,8 +76,8 @@ indices a = G.enumFromTo 0 (G.length a - 1)
 {-# INLINE indices #-}
 
 -- | Zip a vector with its indices.
-indexed :: (G.Vector v e, G.Vector v Int, G.Vector v (Int,e)) => v e -> v (Int,e)
-indexed a = G.zip (indices a) a
+indexed :: (G.Vector v e, G.Vector v (Int,e)) => v e -> v (Int,e)
+indexed xs = G.imap (,) xs
 {-# INLINE indexed #-}
 
 data MM = MM {-# UNPACK #-} !Double {-# UNPACK #-} !Double

diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs
@@ -20,6 +20,7 @@ module Statistics.Sample
     , range
 
     -- * Statistics of location
+    , expectation
     , mean
     , welfordMean
     , meanWeighted
@@ -54,17 +55,20 @@ module Statistics.Sample
     -- * Joint distributions
     , covariance
     , correlation
+    , covariance2
+    , correlation2
     , pair
     -- * References
     -- $references
     ) where
 
-import Statistics.Function (minMax)
+import Statistics.Function (minMax,square)
 import Statistics.Sample.Internal (robustSumVar, sum)
 import Statistics.Types.Internal  (Sample,WeightedSample)
 import qualified Data.Vector as V
 import qualified Data.Vector.Generic as G
 import qualified Data.Vector.Unboxed as U
+import Numeric.Sum (kbn, Summation(zero,add))
 
 -- Operator ^ will be overridden
 import Prelude hiding ((^), sum)
@@ -76,9 +80,17 @@ range s = hi - lo
     where (lo , hi) = minMax s
 {-# INLINE range #-}
 
+-- | /O(n)/ Compute expectation of function over for sample. This is
+--   simply @mean . map f@ but won't create intermediate vector.
+expectation :: (G.Vector v a) => (a -> Double) -> v a -> Double
+expectation f xs = kbn (G.foldl' (\s -> add s . f) zero xs)
+                 / fromIntegral (G.length xs)
+{-# INLINE expectation #-}
+
 -- | /O(n)/ Arithmetic mean.  This uses Kahan-Babuška-Neumaier
 -- summation, so is more accurate than 'welfordMean' unless the input
--- values are very large.
+-- values are very large. This function is not subject to stream
+-- fusion.
 mean :: (G.Vector v Double) => v Double -> Double
 mean xs = sum xs / fromIntegral (G.length xs)
 {-# SPECIALIZE mean :: U.Vector Double -> Double #-}
@@ -122,7 +134,7 @@ harmonicMean = fini . G.foldl' go (T 0 0)
 
 -- | /O(n)/ Geometric mean of a sample containing no negative values.
 geometricMean :: (G.Vector v Double) => v Double -> Double
-geometricMean = exp . mean . G.map log
+geometricMean = exp . expectation log
 {-# INLINE geometricMean #-}
 
 -- | Compute the /k/th central moment of a sample.  The central moment
@@ -138,7 +150,7 @@ centralMoment a xs
     | a < 0  = error "Statistics.Sample.centralMoment: negative input"
     | a == 0 = 1
     | a == 1 = 0
-    | otherwise = sum (G.map go xs) / fromIntegral (G.length xs)
+    | otherwise = expectation go xs
   where
     go x = (x-m) ^ a
     m    = mean xs
@@ -354,42 +366,79 @@ fastStdDev = sqrt . fastVariance
 
 -- | Covariance of sample of pairs. For empty sample it's set to
 --   zero
-covariance :: (G.Vector v (Double,Double), G.Vector v Double)
+covariance :: (G.Vector v (Double,Double))
            => v (Double,Double)
            -> Double
 covariance xy
   | n == 0    = 0
-  | otherwise = mean $ G.zipWith (*)
-                         (G.map (\x -> x - muX) xs)
-                         (G.map (\y -> y - muY) ys)
+  | otherwise = expectation (\(x,y) -> (x - muX)*(y - muY)) xy
   where
-    n       = G.length xy
-    (xs,ys) = G.unzip xy
-    muX     = mean xs
-    muY     = mean ys
+    n   = G.length xy
+    muX = expectation fst xy
+    muY = expectation snd xy
 {-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-}
 {-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-}
 
 -- | Correlation coefficient for sample of pairs. Also known as
 --   Pearson's correlation. For empty sample it's set to zero.
-correlation :: (G.Vector v (Double,Double), G.Vector v Double)
+correlation :: (G.Vector v (Double,Double))
            => v (Double,Double)
            -> Double
 correlation xy
   | n == 0    = 0
   | otherwise = cov / sqrt (varX * varY)
   where
-    n       = G.length xy
-    (xs,ys) = G.unzip xy
-    (muX,varX) = meanVariance xs
-    (muY,varY) = meanVariance ys
-    cov = mean $ G.zipWith (*)
-            (G.map (\x -> x - muX) xs)
-            (G.map (\y -> y - muY) ys)
+    n    = G.length xy
+    muX  = expectation (\(x,_) -> x) xy
+    muY  = expectation (\(_,y) -> y) xy
+    varX = expectation (\(x,_) -> square (x - muX))    xy
+    varY = expectation (\(_,y) -> square (y - muY))    xy
+    cov  = expectation (\(x,y) -> (x - muX)*(y - muY)) xy
 {-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-}
 {-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-}
 
 
+-- | Covariance of two samples. Both vectors must be of the same
+--   length. If both are empty it's set to zero
+covariance2 :: (G.Vector v Double)
+           => v Double
+           -> v Double
+           -> Double
+covariance2 xs ys
+  | nx /= ny  = error $ "Statistics.Sample.covariance2: both samples must have same length"
+  | nx == 0   = 0
+  | otherwise = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys)
+              / fromIntegral nx
+  where
+    nx  = G.length xs
+    ny  = G.length ys
+    muX = mean xs
+    muY = mean ys
+{-# SPECIALIZE covariance2 :: U.Vector Double -> U.Vector Double -> Double #-}
+{-# SPECIALIZE covariance2 :: V.Vector Double -> V.Vector Double -> Double #-}
+
+-- | Correlation coefficient for two samples. Both vector must have
+--   same length Also known as Pearson's correlation. For empty sample
+--   it's set to zero.
+correlation2 :: (G.Vector v Double)
+             => v Double
+             -> v Double
+             -> Double
+correlation2 xs ys
+  | nx /= ny  = error $ "Statistics.Sample.correlation2: both samples must have same length"
+  | nx == 0   = 0
+  | otherwise = cov / sqrt (varX * varY)
+  where
+    nx         = G.length xs
+    ny         = G.length ys
+    (muX,varX) = meanVariance xs
+    (muY,varY) = meanVariance ys
+    cov = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys)
+        / fromIntegral nx
+{-# SPECIALIZE correlation2 :: U.Vector Double -> U.Vector Double -> Double #-}
+{-# SPECIALIZE correlation2 :: V.Vector Double -> V.Vector Double -> Double #-}
+
+
 -- | Pair two samples. It's like 'G.zip' but requires that both
 --   samples have equal size.
 pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b)

diff --git a/Statistics/Test/Internal.hs b/Statistics/Test/Internal.hs
@@ -8,6 +8,7 @@ module Statistics.Test.Internal (
 import Data.Ord
 import           Data.Vector.Generic           ((!))
 import qualified Data.Vector.Generic         as G
+import qualified Data.Vector.Unboxed         as U
 import qualified Data.Vector.Generic.Mutable as M
 import Statistics.Function
 
@@ -33,10 +34,10 @@ data Rank v a = Rank {
 --
 -- >>> rank (==) (VU.fromList [10,10,10,30::Int])
 -- [2.0,2.0,2.0,4.0]
-rank :: (G.Vector v a, G.Vector v Double)
+rank :: (G.Vector v a)
      => (a -> a -> Bool)        -- ^ Equivalence relation
      -> v a                     -- ^ Vector to rank
-     -> v Double
+     -> U.Vector Double
 rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
   where
     go (Rank 0 _ r v)
@@ -59,11 +60,10 @@ rank eq vec = G.unfoldr go (Rank 0 (-1) 1 vec)
 rankUnsorted :: ( Ord a
                 , G.Vector v a
                 , G.Vector v Int
-                , G.Vector v Double
                 , G.Vector v (Int, a)
                 )
              => v a
-             -> v Double
+             -> U.Vector Double
 rankUnsorted xs = G.create $ do
     -- Put ranks into their original positions
     -- NOTE: backpermute will do wrong thing

diff --git a/Statistics/Test/StudentT.hs b/Statistics/Test/StudentT.hs
@@ -71,7 +71,7 @@ welchTTest test sample1 sample2
 -- | Paired two-sample t-test. Two samples are paired in a
 -- within-subject design. Returns @Nothing@ if sample size is not
 -- sufficient.
-pairedTTest :: forall v. (G.Vector v (Double, Double), G.Vector v Double)
+pairedTTest :: forall v. (G.Vector v (Double, Double))
             => PositionTest          -- ^ one- or two-tailed test
             -> v (Double, Double)    -- ^ paired samples
             -> Maybe (Test StudentT)

diff --git a/benchmark/bench.hs b/benchmark/bench.hs
@@ -39,7 +39,11 @@ main =
     , bench "varianceUnbiased" $ nf (\x -> varianceUnbiased x) sample
     , bench "varianceWeighted" $ nf (\x -> varianceWeighted x) sampleW
       -- Correlation
-    , bench "pearson"          $ nf pearson sampleW
+    , bench "pearson"          $ nf pearson     sampleW
+    , bench "covariance"       $ nf covariance  sampleW
+    , bench "correlation"      $ nf correlation sampleW
+    , bench "covariance2"      $ nf (covariance2  sample) sample
+    , bench "correlation2"     $ nf (correlation2 sample) sample
       -- Other
     , bench "stdDev"           $ nf (\x -> stdDev x)           sample
     , bench "skewness"         $ nf (\x -> skewness x)         sample

diff --git a/changelog.md b/changelog.md
@@ -1,6 +1,17 @@
-## Changes in NEXT_VERSIONS
+## Changes in 0.16.3.0
 
- * Computation of `rSquare` has special case for case when data variation is 0.
+ * `S.Sample.correlation`, `S.Sample.covariance`,
+   `S.Correlation.pearson` do not allocate temporary arrays.
+
+ * Variants of correlation which take two vectors as input are added:
+   `S.Sample.correlation2`, `S.Sample.covariance2`, `S.Correlation.pearson2`,
+   `S.Correlation.spearman2`.
+
+ * Contexts for `S.Function.indexed`, `S.Correlation.spearman`, `S.pairedTTest`,
+   `S.Sample.correlation`, `S.Sample.covariance`, reduced.
+
+ * Computation of `rSquare` in linear regression has special case for case when
+   data variation is 0.
 
  * Doctests added.
 

diff --git a/statistics.cabal b/statistics.cabal
@@ -2,7 +2,7 @@ cabal-version:  3.0
 build-type:     Simple
 
 name:           statistics
-version:        0.16.2.1
+version:        0.16.3.0
 synopsis:       A library of statistical types, data, and functions
 description:
   This library provides a number of common functions and types useful