From c4ba123b563607a432303144fbf6cd2cf5d65643 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Fri, 26 Mar 2021 12:42:05 +0100 Subject: [PATCH 01/27] Cooks Distance tests --- src/GLM.jl | 3 ++- src/lm.jl | 21 +++++++++++++++++++++ test/runtests.jl | 17 +++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/GLM.jl b/src/GLM.jl index 54e841a9..940673de 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -17,7 +17,8 @@ module GLM import SpecialFunctions: erfc, erfcinv, digamma, trigamma export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, - fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr² + fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², + cooksdistance export # types diff --git a/src/lm.jl b/src/lm.jl index b2e94332..e2cf64a0 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -243,3 +243,24 @@ function confint(obj::LinearModel; level::Real=0.95) hcat(coef(obj),coef(obj)) + stderror(obj) * quantile(TDist(dof_residual(obj)), (1. - level)/2.) * [1. -1.] end + +""" + cooksdistance(obj::LinearModel) +Compute Cook's distance for each observation, an estimate of the influence of each data point. +Credit to Tyler Beason https://github.com/tbeason +""" +function cooksdistance(obj::LinearModel) + u = residuals(obj) + mse = dispersion(obj,true) + k = dof(obj)-1 + X = modelmatrix(obj) + wts = obj.rr.wts + if isempty(wts) + hii = diag(X * inv(X' * X) * X') + else + W = Diagonal(wts) + hii = diag(X * inv(X' * W * X) * X' * W) + end + D = u.^2 .* (hii ./ (1 .- hii).^2) ./ (k*mse) + return D +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 699fc3eb..6fcaefbe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,6 +49,23 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test coef(lm3) == coef(lm4) ≈ [11, 1, 2, 3, 4] end +@testset "Linear Model Cooks Distance - refers to PR #368 and issue #414" begin + st_df = DataFrame( + Y=[1.3, 2.3, 5.3, 10., 7.2 , 6.3], + XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], + XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + W=[0.1, 0.2, 0.2, 0.1, 0.2, 0.2], + CooksD=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], + CooksDW=[0.7371619575, 30.979165951, 0.0112480511, 0.5168548491, 0.0600217986, 0.1270980264] ) + + t_lm = lm(@formula(Y ~ XA + XB), st_df) + @show(r2(t_lm)) + @test isapprox(st_df.CooksD, cooksdistance(t_lm)) + t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) + @show(r2(t_lm_w)) + @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) +end + @testset "linear model with weights" begin df = dataset("quantreg", "engel") N = nrow(df) From f66d1d7802384ceb34fb33b2923a4f01931c7e67 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Fri, 26 Mar 2021 17:51:34 +0100 Subject: [PATCH 02/27] updated to couple with StatsModels --- src/lm.jl | 8 +++++++- test/runtests.jl | 4 +--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index e2cf64a0..d07ccb4b 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -263,4 +263,10 @@ function cooksdistance(obj::LinearModel) end D = u.^2 .* (hii ./ (1 .- hii).^2) ./ (k*mse) return D -end \ No newline at end of file +end + +using StatsModels + +function cooksdistance(obj :: StatsModels.TableRegressionModel) + cooksdistance(obj.model) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 6fcaefbe..a536e54a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -57,12 +57,10 @@ end W=[0.1, 0.2, 0.2, 0.1, 0.2, 0.2], CooksD=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], CooksDW=[0.7371619575, 30.979165951, 0.0112480511, 0.5168548491, 0.0600217986, 0.1270980264] ) - + t_lm = lm(@formula(Y ~ XA + XB), st_df) - @show(r2(t_lm)) @test isapprox(st_df.CooksD, cooksdistance(t_lm)) t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) - @show(r2(t_lm_w)) @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) end From b680e2a60f97c4bb5861e428ec0f818608ab753e Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Sat, 27 Mar 2021 00:49:47 +0100 Subject: [PATCH 03/27] using StatsModels etc. --- .gitignore | 1 + src/lm.jl | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cf473afd..c29577fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ docs/build docs/site docs/Manifest.toml +.vscode/settings.json diff --git a/src/lm.jl b/src/lm.jl index e2cf64a0..d07ccb4b 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -263,4 +263,10 @@ function cooksdistance(obj::LinearModel) end D = u.^2 .* (hii ./ (1 .- hii).^2) ./ (k*mse) return D -end \ No newline at end of file +end + +using StatsModels + +function cooksdistance(obj :: StatsModels.TableRegressionModel) + cooksdistance(obj.model) +end \ No newline at end of file From deb4f3852b2cc2f5bec09ec9bc96b66ef4c19667 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Sat, 27 Mar 2021 00:57:29 +0100 Subject: [PATCH 04/27] no weights --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 6fcaefbe..f3d28b55 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -61,9 +61,9 @@ end t_lm = lm(@formula(Y ~ XA + XB), st_df) @show(r2(t_lm)) @test isapprox(st_df.CooksD, cooksdistance(t_lm)) - t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) - @show(r2(t_lm_w)) - @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) + # t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) + # @show(r2(t_lm_w)) + # @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) end @testset "linear model with weights" begin From 6d13db3c0a880e0b5dee5167761a645cf6e8e21f Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Sat, 27 Mar 2021 00:57:56 +0100 Subject: [PATCH 05/27] no weights --- test/runtests.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index f3d28b55..a4cfddec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -59,10 +59,8 @@ end CooksDW=[0.7371619575, 30.979165951, 0.0112480511, 0.5168548491, 0.0600217986, 0.1270980264] ) t_lm = lm(@formula(Y ~ XA + XB), st_df) - @show(r2(t_lm)) @test isapprox(st_df.CooksD, cooksdistance(t_lm)) # t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) - # @show(r2(t_lm_w)) # @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) end From 1ec1d71a1f7b3058ece08e0094e6c01e6697726d Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Mon, 29 Mar 2021 17:25:47 +0200 Subject: [PATCH 06/27] added normalised weights support --- src/lm.jl | 19 +++++++++++++++---- test/runtests.jl | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index d07ccb4b..9522087d 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -247,26 +247,37 @@ end """ cooksdistance(obj::LinearModel) Compute Cook's distance for each observation, an estimate of the influence of each data point. +Currently only model without weights are supported Credit to Tyler Beason https://github.com/tbeason """ function cooksdistance(obj::LinearModel) u = residuals(obj) mse = dispersion(obj,true) k = dof(obj)-1 + d_res = dof_residual(obj) X = modelmatrix(obj) wts = obj.rr.wts if isempty(wts) hii = diag(X * inv(X' * X) * X') else - W = Diagonal(wts) - hii = diag(X * inv(X' * W * X) * X' * W) + if mean(wts) == 1. + u = @. u * sqrt(wts) + W = Diagonal(wts) + hii = diag(X * inv(X' * W * X) * X' * W) + else + error("Currently only support LinearModel without weight or with normalised weights `wts= wts ./ mean(wts)`.") + end end - D = u.^2 .* (hii ./ (1 .- hii).^2) ./ (k*mse) + D = @. u^2 * (hii / (1 - hii)^2) / (k*mse) return D end using StatsModels function cooksdistance(obj :: StatsModels.TableRegressionModel) - cooksdistance(obj.model) + if isa(obj.model , LinearModel) + cooksdistance(obj.model) + else + error("cooksdistance not implemented for model type: ", typeof(obj.model)) + end end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index a536e54a..5c8040cf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -54,7 +54,7 @@ end Y=[1.3, 2.3, 5.3, 10., 7.2 , 6.3], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], - W=[0.1, 0.2, 0.2, 0.1, 0.2, 0.2], + W=[0.6, 1.2, 1.2, 0.6, 1.2, 1.2], CooksD=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], CooksDW=[0.7371619575, 30.979165951, 0.0112480511, 0.5168548491, 0.0600217986, 0.1270980264] ) From fec1415d8b09dd7a010dd21c6a25038872e93c30 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Mon, 29 Mar 2021 17:36:58 +0200 Subject: [PATCH 07/27] added reference --- src/lm.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lm.jl b/src/lm.jl index 9522087d..f137b611 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -247,7 +247,8 @@ end """ cooksdistance(obj::LinearModel) Compute Cook's distance for each observation, an estimate of the influence of each data point. -Currently only model without weights are supported +Currently only model without weights or with normalised weights are supported. +reference: https://en.wikipedia.org/wiki/Cook%27s_distance Credit to Tyler Beason https://github.com/tbeason """ function cooksdistance(obj::LinearModel) From 0c3c50f212a40bd330ebd42acc45c808f4798353 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Mon, 29 Mar 2021 17:52:19 +0200 Subject: [PATCH 08/27] add docstring to wrapper --- src/lm.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index f137b611..10d275de 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -261,7 +261,7 @@ function cooksdistance(obj::LinearModel) if isempty(wts) hii = diag(X * inv(X' * X) * X') else - if mean(wts) == 1. + if mean(wts) == 1. # assume that the weights are normalised if the mean is 1 u = @. u * sqrt(wts) W = Diagonal(wts) hii = diag(X * inv(X' * W * X) * X' * W) @@ -275,10 +275,14 @@ end using StatsModels +""" + cooksdistance(obj :: StatsModels.TableRegressionModel) +wrapper from TableRegressionModel to LinearModel. Will produce error when use with something else than LinearModel. +""" function cooksdistance(obj :: StatsModels.TableRegressionModel) if isa(obj.model , LinearModel) cooksdistance(obj.model) else error("cooksdistance not implemented for model type: ", typeof(obj.model)) end -end \ No newline at end of file +end \ No newline at end of file From 463aeb122463486069380f655ea966d475cc1dde Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 29 Mar 2021 22:04:03 +0200 Subject: [PATCH 09/27] Update src/lm.jl Co-authored-by: Milan Bouchet-Valat --- src/lm.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index 10d275de..63c33bf3 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -246,10 +246,11 @@ end """ cooksdistance(obj::LinearModel) -Compute Cook's distance for each observation, an estimate of the influence of each data point. -Currently only model without weights or with normalised weights are supported. -reference: https://en.wikipedia.org/wiki/Cook%27s_distance -Credit to Tyler Beason https://github.com/tbeason + +Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) +for each observation in linear model `obj`, giving an estimate of the influence +of each data point. +Currently only models without weights or with normalised weights are supported. """ function cooksdistance(obj::LinearModel) u = residuals(obj) @@ -285,4 +286,4 @@ function cooksdistance(obj :: StatsModels.TableRegressionModel) else error("cooksdistance not implemented for model type: ", typeof(obj.model)) end -end \ No newline at end of file +end From 253c503c59524cf9554474143eaf81ad62e26ce0 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Sun, 4 Apr 2021 00:15:22 +0200 Subject: [PATCH 10/27] base, no intercept and collinear test amended --- test/runtests.jl | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5c8040cf..041a85ad 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,17 +51,26 @@ end @testset "Linear Model Cooks Distance - refers to PR #368 and issue #414" begin st_df = DataFrame( - Y=[1.3, 2.3, 5.3, 10., 7.2 , 6.3], + Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], W=[0.6, 1.2, 1.2, 0.6, 1.2, 1.2], - CooksD=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], - CooksDW=[0.7371619575, 30.979165951, 0.0112480511, 0.5168548491, 0.0600217986, 0.1270980264] ) + CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], + CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], + CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ] ) + + # linear regression + t_lm_base = lm(@formula(Y ~ XA), st_df) + @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_base)) + + # linear regression, no intercept + t_lm_noint = lm(@formula(Y ~ XA +0), st_df) + @test isapprox(st_df.CooksD_noint, cooksdistance(t_lm_noint)) + + # linear regression, two collinear variables (Variance inflation factor ≊ 250) + t_lm_multi = lm(@formula(Y ~ XA + XB), st_df) + @test isapprox(st_df.CooksD_multi, cooksdistance(t_lm_multi)) - t_lm = lm(@formula(Y ~ XA + XB), st_df) - @test isapprox(st_df.CooksD, cooksdistance(t_lm)) - t_lm_w = lm(@formula(Y ~ XA + XB), st_df, wts = st_df.W) - @test isapprox(st_df.CooksDW, cooksdistance(t_lm_w)) end @testset "linear model with weights" begin From 66d1b1ec4b8e498a5531f9405eff4efbf994da63 Mon Sep 17 00:00:00 2001 From: ericqu Date: Sun, 4 Apr 2021 00:55:06 +0200 Subject: [PATCH 11/27] Delete .gitignore only on my local machine --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c29577fb..00000000 --- a/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -docs/build -docs/site -docs/Manifest.toml -.vscode/settings.json From d5b8c0dddaf7da1c5203629ca1353f76789a6876 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Mon, 5 Apr 2021 01:00:05 +0200 Subject: [PATCH 12/27] collinear v2 --- test/runtests.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 041a85ad..ba090066 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -54,10 +54,12 @@ end Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], + XC=[3., 13., 23., 39.8, 34., 31.], W=[0.6, 1.2, 1.2, 0.6, 1.2, 1.2], CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], - CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ] ) + CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ], + ) # linear regression t_lm_base = lm(@formula(Y ~ XA), st_df) @@ -71,6 +73,10 @@ end t_lm_multi = lm(@formula(Y ~ XA + XB), st_df) @test isapprox(st_df.CooksD_multi, cooksdistance(t_lm_multi)) + # linear regression, two full collinear variables (XA = 2 XC) hence should get the same results as base + t_lm_colli = lm(@formula(Y ~ XA + XC), st_df) + @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) + end @testset "linear model with weights" begin From e7af4560b4afbc8dac782632964195962089de7c Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Mon, 5 Apr 2021 01:06:13 +0200 Subject: [PATCH 13/27] no cook's distance when there are some weights. --- .vscode/settings.json | 3 +++ src/lm.jl | 8 +------- 2 files changed, 4 insertions(+), 7 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..1d39f5dd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "julia.environmentPath": "/home/eric/Repos/GLM.jl" +} \ No newline at end of file diff --git a/src/lm.jl b/src/lm.jl index 63c33bf3..5ac2e69b 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -262,13 +262,7 @@ function cooksdistance(obj::LinearModel) if isempty(wts) hii = diag(X * inv(X' * X) * X') else - if mean(wts) == 1. # assume that the weights are normalised if the mean is 1 - u = @. u * sqrt(wts) - W = Diagonal(wts) - hii = diag(X * inv(X' * W * X) * X' * W) - else - error("Currently only support LinearModel without weight or with normalised weights `wts= wts ./ mean(wts)`.") - end + error("Currently only support LinearModel without weight/frequency.") end D = @. u^2 * (hii / (1 - hii)^2) / (k*mse) return D From c8dac94d41d84e91d3acefb4adc7522a3844f9bc Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 6 Apr 2021 00:06:21 +0200 Subject: [PATCH 14/27] Delete settings.json --- .vscode/settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 1d39f5dd..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "julia.environmentPath": "/home/eric/Repos/GLM.jl" -} \ No newline at end of file From e283b0ebdb160eebb10ea1a29ffac1859cb930c8 Mon Sep 17 00:00:00 2001 From: Eric QUERE Date: Tue, 6 Apr 2021 00:35:58 +0200 Subject: [PATCH 15/27] cooksdistance definition --- src/lm.jl | 16 +--------------- test/runtests.jl | 10 +++++----- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index 5ac2e69b..455eabc0 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -250,7 +250,7 @@ end Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation in linear model `obj`, giving an estimate of the influence of each data point. -Currently only models without weights or with normalised weights are supported. +Currently only implemented for LinearModel models without weights. """ function cooksdistance(obj::LinearModel) u = residuals(obj) @@ -267,17 +267,3 @@ function cooksdistance(obj::LinearModel) D = @. u^2 * (hii / (1 - hii)^2) / (k*mse) return D end - -using StatsModels - -""" - cooksdistance(obj :: StatsModels.TableRegressionModel) -wrapper from TableRegressionModel to LinearModel. Will produce error when use with something else than LinearModel. -""" -function cooksdistance(obj :: StatsModels.TableRegressionModel) - if isa(obj.model , LinearModel) - cooksdistance(obj.model) - else - error("cooksdistance not implemented for model type: ", typeof(obj.model)) - end -end diff --git a/test/runtests.jl b/test/runtests.jl index ba090066..9449d5d5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -55,11 +55,10 @@ end XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], - W=[0.6, 1.2, 1.2, 0.6, 1.2, 1.2], CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ], - ) + ) # linear regression t_lm_base = lm(@formula(Y ~ XA), st_df) @@ -73,9 +72,10 @@ end t_lm_multi = lm(@formula(Y ~ XA + XB), st_df) @test isapprox(st_df.CooksD_multi, cooksdistance(t_lm_multi)) - # linear regression, two full collinear variables (XA = 2 XC) hence should get the same results as base - t_lm_colli = lm(@formula(Y ~ XA + XC), st_df) - @test isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) + # linear regression, two full collinear variables (XC = 2 XA) hence should get the same results as base + t_lm_colli = lm(@formula(Y ~ XA + XC), st_df, dropcollinear=true) + ## Currently the test fails as the collinear variable is not droped from a ```modelmatrix(obj)``` call. + @test_throws SingularException isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) end From 406f976a8460186168bb5790c95dc74606c3bf5c Mon Sep 17 00:00:00 2001 From: ericqu Date: Fri, 23 Apr 2021 19:02:16 +0200 Subject: [PATCH 16/27] Update src/lm.jl Co-authored-by: Phillip Alday --- src/lm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm.jl b/src/lm.jl index 455eabc0..caa90a6b 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -252,7 +252,7 @@ for each observation in linear model `obj`, giving an estimate of the influence of each data point. Currently only implemented for LinearModel models without weights. """ -function cooksdistance(obj::LinearModel) +function StatsBase.cooksdistance(obj::LinearModel) u = residuals(obj) mse = dispersion(obj,true) k = dof(obj)-1 From 6a57a341b5fb93042f6816446cdb09945d103e79 Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 26 Apr 2021 11:07:46 +0200 Subject: [PATCH 17/27] Create .gitignore reverting removal --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..cf473afd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +docs/build +docs/site +docs/Manifest.toml From 58f9f38875fbea980b87c5790e6439c63736bd50 Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 26 Apr 2021 11:08:47 +0200 Subject: [PATCH 18/27] Update src/lm.jl Co-authored-by: Milan Bouchet-Valat --- src/lm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm.jl b/src/lm.jl index caa90a6b..081c36e0 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -262,7 +262,7 @@ function StatsBase.cooksdistance(obj::LinearModel) if isempty(wts) hii = diag(X * inv(X' * X) * X') else - error("Currently only support LinearModel without weight/frequency.") + throw(ArgumentError("Weighted models are not currently supported.")) end D = @. u^2 * (hii / (1 - hii)^2) / (k*mse) return D From d503aa4b2308d013950d76cc0e2c4b4abc65051e Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 26 Apr 2021 11:09:58 +0200 Subject: [PATCH 19/27] Update Project.toml reflect dependency with StatsBase --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 45269bc9..695a0a0c 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,7 @@ Distributions = "0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24" RDatasets = "0.5, 0.6, 0.7" Reexport = "0.1, 0.2, 1.0" SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1" -StatsBase = "0.30, 0.31, 0.32, 0.33" +StatsBase = "0.33.5" StatsFuns = "0.6, 0.7, 0.8, 0.9" StatsModels = "0.6" julia = "1" From 6d0fa1896f560d224109c85bc0a25a7e8419625a Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 26 Apr 2021 11:13:56 +0200 Subject: [PATCH 20/27] Update Project.toml add dependency on latest version of StatsModel --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 695a0a0c..08293086 100644 --- a/Project.toml +++ b/Project.toml @@ -25,7 +25,7 @@ Reexport = "0.1, 0.2, 1.0" SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1" StatsBase = "0.33.5" StatsFuns = "0.6, 0.7, 0.8, 0.9" -StatsModels = "0.6" +StatsModels = "0.6.22" julia = "1" [extras] From b1ab4d397badfc707a658dad8bf234a8959ca40a Mon Sep 17 00:00:00 2001 From: ericqu Date: Mon, 26 Apr 2021 17:20:56 +0200 Subject: [PATCH 21/27] Update lm.jl implementing @palday suggestion --- src/lm.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lm.jl b/src/lm.jl index 081c36e0..77b2633d 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -259,8 +259,9 @@ function StatsBase.cooksdistance(obj::LinearModel) d_res = dof_residual(obj) X = modelmatrix(obj) wts = obj.rr.wts + cmm = crossmodelmatrix(obj) if isempty(wts) - hii = diag(X * inv(X' * X) * X') + hii = diag(X * inv(cmm) * X') else throw(ArgumentError("Weighted models are not currently supported.")) end From 2c3ac21a0b2b8c417fee7445de3438cfbb2d52a9 Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 27 Apr 2021 10:50:18 +0200 Subject: [PATCH 22/27] Update test/runtests.jl Co-authored-by: Phillip Alday --- test/runtests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 9449d5d5..25cfdaab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -72,7 +72,8 @@ end t_lm_multi = lm(@formula(Y ~ XA + XB), st_df) @test isapprox(st_df.CooksD_multi, cooksdistance(t_lm_multi)) - # linear regression, two full collinear variables (XC = 2 XA) hence should get the same results as base + # linear regression, two full collinear variables (XC = 2 XA) hence should get the same results as the original + # after pivoting t_lm_colli = lm(@formula(Y ~ XA + XC), st_df, dropcollinear=true) ## Currently the test fails as the collinear variable is not droped from a ```modelmatrix(obj)``` call. @test_throws SingularException isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) From e49f88f265368a0fdd2a6cdce94b3914a94f2105 Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 27 Apr 2021 10:51:06 +0200 Subject: [PATCH 23/27] Update src/lm.jl Co-authored-by: Phillip Alday --- src/lm.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index 77b2633d..2413d416 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -257,11 +257,11 @@ function StatsBase.cooksdistance(obj::LinearModel) mse = dispersion(obj,true) k = dof(obj)-1 d_res = dof_residual(obj) - X = modelmatrix(obj) + X, XtX = crossmodelmatrix(obj) + k == size(X,2) || throw(ArgumentError("Models with collinear terms are not currently supported.")) wts = obj.rr.wts - cmm = crossmodelmatrix(obj) if isempty(wts) - hii = diag(X * inv(cmm) * X') + hii = diag(X * inv(XtX) * X') else throw(ArgumentError("Weighted models are not currently supported.")) end From b0fb18bad9c917a31f4e43a0fc39764f344102d7 Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 27 Apr 2021 10:51:34 +0200 Subject: [PATCH 24/27] Update test/runtests.jl Co-authored-by: Phillip Alday --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 25cfdaab..23dbd605 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -76,7 +76,7 @@ end # after pivoting t_lm_colli = lm(@formula(Y ~ XA + XC), st_df, dropcollinear=true) ## Currently the test fails as the collinear variable is not droped from a ```modelmatrix(obj)``` call. - @test_throws SingularException isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) + @test_throws ArgumentError isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) end From 0a4581fa0c0f246448d20c7d336fbcf538ddfc7d Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 27 Apr 2021 12:37:31 +0200 Subject: [PATCH 25/27] add test values provenance. added sources of values used for testing. --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index 23dbd605..700e6a10 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -55,6 +55,7 @@ end XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], XB=[1.8, 7.8, 11.8, 20.5, 17.3, 15.8], XC=[3., 13., 23., 39.8, 34., 31.], + # values from SAS proc reg CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ], From 86b6a35fba01f6914fa0f833f350b44b6b5ffc9c Mon Sep 17 00:00:00 2001 From: ericqu Date: Tue, 27 Apr 2021 13:25:08 +0200 Subject: [PATCH 26/27] Update src/lm.jl Co-authored-by: Phillip Alday --- src/lm.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lm.jl b/src/lm.jl index 2413d416..c7686655 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -257,7 +257,8 @@ function StatsBase.cooksdistance(obj::LinearModel) mse = dispersion(obj,true) k = dof(obj)-1 d_res = dof_residual(obj) - X, XtX = crossmodelmatrix(obj) + X = modelmatrix(obj) + XtX = crossmodelmatrix(obj) k == size(X,2) || throw(ArgumentError("Models with collinear terms are not currently supported.")) wts = obj.rr.wts if isempty(wts) From fc70f790efc6bddefcb89d795a47be7148928755 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 28 Apr 2021 17:38:55 +0200 Subject: [PATCH 27/27] Apply suggestions from code review --- src/lm.jl | 2 +- test/runtests.jl | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lm.jl b/src/lm.jl index c7686655..cf6dd691 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -250,7 +250,7 @@ end Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation in linear model `obj`, giving an estimate of the influence of each data point. -Currently only implemented for LinearModel models without weights. +Currently only implemented for linear models without weights. """ function StatsBase.cooksdistance(obj::LinearModel) u = residuals(obj) diff --git a/test/runtests.jl b/test/runtests.jl index 700e6a10..cc14aa86 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -49,7 +49,7 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test coef(lm3) == coef(lm4) ≈ [11, 1, 2, 3, 4] end -@testset "Linear Model Cooks Distance - refers to PR #368 and issue #414" begin +@testset "Linear Model Cook's Distance" begin st_df = DataFrame( Y=[6.4, 7.4, 10.4, 15.1, 12.3 , 11.4], XA=[1.5, 6.5, 11.5, 19.9, 17.0, 15.5], @@ -58,7 +58,7 @@ end # values from SAS proc reg CooksD_base=[1.4068501943, 0.176809102, 0.0026655177, 1.0704009915, 0.0875726457, 0.1331183932], CooksD_noint=[0.0076891801, 0.0302993877, 0.0410262965, 0.0294348488, 0.0691589296, 0.0273045538], - CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157 ], + CooksD_multi=[1.7122291956, 18.983407026, 0.000118078, 0.8470797843, 0.0715921999, 0.1105843157], ) # linear regression @@ -76,7 +76,7 @@ end # linear regression, two full collinear variables (XC = 2 XA) hence should get the same results as the original # after pivoting t_lm_colli = lm(@formula(Y ~ XA + XC), st_df, dropcollinear=true) - ## Currently the test fails as the collinear variable is not droped from a ```modelmatrix(obj)``` call. + # Currently fails as the collinear variable is not dropped from `modelmatrix(obj)` @test_throws ArgumentError isapprox(st_df.CooksD_base, cooksdistance(t_lm_colli)) end