-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproportionTest.plg
154 lines (136 loc) · 6.15 KB
/
proportionTest.plg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
% Copyright (c) 2011 Aubrey Barnard. This is free software. See
% LICENSE for details.
% Statistical tests of two proportions.
% Binomial distribution PDF.
%
% Assumes the arguments are valid (NumberSuccesses >= 0, NumberTrials >=
% NumberSuccesses, 0 =< SuccessProb =< 1).
%
% Here is the math for the following code.
% B(k; n, p) = n! / (k! * (n - k)!) * p^k * (1 - p)^(n - k) [1]
%
% log(B(k; n, p)) = log(n!) - log(k! * (n - k)!) + log(p^k * (1 - p)^(n - k)) [2]
% = log(n!) - (log(k!) + log((n - k)!)) + log(p^k) + log((1 - p)^(n - k))
% = log(n!) - log(k!) - log((n - k)!) + k * log(p) + (n - k) * log(1 - p)
% = log(gamma(n + 1)) - log(gamma(k + 1)) - log(gamma(n - k + 1)) + k * log(p) + (n - k) * log(1 - p)
% = lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1) + k * log(p) + (n - k) * log(1 - p)
%
% B(k; n, p) = exp(lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1) + k * log(p) + (n - k) * log(1 - p))
%
% [1] http://en.wikipedia.org/wiki/Binomial_distribution
% [2] Credit for the log approach goes to SciPy.
binomial(NumberSuccesses, NumberTrials, SuccessProb, Probability) :-
Probability is exp(
lgamma(NumberTrials + 1) -
lgamma(NumberSuccesses + 1) -
lgamma(NumberTrials - NumberSuccesses + 1) +
NumberSuccesses * log(SuccessProb) +
(NumberTrials - NumberSuccesses) * log(1.0 - SuccessProb)
).
% Binomial distribution integral (like CDF, but for arbitrary start and
% end points). Uses sums, not incomplete beta function. End points
% included. Assumes arguments are valid (Start >= 0, End >= Start,
% NumberTrials > 0, 0 =< SuccessProb =< 1).
% This clause exists so that binomial_test_twotailed doesn't have to be
% more careful about bounds.
binomial_integral(Start, End, _, _, 0.0) :-
integer(Start),
integer(End),
Start > End,
!.
% All the mass is on zero successes
binomial_integral(_, _, _, 0.0, 1.0) :-
!.
% All the mass is on all successes
binomial_integral(_, End, End, 1.0, 1.0) :-
!.
binomial_integral(_, _, _, 1.0, 0.0) :-
!.
% Calculate
binomial_integral(Start, End, NumberTrials, SuccessProb, ProbabilitySum) :-
integer(Start),
integer(End),
Start =< End,
NewEnd is End + 1,
binomial_integral_loop(Start, NewEnd, NumberTrials, SuccessProb, 0.0, ProbabilitySum),
!.
binomial_integral_loop(X, X, _, _, P, P).
binomial_integral_loop(Start, End, NumberTrials, SuccessProb, InitialProbabilitySum, ProbabilitySum) :-
binomial(Start, NumberTrials, SuccessProb, PointProbability),
NewStart is Start + 1,
NewProbabilitySum is InitialProbabilitySum + PointProbability,
!,
binomial_integral_loop(NewStart, End, NumberTrials, SuccessProb, NewProbabilitySum, ProbabilitySum).
% Calculates the p-value for a two-tailed binomial test with the given
% arguments.
binomial_test_twotailed(NumberSuccesses, NumberTrials, SuccessProb, PValue) :-
% Calculate the effect size and the other limit
Mean is SuccessProb * NumberTrials,
OppositeSuccesses is 2.0 * Mean - NumberSuccesses,
% Sort out which are the left and right limits
binomial_test_twotailed_sort_limits(NumberSuccesses, OppositeSuccesses, LeftLimit, RightLimit),
% Calculate the values in the tails
binomial_integral(0, LeftLimit, NumberTrials, SuccessProb, LeftTail),
binomial_integral(RightLimit, NumberTrials, NumberTrials, SuccessProb, RightTail),
PValue is LeftTail + RightTail,
!.
binomial_test_twotailed_sort_limits(NumberSuccesses, OppositeSuccesses, LeftLimit, NumberSuccesses) :-
OppositeSuccesses < NumberSuccesses,
LeftLimit is integer(floor(OppositeSuccesses)).
binomial_test_twotailed_sort_limits(NumberSuccesses, OppositeSuccesses, NumberSuccesses, RightLimit) :-
RightLimit is integer(ceiling(OppositeSuccesses)).
% Calculates the p-value for a two-tailed binomial test for the given
% proportions versus the null hypothesis of equal proportions.
binomial_test(0, 0, 1.0).
binomial_test(Proportion1, Proportion2, PValue) :-
Total is Proportion1 + Proportion2,
binomial_test_twotailed(Proportion1, Total, 0.5, PValue),
!.
% Calculates the CDF for the chi-square distribution with one degree of
% freedom.
%
% Here is the math for the following code.
% gammai is the lower incomplete gamma function
% chisq_cdf(x; k) = gammai(k / 2, x / 2) / gamma(k / 2) [1]
% chisq_cdf(x; 1) = gammai(1 / 2, x / 2) / gamma(1 / 2)
% gammai(1 / 2, x) = sqrt(pi) * erf(sqrt(x)) [2]
% gamma(1 / 2) = sqrt(pi) [3]
% chisq_cdf(x; 1) = (sqrt(pi) * erf(sqrt(x / 2))) / sqrt(pi)
% = erf(sqrt(x / 2))
%
% [1] http://en.wikipedia.org/wiki/Chi-square_distribution
% [2] http://en.wikipedia.org/wiki/Incomplete_Gamma_function
% [3] http://en.wikipedia.org/wiki/Gamma_function
chisquare_cdf_1df(X, Area) :-
Area is erf(sqrt(X / 2.0)).
% Calculates the complement of the CDF (1 - CDF(x)) for the chi-square
% distribution with one degree of freedom.
chisquare_cdfc_1df(X, Area) :-
Area is erfc(sqrt(X / 2.0)).
% Calculates the p-value for a chi square test for the given proportions
% versus the null hypothesis of equal proportions.
chisquare_test(0, 0, 1.0).
chisquare_test(Proportion1, Proportion2, PValue) :-
% Calculate the statistic. Write it out because it's just 2 proportions.
% Use 0.5 frequency because that's the null hypothesis.
Expected is (Proportion1 + Proportion2) / 2.0,
Diff1 is Proportion1 - Expected,
Diff2 is Proportion2 - Expected,
Statistic is Diff1 * Diff1 / Expected + Diff2 * Diff2 / Expected,
% Calculate the p-value
chisquare_cdfc_1df(Statistic, PValue),
!.
% Calculates the p-value for a statistical test of the given proportions
% versus the null hypothesis of equal proportions. If the sum of the
% proportions is greater than 200 a chi-square test is used, otherwise a
% two-tailed binomial test is used.
proportion_test(Proportion1, Proportion2, PValue) :-
Total is Proportion1 + Proportion2,
proportion_test_choose_test(Total, Proportion1, Proportion2, PValue),
!.
proportion_test_choose_test(Total, Proportion1, Proportion2, PValue) :-
Total > 200,
!,
chisquare_test(Proportion1, Proportion2, PValue).
proportion_test_choose_test(_, Proportion1, Proportion2, PValue) :-
binomial_test(Proportion1, Proportion2, PValue).