-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdg_chi2test3.m
174 lines (168 loc) · 7.2 KB
/
dg_chi2test3.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
function [significance, colsNotMrgd, finalCols, colsMrgdA, colsMrgdB] = ...
dg_chi2test3(N, userules)
%[significance, colsMrgdA, colsMrgdB] = dg_chi2test3(N, userules)
%[significance, colsMrgdA, colsMrgdB] = dg_chi2test3(N)
% <N> is a 2-D array each row of which is a histogram. Returns
% the statistical significance level for concluding that there is a
% difference between the various rows (i.e. the probability that the
% differences from the mean histogram for each bin would be greater than
% or equal to the given values if the row didn't matter).
%
% If <userules> is 1, then the following rules of thumb are applied and
% cause NaN to be returned if they are violated:
% (a) no expected value for a category should be less than 1 (it does not
% matter what the observed values are)
% (b) no more than one-fifth of expected values should be less than 5
% <userules> defaults to false.
% Note that NaN is also returned if any of the elements of <N> is NaN.
%
% If <userules> is 2, then categories (i.e. columns) are automatically
% merged s.t. the rules are met, or if that is impossible, NaN is
% returned. If merge was successful, <colsMrgdA> contains the indices of
% the original input columns that were merged to satisfy rule (a), and
% <colsMrgdB> is a 2 column matrix containing the pair of columns of the
% working array merged at each iteration to satisfy rule (b). In either
% case, the newly merged column is placed at the end of the array.
% <colsNotMrgd> contains the original column indices of any columns in the
% input <N> that did NOT participate in a merge. <finalCols> is a cell row
% vector with one one element for each bin; each element contains a row
% vector of the original column numbers that went into the corresponding
% bin.
%
% Also note that this does not reduce to a 1-D chi^2 test if you give it a
% single column; use dg_chi2test2 for that.
%
% Based on material of The Really Easy Statistics Site (TRESS) at:
% "Chi-squared test for categories of data".
% http://archive.bio.ed.ac.uk/jdeacon/statistics/tress9.html#Chi-squared%20test
%$Rev: 255 $
%$Date: 2016-12-19 16:48:11 -0500 (Mon, 19 Dec 2016) $
%$Author: dgibson $
colsMrgdA = [];
colsMrgdB = [];
colsNotMrgd = 1:size(N,2); % These get deleted in parallel with cols of N
if any(any(isnan(N)))
significance = NaN;
return
end
if nargin < 2
userules = 0;
end
% Ditto, but merged bin lists are also appended in parallel with N:
finalCols = mat2cell(colsNotMrgd, 1, repmat(1,size(colsNotMrgd)));
% Calculate marginal probabilities & expected values:
Ntot = sum(sum(N));
columnP = sum(N,1)/Ntot;
rowP = sum(N,2)/Ntot;
P = repmat(columnP, size(N,1), 1) .* repmat(rowP, 1, size(N,2));
expected = P * Ntot;
% Check for rules violations:
if userules == 1
% Exit with NaN if violation:
if any(any(expected < 1))
[row col] = find(expected < 1);
warning('dg_chi2test3:lt1', ...
'The expected count per bin is less than 1 at [row col] = %s', ...
mat2str([row col]) );
significance = NaN;
return
elseif sum(sum(expected < 5)) > numel(N)/5
[row col] = find(expected < 5);
warning('dg_chi2test3:lt5', ...
'Too many expected counts < 5 at [row col] = %s', ...
mat2str([row col]) );
significance = NaN;
return
end
elseif userules == 2
% Merge columns to satisfy rules. First check rule (a):
toosmall = expected < 1;
if any(any(toosmall))
toosmallcols = any(toosmall); % row vector
if all(toosmallcols)
warning('dg_chi2test3:alltoosmall', ...
'At least one row contains too few counts in all columns.' );
significance = NaN;
return
end
% Try merging just the "toosmallcols":
mergedCol = sum(N(:,toosmallcols),2); % column vector
if any((sum(mergedCol,1) * rowP) < 1)
% still too small, must merge with next larger column:
bigcolidx = find(~toosmallcols); % row vector
if length(bigcolidx) < 2
warning('dg_chi2test3:alltoosmall2', ...
'Only one column contains enough counts, cannot merge.' );
significance = NaN;
return
end
[C, I] = min(columnP(bigcolidx));
colsMrgdA = [find(toosmallcols) bigcolidx(I)]; % row vector
mergedCol = sum(N(:,colsMrgdA),2);
N(:,colsMrgdA) = [];
N(:,end+1) = mergedCol;
colsNotMrgd(colsMrgdA(colsMrgdA<=length(colsNotMrgd))) = [];
else
% simply merge the "toosmall" columns:
N(:,toosmallcols) = [];
N(:,end+1) = mergedCol;
colsNotMrgd(toosmallcols(1:length(colsNotMrgd))) = [];
colsMrgdA = find(toosmallcols); % row vector
end
finalCols(colsMrgdA) = [];
finalCols(end+1) = {colsMrgdA};
end
% Re-calculate <expected> and check rule (b):
columnP = sum(N,1)/Ntot;
P = repmat(columnP, size(N,1), 1) .* repmat(rowP, 1, size(N,2));
expected = P * Ntot;
toosmall = expected < 5;
% If we have violated rule (b), iteratively merge the two smallest columns
% until rule (b) is satisfied:
while sum(sum(toosmall)) > numel(N)/5
if size(N,2) < 3
warning('dg_chi2test3:alltoosmall3', ...
'Too few columns, cannot merge.' );
significance = NaN;
return
end
[B,IX] = sort(columnP);
ix = IX(1:2);
colsMrgdB(end+1,:) = ix;
mergedCol = sum(N(:,ix), 2);
N(:,ix) = [];
N(:,end+1) = mergedCol;
colsNotMrgd(ix(ix<=length(colsNotMrgd))) = [];
mergedColNums = {[finalCols{ix(1)} finalCols{ix(2)}]};
finalCols(ix) = [];
finalCols(end+1) = mergedColNums;
% Re-calculate:
columnP = sum(N,1)/Ntot;
P = repmat(columnP, size(N,1), 1) .* repmat(rowP, 1, size(N,2));
expected = P * Ntot;
toosmall = expected < 5;
end
end
% TRESS gives a lousy description of Degrees of Freedom; a different angle
% is given at http://davidmlane.com/hyperstat/A42408.html as "the number of
% independent scores that go into the estimate minus the number of
% parameters estimated as intermediate steps". In our case, the parameters
% are the marginal probabilities <rowP> and <columnP>; since the marginals
% along any given axis must always total to 1, the last one is not
% independent, so the number of independent marginals is the number of rows
% minus one plus the number of columns minus one. To calculate our
% expected counts we also need one further independent parameter calculated
% from the data, which is the grand total number of observations,
% sum(sum(N)). So our degrees of freedom are:
df = numel(N) - (size(N,1) + size(N,2) - 2) - 1;
% Calculate the chi^2 estimate:
if df == 1
% Use Yates correction
diffs = abs(N - expected) - 0.5;
else
% No Yates correction
diffs = N - expected;
end
addends = (diffs .^ 2) ./ expected;
chi2 = sum(sum(addends));
significance = chi2cdf(chi2,df,'upper');