Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
(DHSC) Annabel Westermann authored and (DHSC) Annabel Westermann committed May 17, 2024
2 parents 7f8d154 + 86dba99 commit 8e9f68b
Show file tree
Hide file tree
Showing 19 changed files with 65 additions and 73 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f ph_statistical_methods/requirements.txt ]; then pip install -r ph_statistical_methods/requirements.txt; fi
python -m pip install -r requirements.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
23 changes: 10 additions & 13 deletions ph_statistical_methods/DSR.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,26 +69,20 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =

confidence, group_cols = format_args(confidence, group_cols)
ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_denom_col)
validate_data(df, num_col, group_cols, metadata, denom_col, ref_df = ref_df)
df = validate_data(df, num_col, group_cols, metadata, denom_col, ref_df = ref_df)

if ref_df is not None and euro_standard_pops == False:
df = df.merge(ref_df, how = 'left', left_on = ref_join_left, right_on = ref_join_right).drop(ref_join_right, axis=1)

df['wt_rate'] = df[num_col].fillna(0) * df[ref_denom_col] / df[denom_col]
df['sq_rate'] = df[num_col].fillna(0) * (df[ref_denom_col] / df[denom_col])**2

if group_cols is not None:
df = df.groupby(group_cols).agg({num_col: 'sum',
denom_col: lambda x: x.sum(skipna=False),
'wt_rate': lambda x: x.sum(skipna=False),
ref_denom_col: lambda x: x.sum(skipna=False),
'sq_rate': lambda x: x.sum(skipna=False)}).reset_index()
else:
df[num_col] = df[num_col].sum()
for col in [denom_col, 'wt_rate', ref_denom_col, 'sq_rate']:
df[col] = df[col].sum(skipna=False)
df = df[[num_col, denom_col, 'wt_rate', ref_denom_col, 'sq_rate']].drop_duplicates()

df = df.groupby(group_cols).agg({num_col: 'sum',
denom_col: lambda x: x.sum(skipna=False),
'wt_rate': lambda x: x.sum(skipna=False),
ref_denom_col: lambda x: x.sum(skipna=False),
'sq_rate': lambda x: x.sum(skipna=False)}).reset_index()

df['Value'] = df['wt_rate'] / df[ref_denom_col] * multiplier
df['vardsr'] = 1 / df[ref_denom_col]**2 * df['sq_rate']

Expand All @@ -103,6 +97,9 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =

if metadata:
df = metadata_cols(df, f'DSR per {multiplier}', confidence, 'Dobson')

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df

Expand Down
2 changes: 0 additions & 2 deletions ph_statistical_methods/ISRate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ def ph_ISRate(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = N

df['exp_x'] = df[ref_num_col].fillna(0) / df[ref_denom_col] * df[denom_col].fillna(0)

## TODO: add ref rate groupby

if obs_df is not None:
df = df.groupby(group_cols).agg({'exp_x': lambda x: x.sum(skipna=False),
ref_num_col: 'sum',
Expand Down
7 changes: 5 additions & 2 deletions ph_statistical_methods/ISRatio.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .validation import metadata_cols, ci_col, validate_data, format_args, check_kwargs


def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols,
def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = None,
metadata = True, confidence = 0.95, refvalue = 1, **kwargs):

"""Calculates standard mortality ratios (or indirectly standardised ratios) with
Expand Down Expand Up @@ -57,7 +57,7 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols,
confidence, group_cols = format_args(confidence, group_cols)
ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_num_col, ref_denom_col)
obs_df, obs_join_left, obs_join_right = check_kwargs(df, kwargs, 'obs', num_col)
validate_data(df, denom_col, group_cols, metadata, ref_df = ref_df)
df = validate_data(df, denom_col, group_cols, metadata, ref_df = ref_df)

if ref_df is not None:
df = df.merge(ref_df, how = 'left', left_on = ref_join_left, right_on = ref_join_right).drop(ref_join_right, axis=1)
Expand All @@ -82,6 +82,9 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols,
if metadata:
method = np.where(df['Observed'] < 10, 'Exact', 'Byars')
df = metadata_cols(df, f'indirectly standardised ratio x {refvalue}', confidence, method)

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df

Expand Down
13 changes: 1 addition & 12 deletions ph_statistical_methods/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1 @@
# External Functions
from .confidence_intervals import *
from .utils_funnel import sigma_adjustment, poisson_funnel, funnel_ratio_significance
from .utils import euro_standard_pop, join_euro_standard_pops, get_calc_variables
from .funnels import calculate_funnel_limits, assign_funnel_significance, calculate_funnel_points
from .DSR import ph_dsr
from .ISRatio import ph_ISRatio
from .ISRate import ph_ISRate
from .means import ph_mean
from .proportions import ph_proportion
from .rates import ph_rate
from .quantiles import ph_quantile

2 changes: 1 addition & 1 deletion ph_statistical_methods/confidence_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import warnings
from math import sqrt
from scipy.stats import chi2, norm
from .utils import get_calc_variables
from scipy import stats

from .utils import get_calc_variables

def wilson_lower(count, denominator, confidence=0.95):

Expand Down
File renamed without changes.
7 changes: 5 additions & 2 deletions ph_statistical_methods/means.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .confidence_intervals import student_t_dist
from .validation import metadata_cols, ci_col, validate_data, format_args

def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):
def ph_mean(df, num_col, group_cols = None, metadata = True, confidence = 0.95):

"""Calculates means with confidence limits using Student-t distribution.
Expand All @@ -31,7 +31,7 @@ def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):

# Check data and arguments
confidence, group_cols = format_args(confidence, group_cols)
validate_data(df, num_col, group_cols, metadata)
df = validate_data(df, num_col, group_cols, metadata)

if group_cols is None:
raise TypeError('group_cols cannot be None for a mean statistic')
Expand All @@ -52,5 +52,8 @@ def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):

if metadata:
df = metadata_cols(df, 'Mean', confidence, "Student's t-distribution")

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df
15 changes: 5 additions & 10 deletions ph_statistical_methods/proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,6 @@
from .confidence_intervals import wilson_lower, wilson_upper
from .validation import metadata_cols, ci_col, format_args, validate_data

#df = pd.read_excel('unit_tests/test_data/testdata_Proportion.xlsx')

df = pd.DataFrame({'area': [1, 2]*6,
'area2': ['Area7', 'Area2','Area1']*4,
'num': [None, 82, 9, 48, 65, 8200, 10000, 10000, 8, 7, 750, 900],
'den': [100, 10000, 10000, 10000] * 3})


def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, confidence = 0.95, multiplier = 1):
"""Calculates proportions with confidence limits using Wilson Score method.
Expand Down Expand Up @@ -44,7 +37,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co

# Check data and arguments
confidence, group_cols = format_args(confidence, group_cols)
validate_data(df, num_col, group_cols, metadata, denom_col)
df = validate_data(df, num_col, group_cols, metadata, denom_col)

if not isinstance(multiplier, int) or multiplier <= 0:
raise ValueError("'Multiplier' must be a positive integer")
Expand All @@ -53,8 +46,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
raise ValueError('Numerators must be less than or equal to the denominator for a proportion statistic')

# Sum Numerator and Denominator columns, ensure NAs are included.
if group_cols is not None:
df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()

### Calculate statistic
df['Value'] = (df[num_col] / df[denom_col]) * multiplier
Expand All @@ -70,4 +62,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
statistic = 'Percentage' if multiplier == 100 else f'Proportion of {multiplier}'
df = metadata_cols(df, statistic, confidence, 'Wilson')

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df
19 changes: 11 additions & 8 deletions ph_statistical_methods/quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,19 @@ def ph_quantile(df, values, group_cols = None, nquantiles = 10, invert = True, t

if not isinstance(invert, bool):
raise TypeError("Pass 'invert' as a boolean")

# Allows us to group data when group_cols is None in format args.
if group_cols == ['ph_pkg_group']:
df['ph_pkg_group'] = 'ph_pkg_group'

check_arguments(df, [values] if group_cols is None else [values] + group_cols)

# Additional columns in output
df["nquantiles"] = nquantiles
df['nquantiles'] = nquantiles

# Calculate Quantiles
if group_cols is None:
df['num_rows'] = df[values].count() # Number of rows in total
df['rank'] = df[values].rank(ascending = not invert, method='min') # Rank each value
else:
df['num_rows'] = df.groupby(group_cols)[values].transform(lambda x: x.count()) # Number of rows in each group
df['rank'] = df.groupby(group_cols)[values].rank(ascending = not invert, method='min') # Rank each value in each group
# Calculate Quantiles
df['num_rows'] = df.groupby(group_cols)[values].transform(lambda x: x.count()) # Number of rows in each group
df['rank'] = df.groupby(group_cols)[values].rank(ascending = not invert, method='min') # Rank each value in each group


# Assign a quantile based on rank and number of rows in each group
Expand All @@ -87,5 +87,8 @@ def ph_quantile(df, values, group_cols = None, nquantiles = 10, invert = True, t
# Drop columns if required
if type == "standard":
df = df.drop(['num_rows', 'rank', 'nquantiles', 'qinverted'], axis=1)

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df
8 changes: 5 additions & 3 deletions ph_statistical_methods/rates.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,12 @@ def ph_rate(df, num_col, denom_col, group_cols = None, metadata = True, confiden

# Check data and arguments
confidence, group_cols = format_args(confidence, group_cols)
validate_data(df, num_col, group_cols, metadata, denom_col)
df = validate_data(df, num_col, group_cols, metadata, denom_col)

if not isinstance(multiplier, int) or multiplier <= 0:
raise ValueError("'Multiplier' must be a positive integer")

if group_cols is not None:
df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()

#calculate value column
df['Value'] = df[num_col] / df[denom_col] * multiplier
Expand All @@ -54,5 +53,8 @@ def ph_rate(df, num_col, denom_col, group_cols = None, metadata = True, confiden
if metadata:
method = np.where(df[num_col] < 10, 'Exact', 'Byars')
df = metadata_cols(df, f'Rate per {multiplier}', confidence, method)

if group_cols == ['ph_pkg_group']:
df = df.drop(columns='ph_pkg_group')

return df
2 changes: 1 addition & 1 deletion ph_statistical_methods/tests/test_ISRate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

class TestISRate:

path = Path(__file__).parent / 'tests/test_data/testdata_DSR_ISR.xlsx'
path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'

data = pd.read_excel(path, sheet_name = 'testdata_multiarea_isr')
results = pd.read_excel(path, sheet_name = 'testresults_ISR')
Expand Down
2 changes: 1 addition & 1 deletion ph_statistical_methods/tests/test_ISRatio.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

class TestISRatio:

path = Path(__file__).parent / 'tests/test_data/testdata_DSR_ISR.xlsx'
path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'

# Import data - remove last Multiplier column as not a function output - just used for Excel calculation
data = pd.read_excel(path, sheet_name = 'testdata_multiarea_isr')
Expand Down
12 changes: 6 additions & 6 deletions ph_statistical_methods/tests/test_proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,21 @@ def test_multiplier_error(self, multiplier):
ph_proportion(self.data, 'Numerator', 'Denominator', multiplier = multiplier)

def test_default(self):
df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[:8, self.cols_95])

def test_2ci(self):
df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', confidence = [0.95, 0.998])
df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area', confidence = [0.95, 0.998])
assert_frame_equal(df, self.data.iloc[:8, :])

def test_percentage(self):
df = ph_proportion(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', multiplier = 100)\
df = ph_proportion(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area', multiplier = 100)\
.drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[8:16, self.cols_95])
assert_frame_equal(df, self.data.iloc[8:16, self.cols_95].reset_index(drop=True))

def test_NAs(self):
df = ph_proportion(self.data.iloc[16:, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[16:, self.cols_95])
df = ph_proportion(self.data.iloc[16:, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[16:, self.cols_95].reset_index(drop=True))

def test_group(self):
df = ph_proportion(self.data, 'Numerator', 'Denominator', group_cols = 'Area')
Expand Down
4 changes: 2 additions & 2 deletions ph_statistical_methods/tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class TestQuantiles:

# Import data - remove last Multiplier column as not a function output - just used for Excel calculation
path = Path(__file__).parent / 'tests/test_data/testdata_Quantiles.xlsx'
path = Path(__file__).parent / 'test_data/testdata_Quantiles.xlsx'

data = pd.read_excel(path)

Expand Down Expand Up @@ -81,4 +81,4 @@ def test_invert_er(self):
with pytest.raises(TypeError, match = "Pass 'invert' as a boolean"):
ph_quantile(self.df, 'val', invert = "True")

# Other error handling tests are covered in test_validation.py
# Other error handling tests are covered in test_validation.py
14 changes: 7 additions & 7 deletions ph_statistical_methods/tests/test_rates.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ class Test_rates:
cols_95 = [0,1,2,3,4,5,8,9]

def test_default(self):
df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[8:16, self.cols_95])
df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[8:16, self.cols_95].reset_index(drop=True))

def test_multiplier(self):
df = ph_rate(self.data.iloc[:8, :3], 'Numerator', 'Denominator', multiplier=100).drop(['Confidence'], axis=1)
df = ph_rate(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area', multiplier=100).drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[:8, self.cols_95])

@pytest.mark.parametrize('multiplier', [(-10), (1.5)])
Expand All @@ -35,12 +35,12 @@ def test_multiplier_error(self, multiplier):
ph_rate(self.data, 'Numerator', 'Denominator', multiplier = multiplier)

def test_2ci(self):
df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', confidence = [0.95, 0.998]).drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[8:16, :])
df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area', confidence = [0.95, 0.998]).drop(['Confidence'], axis=1)
assert_frame_equal(df, self.data.iloc[8:16, :].reset_index(drop=True))

def test_NAs(self):
df = ph_rate(self.data.iloc[16:, :3], 'Numerator', 'Denominator').drop(['Confidence','Method'], axis=1)
assert_frame_equal(df, self.data.iloc[16:, self.cols_95].drop('Method', axis=1))
df = ph_rate(self.data.iloc[16:, :3], 'Numerator', 'Denominator','Area').drop(['Confidence','Method'], axis=1)
assert_frame_equal(df, self.data.iloc[16:, self.cols_95].drop('Method', axis=1).reset_index(drop=True))

# dropping 'method' column for now while figure out method column
def test_group(self):
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ methods approved for use in the production of Public Health indicators such as
those presented via [Fingertips](https://fingertips.phe.org.uk/). It
provides functions for the generation of Proportions, Rates, DSRs, ISRs,
and Means including confidence intervals for these statistics,
and a function for assigning data to quantiles.
and a function for assigning data to quantiles.

Any feedback would be appreciated and can be provided using the Issues
section of the [PH_statistical_methods GitHub
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
install_requires=['numpy >= 1.25.0',
'pandas >= 2.0.0',
'pytest >= 8.0.0',
'scipy >= 1.8.0']
'scipy >= 1.8.0',
'openpyxl >= 3.1.0']
)

0 comments on commit 8e9f68b

Please sign in to comment.