Skip to content

Commit

Permalink
Merge pull request #91 from DataS-DHSC/Dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
annabelwestermann96 authored May 17, 2024
2 parents b5376cf + c4eca75 commit 3e9061a
Show file tree
Hide file tree
Showing 19 changed files with 259 additions and 1,435 deletions.
8 changes: 2 additions & 6 deletions PHStatsMethods/DSR.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
**kwargs:
ref_df: DataFrame of reference data to join.
ref_join_left: A string or list of column name(s) in `df` to join on to.
ref_join_right: A string or list of column name(s) in `ref_df` to join on to.
ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
Returns:
DataFrame of calculated rates and confidence intervals
Expand All @@ -56,8 +54,6 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
(2) Dobson A et al. Confidence intervals for weighted sums of Poisson parameters. Stat Med 1991;10:457-62.
"""

df = df.copy().reset_index(drop=True)

if not isinstance(multiplier, int) or multiplier <= 0:
raise ValueError("'Multiplier' must be a positive integer")
Expand Down
17 changes: 10 additions & 7 deletions PHStatsMethods/ISRate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@ def ph_ISRate(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = N
**kwargs:
ref_df
ref_join_left
ref_join_right
obs_df
obs_join_left
obs_join_right
ref_df: DataFrame of reference data to join.
ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
obs_df: DataFrame of total observed events for each group.
obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
Returns:
df: Dataframe containing calculated IS Rates.
"""

df = df.copy()
confidence, group_cols = format_args(confidence, group_cols)
ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_num_col, ref_denom_col)
obs_df, obs_join_left, obs_join_right = check_kwargs(df, kwargs, 'obs', num_col)
Expand Down
12 changes: 6 additions & 6 deletions PHStatsMethods/ISRatio.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols =
refvalue (int): the standardised reference ratio, default = 1
**kwargs:
ref_df
ref_join_left
ref_join_right
obs_df
obs_join_left
obs_join_right
ref_df: DataFrame of reference data to join.
ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
obs_df: DataFrame of total observed events for each group.
obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
Returns:
df: Dataframe containing calculated IS Ratios.
Expand Down
129 changes: 124 additions & 5 deletions PHStatsMethods/funnels.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,56 @@
import numpy as np
from math import floor, ceil

from .validation import metadata_cols
from .validation import metadata_cols, validate_data
from .utils_funnel import signif_floor, signif_ceiling, sigma_adjustment, poisson_funnel, funnel_ratio_significance


def calculate_funnel_limits(df, num_col, statistic, multiplier, denom_col = None, metadata = True,
rate = None, ratio_type = None, rate_type = None, years_of_data = None):
rate = None, rate_type = None, ratio_type = None, years_of_data = None):
"""Calculates control limits adopting a consistent method as per the Fingertips Technical Guidance
Args:
df: DataFrame containing the data to calculate control limits for.
num_col (str): Name of column containing observed number of cases in the sample
(the numerator of the population).
statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
multiplier (int): multiplier used to express the final values (e.g. 100 = percentage)
denom_col (str): Name of column containing number of cases in sample
(the denominator of the population).
metadata (bool): Whether to include information on the statistic and confidence interval methods.
rate (str): column name containing the 'rate'.
rate_type (str): if statistic is 'rate', specify either 'dsr' or 'crude'.
ratio_type (str): if statistic is 'ratio', specify either 'count' or 'isr' (indirectly standardised ratio).
years_of_data (int): number of years the data represents; this is required if statistic is 'ratio'
Returns:
DataFrame of calculated confidence limits.
"""

df = validate_data(df, num_col, denom_col = denom_col, metadata = metadata)

if (df[num_col].isna()).any():
raise ValueError('Numerators must provided for all records, even when their values are 0')

if denom_col is not None:
if (df[denom_col].isna()).any():
raise ValueError('Numerators must provided for all records, even when their values are 0')

if statistic not in ['rate', 'proportion', 'ratio']:
raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")

if statistic == 'rate':
if rate is None or rate_type is None or years_of_data is None or multiplier is None:
raise TypeError("'rate', 'rate_type', 'years_of_data' and 'multiplier' are required for rate statistics")
elif rate_type not in ['dsr', 'crude']:
if rate_type not in ['dsr', 'crude']:
raise ValueError("only 'dsr' and 'crude' are valid rate_types")

if denom_col is None and (df[rate].isna()).any():
raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0 or a denominator must be provided")

if denom_col is None and (df[num_col] == 0).any():
raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")

elif statistic in ['proportion', 'ratio']:
if denom_col is None:
Expand Down Expand Up @@ -136,9 +171,63 @@ def calculate_funnel_limits(df, num_col, statistic, multiplier, denom_col = None



def assign_funnel_significance(df, num_col, denom_col, statistic, rate = None, rate_type = None, multiplier = None):
def assign_funnel_significance(df, num_col, statistic, denom_col = None, rate = None, rate_type = None, multiplier = None):
"""Identifies whether each value in a dataset falls outside of 95 and/or 99.8 percent control limits based on the
aggregated average value across the whole dataset as an indicator of statistically significant difference.
Args:
df: DataFrame containing the data to calculate control limits for.
num_col (str): Name of column containing observed number of cases in the sample
(the numerator of the population).
statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
denom_col (str): Name of column containing number of cases in sample
(the denominator of the population).
metadata (bool): Whether to include information on the statistic and confidence interval methods.
rate (str): column name containing the 'rate'.
rate_type (str): if statistic is 'rate', specify either 'dsr' or 'crude'.
multiplier (int): multiplier the rate is normalised with (i.e. per 100000) only required when statistic is 'rate'.
Returns:
DataFrame of calculated significance levels.
"""

if statistic not in ['rate', 'proportion', 'ratio']:
raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")

df = validate_data(df, num_col, denom_col = denom_col)

if (df[num_col].isna()).any():
raise ValueError('Numerators must provided for all records, even when their values are 0')

if denom_col is not None:
if (df[denom_col].isna()).any():
raise ValueError('Numerators must provided for all records, even when their values are 0')

if statistic not in ['rate', 'proportion', 'ratio']:
raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")

if statistic == 'rate':
if rate is None or rate_type is None or multiplier is None:
raise TypeError("'rate', 'rate_type', and 'multiplier' are required for rate statistics")

elif rate_type not in ['dsr', 'crude']:
raise ValueError("only 'dsr' and 'crude' are valid rate_types")

elif denom_col is None and (df[rate].isna()).any():
raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0 or a denominator must be provided")

if denom_col is None and (df[num_col] == 0).any():
raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")

elif statistic in ['proportion', 'ratio']:
if denom_col is None:
raise TypeError("'denom_col' must be given for 'proportion' and 'ratio' statistics")

if statistic == 'proportion':
if (df[num_col] > df[denom_col]).any():
raise ValueError('Numerators must be less than or equal to the denominator for a proportion statistic')

av = df[num_col].sum() / df[denom_col].sum() # don't need skipna here as validation ensures no nulls

df['significance'] = np.where(df[num_col] / df[denom_col] < df[denom_col].apply(lambda x: sigma_adjustment(0.999, x, av, 'low', 1)), 'Low (0.001)',
Expand Down Expand Up @@ -180,7 +269,37 @@ def assign_funnel_significance(df, num_col, denom_col, statistic, rate = None, r


def calculate_funnel_points(df, num_col, rate, rate_type, denom_col = None,
multiplier = None, years_of_data = None):
multiplier = 100000, years_of_data = 1):
"""For rate-based funnels: Derive rate and annual population values for charting based. Process removes rates where the
rate type is dsr and the number of observed events are below 10.
Args:
df: DataFrame containing the data to calculate control limits for.
num_col (str): Name of column containing observed number of cases in the sample
(the numerator of the population).
statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
denom_col (str): Name of column containing number of cases in sample
(the denominator of the population).
metadata (bool): Whether to include information on the statistic and confidence interval methods.
years_of_data (int): number of years the data represents
multiplier (int): multiplier the rate is normalised with (i.e. per 100000).
Returns:
DataFrame of calculated funnel points. First will have the same name as the rate field,
with the suffix '_chart', the second will be called denominator_derived.
"""

df = validate_data(df, num_col, denom_col = denom_col)

if rate_type not in ['dsr', 'crude']:
raise ValueError("only 'dsr' and 'crude' are valid rate_types")

if (df[rate].isna()).any():
raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0")

if denom_col is None and (df[num_col] == 0).any():
raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")

if rate_type == 'dsr':
df[f'{rate}_chart'] = np.where(df[num_col] < 10, np.nan,
Expand Down
3 changes: 0 additions & 3 deletions PHStatsMethods/proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
"""

# Ensure original df remains unchanged
df = df.copy()

# Check data and arguments
confidence, group_cols = format_args(confidence, group_cols)
df = validate_data(df, num_col, group_cols, metadata, denom_col)
Expand Down
53 changes: 28 additions & 25 deletions PHStatsMethods/tests/test_DSR.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,42 @@

import pytest
import pandas as pd
from pathlib import Path
from pandas.testing import assert_frame_equal

from ..DSR import ph_dsr

# class Test_DSR:
class Test_DSR:

path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'

# data = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testdata_multiarea')
# results = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testresults_DSR')\
# .drop('statistic', axis=1).astype({'Total Count':'float64'})
# ref_data = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testdata_1976').astype({'count':'float64'})
data = pd.read_excel(path, sheet_name='testdata_multiarea')
results = pd.read_excel(path, sheet_name='testresults_DSR')\
.drop('statistic', axis=1).astype({'Total Count':'float64'})
ref_data = pd.read_excel(path, sheet_name='testdata_1976').astype({'count':'float64'})

# cols_95 = [0,1,2,3,4,5,8]
cols_95 = [0,1,2,3,4,5,8]

# def test_esp_and_NAs(self):
# df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area').drop(['Confidence', 'Statistic'], axis=1)
# assert_frame_equal(df, self.results.iloc[4:7, self.cols_95].reset_index(drop=True))
def test_esp_and_NAs(self):
df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area').drop(['Confidence', 'Statistic'], axis=1)
assert_frame_equal(df, self.results.iloc[4:7, self.cols_95].reset_index(drop=True))

# def test_2cis(self):
# df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area', confidence = [0.95, 0.998]).drop(['Confidence', 'Statistic'], axis=1)
# assert_frame_equal(df, self.results.iloc[4:7, :].reset_index(drop=True))
def test_2cis(self):
df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area', confidence = [0.95, 0.998]).drop(['Confidence', 'Statistic'], axis=1)
assert_frame_equal(df, self.results.iloc[4:7, :].reset_index(drop=True))

# def test_ref_denom_col(self):
# df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops = False).drop(['Confidence', 'Statistic'], axis=1)
# assert_frame_equal(df.astype({'Total Count':'float64'}),
# self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
def test_ref_denom_col(self):
df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops = False).drop(['Confidence', 'Statistic'], axis=1)
assert_frame_equal(df.astype({'Total Count':'float64'}),
self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))

# def test_ref_df(self):
# df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops=False,
# ref_df = self.ref_data, ref_join_left = 'Age Band', ref_join_right = 'Age Band')\
# .drop(['Confidence', 'Statistic'], axis=1)
# assert_frame_equal(df.astype({'Total Count':'float64'}),
# self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
def test_ref_df(self):
df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops=False,
ref_df = self.ref_data, ref_join_left = 'Age Band', ref_join_right = 'Age Band')\
.drop(['Confidence', 'Statistic'], axis=1)
assert_frame_equal(df.astype({'Total Count':'float64'}),
self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))

# def test_multiplier(self):
# df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols = 'area', multiplier = 10000).drop(['Confidence', 'Statistic'], axis=1)
# assert_frame_equal(df, self.results.iloc[:3, self.cols_95].reset_index(drop=True))
def test_multiplier(self):
df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols = 'area', multiplier = 10000).drop(['Confidence', 'Statistic'], axis=1)
assert_frame_equal(df, self.results.iloc[:3, self.cols_95].reset_index(drop=True))
Loading

0 comments on commit 3e9061a

Please sign in to comment.