Merge pull request #91 from DataS-DHSC/Dev

Dev
DataS-DHSC · May 17, 2024 · 3e9061a · 3e9061a
2 parents b5376cf + c4eca75
commit 3e9061a
Show file tree

Hide file tree

Showing 19 changed files with 259 additions and 1,435 deletions.
diff --git a/PHStatsMethods/DSR.py b/PHStatsMethods/DSR.py
@@ -42,10 +42,8 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
         
     **kwargs:
         ref_df: DataFrame of reference data to join.
-        
-        ref_join_left: A string or list of column name(s) in `df` to join on to.
-        
-        ref_join_right: A string or list of column name(s) in `ref_df` to join on to.
+        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
+        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
         
     Returns:
         DataFrame of calculated rates and confidence intervals
@@ -56,8 +54,6 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
         (2) Dobson A et al. Confidence intervals for weighted sums of Poisson parameters. Stat Med 1991;10:457-62.
 
     """
-
-    df = df.copy().reset_index(drop=True)
 
     if not isinstance(multiplier, int) or multiplier <= 0:
         raise ValueError("'Multiplier' must be a positive integer")

diff --git a/PHStatsMethods/ISRate.py b/PHStatsMethods/ISRate.py
@@ -29,15 +29,18 @@ def ph_ISRate(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = N
         
         
     **kwargs:
-        ref_df
-        ref_join_left
-        ref_join_right
-        obs_df
-        obs_join_left
-        obs_join_right
+        ref_df: DataFrame of reference data to join.
+        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
+        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
+        obs_df: DataFrame of total observed events for each group.
+        obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
+        obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
+        
+    Returns:
+        df: Dataframe containing calculated IS Rates.
+        
     """
 
-    df = df.copy()
     confidence, group_cols = format_args(confidence, group_cols)
     ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_num_col, ref_denom_col)
     obs_df, obs_join_left, obs_join_right = check_kwargs(df, kwargs, 'obs', num_col)

diff --git a/PHStatsMethods/ISRatio.py b/PHStatsMethods/ISRatio.py
@@ -41,12 +41,12 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols =
         refvalue (int): the standardised reference ratio, default = 1
         
     **kwargs:
-        ref_df
-        ref_join_left
-        ref_join_right
-        obs_df
-        obs_join_left
-        obs_join_right
+        ref_df: DataFrame of reference data to join.
+        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
+        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
+        obs_df: DataFrame of total observed events for each group.
+        obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
+        obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
         
     Returns:
         df: Dataframe containing calculated IS Ratios.

diff --git a/PHStatsMethods/funnels.py b/PHStatsMethods/funnels.py
@@ -9,21 +9,56 @@
 import numpy as np
 from math import floor, ceil
 
-from .validation import metadata_cols
+from .validation import metadata_cols, validate_data
 from .utils_funnel import signif_floor, signif_ceiling, sigma_adjustment, poisson_funnel, funnel_ratio_significance
 
 
 def calculate_funnel_limits(df, num_col, statistic, multiplier, denom_col = None, metadata = True, 
-                            rate = None, ratio_type = None, rate_type = None, years_of_data = None):
+                            rate = None, rate_type = None, ratio_type = None, years_of_data = None):
+    """Calculates control limits adopting a consistent method as per the Fingertips Technical Guidance
+
+    Args:
+        df: DataFrame containing the data to calculate control limits for.
+        num_col (str): Name of column containing observed number of cases in the sample
+                (the numerator of the population).
+        statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
+        multiplier (int): multiplier used to express the final values (e.g. 100 = percentage)
+        denom_col (str): Name of column containing number of cases in sample 
+                (the denominator of the population).
+        metadata (bool): Whether to include information on the statistic and confidence interval methods.
+        rate (str): column name containing the 'rate'.
+        rate_type (str): if statistic is 'rate', specify either 'dsr' or 'crude'.
+        ratio_type (str): if statistic is 'ratio', specify either 'count' or 'isr' (indirectly standardised ratio).
+        years_of_data (int): number of years the data represents; this is required if statistic is 'ratio'
+
+    Returns:
+        DataFrame of calculated confidence limits.
+        
+    """
+
+    df = validate_data(df, num_col, denom_col = denom_col, metadata = metadata)
+
+    if (df[num_col].isna()).any():
+        raise ValueError('Numerators must provided for all records, even when their values are 0')
+
+    if denom_col is not None:
+        if (df[denom_col].isna()).any():
+            raise ValueError('Numerators must provided for all records, even when their values are 0')
 
     if statistic not in ['rate', 'proportion', 'ratio']:
         raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")
 
     if statistic == 'rate':
         if rate is None or rate_type is None or years_of_data is None or multiplier is None:
             raise TypeError("'rate', 'rate_type', 'years_of_data' and 'multiplier' are required for rate statistics")
-        elif rate_type not in ['dsr', 'crude']:
+        if rate_type not in ['dsr', 'crude']:
             raise ValueError("only 'dsr' and 'crude' are valid rate_types")
+
+        if denom_col is None and (df[rate].isna()).any():
+            raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0 or a denominator must be provided")
+
+        if denom_col is None and (df[num_col] == 0).any():
+            raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")
 
     elif statistic in ['proportion', 'ratio']:
         if denom_col is None:
@@ -136,9 +171,63 @@ def calculate_funnel_limits(df, num_col, statistic, multiplier, denom_col = None
 
 
 
-def assign_funnel_significance(df, num_col, denom_col, statistic, rate = None, rate_type = None, multiplier = None):
+def assign_funnel_significance(df, num_col, statistic, denom_col = None, rate = None, rate_type = None, multiplier = None):
+    """Identifies whether each value in a dataset falls outside of 95 and/or 99.8 percent control limits based on the 
+    aggregated average value across the whole dataset as an indicator of statistically significant difference.
+
+    Args:
+        df: DataFrame containing the data to calculate control limits for.
+        num_col (str): Name of column containing observed number of cases in the sample
+                (the numerator of the population).
+        statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
+        denom_col (str): Name of column containing number of cases in sample 
+                (the denominator of the population).
+        metadata (bool): Whether to include information on the statistic and confidence interval methods.
+        rate (str): column name containing the 'rate'.
+        rate_type (str): if statistic is 'rate', specify either 'dsr' or 'crude'.
+        multiplier (int): multiplier the rate is normalised with (i.e. per 100000) only required when statistic is 'rate'.
+
+    Returns:
+        DataFrame of calculated significance levels.
+        
+    """
+
+    if statistic not in ['rate', 'proportion', 'ratio']:
+        raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")
+
+    df = validate_data(df, num_col, denom_col = denom_col)
+
+    if (df[num_col].isna()).any():
+        raise ValueError('Numerators must provided for all records, even when their values are 0')
+
+    if denom_col is not None:
+        if (df[denom_col].isna()).any():
+            raise ValueError('Numerators must provided for all records, even when their values are 0')
+
+    if statistic not in ['rate', 'proportion', 'ratio']:
+        raise ValueError("'statistic' must be either 'proportion', 'ratio' or 'rate")
+
+    if statistic == 'rate':
+        if rate is None or rate_type is None or multiplier is None:
+            raise TypeError("'rate', 'rate_type', and 'multiplier' are required for rate statistics")
+
+        elif rate_type not in ['dsr', 'crude']:
+            raise ValueError("only 'dsr' and 'crude' are valid rate_types")
+
+        elif denom_col is None and (df[rate].isna()).any():
+            raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0 or a denominator must be provided")
+
+        if denom_col is None and (df[num_col] == 0).any():
+            raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")
 
+    elif statistic in ['proportion', 'ratio']:
+        if denom_col is None:
+            raise TypeError("'denom_col' must be given for 'proportion' and 'ratio' statistics")
+
     if statistic == 'proportion':
+        if (df[num_col] > df[denom_col]).any():
+            raise ValueError('Numerators must be less than or equal to the denominator for a proportion statistic')  
+
         av = df[num_col].sum() / df[denom_col].sum() # don't need skipna here as validation ensures no nulls
 
         df['significance'] = np.where(df[num_col] / df[denom_col] < df[denom_col].apply(lambda x: sigma_adjustment(0.999, x, av, 'low', 1)), 'Low (0.001)',
@@ -180,7 +269,37 @@ def assign_funnel_significance(df, num_col, denom_col, statistic, rate = None, r
 
 
 def calculate_funnel_points(df, num_col, rate, rate_type, denom_col = None,
-                            multiplier = None, years_of_data = None):
+                            multiplier = 100000, years_of_data = 1):
+    """For rate-based funnels: Derive rate and annual population values for charting based. Process removes rates where the 
+    rate type is dsr and the number of observed events are below 10.
+
+    Args:
+        df: DataFrame containing the data to calculate control limits for.
+        num_col (str): Name of column containing observed number of cases in the sample
+                (the numerator of the population).
+        statistic (str): type of statistic to inform funnel calculations: 'proportion', 'rate', or 'ratio'
+        denom_col (str): Name of column containing number of cases in sample 
+                (the denominator of the population).
+        metadata (bool): Whether to include information on the statistic and confidence interval methods.
+        years_of_data (int): number of years the data represents
+        multiplier (int): multiplier the rate is normalised with (i.e. per 100000).
+
+    Returns:
+        DataFrame of calculated funnel points. First will have the same name as the rate field,
+        with the suffix '_chart', the second will be called denominator_derived.
+        
+    """
+
+    df = validate_data(df, num_col, denom_col = denom_col)
+
+    if rate_type not in ['dsr', 'crude']:
+        raise ValueError("only 'dsr' and 'crude' are valid rate_types")
+
+    if (df[rate].isna()).any():
+        raise ValueError("For rates, 'rate' must be provided for all records even if the rate is 0")
+
+    if denom_col is None and (df[num_col] == 0).any():
+        raise ValueError("For rates, where there are 0 events for a record, 'denom_col' must be provided")
 
     if rate_type == 'dsr':
         df[f'{rate}_chart'] = np.where(df[num_col] < 10, np.nan, 

diff --git a/PHStatsMethods/proportions.py b/PHStatsMethods/proportions.py
@@ -32,9 +32,6 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
         
     """
 
-    # Ensure original df remains unchanged 
-    df = df.copy()
-
     # Check data and arguments
     confidence, group_cols = format_args(confidence, group_cols)
     df = validate_data(df, num_col, group_cols, metadata, denom_col)

diff --git a/PHStatsMethods/tests/test_DSR.py b/PHStatsMethods/tests/test_DSR.py
@@ -7,39 +7,42 @@
 
 import pytest
 import pandas as pd
+from pathlib import Path
 from pandas.testing import assert_frame_equal
 
 from ..DSR import ph_dsr
 
-# class Test_DSR:
+class Test_DSR:
+
+    path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'
 
-#     data = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testdata_multiarea')
-#     results = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testresults_DSR')\
-#         .drop('statistic', axis=1).astype({'Total Count':'float64'})  
-#     ref_data = pd.read_excel('tests/test_data/testdata_DSR_ISR.xlsx', sheet_name='testdata_1976').astype({'count':'float64'})  
+    data = pd.read_excel(path, sheet_name='testdata_multiarea')
+    results = pd.read_excel(path, sheet_name='testresults_DSR')\
+        .drop('statistic', axis=1).astype({'Total Count':'float64'})  
+    ref_data = pd.read_excel(path, sheet_name='testdata_1976').astype({'count':'float64'})  
 
-#     cols_95 = [0,1,2,3,4,5,8]
+    cols_95 = [0,1,2,3,4,5,8]
 
-#     def test_esp_and_NAs(self):
-#         df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area').drop(['Confidence', 'Statistic'], axis=1)
-#         assert_frame_equal(df, self.results.iloc[4:7, self.cols_95].reset_index(drop=True))
+    def test_esp_and_NAs(self):
+        df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area').drop(['Confidence', 'Statistic'], axis=1)
+        assert_frame_equal(df, self.results.iloc[4:7, self.cols_95].reset_index(drop=True))
 
-#     def test_2cis(self):
-#         df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area', confidence = [0.95, 0.998]).drop(['Confidence', 'Statistic'], axis=1)
-#         assert_frame_equal(df, self.results.iloc[4:7, :].reset_index(drop=True))
+    def test_2cis(self):
+        df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols='area', confidence = [0.95, 0.998]).drop(['Confidence', 'Statistic'], axis=1)
+        assert_frame_equal(df, self.results.iloc[4:7, :].reset_index(drop=True))
 
-#     def test_ref_denom_col(self):
-#         df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops = False).drop(['Confidence', 'Statistic'], axis=1)
-#         assert_frame_equal(df.astype({'Total Count':'float64'}), 
-#                            self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
+    def test_ref_denom_col(self):
+        df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops = False).drop(['Confidence', 'Statistic'], axis=1)
+        assert_frame_equal(df.astype({'Total Count':'float64'}), 
+                            self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
 
-#     def test_ref_df(self):
-#         df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops=False, 
-#                     ref_df = self.ref_data, ref_join_left = 'Age Band', ref_join_right = 'Age Band')\
-#             .drop(['Confidence', 'Statistic'], axis=1)
-#         assert_frame_equal(df.astype({'Total Count':'float64'}), 
-#                            self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
+    def test_ref_df(self):
+        df = ph_dsr(self.ref_data, 'count', 'pop', 'esp1976', euro_standard_pops=False, 
+                    ref_df = self.ref_data, ref_join_left = 'Age Band', ref_join_right = 'Age Band')\
+            .drop(['Confidence', 'Statistic'], axis=1)
+        assert_frame_equal(df.astype({'Total Count':'float64'}), 
+                            self.results.iloc[7:8, self.cols_95].drop('area', axis=1).reset_index(drop=True))
 
-#     def test_multiplier(self):
-#         df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols = 'area', multiplier = 10000).drop(['Confidence', 'Statistic'], axis=1)
-#         assert_frame_equal(df, self.results.iloc[:3, self.cols_95].reset_index(drop=True))
+    def test_multiplier(self):
+        df = ph_dsr(self.data, 'count', 'pop', 'ageband', group_cols = 'area', multiplier = 10000).drop(['Confidence', 'Statistic'], axis=1)
+        assert_frame_equal(df, self.results.iloc[:3, self.cols_95].reset_index(drop=True))