Merge branch 'main' of https://github.com/DataS-DHSC/PH_statistical_m…

…ethods
DataS-DHSC · May 17, 2024 · 8e9f68b · 8e9f68b
2 parents 7f8d154 + 86dba99
commit 8e9f68b
Show file tree

Hide file tree

Showing 19 changed files with 65 additions and 73 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -26,7 +26,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
-        if [ -f ph_statistical_methods/requirements.txt ]; then pip install -r ph_statistical_methods/requirements.txt; fi
+        python -m pip install -r requirements.txt
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names

diff --git a/ph_statistical_methods/DSR.py b/ph_statistical_methods/DSR.py
@@ -69,26 +69,20 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
 
     confidence, group_cols = format_args(confidence, group_cols)
     ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_denom_col)
-    validate_data(df, num_col, group_cols, metadata, denom_col, ref_df = ref_df)
+    df = validate_data(df, num_col, group_cols, metadata, denom_col, ref_df = ref_df)
 
     if ref_df is not None and euro_standard_pops == False:
         df = df.merge(ref_df, how = 'left', left_on = ref_join_left, right_on = ref_join_right).drop(ref_join_right, axis=1)
 
     df['wt_rate'] = df[num_col].fillna(0) * df[ref_denom_col] / df[denom_col]
     df['sq_rate'] = df[num_col].fillna(0) * (df[ref_denom_col] / df[denom_col])**2
 
-    if group_cols is not None:
-        df = df.groupby(group_cols).agg({num_col: 'sum', 
-                                          denom_col: lambda x: x.sum(skipna=False), 
-                                          'wt_rate': lambda x: x.sum(skipna=False),
-                                          ref_denom_col: lambda x: x.sum(skipna=False), 
-                                          'sq_rate': lambda x: x.sum(skipna=False)}).reset_index()
-    else:
-        df[num_col] = df[num_col].sum()
-        for col in [denom_col, 'wt_rate', ref_denom_col, 'sq_rate']:
-            df[col] = df[col].sum(skipna=False)
-        df = df[[num_col, denom_col, 'wt_rate', ref_denom_col, 'sq_rate']].drop_duplicates()
-
+    df = df.groupby(group_cols).agg({num_col: 'sum', 
+                                      denom_col: lambda x: x.sum(skipna=False), 
+                                      'wt_rate': lambda x: x.sum(skipna=False),
+                                      ref_denom_col: lambda x: x.sum(skipna=False), 
+                                      'sq_rate': lambda x: x.sum(skipna=False)}).reset_index()
+
     df['Value'] = df['wt_rate'] / df[ref_denom_col] * multiplier
     df['vardsr'] = 1 / df[ref_denom_col]**2 * df['sq_rate']
 
@@ -103,6 +97,9 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
 
     if metadata:
         df = metadata_cols(df, f'DSR per {multiplier}', confidence, 'Dobson')
+
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group') 
 
     return df
 

diff --git a/ph_statistical_methods/ISRate.py b/ph_statistical_methods/ISRate.py
@@ -48,8 +48,6 @@ def ph_ISRate(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = N
 
     df['exp_x'] = df[ref_num_col].fillna(0) / df[ref_denom_col] * df[denom_col].fillna(0)
 
-    ## TODO: add ref rate groupby
-
     if obs_df is not None:
         df = df.groupby(group_cols).agg({'exp_x': lambda x: x.sum(skipna=False),
                                          ref_num_col: 'sum',

diff --git a/ph_statistical_methods/ISRatio.py b/ph_statistical_methods/ISRatio.py
@@ -12,7 +12,7 @@
 from .validation import metadata_cols, ci_col, validate_data, format_args, check_kwargs
 
 
-def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols, 
+def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = None, 
                       metadata = True, confidence = 0.95, refvalue = 1, **kwargs):
 
     """Calculates standard mortality ratios (or indirectly standardised ratios) with
@@ -57,7 +57,7 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols,
     confidence, group_cols = format_args(confidence, group_cols)
     ref_df, ref_join_left, ref_join_right = check_kwargs(df, kwargs, 'ref', ref_num_col, ref_denom_col)
     obs_df, obs_join_left, obs_join_right = check_kwargs(df, kwargs, 'obs', num_col)
-    validate_data(df, denom_col, group_cols, metadata, ref_df = ref_df)
+    df = validate_data(df, denom_col, group_cols, metadata, ref_df = ref_df)
 
     if ref_df is not None:
         df = df.merge(ref_df, how = 'left', left_on = ref_join_left, right_on = ref_join_right).drop(ref_join_right, axis=1)
@@ -82,6 +82,9 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols,
     if metadata:
         method = np.where(df['Observed'] < 10, 'Exact', 'Byars')
         df = metadata_cols(df, f'indirectly standardised ratio x {refvalue}', confidence, method)
+
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group') 
 
     return df
 

diff --git a/ph_statistical_methods/__init__.py b/ph_statistical_methods/__init__.py
@@ -1,12 +1 @@
-# External Functions
-from .confidence_intervals import *
-from .utils_funnel import sigma_adjustment, poisson_funnel, funnel_ratio_significance
-from .utils import euro_standard_pop, join_euro_standard_pops, get_calc_variables
-from .funnels import calculate_funnel_limits, assign_funnel_significance, calculate_funnel_points
-from .DSR import ph_dsr
-from .ISRatio import ph_ISRatio
-from .ISRate import ph_ISRate
-from .means import ph_mean
-from .proportions import ph_proportion
-from .rates import ph_rate
-from .quantiles import ph_quantile
+
diff --git a/ph_statistical_methods/confidence_intervals.py b/ph_statistical_methods/confidence_intervals.py
@@ -9,9 +9,9 @@
 import warnings
 from math import sqrt
 from scipy.stats import chi2, norm
-from .utils import get_calc_variables
 from scipy import stats
 
+from .utils import get_calc_variables
 
 def wilson_lower(count, denominator, confidence=0.95):
 

diff --git a/ph_statistical_methods/Funnels.py → ph_statistical_methods/funnels.py b/ph_statistical_methods/Funnels.py → ph_statistical_methods/funnels.py
diff --git a/ph_statistical_methods/means.py b/ph_statistical_methods/means.py
@@ -11,7 +11,7 @@
 from .confidence_intervals import student_t_dist
 from .validation import metadata_cols, ci_col, validate_data, format_args
 
-def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):
+def ph_mean(df, num_col, group_cols = None, metadata = True, confidence = 0.95):
 
     """Calculates means with confidence limits using Student-t distribution.
 
@@ -31,7 +31,7 @@ def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):
 
     # Check data and arguments
     confidence, group_cols = format_args(confidence, group_cols)
-    validate_data(df, num_col, group_cols, metadata)
+    df = validate_data(df, num_col, group_cols, metadata)
 
     if group_cols is None:
         raise TypeError('group_cols cannot be None for a mean statistic')
@@ -52,5 +52,8 @@ def ph_mean(df, num_col, group_cols, metadata = True, confidence = 0.95):
 
     if metadata:
         df = metadata_cols(df, 'Mean', confidence, "Student's t-distribution")
+
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group') 
 
     return df
diff --git a/ph_statistical_methods/proportions.py b/ph_statistical_methods/proportions.py
@@ -10,13 +10,6 @@
 from .confidence_intervals import wilson_lower, wilson_upper
 from .validation import metadata_cols, ci_col, format_args, validate_data
 
-#df = pd.read_excel('unit_tests/test_data/testdata_Proportion.xlsx')
-
-df = pd.DataFrame({'area': [1, 2]*6,
-                    'area2': ['Area7', 'Area2','Area1']*4,
-                   'num': [None, 82, 9, 48, 65, 8200, 10000, 10000, 8, 7, 750, 900],
-                   'den': [100, 10000, 10000, 10000] * 3})
-
 
 def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, confidence = 0.95, multiplier = 1):
     """Calculates proportions with confidence limits using Wilson Score method.
@@ -44,7 +37,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
 
     # Check data and arguments
     confidence, group_cols = format_args(confidence, group_cols)
-    validate_data(df, num_col, group_cols, metadata, denom_col)
+    df = validate_data(df, num_col, group_cols, metadata, denom_col)
 
     if not isinstance(multiplier, int) or multiplier <= 0:
         raise ValueError("'Multiplier' must be a positive integer")
@@ -53,8 +46,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
         raise ValueError('Numerators must be less than or equal to the denominator for a proportion statistic')   
 
     # Sum Numerator and Denominator columns, ensure NAs are included. 
-    if group_cols is not None:
-        df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
+    df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
 
     ### Calculate statistic
     df['Value'] = (df[num_col] / df[denom_col]) * multiplier
@@ -70,4 +62,7 @@ def ph_proportion(df, num_col, denom_col, group_cols = None, metadata = True, co
         statistic = 'Percentage' if multiplier == 100 else f'Proportion of {multiplier}'
         df = metadata_cols(df, statistic, confidence, 'Wilson')
 
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group') 
+
     return df
diff --git a/ph_statistical_methods/quantiles.py b/ph_statistical_methods/quantiles.py
@@ -51,19 +51,19 @@ def ph_quantile(df, values, group_cols = None, nquantiles = 10, invert = True, t
 
     if not isinstance(invert, bool):
         raise TypeError("Pass 'invert' as a boolean")
+
+    # Allows us to group data when group_cols is None in format args.
+    if group_cols == ['ph_pkg_group']:
+        df['ph_pkg_group'] = 'ph_pkg_group'
 
     check_arguments(df, [values] if group_cols is None else [values] + group_cols)
 
     # Additional columns in output
-    df["nquantiles"] = nquantiles
+    df['nquantiles'] = nquantiles
 
-    # Calculate Quantiles 
-    if group_cols is None:
-        df['num_rows'] = df[values].count() # Number of rows in total
-        df['rank'] = df[values].rank(ascending = not invert, method='min') # Rank each value
-    else:   
-        df['num_rows'] = df.groupby(group_cols)[values].transform(lambda x: x.count()) # Number of rows in each group
-        df['rank'] = df.groupby(group_cols)[values].rank(ascending = not invert, method='min') # Rank each value in each group
+    # Calculate Quantiles  
+    df['num_rows'] = df.groupby(group_cols)[values].transform(lambda x: x.count()) # Number of rows in each group
+    df['rank'] = df.groupby(group_cols)[values].rank(ascending = not invert, method='min') # Rank each value in each group
 
 
     # Assign a quantile based on rank and number of rows in each group 
@@ -87,5 +87,8 @@ def ph_quantile(df, values, group_cols = None, nquantiles = 10, invert = True, t
     # Drop columns if required
     if type == "standard":
         df = df.drop(['num_rows', 'rank', 'nquantiles', 'qinverted'], axis=1)
+
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group')
 
     return df
diff --git a/ph_statistical_methods/rates.py b/ph_statistical_methods/rates.py
@@ -33,13 +33,12 @@ def ph_rate(df, num_col, denom_col, group_cols = None, metadata = True, confiden
 
     # Check data and arguments
     confidence, group_cols = format_args(confidence, group_cols)
-    validate_data(df, num_col, group_cols, metadata, denom_col)
+    df = validate_data(df, num_col, group_cols, metadata, denom_col)
 
     if not isinstance(multiplier, int) or multiplier <= 0:
         raise ValueError("'Multiplier' must be a positive integer")
 
-    if group_cols is not None:
-        df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
+    df = df.groupby(group_cols)[[num_col, denom_col]].apply(lambda x: x.sum(skipna=False)).reset_index()
 
     #calculate value column
     df['Value'] = df[num_col] / df[denom_col] * multiplier
@@ -54,5 +53,8 @@ def ph_rate(df, num_col, denom_col, group_cols = None, metadata = True, confiden
     if metadata:
         method = np.where(df[num_col] < 10, 'Exact', 'Byars')
         df = metadata_cols(df, f'Rate per {multiplier}', confidence, method)
+
+    if group_cols == ['ph_pkg_group']:
+        df = df.drop(columns='ph_pkg_group') 
 
     return df 
diff --git a/ph_statistical_methods/tests/test_ISRate.py b/ph_statistical_methods/tests/test_ISRate.py
@@ -15,7 +15,7 @@
 
 class TestISRate:
 
-    path = Path(__file__).parent / 'tests/test_data/testdata_DSR_ISR.xlsx'
+    path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'
 
     data = pd.read_excel(path, sheet_name = 'testdata_multiarea_isr')
     results = pd.read_excel(path, sheet_name = 'testresults_ISR')

diff --git a/ph_statistical_methods/tests/test_ISRatio.py b/ph_statistical_methods/tests/test_ISRatio.py
@@ -13,7 +13,7 @@
 
 class TestISRatio:
 
-    path = Path(__file__).parent / 'tests/test_data/testdata_DSR_ISR.xlsx'
+    path = Path(__file__).parent / 'test_data/testdata_DSR_ISR.xlsx'
 
     # Import data - remove last Multiplier column as not a function output - just used for Excel calculation
     data = pd.read_excel(path, sheet_name = 'testdata_multiarea_isr')

diff --git a/ph_statistical_methods/tests/test_proportions.py b/ph_statistical_methods/tests/test_proportions.py
@@ -33,21 +33,21 @@ def test_multiplier_error(self, multiplier):
             ph_proportion(self.data, 'Numerator', 'Denominator', multiplier = multiplier)
 
     def test_default(self):
-        df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
+        df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
         assert_frame_equal(df, self.data.iloc[:8, self.cols_95])
 
     def test_2ci(self):
-        df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', confidence = [0.95, 0.998])
+        df = ph_proportion(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area', confidence = [0.95, 0.998])
         assert_frame_equal(df, self.data.iloc[:8, :])
 
     def test_percentage(self):
-        df = ph_proportion(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', multiplier = 100)\
+        df = ph_proportion(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area', multiplier = 100)\
             .drop(['Confidence'], axis=1)
-        assert_frame_equal(df, self.data.iloc[8:16, self.cols_95])
+        assert_frame_equal(df, self.data.iloc[8:16, self.cols_95].reset_index(drop=True))
 
     def test_NAs(self):
-        df = ph_proportion(self.data.iloc[16:, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
-        assert_frame_equal(df, self.data.iloc[16:, self.cols_95])
+        df = ph_proportion(self.data.iloc[16:, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
+        assert_frame_equal(df, self.data.iloc[16:, self.cols_95].reset_index(drop=True))
 
     def test_group(self):
         df = ph_proportion(self.data, 'Numerator', 'Denominator', group_cols = 'Area')

diff --git a/ph_statistical_methods/tests/test_quantiles.py b/ph_statistical_methods/tests/test_quantiles.py
@@ -16,7 +16,7 @@
 class TestQuantiles:
 
     # Import data - remove last Multiplier column as not a function output - just used for Excel calculation
-    path = Path(__file__).parent / 'tests/test_data/testdata_Quantiles.xlsx'
+    path = Path(__file__).parent / 'test_data/testdata_Quantiles.xlsx'
 
     data = pd.read_excel(path)
 
@@ -81,4 +81,4 @@ def test_invert_er(self):
         with pytest.raises(TypeError, match = "Pass 'invert' as a boolean"):
             ph_quantile(self.df, 'val', invert = "True")
 
-    # Other error handling tests are covered in test_validation.py
+    # Other error handling tests are covered in test_validation.py
diff --git a/ph_statistical_methods/tests/test_rates.py b/ph_statistical_methods/tests/test_rates.py
@@ -22,11 +22,11 @@ class Test_rates:
     cols_95 = [0,1,2,3,4,5,8,9]
 
     def test_default(self):
-        df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator').drop(['Confidence'], axis=1)
-        assert_frame_equal(df, self.data.iloc[8:16, self.cols_95])
+        df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area').drop(['Confidence'], axis=1)
+        assert_frame_equal(df, self.data.iloc[8:16, self.cols_95].reset_index(drop=True))
 
     def test_multiplier(self):
-        df = ph_rate(self.data.iloc[:8, :3], 'Numerator', 'Denominator', multiplier=100).drop(['Confidence'], axis=1)
+        df = ph_rate(self.data.iloc[:8, :3], 'Numerator', 'Denominator', 'Area', multiplier=100).drop(['Confidence'], axis=1)
         assert_frame_equal(df, self.data.iloc[:8, self.cols_95])
 
     @pytest.mark.parametrize('multiplier', [(-10), (1.5)])
@@ -35,12 +35,12 @@ def test_multiplier_error(self, multiplier):
             ph_rate(self.data, 'Numerator', 'Denominator', multiplier = multiplier)
 
     def test_2ci(self):
-         df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', confidence = [0.95, 0.998]).drop(['Confidence'], axis=1)
-         assert_frame_equal(df, self.data.iloc[8:16, :])
+         df = ph_rate(self.data.iloc[8:16, :3], 'Numerator', 'Denominator', 'Area', confidence = [0.95, 0.998]).drop(['Confidence'], axis=1)
+         assert_frame_equal(df, self.data.iloc[8:16, :].reset_index(drop=True))
 
     def test_NAs(self):
-        df = ph_rate(self.data.iloc[16:, :3], 'Numerator', 'Denominator').drop(['Confidence','Method'], axis=1)
-        assert_frame_equal(df, self.data.iloc[16:, self.cols_95].drop('Method', axis=1))
+        df = ph_rate(self.data.iloc[16:, :3], 'Numerator', 'Denominator','Area').drop(['Confidence','Method'], axis=1)
+        assert_frame_equal(df, self.data.iloc[16:, self.cols_95].drop('Method', axis=1).reset_index(drop=True))
 
     # dropping 'method' column for now while figure out method column
     def test_group(self):

diff --git a/readme.md b/readme.md
@@ -4,7 +4,7 @@ methods approved for use in the production of Public Health indicators such as
 those presented via [Fingertips](https://fingertips.phe.org.uk/). It
 provides functions for the generation of Proportions, Rates, DSRs, ISRs,
 and Means including confidence intervals for these statistics,
-and a function for assigning data to quantiles.
+and a function for assigning data to quantiles. 
 
 Any feedback would be appreciated and can be provided using the Issues
 section of the [PH_statistical_methods GitHub

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+.
diff --git a/setup.py b/setup.py
@@ -13,6 +13,7 @@
     install_requires=['numpy >= 1.25.0',
                       'pandas >= 2.0.0',
                       'pytest >= 8.0.0',
-                      'scipy >= 1.8.0']
+                      'scipy >= 1.8.0',
+                      'openpyxl >= 3.1.0']
 )