Merge pull request #115 from DataS-DHSC/Dev

Dev
DataS-DHSC · Jun 7, 2024 · 9c8203c · 9c8203c
2 parents 9220d57 + 44cc2f6
commit 9c8203c
Show file tree

Hide file tree

Showing 16 changed files with 615 additions and 346 deletions.
diff --git a/PHStatsMethods/DSR.py b/PHStatsMethods/DSR.py
@@ -15,43 +15,57 @@ def ph_dsr(df, num_col, denom_col, ref_denom_col, group_cols = None, metadata =
     """Calculates directly standardised rates with confidence limits using Byar's
     method (1) with Dobson method adjustment (2).
     
-    Args:
-        df: DataFrame containing the data to be standardised.
-        
-        num_col (str): column name from data containing the observed number of events for
+    Parameters
+    ----------
+    df: 
+        DataFrame containing the data to be standardised.
+    num_col : str 
+        Column name from data containing the observed number of events for
         each standardisation category (e.g. ageband) within each grouping set (e.g. area).
-        
-        denom_col (str): column name from data containing the population for each standardisation 
+    denom_col : str
+        Column name from data containing the population for each standardisation 
         category (e.g. age band).
-        
-        ref_denom_col (str): the standard populations for each standardisation category (e.g. age band).
+    ref_denom_col : str
+        The standard populations for each standardisation category (e.g. age band).
         This is either the column name in the main dataframe, the reference data if given, or the column
         name of the agebands to join to if `euro_standard_pops` is set to True. 
-        
-        group_cols: A string or list of column name(s) to group the data by. Default to None.
-
-        metadata (bool): Whether to include information on the statistic and confidence interval methods.
-        
-        euro_standard_pops (bool): Whether to use the european standard populations.
+    group_cols : str | list
+        A string or list of column name(s) to group the data by. Default to None.
+    metadata : bool 
+        Whether to include information on the statistic and confidence interval methods.
+    euro_standard_pops : bool 
+        Whether to use the european standard populations.
         You can see what these populations are with `euro_standard_pop()`.
-        
-        multiplier (int): the multiplier used to express the final values. Default 100,000.
-        
-        confidence (float): Confidence interval(s) to use, either as a float, list of float values or None.
+    multiplier : int
+        The multiplier used to express the final values. Default 100,000.
+    confidence : float 
+        Confidence interval(s) to use, either as a float, list of float values or None.
         Confidence intervals must be between 0.9 and 1. Defaults to 0.95 (2 std from mean).
         
-    **kwargs:
-        ref_df: DataFrame of reference data to join.
-        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
-        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
-        
-    Returns:
-        DataFrame of calculated rates and confidence intervals
+    Other Parameters
+    ----------------
+    ref_df
+        DataFrame of reference data to join.
+    ref_join_left : str | list
+        A string or list of column name(s) in `df` to join on to.
+    ref_join_right : str | list
+        A string or list of column name(s) in `ref_df` to join on to.
         
-    References:
-        (1) Breslow NE, Day NE. Statistical methods in cancer research, volume II: The design and analysis of cohort studies. 
+    Returns
+    -------
+    Pandas DataFrame
+        DataFrame of calculated directly standardised rates and confidence intervals
+    
+    Notes
+    -----
+    For total counts >= 10 Byar's method (1) is applied using the internal byars_lower and byars_upper
+    functions. When the total count is < 10 DSRs are not reliable and will therefore not be calculated.
+
+    References
+    ----------
+    (1) Breslow NE, Day NE. Statistical methods in cancer research, volume II: The design and analysis of cohort studies. 
         Lyon: International Agency for Research on Cancer, World Health Organisation; 1987.
-        (2) Dobson A et al. Confidence intervals for weighted sums of Poisson parameters. Stat Med 1991;10:457-62.
+    (2) Dobson A et al. Confidence intervals for weighted sums of Poisson parameters. Stat Med 1991;10:457-62.
 
     """
 

diff --git a/PHStatsMethods/ISRate.py b/PHStatsMethods/ISRate.py
@@ -1,9 +1,4 @@
 # -*- coding: utf-8 -*-
-"""
-Created on Fri Apr 19 10:49:43 2024
-
-@author: T.Vikneswaran_DHSC
-"""
 
 import pandas as pd
 import numpy as np
@@ -15,30 +10,61 @@ def ph_ISRate(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols = N
                      metadata = True, confidence = 0.95, multiplier = 100000, **kwargs):
 
     """Calculates indirectly standardized rates with confidence limits using Byar's or exact CI method.
-    
-    Args:
-        df: DataFrame containing the data.
-        num_col (str): Field containing observed number of events.
-        denom_col (str): Field containing population at risk.
-        ref_num_col (str): Observed events in the reference population.
-        ref_denom_col (str): Population at risk in the reference population.
-        group_cols: Columns to group data by.
-        metadata (bool): Include metadata columns.
-        confidence (float or list): Confidence levels, default 0.95.
-        multiplier (int): The multiplier for the rate calculation, default 100000.
-        
+
+    Parameters
+    ----------
+    df
+        DataFrame containing the data.
+    num_col : str
+        Field containing observed number of events.
+    denom_col : str
+        Field containing population at risk.
+    ref_num_col : str
+        Observed events in the reference population.
+    ref_denom_col : str
+        Population at risk in the reference population.
+    group_cols : str | list
+        Columns to group data by.
+    metadata : bool 
+        Include metadata columns.
+    confidence : float | list 
+        Confidence levels, default 0.95.
+    multiplier : int 
+        The multiplier for the rate calculation, default 100000.
         
-    **kwargs:
-        ref_df: DataFrame of reference data to join.
-        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
-        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
-        obs_df: DataFrame of total observed events for each group.
-        obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
-        obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
         
-    Returns:
-        df: Dataframe containing calculated IS Rates.
+    Other Parameters
+    ----------------
+    ref_df: 
+        DataFrame of reference data to join.
+    ref_join_left : str | list 
+        A string or list of column name(s) in `df` to join on to.
+    ref_join_right : str | list 
+        A string or list of column name(s) in `ref_df` to join on to.
+    obs_df 
+        DataFrame of total observed events for each group.
+    obs_join_left : str | list 
+        A string or list of column name(s) in `df` to join on to.
+    obs_join_right : str | list 
+        A string or list of column name(s) in `obs_df` to join on to.
         
+    Returns
+    -------
+    Pandas DataFrame
+        Dataframe containing calculated IS Rates.
+    
+    Notes
+    -----
+    For numerators >= 10 Byar's method (1) is applied using the internal byars_lower and byars_upper functions. 
+    For small numerators Byar's method is less accurate and so an exact method (2) based on the 
+    Poisson distribution is used.
+
+    References
+    ----------
+    (1) Breslow NE, Day NE. Statistical methods in cancer research, volume II: The design and analysis
+        of cohort studies. Lyon: International Agency for Research on Cancer, World Health Organisation; 1987.
+    (2) Armitage P, Berry G. Statistical methods in medical research (4th edn). Oxford: Blackwell; 2002.
+
     """
 
     confidence, group_cols = format_args(confidence, group_cols)

diff --git a/PHStatsMethods/ISRatio.py b/PHStatsMethods/ISRatio.py
@@ -1,9 +1,4 @@
 # -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 13 13:58:18 2024
-
-@author: Karandeep.Kaur
-"""
 
 import pandas as pd
 import numpy as np
@@ -18,39 +13,61 @@ def ph_ISRatio(df, num_col, denom_col, ref_num_col, ref_denom_col, group_cols =
     """Calculates standard mortality ratios (or indirectly standardised ratios) with
     confidence limits using Byar's (1) or exact (2) CI method.
     
-    Args:
-        df: DataFrame containing the data to calculate IS ratios for.
-        
-        num_col (str): field name from data containing the observed number of events for
+    Parameters
+    ----------
+    df 
+        DataFrame containing the data to calculate IS ratios for.
+    num_col : str
+        Field name from data containing the observed number of events for
         each standardisation category (e.g. ageband) within each grouping set (eg area). If observed_totals is not None,
         then num_col will contain the observations from the observed_totals dataframe.
-        
-        denom_col (str): field name from data containing the population for each standardisation 
+    denom_col : str 
+        Field name from data containing the population for each standardisation 
         category (e.g. age band).
-        
-        ref_num_col (str): the observed number of events in the reference population for
+    ref_num_col : str 
+        The observed number of events in the reference population for
         each standardisation category (eg age band); field name from df or ref_def.
-        
-        ref_denom_col (str): the reference population for each standardisation category (eg age band)
-        
-        group_cols: A string or list of column name(s) to group the data by.
-        
-        confidence (float): Confidence interval(s) to use, either as a float, list of float values or None.
+    ref_denom_col : str 
+        The reference population for each standardisation category (eg age band)
+    group_cols : str | list
+        A string or list of column name(s) to group the data by.
+    confidence : float 
+        Confidence interval(s) to use, either as a float, list of float values or None.
         Confidence intervals must be between 0.9 and 1. Defaults to 0.95 (2 std from mean).
-
-        refvalue (int): the standardised reference ratio, default = 1
+    refvalue : int 
+        The standardised reference ratio, default = 1
         
-    **kwargs:
-        ref_df: DataFrame of reference data to join.
-        ref_join_left (str | list): A string or list of column name(s) in `df` to join on to.
-        ref_join_right (str | list): A string or list of column name(s) in `ref_df` to join on to.
-        obs_df: DataFrame of total observed events for each group.
-        obs_join_left (str | list): A string or list of column name(s) in `df` to join on to.
-        obs_join_right (str | list): A string or list of column name(s) in `obs_df` to join on to.
+    Other Parameters
+    ----------------
+    ref_df: 
+        DataFrame of reference data to join.
+    ref_join_left : str | list 
+        A string or list of column name(s) in `df` to join on to.
+    ref_join_right : str | list
+        A string or list of column name(s) in `ref_df` to join on to.
+    obs_df: 
+        DataFrame of total observed events for each group.
+    obs_join_left : str | list 
+        A string or list of column name(s) in `df` to join on to.
+    obs_join_right : str | list 
+        A string or list of column name(s) in `obs_df` to join on to.
         
-    Returns:
-        df: Dataframe containing calculated IS Ratios.
+    Returns
+    -------
+    Pandas DataFrame
+        Dataframe containing calculated IS Ratios.
+
+    Notes
+    -----
+    For numerators >= 10 Byar's method (1) is applied using the internal byars_lower and byars_upper functions. 
+    For small numerators Byar's method is less accurate and so an exact method (2) based on the 
+    Poisson distribution is used. 
 
+    References
+    ----------
+    (1) Breslow NE, Day NE. Statistical methods in cancer research, volume II: The design and analysis 
+        of cohort studies. Lyon: International Agency for Research on Cancer, World Health Organisation; 1987.
+    (2) Armitage P, Berry G. Statistical methods in medical research (4th edn). Oxford: Blackwell; 2002.
     """
 
     # validate data - TODO: check group by row lengths?

diff --git a/PHStatsMethods/__init__.py b/PHStatsMethods/__init__.py
@@ -6,18 +6,18 @@
 ------------
 This is a Python package to support analysts in the execution of statistical
 methods approved for use in the production of Public Health indicators such as
-those presented via `Fingertips <https://fingertips.phe.org.uk/>`__. It
+those presented via `Fingertips <https://fingertips.phe.org.uk/>`_. It
 provides functions for the generation of Proportions, Rates, DSRs, ISRs,
 Funnel plots and Means including confidence intervals for these statistics,
 and a function for assigning data to quantiles.
 
 Any feedback would be appreciated and can be provided using the Issues
 section of the `PHStatsMethods GitHub
-repository <https://github.com/DataS-DHSC/PHStatsMethods/issue>`__.
+repository <https://github.com/DataS-DHSC/PHStatsMethods/issue>`_.
 
 Licence
 -------
-This project is released under the `GPL-3 <https://opensource.org/licenses/GPL-3.0>`__
+This project is released under the `GPL-3 <https://opensource.org/licenses/GPL-3.0>`_
 licence.
 
 Examples