Merge pull request #18 from cvs-health/release-branch/v0.1.1

Release PR - v0.1.1
cvs-health · Oct 28, 2024 · c93e884 · c93e884
2 parents 07e6a33 + c617057
commit c93e884
Show file tree

Hide file tree

Showing 29 changed files with 190 additions and 112 deletions.
diff --git a/README.md b/README.md
diff --git a/assets/images/autoeval_process.png b/assets/images/autoeval_process.png
diff --git a/langfair/auto/auto.py b/langfair/auto/auto.py
@@ -50,8 +50,8 @@ def __init__(
         """
         This class calculates all toxicity, stereotype, and counterfactual metrics support by langfair
 
-        Parameters:
-        -----------
+        Parameters
+        ----------
         prompts : list of strings or DataFrame of strings
             A list of input prompts for the model.
 
@@ -99,14 +99,15 @@ async def evaluate(
         """
         Compute all the metrics based on the provided data.
 
-        Parameters:
-        -----------
-        metrics : dict or list of str, default option compute all supported metrics.
-            Specifies which metrics to evaluate.
+        Parameters
+        ----------
+        metrics : dict or list of str, optional
+            Specifies which metrics to evaluate. If None, computes all supported metrics.
 
-        Returns:
-        -----------
-        Dictionary containing values of toxicity, stereotype, and counterfactual metrics
+        Returns
+        -------
+        dict
+            A dictionary containing values of toxicity, stereotype, and counterfactual metrics.
         """
         if metrics is not None:
             self.metrics = self._validate_metrics(metrics)
@@ -256,8 +257,8 @@ def export_results(self, file_name: str = "results.txt") -> None:
         """
         Export the evaluated metrics values in a text file.
 
-        Parameters:
-        -----------
+        Parameters
+        ----------
         file_name : str, Default = "results.txt"
             Name of the .txt file.
         """

diff --git a/langfair/generator/counterfactual.py b/langfair/generator/counterfactual.py
@@ -136,6 +136,7 @@ async def estimate_token_cost(
 
         Returns
         -------
+        dict
            A dictionary containing the estimated token costs, including prompt token cost, completion token cost,
            and total token cost.
         """
@@ -176,10 +177,11 @@ def parse_texts(
         custom_list : List[str], default=None
             Custom list of tokens to use for parsing prompts. Must be provided if attribute is None.
 
-         Returns
-        ----------
-        List of length `len(texts)` with each element being a list of identified protected
-        attribute words in provided text
+        Returns
+        -------
+        list
+            List of length `len(texts)` with each element being a list of identified protected
+            attribute words in provided text
         """
         assert not (custom_list and attribute), """
         langfair: Either custom_list or attribute must be None.
@@ -220,8 +222,9 @@ def create_prompts(
             {'male': ['he', 'him', 'woman'], 'female': ['she', 'her', 'man']}
 
         Returns
-        ----------
-        Dictionary containing counterfactual prompts
+        -------
+        dict
+            Dictionary containing counterfactual prompts
         """
         assert not (custom_dict and attribute), """
         langfair: Either custom_dict or attribute must be None.
@@ -284,8 +287,9 @@ def neutralize_tokens(
             Specifies whether to use race or gender for neutralization
 
         Returns
-        ----------
-        List of texts neutralized for race or gender
+        -------
+        list
+            List of texts neutralized for race or gender
         """
         assert attribute in [
             "gender",

diff --git a/langfair/generator/generator.py b/langfair/generator/generator.py
@@ -101,6 +101,7 @@ async def estimate_token_cost(
 
         Returns
         -------
+        dict
            A dictionary containing the estimated token costs, including prompt token cost, completion token cost,
            and total token cost.
         """
@@ -193,25 +194,25 @@ async def generate_responses(
             or Gehman et al., 2020 (https://aclanthology.org/2020.findings-emnlp.301/).
 
         Returns
-        ----------
+        -------
         dict
-        A dictionary with two keys: 'data' and 'metadata'.
-        'data' : dict
-            A dictionary containing the prompts and responses.
-            'prompt' : list
-                A list of prompts.
-            'response' : list
-                A list of responses corresponding to the prompts.
-        'metadata' : dict
-            A dictionary containing metadata about the generation process.
-            'non_completion_rate' : float
-                The rate at which the generation process did not complete.
-            'temperature' : float
-                The temperature parameter used in the generation process.
-            'count' : int
-                The count of prompts used in the generation process.
-            'system_prompt' : str
-                The system prompt used for generating responses
+            A dictionary with two keys: 'data' and 'metadata'.
+            'data' : dict
+                A dictionary containing the prompts and responses.
+                'prompt' : list
+                    A list of prompts.
+                'response' : list
+                    A list of responses corresponding to the prompts.
+            'metadata' : dict
+                A dictionary containing metadata about the generation process.
+                'non_completion_rate' : float
+                    The rate at which the generation process did not complete.
+                'temperature' : float
+                    The temperature parameter used in the generation process.
+                'count' : int
+                    The count of prompts used in the generation process.
+                'system_prompt' : str
+                    The system prompt used for generating responses
         """
         assert isinstance(self.llm, langchain_core.runnables.base.Runnable), """
             langchain_llm must be an instance of langchain_core.runnables.base.Runnable

diff --git a/langfair/metrics/classification/classification.py b/langfair/metrics/classification/classification.py
@@ -82,8 +82,9 @@ def evaluate(
             Indicates whether to compute the metric as a difference or a ratio
 
         Returns
-        ----------
-        Dictionary containing specified metric values
+        -------
+        dict
+            Dictionary containing specified metric values
         """
         return {
             metric.name: metric.evaluate(

diff --git a/langfair/metrics/classification/metrics/false_discovery.py b/langfair/metrics/classification/metrics/false_discovery.py
@@ -50,6 +50,11 @@ def evaluate(
 
         ratio : bool, default=False
             Indicates whether to compute the metric as a difference or a ratio
+            
+        Returns
+        -------
+        float
+            Value of false discovery rate parity
         """
         unique_preds, unique_labels, unique_groups = (
             np.unique(y_pred),

diff --git a/langfair/metrics/classification/metrics/false_negative.py b/langfair/metrics/classification/metrics/false_negative.py
@@ -50,6 +50,11 @@ def evaluate(
 
         ratio : bool, default=False
             Indicates whether to compute the metric as a difference or a ratio
+            
+        Returns
+        -------
+        float
+            Value of false negative rate parity
         """
         unique_preds, unique_labels, unique_groups = (
             np.unique(y_pred),

diff --git a/langfair/metrics/classification/metrics/false_omission.py b/langfair/metrics/classification/metrics/false_omission.py
@@ -50,6 +50,11 @@ def evaluate(
 
         ratio : bool, default=False
             Indicates whether to compute the metric as a difference or a ratio
+            
+        Returns
+        -------
+        float
+            Value of false omission rate parity
         """
         unique_preds, unique_labels, unique_groups = (
             np.unique(y_pred),

diff --git a/langfair/metrics/classification/metrics/false_positive.py b/langfair/metrics/classification/metrics/false_positive.py
@@ -50,6 +50,11 @@ def evaluate(
 
         ratio : bool, default=False
             Indicates whether to compute the metric as a difference or a ratio
+            
+        Returns
+        -------
+        float
+            Value of false positive rate parity
         """
         unique_preds, unique_labels, unique_groups = (
             np.unique(y_pred),

diff --git a/langfair/metrics/classification/metrics/predicted_prevalence.py b/langfair/metrics/classification/metrics/predicted_prevalence.py
@@ -51,6 +51,11 @@ def evaluate(
 
         ratio : bool, default=False
             Indicates whether to compute the metric as a difference or a ratio
+            
+        Returns
+        -------
+        float
+            Value of predicted prevalence rate parity
         """
         unique_preds = np.unique(y_pred)
         assert np.array_equal(

diff --git a/langfair/metrics/counterfactual/counterfactual.py b/langfair/metrics/counterfactual/counterfactual.py
@@ -75,8 +75,9 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
             Specifies whether to use race or gender for neutralization
 
         Returns
-        ----------
-        Dictionary containing values of counterfactual metrics
+        -------
+        dict
+            Dictionary containing values of counterfactual metrics
         """
         if self.neutralize_tokens:
             assert attribute in [

diff --git a/langfair/metrics/counterfactual/metrics/bleu.py b/langfair/metrics/counterfactual/metrics/bleu.py
@@ -64,8 +64,9 @@ def evaluate(
             group within the same protected attribute as mentioned in `texts1`.
 
         Returns
-        ----------
-        Mean BLEU scores for provided lists of texts.
+        -------
+        float
+            Mean BLEU score for provided lists of texts.
         """
         blue_scores = [self._calc_bleu(t1, t2) for t1, t2 in zip(texts1, texts2)]
 

diff --git a/langfair/metrics/counterfactual/metrics/cosine.py b/langfair/metrics/counterfactual/metrics/cosine.py
@@ -71,8 +71,9 @@ def evaluate(
             group within the same protected attribute as mentioned in `texts1`.
 
         Returns
-        ----------
-        Mean cosine similarity score for provided lists of texts.
+        -------
+        float
+            Mean cosine similarity score for provided lists of texts.
         """
         assert len(texts1) == len(
             texts2

diff --git a/langfair/metrics/counterfactual/metrics/rougel.py b/langfair/metrics/counterfactual/metrics/rougel.py
@@ -70,8 +70,9 @@ def evaluate(
             group within the same protected attribute as mentioned in `texts1`.
 
         Returns
-        ----------
-        Mean ROUGE-L score for provided lists of texts.
+        -------
+        float
+            Mean ROUGE-L score for provided lists of texts.
         """
         rouge_scores = [
             self.rouge_scorer.score(t1, t2)[self.rouge_metric].fmeasure

diff --git a/langfair/metrics/counterfactual/metrics/sentimentbias.py b/langfair/metrics/counterfactual/metrics/sentimentbias.py
@@ -107,8 +107,9 @@ def evaluate(self, texts1: List[str], texts2: List[str]) -> float:
             group within the same protected attribute as mentioned in `texts1`.
 
         Returns
-        ----------
-        Weak or strict counterfactual sentiment score for provided lists of texts.
+        -------
+        float
+            Weak or strict counterfactual sentiment score for provided lists of texts.
         """
         group_dists = []
 

diff --git a/langfair/metrics/recommendation/metrics/jaccard.py b/langfair/metrics/recommendation/metrics/jaccard.py
@@ -36,9 +36,10 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
         list2 : list of strings
             Another list of recommendation from an LLM model.
 
-        Parameters
-        ----------
-        Jaccard similarity for the two provided lists of recommendations (float)
+        Returns
+        -------
+        float
+            Jaccard similarity for the two provided lists of recommendations (float)
         """
         x = set(list1)
         y = set(list2)

diff --git a/langfair/metrics/recommendation/metrics/prag.py b/langfair/metrics/recommendation/metrics/prag.py
@@ -36,9 +36,10 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
         list2 : list of strings
             Another list of recommendation from an LLM model.
 
-        Parameters
-        ----------
-        PRAG metric value for the two provided lists of recommendations (float)
+        Returns
+        -------
+        float
+            PRAG metric value for the two provided lists of recommendations (float)
         """
         K = len(list1)
         if not list1 or not list2:

diff --git a/langfair/metrics/recommendation/metrics/serp.py b/langfair/metrics/recommendation/metrics/serp.py
@@ -37,8 +37,9 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
             Another list of recommendation from an LLM model.
 
         Returns
-        ----------
-        SERP metric value for the two provided lists of recommendations (float)
+        -------
+        float
+            SERP metric value for the two provided lists of recommendations (float)
         """
         K = len(list1)
         if not list2:

diff --git a/langfair/metrics/recommendation/recommendation.py b/langfair/metrics/recommendation/recommendation.py
@@ -152,9 +152,10 @@ def evaluate_against_neutral(
             ]
 
         Returns
-        ----------
-        Dictionary containing mean, max, standard deviation, and range for
-        Jaccard, SERP, PRAG across protected attribute groups
+        -------
+        dict
+            Dictionary containing mean, max, standard deviation, and range for
+            Jaccard, SERP, PRAG across protected attribute groups
         """
         self._run_input_checks(group_dict_list, neutral_dict)
         return self._return_min_max_delta_std(
@@ -182,8 +183,9 @@ def evaluate_pairwise(
             attribute group.
 
         Returns
-        ----------
-        Dictionary containing pairwise metric values of SERP, Jaccard, and PRAG
+        -------
+        dict
+            Dictionary containing pairwise metric values of SERP, Jaccard, and PRAG
         """
         assert (
             len(rec_lists1) == len(rec_lists2)
@@ -306,6 +308,9 @@ def _get_metric_with_neutral(
     def _pairwise_calculations(
         rec_lists1: List[str], rec_lists2: List[str], metric: Any
     ) -> float:
+        """
+        Helper function to calculate SERP similarity metric (pairwise)
+        """
         val_list = []
         for i in range(len(rec_lists1)):
             val_i = min(

diff --git a/langfair/metrics/stereotype/metrics/associations.py b/langfair/metrics/stereotype/metrics/associations.py
@@ -136,8 +136,9 @@ def evaluate(self, responses: List[str]) -> Optional[float]:
             metric will be calculated.
 
         Returns
-        ----------
-        Stereotypical associations score (float)
+        -------
+        float
+            Stereotypical associations score
         """
         # Count the number of times each target_word and group co-occur
         pair_to_count: Dict[Tuple[str, str], int] = defaultdict(int)

diff --git a/langfair/metrics/stereotype/metrics/classifier.py b/langfair/metrics/stereotype/metrics/classifier.py
@@ -90,8 +90,9 @@ def get_stereotype_scores(self, responses: List[str]) -> Dict[str, Any]:
             stereotype metrics will be calculated.
 
         Returns
-        ----------
-        Dictionary containing response-level stereotype scores returned by stereotype classifier
+        -------
+        dict
+            Dictionary containing response-level stereotype scores returned by stereotype classifier
         """
         score_dicts = self.classifier_instance(responses)
         stereotype_scores = {
@@ -125,7 +126,7 @@ def evaluate(
         responses : list of strings
             A list of generated output from an LLM.
 
-        scores: list of float, default=None
+        scores : list of float, default=None
             A list response-level stereotype score. If None, method will compute it first.
 
         prompts : list of strings, default=None
@@ -137,8 +138,9 @@ def evaluate(
             Specifies whether to include a dictionary containing response-level stereotype scores in returned result.
 
         Returns
-        ----------
-        Dictionary containing two keys: 'metrics', containing all metric values, and 'data', containing response-level stereotype scores.
+        -------
+        dict
+            Dictionary containing two keys: 'metrics', containing all metric values, and 'data', containing response-level stereotype scores.
         """
         if categories is not None:
             self.categories = categories