Skip to content

Commit

Permalink
Merge pull request #18 from cvs-health/release-branch/v0.1.1
Browse files Browse the repository at this point in the history
Release PR - v0.1.1
  • Loading branch information
dylanbouchard authored Oct 28, 2024
2 parents 07e6a33 + c617057 commit c93e884
Show file tree
Hide file tree
Showing 29 changed files with 190 additions and 112 deletions.
77 changes: 44 additions & 33 deletions README.md

Large diffs are not rendered by default.

Binary file modified assets/images/autoeval_process.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 12 additions & 11 deletions langfair/auto/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def __init__(
"""
This class calculates all toxicity, stereotype, and counterfactual metrics support by langfair
Parameters:
-----------
Parameters
----------
prompts : list of strings or DataFrame of strings
A list of input prompts for the model.
Expand Down Expand Up @@ -99,14 +99,15 @@ async def evaluate(
"""
Compute all the metrics based on the provided data.
Parameters:
-----------
metrics : dict or list of str, default option compute all supported metrics.
Specifies which metrics to evaluate.
Parameters
----------
metrics : dict or list of str, optional
Specifies which metrics to evaluate. If None, computes all supported metrics.
Returns:
-----------
Dictionary containing values of toxicity, stereotype, and counterfactual metrics
Returns
-------
dict
A dictionary containing values of toxicity, stereotype, and counterfactual metrics.
"""
if metrics is not None:
self.metrics = self._validate_metrics(metrics)
Expand Down Expand Up @@ -256,8 +257,8 @@ def export_results(self, file_name: str = "results.txt") -> None:
"""
Export the evaluated metrics values in a text file.
Parameters:
-----------
Parameters
----------
file_name : str, Default = "results.txt"
Name of the .txt file.
"""
Expand Down
20 changes: 12 additions & 8 deletions langfair/generator/counterfactual.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ async def estimate_token_cost(
Returns
-------
dict
A dictionary containing the estimated token costs, including prompt token cost, completion token cost,
and total token cost.
"""
Expand Down Expand Up @@ -176,10 +177,11 @@ def parse_texts(
custom_list : List[str], default=None
Custom list of tokens to use for parsing prompts. Must be provided if attribute is None.
Returns
----------
List of length `len(texts)` with each element being a list of identified protected
attribute words in provided text
Returns
-------
list
List of length `len(texts)` with each element being a list of identified protected
attribute words in provided text
"""
assert not (custom_list and attribute), """
langfair: Either custom_list or attribute must be None.
Expand Down Expand Up @@ -220,8 +222,9 @@ def create_prompts(
{'male': ['he', 'him', 'woman'], 'female': ['she', 'her', 'man']}
Returns
----------
Dictionary containing counterfactual prompts
-------
dict
Dictionary containing counterfactual prompts
"""
assert not (custom_dict and attribute), """
langfair: Either custom_dict or attribute must be None.
Expand Down Expand Up @@ -284,8 +287,9 @@ def neutralize_tokens(
Specifies whether to use race or gender for neutralization
Returns
----------
List of texts neutralized for race or gender
-------
list
List of texts neutralized for race or gender
"""
assert attribute in [
"gender",
Expand Down
37 changes: 19 additions & 18 deletions langfair/generator/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ async def estimate_token_cost(
Returns
-------
dict
A dictionary containing the estimated token costs, including prompt token cost, completion token cost,
and total token cost.
"""
Expand Down Expand Up @@ -193,25 +194,25 @@ async def generate_responses(
or Gehman et al., 2020 (https://aclanthology.org/2020.findings-emnlp.301/).
Returns
----------
-------
dict
A dictionary with two keys: 'data' and 'metadata'.
'data' : dict
A dictionary containing the prompts and responses.
'prompt' : list
A list of prompts.
'response' : list
A list of responses corresponding to the prompts.
'metadata' : dict
A dictionary containing metadata about the generation process.
'non_completion_rate' : float
The rate at which the generation process did not complete.
'temperature' : float
The temperature parameter used in the generation process.
'count' : int
The count of prompts used in the generation process.
'system_prompt' : str
The system prompt used for generating responses
A dictionary with two keys: 'data' and 'metadata'.
'data' : dict
A dictionary containing the prompts and responses.
'prompt' : list
A list of prompts.
'response' : list
A list of responses corresponding to the prompts.
'metadata' : dict
A dictionary containing metadata about the generation process.
'non_completion_rate' : float
The rate at which the generation process did not complete.
'temperature' : float
The temperature parameter used in the generation process.
'count' : int
The count of prompts used in the generation process.
'system_prompt' : str
The system prompt used for generating responses
"""
assert isinstance(self.llm, langchain_core.runnables.base.Runnable), """
langchain_llm must be an instance of langchain_core.runnables.base.Runnable
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/classification/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ def evaluate(
Indicates whether to compute the metric as a difference or a ratio
Returns
----------
Dictionary containing specified metric values
-------
dict
Dictionary containing specified metric values
"""
return {
metric.name: metric.evaluate(
Expand Down
5 changes: 5 additions & 0 deletions langfair/metrics/classification/metrics/false_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def evaluate(
ratio : bool, default=False
Indicates whether to compute the metric as a difference or a ratio
Returns
-------
float
Value of false discovery rate parity
"""
unique_preds, unique_labels, unique_groups = (
np.unique(y_pred),
Expand Down
5 changes: 5 additions & 0 deletions langfair/metrics/classification/metrics/false_negative.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def evaluate(
ratio : bool, default=False
Indicates whether to compute the metric as a difference or a ratio
Returns
-------
float
Value of false negative rate parity
"""
unique_preds, unique_labels, unique_groups = (
np.unique(y_pred),
Expand Down
5 changes: 5 additions & 0 deletions langfair/metrics/classification/metrics/false_omission.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def evaluate(
ratio : bool, default=False
Indicates whether to compute the metric as a difference or a ratio
Returns
-------
float
Value of false omission rate parity
"""
unique_preds, unique_labels, unique_groups = (
np.unique(y_pred),
Expand Down
5 changes: 5 additions & 0 deletions langfair/metrics/classification/metrics/false_positive.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def evaluate(
ratio : bool, default=False
Indicates whether to compute the metric as a difference or a ratio
Returns
-------
float
Value of false positive rate parity
"""
unique_preds, unique_labels, unique_groups = (
np.unique(y_pred),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def evaluate(
ratio : bool, default=False
Indicates whether to compute the metric as a difference or a ratio
Returns
-------
float
Value of predicted prevalence rate parity
"""
unique_preds = np.unique(y_pred)
assert np.array_equal(
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/counterfactual/counterfactual.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
Specifies whether to use race or gender for neutralization
Returns
----------
Dictionary containing values of counterfactual metrics
-------
dict
Dictionary containing values of counterfactual metrics
"""
if self.neutralize_tokens:
assert attribute in [
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/counterfactual/metrics/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def evaluate(
group within the same protected attribute as mentioned in `texts1`.
Returns
----------
Mean BLEU scores for provided lists of texts.
-------
float
Mean BLEU score for provided lists of texts.
"""
blue_scores = [self._calc_bleu(t1, t2) for t1, t2 in zip(texts1, texts2)]

Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/counterfactual/metrics/cosine.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ def evaluate(
group within the same protected attribute as mentioned in `texts1`.
Returns
----------
Mean cosine similarity score for provided lists of texts.
-------
float
Mean cosine similarity score for provided lists of texts.
"""
assert len(texts1) == len(
texts2
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/counterfactual/metrics/rougel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ def evaluate(
group within the same protected attribute as mentioned in `texts1`.
Returns
----------
Mean ROUGE-L score for provided lists of texts.
-------
float
Mean ROUGE-L score for provided lists of texts.
"""
rouge_scores = [
self.rouge_scorer.score(t1, t2)[self.rouge_metric].fmeasure
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/counterfactual/metrics/sentimentbias.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ def evaluate(self, texts1: List[str], texts2: List[str]) -> float:
group within the same protected attribute as mentioned in `texts1`.
Returns
----------
Weak or strict counterfactual sentiment score for provided lists of texts.
-------
float
Weak or strict counterfactual sentiment score for provided lists of texts.
"""
group_dists = []

Expand Down
7 changes: 4 additions & 3 deletions langfair/metrics/recommendation/metrics/jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
list2 : list of strings
Another list of recommendation from an LLM model.
Parameters
----------
Jaccard similarity for the two provided lists of recommendations (float)
Returns
-------
float
Jaccard similarity for the two provided lists of recommendations (float)
"""
x = set(list1)
y = set(list2)
Expand Down
7 changes: 4 additions & 3 deletions langfair/metrics/recommendation/metrics/prag.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
list2 : list of strings
Another list of recommendation from an LLM model.
Parameters
----------
PRAG metric value for the two provided lists of recommendations (float)
Returns
-------
float
PRAG metric value for the two provided lists of recommendations (float)
"""
K = len(list1)
if not list1 or not list2:
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/recommendation/metrics/serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ def evaluate(self, list1: List[str], list2: List[str]) -> float:
Another list of recommendation from an LLM model.
Returns
----------
SERP metric value for the two provided lists of recommendations (float)
-------
float
SERP metric value for the two provided lists of recommendations (float)
"""
K = len(list1)
if not list2:
Expand Down
15 changes: 10 additions & 5 deletions langfair/metrics/recommendation/recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,10 @@ def evaluate_against_neutral(
]
Returns
----------
Dictionary containing mean, max, standard deviation, and range for
Jaccard, SERP, PRAG across protected attribute groups
-------
dict
Dictionary containing mean, max, standard deviation, and range for
Jaccard, SERP, PRAG across protected attribute groups
"""
self._run_input_checks(group_dict_list, neutral_dict)
return self._return_min_max_delta_std(
Expand Down Expand Up @@ -182,8 +183,9 @@ def evaluate_pairwise(
attribute group.
Returns
----------
Dictionary containing pairwise metric values of SERP, Jaccard, and PRAG
-------
dict
Dictionary containing pairwise metric values of SERP, Jaccard, and PRAG
"""
assert (
len(rec_lists1) == len(rec_lists2)
Expand Down Expand Up @@ -306,6 +308,9 @@ def _get_metric_with_neutral(
def _pairwise_calculations(
rec_lists1: List[str], rec_lists2: List[str], metric: Any
) -> float:
"""
Helper function to calculate SERP similarity metric (pairwise)
"""
val_list = []
for i in range(len(rec_lists1)):
val_i = min(
Expand Down
5 changes: 3 additions & 2 deletions langfair/metrics/stereotype/metrics/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,9 @@ def evaluate(self, responses: List[str]) -> Optional[float]:
metric will be calculated.
Returns
----------
Stereotypical associations score (float)
-------
float
Stereotypical associations score
"""
# Count the number of times each target_word and group co-occur
pair_to_count: Dict[Tuple[str, str], int] = defaultdict(int)
Expand Down
12 changes: 7 additions & 5 deletions langfair/metrics/stereotype/metrics/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def get_stereotype_scores(self, responses: List[str]) -> Dict[str, Any]:
stereotype metrics will be calculated.
Returns
----------
Dictionary containing response-level stereotype scores returned by stereotype classifier
-------
dict
Dictionary containing response-level stereotype scores returned by stereotype classifier
"""
score_dicts = self.classifier_instance(responses)
stereotype_scores = {
Expand Down Expand Up @@ -125,7 +126,7 @@ def evaluate(
responses : list of strings
A list of generated output from an LLM.
scores: list of float, default=None
scores : list of float, default=None
A list response-level stereotype score. If None, method will compute it first.
prompts : list of strings, default=None
Expand All @@ -137,8 +138,9 @@ def evaluate(
Specifies whether to include a dictionary containing response-level stereotype scores in returned result.
Returns
----------
Dictionary containing two keys: 'metrics', containing all metric values, and 'data', containing response-level stereotype scores.
-------
dict
Dictionary containing two keys: 'metrics', containing all metric values, and 'data', containing response-level stereotype scores.
"""
if categories is not None:
self.categories = categories
Expand Down
Loading

0 comments on commit c93e884

Please sign in to comment.