Guided decoding integration for autolabel (#898)

* WIP guided decoding integration for autolabel * Removing JSON mode * Removing multilabel confidence (regression) and label selector (currently unusable) * Cleaning up imports * rm logit bias * Latest reqs * Anthropic and google reqs * latest reqs * rm query params from openai * logprobs and top logprobs as primary keys * Sending json schema directly * Use provided schema * fmt * tests * Passing tests * fmt * rm error log * smaller test file * Remove additionalProperties recursively * fmt
refuel-ai · Sep 30, 2024 · d12ccc0 · d12ccc0
1 parent b5267f6
commit d12ccc0
Show file tree

Hide file tree

Showing 47 changed files with 1,933 additions and 4,616 deletions.
diff --git a/examples/figure_extraction/example_figure_extraction.ipynb b/examples/figure_extraction/example_figure_extraction.ipynb
@@ -419,7 +419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.9.19"
   },
   "vscode": {
    "interpreter": {

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "numpy == 1.26.4",
     "requests >= 2.27.0",
     "datasets >= 2.7.0",
-    "langchain == 0.1.9",
+    "langchain == 0.2.16",
     "nervaluate >= 0.1.8",
     "pandas >= 1.3.0",
     "scikit-learn >= 1.0.0",
@@ -63,36 +63,36 @@ dev = [
     "pre-commit"
 ]
 openai = [
-    "openai == 1.10.0",
+    "openai == 1.45.0",
     "tiktoken >= 0.7.0"
 ]
 anthropic = [
-    "anthropic == 0.17.0"
+    "anthropic == 0.34.2"
 ]
 huggingface = [
     "transformers >= 4.25.0",
     "sentence-transformers==2.3.1"
 ]
 google = [
     "tiktoken >= 0.7.0",
-    "google-cloud-aiplatform>=1.25.0",
-    "langchain_google_vertexai==0.0.5",
+    "google-cloud-aiplatform==1.56.0",
+    "langchain-google-vertexai==1.0.10",
 ]
 cohere = [
     "cohere>=4.11.2"
 ]
 minimal = [
-    "openai == 1.10.0",
-    "langchain==0.1.9",
-    "langchain-anthropic==0.1.1",
-    "langchain-core==0.1.30",
-    "langchain-community==0.0.27",
-    "langchain-openai==0.0.7",
-    "langchain_google_vertexai==0.0.5",
+    "openai==1.45.0",
+    "langchain==0.2.16",
+    "langchain-anthropic==0.1.23",
+    "langchain-core==0.2.40",
+    "langchain-community==0.2.17",
+    "langchain-openai==0.1.24",
+    "langchain-google-vertexai==1.0.10",
     "tiktoken >= 0.7.0",
-    "anthropic == 0.17.0",
+    "anthropic == 0.34.2",
     "transformers >= 4.25.0",
-    "google-cloud-aiplatform>=1.25.0",
+    "google-cloud-aiplatform==1.56.0",
     "google-search-results>=2.4.2",
     "redis >= 3.5.3",
     "pdfplumber >= 0.10.2",
@@ -114,17 +114,17 @@ all = [
     "pytest-asyncio",
     "pytest-mock",
     "pre-commit",
-    "openai == 1.10.0",
-    "langchain==0.1.9",
-    "langchain-anthropic==0.1.1",
-    "langchain-core==0.1.30",
-    "langchain-community==0.0.27",
-    "langchain-openai==0.0.7",
-    "langchain_google_vertexai==0.0.5",
+    "openai==1.45.0",
+    "langchain==0.2.16",
+    "langchain-anthropic==0.1.23",
+    "langchain-core==0.2.40",
+    "langchain-community==0.2.17",
+    "langchain-openai==0.1.24",
+    "langchain-google-vertexai==1.0.10",
     "tiktoken >= 0.7.0",
-    "anthropic == 0.17.0",
+    "anthropic == 0.34.2",
     "transformers >= 4.25.0",
-    "google-cloud-aiplatform>=1.25.0",
+    "google-cloud-aiplatform==1.56.0",
     "google-search-results>=2.4.2",
     "cohere>=4.11.2",
     "redis >= 3.5.3",

diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
@@ -37,8 +37,6 @@ class AutolabelConfig(BaseConfig):
     MODEL_PARAMS_KEY = "params"
     MODEL_ENDPOINT_KEY = "endpoint"
     COMPUTE_CONFIDENCE_KEY = "compute_confidence"
-    LOGIT_BIAS_KEY = "logit_bias"
-    JSON_MODE = "json_mode"
 
     # Embedding config keys (config["embedding"][<key>])
     EMBEDDING_PROVIDER_KEY = "provider"
@@ -183,10 +181,6 @@ def confidence(self) -> bool:
         """Returns true if the model is able to return a confidence score along with its predictions"""
         return self._model_config.get(self.COMPUTE_CONFIDENCE_KEY, False)
 
-    def logit_bias(self) -> float:
-        """Returns the logit bias for the labels specified in the config"""
-        return self._model_config.get(self.LOGIT_BIAS_KEY, 0.0)
-
     # Embedding config
     def embedding_provider(self) -> str:
         """Returns the name of the entity that provides the model used for computing embeddings"""
@@ -297,7 +291,3 @@ def confidence_chunk_size(self) -> int:
     def confidence_merge_function(self) -> str:
         """Returns the function to use when merging confidence scores"""
         return self._chunking_config.get(self.CONFIDENCE_MERGE_FUNCTION_KEY, "max")
-
-    def json_mode(self) -> bool:
-        """Returns true if the model should be used in json mode. Currently only used for OpenAI models."""
-        return self._model_config.get(self.JSON_MODE, False)
diff --git a/src/autolabel/configs/schema.py b/src/autolabel/configs/schema.py
@@ -73,7 +73,6 @@ def populate_few_shot_selection() -> List[str]:
                 },
                 "name": {"type": "string"},
                 "compute_confidence": {"type": ["boolean", "null"]},
-                "logit_bias": {"type": ["number", "null"]},
                 "params": {"type": ["object", "null"]},
             },
             "required": ["provider", "name"],

diff --git a/src/autolabel/dataset/dataset.py b/src/autolabel/dataset/dataset.py
@@ -7,7 +7,7 @@
 
 from autolabel.configs import AutolabelConfig
 from autolabel.dataset.validation import TaskDataValidation
-from autolabel.schema import LLMAnnotation, MetricResult, TaskType
+from autolabel.schema import LLMAnnotation, MetricResult
 from autolabel.tasks import BaseTask, TaskFactory
 from autolabel.utils import print_table
 
@@ -72,20 +72,11 @@ def __init__(
 
         inputs = df.to_dict(orient="records")
         label_column = self.config.label_column()
-        if not self.config.task_type() == TaskType.ATTRIBUTE_EXTRACTION:
-            gt_labels = (
-                None
-                if not label_column or not len(inputs) or label_column not in inputs[0]
-                else df[label_column].tolist()
-            )
-        else:
-            gt_labels = {}
-            for attr in self.config.attributes():
-                name = attr["name"]
-                column_name = attr["label_column"] if "label_column" in attr else name
-                gt_labels[name] = (
-                    df[column_name].tolist() if column_name in df.keys() else None
-                )
+        gt_labels = (
+            None
+            if not label_column or not len(inputs) or label_column not in inputs[0]
+            else df[label_column].tolist()
+        )
 
         self.df = df
         self.inputs = inputs
@@ -119,17 +110,14 @@ def process_labels(
         # Add the LLM labels to the dataframe
         self.df[self.generate_label_name("label")] = [x.label for x in llm_labels]
 
-        if self.config.task_type() == TaskType.ATTRIBUTE_EXTRACTION:
-            for attr in self.config.attributes():
-                attribute_labels = []
-                for x in llm_labels:
-                    if x.successfully_labeled:
-                        attribute_labels.append(x.label.get(attr["name"], ""))
-                    else:
-                        attribute_labels.append(BaseTask.NULL_LABEL_TOKEN)
-                self.df[
-                    self.generate_label_name("label", attr["name"])
-                ] = attribute_labels
+        for attr in self.config.attributes():
+            attribute_labels = []
+            for x in llm_labels:
+                if x.successfully_labeled:
+                    attribute_labels.append(x.label.get(attr["name"], ""))
+                else:
+                    attribute_labels.append(BaseTask.NULL_LABEL_TOKEN)
+            self.df[self.generate_label_name("label", attr["name"])] = attribute_labels
 
         # Add the LLM errors to the dataframe
         self.df[self.generate_label_name("error")] = [x.error for x in llm_labels]
@@ -159,19 +147,18 @@ def process_labels(
             self.df[self.generate_label_name("confidence")] = [
                 x.confidence_score for x in llm_labels
             ]
-            if self.config.task_type() == TaskType.ATTRIBUTE_EXTRACTION:
-                for attr in self.config.attributes():
-                    attr_confidence_scores = []
-                    for x in llm_labels:
-                        if x.successfully_labeled:
-                            attr_confidence_scores.append(
-                                x.confidence_score.get(attr["name"], 0.0)
-                            )
-                        else:
-                            attr_confidence_scores.append(0.0)
-                    self.df[
-                        self.generate_label_name("confidence", attr["name"])
-                    ] = attr_confidence_scores
+            for attr in self.config.attributes():
+                attr_confidence_scores = []
+                for x in llm_labels:
+                    if x.successfully_labeled:
+                        attr_confidence_scores.append(
+                            x.confidence_score.get(attr["name"], 0.0)
+                        )
+                    else:
+                        attr_confidence_scores.append(0.0)
+                self.df[
+                    self.generate_label_name("confidence", attr["name"])
+                ] = attr_confidence_scores
 
         # Add the LLM explanations to the dataframe if chain of thought is set in config
         if self.config.chain_of_thought():