release v0.3.3

UKGovernmentBEIS · Apr 28, 2024 · 3c1a676 · 3c1a676
1 parent efa6978
commit 3c1a676
Show file tree

Hide file tree

Showing 170 changed files with 7,505 additions and 2,141 deletions.
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -35,11 +35,11 @@ jobs:
           --user
       - name: Build
         run: python -m build
-      - name: Publish package distributions to TestPyPI
+      - name: Publish package to TestPyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         if: ${{ ! inputs.publish-release }}
         with:
           repository-url: https://test.pypi.org/legacy/
-      - name: Publish package distributions to PyPI
+      - name: Publish package to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         if: ${{ inputs.publish-release }}
diff --git a/.gitignore b/.gitignore
@@ -163,6 +163,10 @@ cython_debug/
 data/datasets/*/hidden
 logs/
 
+# thumbnails
+.DS_Store
+thumbs.db
+
 # JS
 node_modules/
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,26 @@
+# Changelog
+
+## v0.3.3 (28 April 2024)
+
+- `inspect view` command for viewing eval log files.
+- `Score` now has an optional `answer` field, which denotes the answer text extracted from model output.
+- Accuracy metrics now take an optional `ValueToFloat` function for customizing how textual values mapped to float.
+- Made `model_graded_qa` more flexible with separate `instruction` template and `grade_pattern`, as well providing `partial_credit` as an option.
+- Modify the default templates for `chain_of_thought()` and `self_critique()` to instruct the model to reply with `ANSWER: $ANSWER` at the end on its own line. 
+- Improved numeric extraction for `match(numeric=True)` (better currency and decimal handling).
+- Improve `answer()` patterns so that they detect letter and word answers both within and at the end of model output.
+- `Plan` now has an optional `cleanup` function which can be used to free per-sample resources (e.g. Docker containers) even in the case of an evaluation error.
+- Add `Dataset.filter` method for filtering samples using a predicate.
+- `Dataset` slices (e.g. `dataset[0:100]`) now return a `Dataset` rather than `list[Sample]`.
+- Relative path to `INSPECT_LOG_DIR` in `.env` file is now correctly resolved for execution within subdirectories.
+- `inspect list tasks` and `list_tasks()` now only parse source files (rather than loading them), ensuring that it is fast even for task files that have non-trivial global initialisation.
+- `inspect list logs` and `list_eval_logs()` now enumerate log files recursively by default, and only enumerate json files that match log file naming conventions.
+- Provide `header_only` option for `read_eval_log()` and `inspect info log-file` for bypassing the potentially expensive reading of samples.
+- Provide `filter` option for `list_eval_logs()` to filter based on log file header info (i.e. anything but samples).
+- Added `__main__.py` entry point for invocation via `python3 -m inspect_ai`.
+- Removed prompt and callable from model `ToolDef` (renamed to `ToolInfo`).
+- Fix issue with accesses of `completion` property on `ModelOutput` with no choices.
+
+## v0.3.2 (21 April 2024)
+
+- Initial release.
diff --git a/DESCRIPTION.md b/DESCRIPTION.md
diff --git a/benchmarks/arc.py b/benchmarks/arc.py
@@ -8,8 +8,8 @@
 inspect eval arc.py
 
 # run specific subsets
-inspect eval arc.py@easy
-inspect eval arc.py@challenge
+inspect eval arc.py@arc_easy
+inspect eval arc.py@arc_challenge
 """
 
 from inspect_ai import Task, task
@@ -19,39 +19,39 @@
 
 
 def record_to_sample(record):
-  # read the labels and text
-  choices = record["choices"]
-  choices = dict(zip(choices["label"], choices["text"]))
-
-  # determine the target then normalize to letter
-  answerKey = record["answerKey"]
-  target = list(choices.keys()).index(answerKey)
-  target = chr(ord("A") + int(target))
-
-  # return sample
-  return Sample(
-    input=record["question"],
-    choices=list(choices.values()),
-    target=target
-  )
+    # read the labels and text
+    choices = record["choices"]
+    choices = dict(zip(choices["label"], choices["text"]))
+
+    # determine the target then normalize to letter
+    answerKey = record["answerKey"]
+    target = list(choices.keys()).index(answerKey)
+    target = chr(ord("A") + int(target))
+
+    # return sample
+    return Sample(
+        input=record["question"], choices=list(choices.values()), target=target
+    )
+
 
 def arc_task(dataset_name):
-   return Task(
-     dataset=hf_dataset(
-       path="allenai/ai2_arc",
-       name=dataset_name,
-       split="test",
-       sample_fields=record_to_sample
-     ),
-     plan = multiple_choice(),
-     scorer = answer("letter")
-   )
+    return Task(
+        dataset=hf_dataset(
+            path="allenai/ai2_arc",
+            name=dataset_name,
+            split="test",
+            sample_fields=record_to_sample,
+        ),
+        plan=multiple_choice(),
+        scorer=answer("letter"),
+    )
 
-@task
-def easy():
-  return arc_task("ARC-Easy")
 
 @task
-def challenge():
-  return arc_task("ARC-Challenge")
+def arc_easy():
+    return arc_task("ARC-Easy")
+
 
+@task
+def arc_challenge():
+    return arc_task("ARC-Challenge")
diff --git a/benchmarks/gpqa.py b/benchmarks/gpqa.py
@@ -45,7 +45,7 @@ def record_to_sample(record):
 
 
 @task
-def gpqa(cot=True):
+def gpqa_diamond(cot=True):
     return Task(
         dataset=csv_dataset(
             csv_file="https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv",

diff --git a/benchmarks/gsm8k.py b/benchmarks/gsm8k.py
@@ -15,7 +15,7 @@
 from inspect_ai import Task, task
 from inspect_ai.dataset import Sample, hf_dataset
 from inspect_ai.scorer import match
-from inspect_ai.solver import generate, system_message
+from inspect_ai.solver import generate, prompt_template, system_message
 
 
 def record_to_sample(record):
@@ -24,25 +24,33 @@ def record_to_sample(record):
     answer = record["answer"].split(DELIM)
     target = answer.pop().strip()
     reasoning = DELIM.join(answer)
-    return Sample(
-        input=input,
-        target=target,
-        metadata={"reasoning": reasoning.strip()}
-    )
+    return Sample(input=input, target=target, metadata={"reasoning": reasoning.strip()})
+
 
 def sample_to_fewshot(sample):
-    ANSWER_TRIGGER = "The answer is"
     return (
-        f"Question: {sample.input}\nAnswer: "
-        + f"{sample.metadata['reasoning']} "
-        + f"{ANSWER_TRIGGER} {sample.target}"
+        f"{sample.input}\n\nReasoning:\n"
+        + f"{sample.metadata['reasoning']}\n\n"
+        + f"ANSWER: {sample.target}"
     )
 
+
+# setup for problem + instructions for providing answer
+MATH_PROMPT_TEMPLATE = """
+Solve the following math problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+
+{prompt}
+
+Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
+
+Reasoning:
+""".strip()
+
+
 @task
 def gsm8k(fewshot=10, fewshot_seed=42):
-
     # build plan dynamically (may or may not be doing fewshot)
-    plan = [generate()]
+    plan = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
     if fewshot:
         fewshots = hf_dataset(
             path="gsm8k",
@@ -53,9 +61,12 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             seed=fewshot_seed,
             limit=fewshot,
         )
-        plan.insert(0, system_message("\n\n".join(
-            [sample_to_fewshot(sample) for sample in fewshots]
-        )))
+        plan.insert(
+            0,
+            system_message(
+                "\n\n".join([sample_to_fewshot(sample) for sample in fewshots])
+            ),
+        )
 
     # define task
     return Task(
@@ -66,6 +77,5 @@ def gsm8k(fewshot=10, fewshot_seed=42):
             sample_fields=record_to_sample,
         ),
         plan=plan,
-        scorer=match(numeric=True)
+        scorer=match(numeric=True),
     )
-
diff --git a/benchmarks/hellaswag.py b/benchmarks/hellaswag.py
@@ -14,34 +14,30 @@
 Choose the most plausible continuation for the story.
 """
 
+
 def record_to_sample(record):
     return Sample(
-        input = record["ctx"],
-        target = chr(ord("A") + int(record["label"])),
-        choices = record["endings"],
-        metadata = dict(
-            source_id = record["source_id"]
-        )
+        input=record["ctx"],
+        target=chr(ord("A") + int(record["label"])),
+        choices=record["endings"],
+        metadata=dict(source_id=record["source_id"]),
     )
 
+
 @task
 def hellaswag():
-
     # dataset
     dataset = hf_dataset(
         path="hellaswag",
         split="validation",
         sample_fields=record_to_sample,
-        trust=True
+        trust=True,
+        shuffle=True,
     )
 
     # define task
     return Task(
         dataset=dataset,
-        plan=[
-          system_message(SYSTEM_MESSAGE),
-          multiple_choice()
-        ],
+        plan=[system_message(SYSTEM_MESSAGE), multiple_choice()],
         scorer=answer("letter"),
     )
-