release v0.3.7

UKGovernmentBEIS · May 7, 2024 · 8406a69 · 8406a69
1 parent 2a0b6b9
commit 8406a69
Show file tree

Hide file tree

Showing 116 changed files with 15,485 additions and 712 deletions.
diff --git a/.github/workflows/vscode.yml b/.github/workflows/vscode.yml
@@ -0,0 +1,34 @@
+on:
+  push:
+    tags:
+      - "v[0-9]*"
+    branches:
+      - "main"
+  pull_request:
+    branches:
+      - "main"
+  workflow_dispatch:
+
+name: Deploy Extension
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "18.x"
+      - run: |
+          pushd tools/vscode
+          yarn install --immutable --immutable-cache --check-cache
+
+      - name: Build Extension
+        run: |
+          pushd tools/vscode
+          yarn vsce package
+
+      - name: Upload extension to Actions Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: inspect-vscode
+          path: "tools/vscode/inspect*.vsix"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.3.7 (07 May 2024)
+
+- Add support for logprobs to HF provider, and create uniform API for other providers that support logprobs (Together and OpenAI).
+- Provide an option to merge asssistant messages and use it for Anthropoic models (as they don't allow consecutive assistant messages).
+- Supporting infrastructure in Inspect CLI for VS Code extension (additional list and info commands).
+
 ## v0.3.6 (06 May 2024)
 
 - Show first log file immediately (don't wait for fetching metadata for other logs)

diff --git a/benchmarks/mmlu.py b/benchmarks/mmlu.py
@@ -14,11 +14,9 @@
 inspect eval mmlu.py@mmlu --limit 500 -T cot=true
 
 # eval selected subjects
+inspect eval mmlu.py@mmlu -T subjects=anatomy
+inspect eval mmlu.py@mmlu -T subjects=astronomy
 inspect eval mmlu.py@mmlu -T subjects=anatomy,astronomy
-
-# eval single subjects
-inspect eval mmlu.py@mmlu_anatomy
-inspect eval mmlu.py@mmlu_astronomy
 """
 
 from inspect_ai import Task, task
@@ -71,278 +69,3 @@ def mmlu(subjects=[], cot=False):
         scorer=answer("letter"),
         config=GenerateConfig(temperature=0.5),
     )
-
-
-@task
-def mmlu_abstract_algebra(cot=False):
-    return mmlu("abstract_algebra", cot)
-
-
-@task
-def mmlu_anatomy(cot=False):
-    return mmlu("anatomy", cot)
-
-
-@task
-def mmlu_astronomy(cot=False):
-    return mmlu("astronomy", cot)
-
-
-@task
-def mmlu_business_ethics(cot=False):
-    return mmlu("business_ethics", cot)
-
-
-@task
-def mmlu_clinical_knowledge(cot=False):
-    return mmlu("clinical_knowledge", cot)
-
-
-@task
-def mmlu_college_biology(cot=False):
-    return mmlu("college_biology", cot)
-
-
-@task
-def mmlu_college_chemistry(cot=False):
-    return mmlu("college_chemistry", cot)
-
-
-@task
-def mmlu_college_computer_science(cot=False):
-    return mmlu("college_computer_science", cot)
-
-
-@task
-def mmlu_college_mathematics(cot=False):
-    return mmlu("college_mathematics", cot)
-
-
-@task
-def mmlu_college_medicine(cot=False):
-    return mmlu("college_medicine", cot)
-
-
-@task
-def mmlu_college_physics(cot=False):
-    return mmlu("college_physics", cot)
-
-
-@task
-def mmlu_computer_security(cot=False):
-    return mmlu("computer_security", cot)
-
-
-@task
-def mmlu_conceptual_physics(cot=False):
-    return mmlu("conceptual_physics", cot)
-
-
-@task
-def mmlu_electrical_engineering(cot=False):
-    return mmlu("electrical_engineering", cot)
-
-
-@task
-def mmlu_elementary_mathematics(cot=False):
-    return mmlu("elementary_mathematics", cot)
-
-
-@task
-def mmlu_formal_logic(cot=False):
-    return mmlu("formal_logic", cot)
-
-
-@task
-def mmlu_global_facts(cot=False):
-    return mmlu("global_facts", cot)
-
-
-@task
-def mmlu_high_school_biology(cot=False):
-    return mmlu("high_school_biology", cot)
-
-
-@task
-def mmlu_high_school_chemistry(cot=False):
-    return mmlu("high_school_chemistry", cot)
-
-
-@task
-def mmlu_high_school_computer_science(cot=False):
-    return mmlu("high_school_computer_science", cot)
-
-
-@task
-def mmlu_high_school_european_history(cot=False):
-    return mmlu("high_school_european_history", cot)
-
-
-@task
-def mmlu_high_school_geography(cot=False):
-    return mmlu("high_school_geography", cot)
-
-
-@task
-def mmlu_high_school_government_and_politics(cot=False):
-    return mmlu("high_school_government_and_politics", cot)
-
-
-@task
-def mmlu_high_school_macroeconomics(cot=False):
-    return mmlu("high_school_macroeconomics", cot)
-
-
-@task
-def mmlu_high_school_mathematics(cot=False):
-    return mmlu("high_school_mathematics", cot)
-
-
-@task
-def mmlu_high_school_microeconomics(cot=False):
-    return mmlu("high_school_microeconomics", cot)
-
-
-@task
-def mmlu_high_school_physics(cot=False):
-    return mmlu("high_school_physics", cot)
-
-
-@task
-def mmlu_high_school_psychology(cot=False):
-    return mmlu("high_school_psychology", cot)
-
-
-@task
-def mmlu_high_school_statistics(cot=False):
-    return mmlu("high_school_statistics", cot)
-
-
-@task
-def mmlu_high_school_us_history(cot=False):
-    return mmlu("high_school_us_history", cot)
-
-
-@task
-def mmlu_high_school_world_history(cot=False):
-    return mmlu("high_school_world_history", cot)
-
-
-@task
-def mmlu_human_aging(cot=False):
-    return mmlu("human_aging", cot)
-
-
-@task
-def mmlu_human_sexuality(cot=False):
-    return mmlu("human_sexuality", cot)
-
-
-@task
-def mmlu_international_law(cot=False):
-    return mmlu("international_law", cot)
-
-
-@task
-def mmlu_jurisprudence(cot=False):
-    return mmlu("jurisprudence", cot)
-
-
-@task
-def mmlu_logical_fallacies(cot=False):
-    return mmlu("logical_fallacies", cot)
-
-
-@task
-def mmlu_machine_learning(cot=False):
-    return mmlu("machine_learning", cot)
-
-
-@task
-def mmlu_management(cot=False):
-    return mmlu("management", cot)
-
-
-@task
-def mmlu_marketing(cot=False):
-    return mmlu("marketing", cot)
-
-
-@task
-def mmlu_miscellaneous(cot=False):
-    return mmlu("miscellaneous", cot)
-
-
-@task
-def mmlu_moral_disputes(cot=False):
-    return mmlu("moral_disputes", cot)
-
-
-@task
-def mmlu_moral_scenarios(cot=False):
-    return mmlu("moral_scenarios", cot)
-
-
-@task
-def mmlu_nutrition(cot=False):
-    return mmlu("nutrition", cot)
-
-
-@task
-def mmlu_philosophy(cot=False):
-    return mmlu("philosophy", cot)
-
-
-@task
-def mmlu_prehistory(cot=False):
-    return mmlu("prehistory", cot)
-
-
-@task
-def mmlu_professional_accounting(cot=False):
-    return mmlu("professional_accounting", cot)
-
-
-@task
-def mmlu_professional_law(cot=False):
-    return mmlu("professional_law", cot)
-
-
-@task
-def mmlu_professional_medicine(cot=False):
-    return mmlu("professional_medicine", cot)
-
-
-@task
-def mmlu_professional_psychology(cot=False):
-    return mmlu("professional_psychology", cot)
-
-
-@task
-def mmlu_public_relations(cot=False):
-    return mmlu("public_relations", cot)
-
-
-@task
-def mmlu_security_studies(cot=False):
-    return mmlu("security_studies", cot)
-
-
-@task
-def mmlu_sociology(cot=False):
-    return mmlu("sociology", cot)
-
-
-@task
-def mmlu_us_foreign_policy(cot=False):
-    return mmlu("us_foreign_policy", cot)
-
-
-@task
-def mmlu_virology(cot=False):
-    return mmlu("virology", cot)
-
-
-@task
-def mmlu_world_religions(cot=False):
-    return mmlu("world_religions", cot)
diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py
@@ -156,12 +156,12 @@
     "--logprobs",
     type=bool,
     is_flag=True,
-    help="Return log probabilities of the output tokens. OpenAI and TogetherAI only.",
+    help="Return log probabilities of the output tokens. OpenAI, TogetherAI, and Huggingface only.",
 )
 @click.option(
     "--top-logprobs",
     type=int,
-    help="Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI only.",
+    help="Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI and Huggingface only.",
 )
 @common_options
 def eval_command(

diff --git a/src/inspect_ai/_cli/info.py b/src/inspect_ai/_cli/info.py
@@ -1,10 +1,11 @@
 from json import dumps
 
 import click
+from pydantic_core import to_jsonable_python
 
 from inspect_ai import __version__
 from inspect_ai._util.constants import PKG_PATH
-from inspect_ai.log import eval_log_json, read_eval_log
+from inspect_ai.log._file import eval_log_json, read_eval_log, read_eval_log_headers
 
 
 @click.group("info")
@@ -44,6 +45,14 @@ def log(path: str, header_only: bool) -> None:
     print(eval_log_json(log))
 
 
+@info_command.command("log-file-headers")
+@click.argument("files", nargs=-1)
+def log_file_headers(files: tuple[str]) -> None:
+    """Read and print a JSON list of log file headers."""
+    headers = read_eval_log_headers(list(files))
+    print(dumps(to_jsonable_python(headers), indent=2))
+
+
 @info_command.command("log-schema")
 def log_schema() -> None:
     """Print JSON schema for log files."""