Skip to content

Commit

Permalink
release v0.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
aisi-inspect committed Apr 28, 2024
1 parent efa6978 commit 3c1a676
Show file tree
Hide file tree
Showing 170 changed files with 7,505 additions and 2,141 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ jobs:
--user
- name: Build
run: python -m build
- name: Publish package distributions to TestPyPI
- name: Publish package to TestPyPI
uses: pypa/gh-action-pypi-publish@release/v1
if: ${{ ! inputs.publish-release }}
with:
repository-url: https://test.pypi.org/legacy/
- name: Publish package distributions to PyPI
- name: Publish package to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
if: ${{ inputs.publish-release }}
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ cython_debug/
data/datasets/*/hidden
logs/

# thumbnails
.DS_Store
thumbs.db

# JS
node_modules/

Expand Down
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Changelog

## v0.3.3 (28 April 2024)

- `inspect view` command for viewing eval log files.
- `Score` now has an optional `answer` field, which denotes the answer text extracted from model output.
- Accuracy metrics now take an optional `ValueToFloat` function for customizing how textual values mapped to float.
- Made `model_graded_qa` more flexible with separate `instruction` template and `grade_pattern`, as well providing `partial_credit` as an option.
- Modify the default templates for `chain_of_thought()` and `self_critique()` to instruct the model to reply with `ANSWER: $ANSWER` at the end on its own line.
- Improved numeric extraction for `match(numeric=True)` (better currency and decimal handling).
- Improve `answer()` patterns so that they detect letter and word answers both within and at the end of model output.
- `Plan` now has an optional `cleanup` function which can be used to free per-sample resources (e.g. Docker containers) even in the case of an evaluation error.
- Add `Dataset.filter` method for filtering samples using a predicate.
- `Dataset` slices (e.g. `dataset[0:100]`) now return a `Dataset` rather than `list[Sample]`.
- Relative path to `INSPECT_LOG_DIR` in `.env` file is now correctly resolved for execution within subdirectories.
- `inspect list tasks` and `list_tasks()` now only parse source files (rather than loading them), ensuring that it is fast even for task files that have non-trivial global initialisation.
- `inspect list logs` and `list_eval_logs()` now enumerate log files recursively by default, and only enumerate json files that match log file naming conventions.
- Provide `header_only` option for `read_eval_log()` and `inspect info log-file` for bypassing the potentially expensive reading of samples.
- Provide `filter` option for `list_eval_logs()` to filter based on log file header info (i.e. anything but samples).
- Added `__main__.py` entry point for invocation via `python3 -m inspect_ai`.
- Removed prompt and callable from model `ToolDef` (renamed to `ToolInfo`).
- Fix issue with accesses of `completion` property on `ModelOutput` with no choices.

## v0.3.2 (21 April 2024)

- Initial release.
1 change: 0 additions & 1 deletion DESCRIPTION.md

This file was deleted.

64 changes: 32 additions & 32 deletions benchmarks/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
inspect eval arc.py
# run specific subsets
inspect eval arc.py@easy
inspect eval arc.py@challenge
inspect eval arc.py@arc_easy
inspect eval arc.py@arc_challenge
"""

from inspect_ai import Task, task
Expand All @@ -19,39 +19,39 @@


def record_to_sample(record):
# read the labels and text
choices = record["choices"]
choices = dict(zip(choices["label"], choices["text"]))

# determine the target then normalize to letter
answerKey = record["answerKey"]
target = list(choices.keys()).index(answerKey)
target = chr(ord("A") + int(target))

# return sample
return Sample(
input=record["question"],
choices=list(choices.values()),
target=target
)
# read the labels and text
choices = record["choices"]
choices = dict(zip(choices["label"], choices["text"]))

# determine the target then normalize to letter
answerKey = record["answerKey"]
target = list(choices.keys()).index(answerKey)
target = chr(ord("A") + int(target))

# return sample
return Sample(
input=record["question"], choices=list(choices.values()), target=target
)


def arc_task(dataset_name):
return Task(
dataset=hf_dataset(
path="allenai/ai2_arc",
name=dataset_name,
split="test",
sample_fields=record_to_sample
),
plan = multiple_choice(),
scorer = answer("letter")
)
return Task(
dataset=hf_dataset(
path="allenai/ai2_arc",
name=dataset_name,
split="test",
sample_fields=record_to_sample,
),
plan=multiple_choice(),
scorer=answer("letter"),
)

@task
def easy():
return arc_task("ARC-Easy")

@task
def challenge():
return arc_task("ARC-Challenge")
def arc_easy():
return arc_task("ARC-Easy")


@task
def arc_challenge():
return arc_task("ARC-Challenge")
2 changes: 1 addition & 1 deletion benchmarks/gpqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def record_to_sample(record):


@task
def gpqa(cot=True):
def gpqa_diamond(cot=True):
return Task(
dataset=csv_dataset(
csv_file="https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv",
Expand Down
44 changes: 27 additions & 17 deletions benchmarks/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.scorer import match
from inspect_ai.solver import generate, system_message
from inspect_ai.solver import generate, prompt_template, system_message


def record_to_sample(record):
Expand All @@ -24,25 +24,33 @@ def record_to_sample(record):
answer = record["answer"].split(DELIM)
target = answer.pop().strip()
reasoning = DELIM.join(answer)
return Sample(
input=input,
target=target,
metadata={"reasoning": reasoning.strip()}
)
return Sample(input=input, target=target, metadata={"reasoning": reasoning.strip()})


def sample_to_fewshot(sample):
ANSWER_TRIGGER = "The answer is"
return (
f"Question: {sample.input}\nAnswer: "
+ f"{sample.metadata['reasoning']} "
+ f"{ANSWER_TRIGGER} {sample.target}"
f"{sample.input}\n\nReasoning:\n"
+ f"{sample.metadata['reasoning']}\n\n"
+ f"ANSWER: {sample.target}"
)


# setup for problem + instructions for providing answer
MATH_PROMPT_TEMPLATE = """
Solve the following math problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
{prompt}
Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
Reasoning:
""".strip()


@task
def gsm8k(fewshot=10, fewshot_seed=42):

# build plan dynamically (may or may not be doing fewshot)
plan = [generate()]
plan = [prompt_template(MATH_PROMPT_TEMPLATE), generate()]
if fewshot:
fewshots = hf_dataset(
path="gsm8k",
Expand All @@ -53,9 +61,12 @@ def gsm8k(fewshot=10, fewshot_seed=42):
seed=fewshot_seed,
limit=fewshot,
)
plan.insert(0, system_message("\n\n".join(
[sample_to_fewshot(sample) for sample in fewshots]
)))
plan.insert(
0,
system_message(
"\n\n".join([sample_to_fewshot(sample) for sample in fewshots])
),
)

# define task
return Task(
Expand All @@ -66,6 +77,5 @@ def gsm8k(fewshot=10, fewshot_seed=42):
sample_fields=record_to_sample,
),
plan=plan,
scorer=match(numeric=True)
scorer=match(numeric=True),
)

22 changes: 9 additions & 13 deletions benchmarks/hellaswag.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,30 @@
Choose the most plausible continuation for the story.
"""


def record_to_sample(record):
return Sample(
input = record["ctx"],
target = chr(ord("A") + int(record["label"])),
choices = record["endings"],
metadata = dict(
source_id = record["source_id"]
)
input=record["ctx"],
target=chr(ord("A") + int(record["label"])),
choices=record["endings"],
metadata=dict(source_id=record["source_id"]),
)


@task
def hellaswag():

# dataset
dataset = hf_dataset(
path="hellaswag",
split="validation",
sample_fields=record_to_sample,
trust=True
trust=True,
shuffle=True,
)

# define task
return Task(
dataset=dataset,
plan=[
system_message(SYSTEM_MESSAGE),
multiple_choice()
],
plan=[system_message(SYSTEM_MESSAGE), multiple_choice()],
scorer=answer("letter"),
)

Loading

0 comments on commit 3c1a676

Please sign in to comment.