diff --git a/.github/workflows/vscode.yml b/.github/workflows/vscode.yml new file mode 100644 index 000000000..2ddbb4bfe --- /dev/null +++ b/.github/workflows/vscode.yml @@ -0,0 +1,34 @@ +on: + push: + tags: + - "v[0-9]*" + branches: + - "main" + pull_request: + branches: + - "main" + workflow_dispatch: + +name: Deploy Extension +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: "18.x" + - run: | + pushd tools/vscode + yarn install --immutable --immutable-cache --check-cache + + - name: Build Extension + run: | + pushd tools/vscode + yarn vsce package + + - name: Upload extension to Actions Artifact + uses: actions/upload-artifact@v4 + with: + name: inspect-vscode + path: "tools/vscode/inspect*.vsix" diff --git a/CHANGELOG.md b/CHANGELOG.md index 774b9f746..9e72d808e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v0.3.7 (07 May 2024) + +- Add support for logprobs to HF provider, and create uniform API for other providers that support logprobs (Together and OpenAI). +- Provide an option to merge asssistant messages and use it for Anthropoic models (as they don't allow consecutive assistant messages). +- Supporting infrastructure in Inspect CLI for VS Code extension (additional list and info commands). + ## v0.3.6 (06 May 2024) - Show first log file immediately (don't wait for fetching metadata for other logs) diff --git a/benchmarks/mmlu.py b/benchmarks/mmlu.py index 1b9e0bcff..478305c0d 100644 --- a/benchmarks/mmlu.py +++ b/benchmarks/mmlu.py @@ -14,11 +14,9 @@ inspect eval mmlu.py@mmlu --limit 500 -T cot=true # eval selected subjects +inspect eval mmlu.py@mmlu -T subjects=anatomy +inspect eval mmlu.py@mmlu -T subjects=astronomy inspect eval mmlu.py@mmlu -T subjects=anatomy,astronomy - -# eval single subjects -inspect eval mmlu.py@mmlu_anatomy -inspect eval mmlu.py@mmlu_astronomy """ from inspect_ai import Task, task @@ -71,278 +69,3 @@ def mmlu(subjects=[], cot=False): scorer=answer("letter"), config=GenerateConfig(temperature=0.5), ) - - -@task -def mmlu_abstract_algebra(cot=False): - return mmlu("abstract_algebra", cot) - - -@task -def mmlu_anatomy(cot=False): - return mmlu("anatomy", cot) - - -@task -def mmlu_astronomy(cot=False): - return mmlu("astronomy", cot) - - -@task -def mmlu_business_ethics(cot=False): - return mmlu("business_ethics", cot) - - -@task -def mmlu_clinical_knowledge(cot=False): - return mmlu("clinical_knowledge", cot) - - -@task -def mmlu_college_biology(cot=False): - return mmlu("college_biology", cot) - - -@task -def mmlu_college_chemistry(cot=False): - return mmlu("college_chemistry", cot) - - -@task -def mmlu_college_computer_science(cot=False): - return mmlu("college_computer_science", cot) - - -@task -def mmlu_college_mathematics(cot=False): - return mmlu("college_mathematics", cot) - - -@task -def mmlu_college_medicine(cot=False): - return mmlu("college_medicine", cot) - - -@task -def mmlu_college_physics(cot=False): - return mmlu("college_physics", cot) - - -@task -def mmlu_computer_security(cot=False): - return mmlu("computer_security", cot) - - -@task -def mmlu_conceptual_physics(cot=False): - return mmlu("conceptual_physics", cot) - - -@task -def mmlu_electrical_engineering(cot=False): - return mmlu("electrical_engineering", cot) - - -@task -def mmlu_elementary_mathematics(cot=False): - return mmlu("elementary_mathematics", cot) - - -@task -def mmlu_formal_logic(cot=False): - return mmlu("formal_logic", cot) - - -@task -def mmlu_global_facts(cot=False): - return mmlu("global_facts", cot) - - -@task -def mmlu_high_school_biology(cot=False): - return mmlu("high_school_biology", cot) - - -@task -def mmlu_high_school_chemistry(cot=False): - return mmlu("high_school_chemistry", cot) - - -@task -def mmlu_high_school_computer_science(cot=False): - return mmlu("high_school_computer_science", cot) - - -@task -def mmlu_high_school_european_history(cot=False): - return mmlu("high_school_european_history", cot) - - -@task -def mmlu_high_school_geography(cot=False): - return mmlu("high_school_geography", cot) - - -@task -def mmlu_high_school_government_and_politics(cot=False): - return mmlu("high_school_government_and_politics", cot) - - -@task -def mmlu_high_school_macroeconomics(cot=False): - return mmlu("high_school_macroeconomics", cot) - - -@task -def mmlu_high_school_mathematics(cot=False): - return mmlu("high_school_mathematics", cot) - - -@task -def mmlu_high_school_microeconomics(cot=False): - return mmlu("high_school_microeconomics", cot) - - -@task -def mmlu_high_school_physics(cot=False): - return mmlu("high_school_physics", cot) - - -@task -def mmlu_high_school_psychology(cot=False): - return mmlu("high_school_psychology", cot) - - -@task -def mmlu_high_school_statistics(cot=False): - return mmlu("high_school_statistics", cot) - - -@task -def mmlu_high_school_us_history(cot=False): - return mmlu("high_school_us_history", cot) - - -@task -def mmlu_high_school_world_history(cot=False): - return mmlu("high_school_world_history", cot) - - -@task -def mmlu_human_aging(cot=False): - return mmlu("human_aging", cot) - - -@task -def mmlu_human_sexuality(cot=False): - return mmlu("human_sexuality", cot) - - -@task -def mmlu_international_law(cot=False): - return mmlu("international_law", cot) - - -@task -def mmlu_jurisprudence(cot=False): - return mmlu("jurisprudence", cot) - - -@task -def mmlu_logical_fallacies(cot=False): - return mmlu("logical_fallacies", cot) - - -@task -def mmlu_machine_learning(cot=False): - return mmlu("machine_learning", cot) - - -@task -def mmlu_management(cot=False): - return mmlu("management", cot) - - -@task -def mmlu_marketing(cot=False): - return mmlu("marketing", cot) - - -@task -def mmlu_miscellaneous(cot=False): - return mmlu("miscellaneous", cot) - - -@task -def mmlu_moral_disputes(cot=False): - return mmlu("moral_disputes", cot) - - -@task -def mmlu_moral_scenarios(cot=False): - return mmlu("moral_scenarios", cot) - - -@task -def mmlu_nutrition(cot=False): - return mmlu("nutrition", cot) - - -@task -def mmlu_philosophy(cot=False): - return mmlu("philosophy", cot) - - -@task -def mmlu_prehistory(cot=False): - return mmlu("prehistory", cot) - - -@task -def mmlu_professional_accounting(cot=False): - return mmlu("professional_accounting", cot) - - -@task -def mmlu_professional_law(cot=False): - return mmlu("professional_law", cot) - - -@task -def mmlu_professional_medicine(cot=False): - return mmlu("professional_medicine", cot) - - -@task -def mmlu_professional_psychology(cot=False): - return mmlu("professional_psychology", cot) - - -@task -def mmlu_public_relations(cot=False): - return mmlu("public_relations", cot) - - -@task -def mmlu_security_studies(cot=False): - return mmlu("security_studies", cot) - - -@task -def mmlu_sociology(cot=False): - return mmlu("sociology", cot) - - -@task -def mmlu_us_foreign_policy(cot=False): - return mmlu("us_foreign_policy", cot) - - -@task -def mmlu_virology(cot=False): - return mmlu("virology", cot) - - -@task -def mmlu_world_religions(cot=False): - return mmlu("world_religions", cot) diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py index d7adda284..c554c8880 100644 --- a/src/inspect_ai/_cli/eval.py +++ b/src/inspect_ai/_cli/eval.py @@ -156,12 +156,12 @@ "--logprobs", type=bool, is_flag=True, - help="Return log probabilities of the output tokens. OpenAI and TogetherAI only.", + help="Return log probabilities of the output tokens. OpenAI, TogetherAI, and Huggingface only.", ) @click.option( "--top-logprobs", type=int, - help="Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI only.", + help="Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI and Huggingface only.", ) @common_options def eval_command( diff --git a/src/inspect_ai/_cli/info.py b/src/inspect_ai/_cli/info.py index 9f4ed10a9..862f81da0 100644 --- a/src/inspect_ai/_cli/info.py +++ b/src/inspect_ai/_cli/info.py @@ -1,10 +1,11 @@ from json import dumps import click +from pydantic_core import to_jsonable_python from inspect_ai import __version__ from inspect_ai._util.constants import PKG_PATH -from inspect_ai.log import eval_log_json, read_eval_log +from inspect_ai.log._file import eval_log_json, read_eval_log, read_eval_log_headers @click.group("info") @@ -44,6 +45,14 @@ def log(path: str, header_only: bool) -> None: print(eval_log_json(log)) +@info_command.command("log-file-headers") +@click.argument("files", nargs=-1) +def log_file_headers(files: tuple[str]) -> None: + """Read and print a JSON list of log file headers.""" + headers = read_eval_log_headers(list(files)) + print(dumps(to_jsonable_python(headers), indent=2)) + + @info_command.command("log-schema") def log_schema() -> None: """Print JSON schema for log files.""" diff --git a/src/inspect_ai/_view/view.py b/src/inspect_ai/_view/view.py index cd54eeace..808677484 100644 --- a/src/inspect_ai/_view/view.py +++ b/src/inspect_ai/_view/view.py @@ -12,6 +12,7 @@ from urllib.parse import parse_qs, urlparse, urlunparse import psutil +from pydantic_core import to_jsonable_python from inspect_ai._display import display from inspect_ai._display.logger import init_logger @@ -24,7 +25,12 @@ from inspect_ai._util.error import exception_message from inspect_ai._util.file import FileSystem, file, filesystem from inspect_ai._util.http import InspectHTTPRequestHandler -from inspect_ai.log._file import eval_log_json, list_eval_logs, read_eval_log +from inspect_ai.log._file import ( + eval_log_json, + list_eval_logs, + read_eval_log, + read_eval_log_headers, +) logger = logging.getLogger(__name__) @@ -34,6 +40,7 @@ LOGS_PATH = "/api/logs" LOGS_DIR = f"{LOGS_PATH}/" +LOG_HEADERS_PATH = "/api/log-headers" def view( @@ -93,6 +100,8 @@ def __init__( def do_GET(self) -> None: if self.path == LOGS_PATH: self.handle_logs() + elif self.path.startswith(LOG_HEADERS_PATH): + self.handle_log_headers() elif self.path.startswith(LOGS_DIR): self.handle_log() else: @@ -121,6 +130,16 @@ def handle_logs(self) -> None: ) self.send_json(json_files) + def handle_log_headers(self) -> None: + # check for query params + parsed = urlparse(self.path) + query_params = parse_qs(parsed.query) + files = query_params.get("file", []) + headers = read_eval_log_headers(files) + self.send_json( + json.dumps(to_jsonable_python(headers, exclude_none=True), indent=2) + ) + def handle_log(self) -> None: """Serve log files from /api/logs/* url.""" path = self.path.replace(LOGS_DIR, "", 1) # strip /api/logs/ diff --git a/src/inspect_ai/_view/www/App.mjs b/src/inspect_ai/_view/www/App.mjs index 58ead8e51..85a9248e8 100644 --- a/src/inspect_ai/_view/www/App.mjs +++ b/src/inspect_ai/_view/www/App.mjs @@ -1,17 +1,20 @@ import { html } from "htm/preact"; -import { useState, useEffect } from "preact/hooks"; +import { useCallback, useState, useEffect } from "preact/hooks"; import { formatPrettyDecimal } from "./src/utils/Format.mjs"; -import { client_events, eval_logs } from "api"; import "./src/Register.mjs"; import { icons } from "./src/Constants.mjs"; import { WorkSpace } from "./src/workspace/WorkSpace.mjs"; -import { eval_log } from "./api.mjs"; +import api from "./src/api/index.mjs"; import { CopyButton } from "./src/components/CopyButton.mjs"; +const logFileName = (path) => { + return path.replace("\\", "/").split('/').pop(); +}; + export function App() { const [selected, setSelected] = useState(0); const [logs, setLogs] = useState({ log_dir: "", files: [] }); @@ -28,22 +31,65 @@ export function App() { useEffect(async () => { // Read header information for the logs // and then update - const headerResults = await Promise.all(logs.files.map((file) => { - return eval_log(file.name, true).then((result) => { - return { file: file.name, result }; - }).catch(() => { return undefined}); - })); - - // Update the headers - const updatedHeaders = logHeaders; - for (const headerResult of headerResults) { - if (headerResult) { - updatedHeaders[headerResult.file] = headerResult.result; - } + + // Group into chunks + const chunkSize = 5; + const fileLists = []; + for (let i = 0; i < logs.files.length; i += chunkSize) { + let chunk = logs.files.slice(i, i + chunkSize).map((log) => { return log.name; }); + fileLists.push(chunk); + } + + for (const fileList of fileLists) { + const headers = await api.eval_log_headers(fileList); + const updatedHeaders = logHeaders; + headers.forEach((header, index) => { + const logFile = fileList[index]; + updatedHeaders[logFile] = header; + }) + setLogHeaders({ ...updatedHeaders }); } - setLogHeaders({ ...updatedHeaders }); }, [logs]); + const updateLogs = useCallback(async (log) => { + // Set the list of logs + const logresult = await api.eval_logs(); + if (logresult) { + setLogs(logresult); + if (log) { + const name = logFileName(log); + const index = logresult.files.findIndex((val) => { + return val.name.endsWith(name); + }) + setSelected(index); + } + } else { + setLogs({ log_dir: "", files: [] }); + } + }, [setLogs, setSelected]); + + // listen for updateState messages from vscode + useEffect(() => { + + const onMessage = (e) => { + switch (e.data.type || e.data.message) { + + case "updateState": { + if (e.data.url) { + updateLogs(e.data.url); + } + } + } + } + + window.addEventListener("message", onMessage); + + return () => { + window.removeEventListener("message", onMessage); + } + + }, [updateLogs]); + useEffect(async () => { const urlParams = new URLSearchParams(window.location.search); @@ -54,24 +100,19 @@ export function App() { const logPath = urlParams.get("task_file"); const loadLogs = logPath ? async () => { - setLogs({ - log_dir: "", - files: [{ name: logPath }], - }); - } - : async () => { - // Set the list of logs - const logresult = await eval_logs(); - setLogs(logresult); - - }; + setLogs({ + log_dir: "", + files: [{ name: logPath }], + }); + } + : updateLogs; // initial fetch of logs await loadLogs(); // poll every 1s for events setInterval(() => { - client_events().then((events) => { + api.client_events().then((events) => { if (events.includes("reload")) { window.location.reload(true); } @@ -96,15 +137,15 @@ export function App() { offcanvas=${offcanvas} selected=${selected} onSelected=${(index) => { - setSelected(index); + setSelected(index); - // hide the sidebar offcanvas - var myOffcanvas = document.getElementById("sidebarOffCanvas"); - var bsOffcanvas = bootstrap.Offcanvas.getInstance(myOffcanvas); - if (bsOffcanvas) { - bsOffcanvas.hide(); - } - }} + // hide the sidebar offcanvas + var myOffcanvas = document.getElementById("sidebarOffCanvas"); + var bsOffcanvas = bootstrap.Offcanvas.getInstance(myOffcanvas); + if (bsOffcanvas) { + bsOffcanvas.hide(); + } + }} /> `; return html` @@ -127,9 +168,7 @@ const Header = (props) => { const logFiles = props.logs.files || []; const logSelected = props.selected || 0; const logUri = logFiles.length > logSelected ? logFiles[logSelected].name : ""; - const logName =logUri.split('/').pop(); - - + const logName = logFileName(logUri); return html`