diff --git a/docs/src/components/BuiltinTools.mdx b/docs/src/components/BuiltinTools.mdx index 941a1efc6a..7caed9168d 100644 --- a/docs/src/components/BuiltinTools.mdx +++ b/docs/src/components/BuiltinTools.mdx @@ -48,7 +48,7 @@ import { LinkCard } from '@astrojs/starlight/components'; - + diff --git a/docs/src/content/docs/reference/scripts/system.mdx b/docs/src/content/docs/reference/scripts/system.mdx index 58d3dfd4ae..cad4f2d84b 100644 --- a/docs/src/content/docs/reference/scripts/system.mdx +++ b/docs/src/content/docs/reference/scripts/system.mdx @@ -3566,7 +3566,7 @@ defTool( Video manipulation tools - tool `video_probe`: Probe a video file and returns the metadata information -- tool `video_extract_audio`: Extract audio from a video file into a .wav file. Returns the audio filename. +- tool `video_extract_audio`: Extract audio from a video file into an audio file. Returns the audio filename. - tool `video_extract_frames`: Extract frames from a video file `````js wrap title="system.video" @@ -3582,14 +3582,16 @@ defTool( properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, }, required: ["filename"], }, async (args) => { const { context, filename } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`probing ${filename}`) const info = await ffmpeg.probe(filename) return YAML.stringify(info) @@ -3598,20 +3600,22 @@ defTool( defTool( "video_extract_audio", - "Extract audio from a video file into a .wav file. Returns the audio filename.", + "Extract audio from a video file into an audio file. Returns the audio filename.", { type: "object", properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, }, required: ["filename"], }, async (args) => { const { context, filename } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`extracting audio from ${filename}`) const audioFile = await ffmpeg.extractAudio(filename) return audioFile @@ -3626,7 +3630,7 @@ defTool( properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, keyframes: { type: "boolean", @@ -3655,7 +3659,9 @@ defTool( }, async (args) => { const { context, filename, transcription, ...options } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`extracting frames from ${filename}`) if (transcription) { diff --git a/docs/src/content/docs/reference/scripts/videos.mdx b/docs/src/content/docs/reference/scripts/videos.mdx index 8e3254f8b2..4bd07c6226 100644 --- a/docs/src/content/docs/reference/scripts/videos.mdx +++ b/docs/src/content/docs/reference/scripts/videos.mdx @@ -28,13 +28,13 @@ or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point t As mentionned above, multi-modal LLMs typically support images as a sequence of frames (or screenshots). -The `ffmpeg.extractFrames` will render frames from a video file or url +The `ffmpeg.extractFrames` will render frames from a video file and return them as an array of file paths. You can use the result with `defImages` directly. - by default, extract keyframes (intra-frames) ```js -const frames = await ffmpeg.extractFrames("path_url_to_video") +const frames = await ffmpeg.extractFrames("path_to_video") def("FRAMES", frames) ``` @@ -66,11 +66,11 @@ const transcript = await transcribe("...", { sceneThreshold: 0.3 }) ## Extracting audio -The `ffmpeg.extractAudio` will extract the audio from a video file or url +The `ffmpeg.extractAudio` will extract the audio from a video file as a `.wav` file. ```js -const audio = await ffmpeg.extractAudio("path_url_to_video") +const audio = await ffmpeg.extractAudio("path_to_video") ``` The conversion to audio happens automatically @@ -78,10 +78,10 @@ for videos when using [transcribe](/genaiscript/reference/scripts/transcription) ## Probing videos -You can extract metadata from a video file or url using `ffmpeg.probe`. +You can extract metadata from a video file using `ffmpeg.probe`. ```js -const info = await ffmpeg.probe("path_url_to_video") +const info = await ffmpeg.probe("path_to_video") const { duration } = info.streams[0] console.log(`video duration: ${duration} seconds`) ``` @@ -92,7 +92,7 @@ You can further customize the `ffmpeg` configuration by passing `outputOptions`. ```js 'outputOptions: "-b:a 16k",' -const audio = await ffmpeg.extractAudio("path_url_to_video", { +const audio = await ffmpeg.extractAudio("path_to_video", { outputOptions: "-b:a 16k", }) ``` diff --git a/packages/core/src/fetch.ts b/packages/core/src/fetch.ts index e90f6d1439..4ba2f5d88a 100644 --- a/packages/core/src/fetch.ts +++ b/packages/core/src/fetch.ts @@ -9,12 +9,15 @@ import { import { errorMessage } from "./error" import { logVerbose, toStringList } from "./util" import { CancellationOptions, CancellationToken } from "./cancellation" -import { readText } from "./fs" import { resolveHttpProxyAgent } from "./proxy" import { host } from "./host" import { renderWithPrecision } from "./precision" import crossFetch from "cross-fetch" import prettyBytes from "pretty-bytes" +import { fileTypeFromBuffer } from "file-type" +import { isBinaryMimeType } from "./binary" +import { toBase64 } from "./base64" +import { deleteUndefinedValues } from "./cleaners" export type FetchType = ( input: string | URL | globalThis.Request, @@ -137,7 +140,7 @@ export async function fetchText( const url = urlOrFile.filename let ok = false let status = 404 - let text: string + let bytes: Uint8Array if (/^https?:\/\//i.test(url)) { const f = await createFetch({ retries, @@ -149,25 +152,39 @@ export async function fetchText( const resp = await f(url, rest) ok = resp.ok status = resp.status - if (ok) text = await resp.text() + if (ok) bytes = new Uint8Array(await resp.arrayBuffer()) } else { try { - text = await readText("workspace://" + url) - ok = true + bytes = await host.readFile(url) } catch (e) { logVerbose(e) ok = false status = 404 } } - const file: WorkspaceFile = { - filename: urlOrFile.filename, - content: text, + + let content: string + let encoding: "base64" + let type: string + const mime = await fileTypeFromBuffer(bytes) + if (isBinaryMimeType(mime?.mime)) { + encoding = "base64" + content = toBase64(bytes) + } else { + content = host.createUTF8Decoder().decode(bytes) } + ok = true + const file: WorkspaceFile = deleteUndefinedValues({ + filename: urlOrFile.filename, + encoding, + type, + content, + }) return { ok, status, - text, + text: content, + bytes, file, } } diff --git a/packages/core/src/filesystem.ts b/packages/core/src/filesystem.ts index 3e7aa0223e..b4c5f22139 100644 --- a/packages/core/src/filesystem.ts +++ b/packages/core/src/filesystem.ts @@ -1,3 +1,4 @@ +import { stat } from "fs/promises" import { JSONLineCache } from "./cache" import { DOT_ENV_REGEX } from "./constants" import { CSVTryParse } from "./csv" @@ -111,6 +112,17 @@ export function createFileSystem(): Omit { const res = JSONLineCache.byName(name) return res }, + stat: async (filename: string) => { + try { + const res = await stat(filename) + return { + size: res.size, + mode: res.mode, + } + } catch { + return undefined + } + }, } satisfies Omit ;(fs as any).readFile = readText return Object.freeze(fs) diff --git a/packages/core/src/genaisrc/system.video.genai.js b/packages/core/src/genaisrc/system.video.genai.js index 6743b68562..1a7b02472d 100644 --- a/packages/core/src/genaisrc/system.video.genai.js +++ b/packages/core/src/genaisrc/system.video.genai.js @@ -10,14 +10,16 @@ defTool( properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, }, required: ["filename"], }, async (args) => { const { context, filename } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`probing ${filename}`) const info = await ffmpeg.probe(filename) return YAML.stringify(info) @@ -26,20 +28,22 @@ defTool( defTool( "video_extract_audio", - "Extract audio from a video file into a .wav file. Returns the audio filename.", + "Extract audio from a video file into an audio file. Returns the audio filename.", { type: "object", properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, }, required: ["filename"], }, async (args) => { const { context, filename } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`extracting audio from ${filename}`) const audioFile = await ffmpeg.extractAudio(filename) return audioFile @@ -54,7 +58,7 @@ defTool( properties: { filename: { type: "string", - description: "The video filename or URL to probe", + description: "The video filename to probe", }, keyframes: { type: "boolean", @@ -83,7 +87,9 @@ defTool( }, async (args) => { const { context, filename, transcription, ...options } = args - if (!filename) return "No filename or url provided" + if (!filename) return "No filename provided" + if (!(await workspace.stat(filename))) + return `File ${filename} does not exist.` context.log(`extracting frames from ${filename}`) if (transcription) { diff --git a/packages/core/src/promptcontext.ts b/packages/core/src/promptcontext.ts index b8fcbff173..24d187f776 100644 --- a/packages/core/src/promptcontext.ts +++ b/packages/core/src/promptcontext.ts @@ -74,6 +74,7 @@ export async function createPromptContext( }) return res }, + stat: (filename) => runtimeHost.workspace.stat(filename), grep: async ( query, grepOptions: string | WorkspaceGrepOptions, diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index beb5eb12d3..f4ffd5c3de 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -885,6 +885,14 @@ interface FindFilesOptions { readText?: boolean } +interface FileStats { + /** + * Size of the file in bytes + */ + size: number + mode: number +} + interface WorkspaceFileSystem { /** * Searches for files using the glob pattern and returns a list of files. @@ -911,6 +919,12 @@ interface WorkspaceFileSystem { options?: Omit ): Promise + /** + * Reads metadata information about the file. Returns undefined if the file does not exist. + * @param filename + */ + stat(filename: string): Promise + /** * Reads the content of a file as text * @param path