diff --git a/docs/src/content/docs/reference/scripts/videos.mdx b/docs/src/content/docs/reference/scripts/videos.mdx index 59e8e16eb2..0f484e2708 100644 --- a/docs/src/content/docs/reference/scripts/videos.mdx +++ b/docs/src/content/docs/reference/scripts/videos.mdx @@ -9,7 +9,7 @@ While most LLMs do not support videos natively, they can be integrated in script and adding them as images to the prompt. This can be tedious and GenAIScript provides efficient helpers to streamline this process. -## Configuration +## ffmpeg configuration The functionalities to render and analyze videos rely on [ffmpeg](https://ffmpeg.org/) and [ffprobe](https://ffmpeg.org/ffprobe.html). @@ -23,14 +23,6 @@ sudo apt-get update && sudo apt-get install ffmpeg Make sure these tools are installed locally and available in your PATH, or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point to the `ffmpeg`/`ffprobe` executable. -### ffmpeg output caching - -Since video processing can be slow, GenAIScript caches the results in subfolders under `.genaiscript/videos/...` -where the subfolder name is a hash from the video file content and the options used to render the video. -This way, you can re-run the script without having to re-render the video. - -You can review the `ffmpeg` console log in the `log.txt` file in the cache folder. - ## Extracting frames As mentionned above, multi-modal LLMs typically support images as a sequence @@ -85,3 +77,13 @@ const info = await ffmpeg.probe("path_url_to_video") const { duration } = info.streams[0] console.log(`video duration: ${duration} seconds`) ``` + +## Extract ffmpeg options + +You can further customize the `ffmpeg` configuration by passing a `builder` function that manipulates the `ffmpeg` command object. + +```js +const audio = await ffmpeg.extractAudio("path_url_to_video", { + builder: (cmd) => cmd.audioBitrate("16k"), +}) +``` diff --git a/packages/core/src/ffmpeg.ts b/packages/core/src/ffmpeg.ts index a4f26f6136..91bca5f098 100644 --- a/packages/core/src/ffmpeg.ts +++ b/packages/core/src/ffmpeg.ts @@ -1,4 +1,4 @@ -import { dotGenaiscriptPath, logVerbose } from "./util" +import { arrayify, dotGenaiscriptPath, logVerbose } from "./util" import { TraceOptions } from "./trace" import { lookupMime } from "./mime" import pLimit from "p-limit" @@ -11,8 +11,6 @@ import { writeFile, readFile } from "fs/promises" import { errorMessage, serializeError } from "./error" import { fromBase64 } from "./base64" import { fileTypeFromBuffer } from "file-type" -import { log } from "node:console" -import { CORE_VERSION } from "./version" const ffmpegLimit = pLimit(1) @@ -24,9 +22,9 @@ async function ffmpegCommand(options?: { timeout?: number }) { async function computeHashFolder( filename: string | WorkspaceFile, - options: TraceOptions & { cache?: string } + options: TraceOptions & FFmpegCommandOptions ) { - const { trace, cache, ...rest } = options + const { trace, ...rest } = options const h = await hash( [typeof filename === "string" ? { filename } : filename, rest], { @@ -54,19 +52,17 @@ async function resolveInput( } export class FFmepgClient implements Ffmpeg { - readonly options: any - constructor() { - this.options = {} - } + constructor() {} async run( input: string | WorkspaceFile, builder: ( cmd: FfmpegCommandBuilder, - options?: { input: string; dir: string } - ) => Promise<{ output?: string }> + options?: { input: string; dir: string } & FFmpegCommandOptions + ) => Promise<{ output?: string }>, + options?: FFmpegCommandOptions ): Promise { - const res = await runFfmpeg(input, builder, { ...this.options }) + const res = await runFfmpeg(input, builder, options || {}) return res.filenames } @@ -76,22 +72,25 @@ export class FFmepgClient implements Ffmpeg { ): Promise { if (!filename) throw new Error("filename is required") - const { transcript, builder, ...soptions } = options || {} + const { transcript, ...soptions } = options || {} if (transcript?.segments?.length) soptions.timestamps = transcript.segments.map((s) => s.start) if (!soptions.count && !soptions.timestamps) soptions.count = 5 - const res = await this.run(filename, async (cmd, fopts) => { - const { dir } = fopts - await builder?.(cmd) - const c = cmd as FfmpegCommand - c.screenshots({ - filename: "%b_%i.png", - ...soptions, - folder: dir, - }) - return undefined - }) + const res = await this.run( + filename, + async (cmd, fopts) => { + const { dir } = fopts + const c = cmd as FfmpegCommand + c.screenshots({ + ...soptions, + filename: "%b_%i.png", + folder: dir, + }) + return undefined + }, + { ...soptions, cache: "frames" } + ) logVerbose(`ffmpeg: extracted ${res.length} frames`) return res } @@ -102,17 +101,20 @@ export class FFmepgClient implements Ffmpeg { ): Promise { if (!filename) throw new Error("filename is required") - const { builder, forceConversion } = options + const { forceConversion, ...foptions } = options if (!forceConversion && typeof filename === "string") { const mime = lookupMime(filename) if (/^audio/.test(mime)) return filename } - const res = await this.run(filename, async (cmd, fopts) => { - const { input, dir } = fopts - await builder?.(cmd) - cmd.noVideo().toFormat("wav") - return { output: join(dir, basename(input) + ".wav") } - }) + const res = await this.run( + filename, + async (cmd, fopts) => { + const { input, dir } = fopts + cmd.noVideo().toFormat("wav") + return { output: join(dir, basename(input) + ".wav") } + }, + { ...foptions, cache: "audio" } + ) return res[0] } @@ -135,7 +137,7 @@ export class FFmepgClient implements Ffmpeg { const meta = await res return meta }, - this.options + { cache: "probe" } ) return res.data as VideoProbeResult } @@ -147,7 +149,7 @@ async function runFfmpeg( cmd: FfmpegCommand, options: { input: string; dir: string } ) => Awaitable<{ output?: string; data?: any }>, - options: {} + options: FFmpegCommandOptions ): Promise<{ filenames: string[]; data?: any }> { if (!filename) throw new Error("filename is required") return ffmpegLimit(async () => { @@ -196,6 +198,10 @@ async function runFfmpeg( const end = () => resolve(r) cmd.input(input) + if (options.inputOptions) + cmd.inputOptions(...arrayify(options.inputOptions)) + if (options.outputOptions) + cmd.outputOption(...arrayify(options.outputOptions)) cmd.addListener("filenames", (fns: string[]) => { r.filenames.push(...fns.map((f) => join(folder, f))) }) diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index cd27059218..55939e5ef4 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -2174,6 +2174,13 @@ interface FfmpegCommandBuilder { noVideo(): FfmpegCommandBuilder noAudio(): FfmpegCommandBuilder audioCodec(codec: string): FfmpegCommandBuilder + audioBitrate(bitrate: string | number): FfmpegCommandBuilder + audioChannels(channels: number): FfmpegCommandBuilder + audioFrequency(freq: number): FfmpegCommandBuilder + audioQuality(quality: number): FfmpegCommandBuilder + audioFilters( + filters: string | string[] | AudioVideoFilter[] + ): FfmpegCommandBuilder videoCodec(codec: string): FfmpegCommandBuilder toFormat(format: string): FfmpegCommandBuilder inputOptions(...options: string[]): FfmpegCommandBuilder @@ -2181,7 +2188,9 @@ interface FfmpegCommandBuilder { } interface FFmpegCommandOptions { - builder?: (cmd: FfmpegCommandBuilder) => Awaitable + inputOptions?: ElementOrArray + outputOptions?: ElementOrArray + cache?: string } interface VideoExtractAudioOptions extends FFmpegCommandOptions { @@ -2226,7 +2235,8 @@ interface Ffmpeg { builder: ( cmd: FfmpegCommandBuilder, options?: { input: string; dir: string } - ) => Promise<{ output?: string }> + ) => Promise<{ output?: string }>, + options?: FFmpegCommandOptions ): Promise } diff --git a/packages/sample/genaisrc/video.genai.mjs b/packages/sample/genaisrc/video.genai.mjs index 23dcfff382..baaf756f9b 100644 --- a/packages/sample/genaisrc/video.genai.mjs +++ b/packages/sample/genaisrc/video.genai.mjs @@ -9,6 +9,11 @@ const more = await ffmpeg.extractFrames( "https://github.com/microsoft/jacdac-docs/raw/refs/heads/main/static/videos/addbutton.webm" ) +const audio = await ffmpeg.extractAudio("src/audio/helloworld.mp4", { + outputOptions: "-ar 16000", +}) +console.log({ audio }) + defImages(frames) defImages(more) $`Describe the images.`