Skip to content

Commit

Permalink
feat: ✨ enhance ffmpeg options and add audio extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
pelikhan committed Jan 15, 2025
1 parent cf1d9c0 commit b5db18a
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 44 deletions.
20 changes: 11 additions & 9 deletions docs/src/content/docs/reference/scripts/videos.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ While most LLMs do not support videos natively, they can be integrated in script
and adding them as images to the prompt. This can be tedious and GenAIScript provides efficient helpers
to streamline this process.

## Configuration
## ffmpeg configuration

The functionalities to render and analyze videos rely on [ffmpeg](https://ffmpeg.org/)
and [ffprobe](https://ffmpeg.org/ffprobe.html).
Expand All @@ -23,14 +23,6 @@ sudo apt-get update && sudo apt-get install ffmpeg
Make sure these tools are installed locally and available in your PATH,
or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point to the `ffmpeg`/`ffprobe` executable.

### ffmpeg output caching

Since video processing can be slow, GenAIScript caches the results in subfolders under `.genaiscript/videos/...`
where the subfolder name is a hash from the video file content and the options used to render the video.
This way, you can re-run the script without having to re-render the video.

You can review the `ffmpeg` console log in the `log.txt` file in the cache folder.

## Extracting frames

As mentionned above, multi-modal LLMs typically support images as a sequence
Expand Down Expand Up @@ -85,3 +77,13 @@ const info = await ffmpeg.probe("path_url_to_video")
const { duration } = info.streams[0]
console.log(`video duration: ${duration} seconds`)
```

## Extract ffmpeg options

You can further customize the `ffmpeg` configuration by passing a `builder` function that manipulates the `ffmpeg` command object.

```js
const audio = await ffmpeg.extractAudio("path_url_to_video", {
builder: (cmd) => cmd.audioBitrate("16k"),
})
```
72 changes: 39 additions & 33 deletions packages/core/src/ffmpeg.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { dotGenaiscriptPath, logVerbose } from "./util"
import { arrayify, dotGenaiscriptPath, logVerbose } from "./util"
import { TraceOptions } from "./trace"
import { lookupMime } from "./mime"
import pLimit from "p-limit"
Expand All @@ -11,8 +11,6 @@ import { writeFile, readFile } from "fs/promises"
import { errorMessage, serializeError } from "./error"
import { fromBase64 } from "./base64"
import { fileTypeFromBuffer } from "file-type"
import { log } from "node:console"
import { CORE_VERSION } from "./version"

const ffmpegLimit = pLimit(1)

Expand All @@ -24,9 +22,9 @@ async function ffmpegCommand(options?: { timeout?: number }) {

async function computeHashFolder(
filename: string | WorkspaceFile,
options: TraceOptions & { cache?: string }
options: TraceOptions & FFmpegCommandOptions
) {
const { trace, cache, ...rest } = options
const { trace, ...rest } = options
const h = await hash(
[typeof filename === "string" ? { filename } : filename, rest],
{
Expand Down Expand Up @@ -54,19 +52,17 @@ async function resolveInput(
}

export class FFmepgClient implements Ffmpeg {
readonly options: any
constructor() {
this.options = {}
}
constructor() {}

async run(
input: string | WorkspaceFile,
builder: (
cmd: FfmpegCommandBuilder,
options?: { input: string; dir: string }
) => Promise<{ output?: string }>
options?: { input: string; dir: string } & FFmpegCommandOptions
) => Promise<{ output?: string }>,
options?: FFmpegCommandOptions
): Promise<string[]> {
const res = await runFfmpeg(input, builder, { ...this.options })
const res = await runFfmpeg(input, builder, options || {})
return res.filenames
}

Expand All @@ -76,22 +72,25 @@ export class FFmepgClient implements Ffmpeg {
): Promise<string[]> {
if (!filename) throw new Error("filename is required")

const { transcript, builder, ...soptions } = options || {}
const { transcript, ...soptions } = options || {}
if (transcript?.segments?.length)
soptions.timestamps = transcript.segments.map((s) => s.start)
if (!soptions.count && !soptions.timestamps) soptions.count = 5

const res = await this.run(filename, async (cmd, fopts) => {
const { dir } = fopts
await builder?.(cmd)
const c = cmd as FfmpegCommand
c.screenshots({
filename: "%b_%i.png",
...soptions,
folder: dir,
})
return undefined
})
const res = await this.run(
filename,
async (cmd, fopts) => {
const { dir } = fopts
const c = cmd as FfmpegCommand
c.screenshots({
...soptions,
filename: "%b_%i.png",
folder: dir,
})
return undefined
},
{ ...soptions, cache: "frames" }
)
logVerbose(`ffmpeg: extracted ${res.length} frames`)
return res
}
Expand All @@ -102,17 +101,20 @@ export class FFmepgClient implements Ffmpeg {
): Promise<string> {
if (!filename) throw new Error("filename is required")

const { builder, forceConversion } = options
const { forceConversion, ...foptions } = options
if (!forceConversion && typeof filename === "string") {
const mime = lookupMime(filename)
if (/^audio/.test(mime)) return filename
}
const res = await this.run(filename, async (cmd, fopts) => {
const { input, dir } = fopts
await builder?.(cmd)
cmd.noVideo().toFormat("wav")
return { output: join(dir, basename(input) + ".wav") }
})
const res = await this.run(
filename,
async (cmd, fopts) => {
const { input, dir } = fopts
cmd.noVideo().toFormat("wav")
return { output: join(dir, basename(input) + ".wav") }
},
{ ...foptions, cache: "audio" }
)
return res[0]
}

Expand All @@ -135,7 +137,7 @@ export class FFmepgClient implements Ffmpeg {
const meta = await res
return meta
},
this.options
{ cache: "probe" }
)
return res.data as VideoProbeResult
}
Expand All @@ -147,7 +149,7 @@ async function runFfmpeg(
cmd: FfmpegCommand,
options: { input: string; dir: string }
) => Awaitable<{ output?: string; data?: any }>,
options: {}
options: FFmpegCommandOptions
): Promise<{ filenames: string[]; data?: any }> {
if (!filename) throw new Error("filename is required")
return ffmpegLimit(async () => {
Expand Down Expand Up @@ -196,6 +198,10 @@ async function runFfmpeg(
const end = () => resolve(r)

cmd.input(input)
if (options.inputOptions)
cmd.inputOptions(...arrayify(options.inputOptions))
if (options.outputOptions)
cmd.outputOption(...arrayify(options.outputOptions))
cmd.addListener("filenames", (fns: string[]) => {
r.filenames.push(...fns.map((f) => join(folder, f)))
})
Expand Down
14 changes: 12 additions & 2 deletions packages/core/src/types/prompt_template.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2174,14 +2174,23 @@ interface FfmpegCommandBuilder {
noVideo(): FfmpegCommandBuilder
noAudio(): FfmpegCommandBuilder
audioCodec(codec: string): FfmpegCommandBuilder
audioBitrate(bitrate: string | number): FfmpegCommandBuilder
audioChannels(channels: number): FfmpegCommandBuilder
audioFrequency(freq: number): FfmpegCommandBuilder
audioQuality(quality: number): FfmpegCommandBuilder
audioFilters(
filters: string | string[] | AudioVideoFilter[]
): FfmpegCommandBuilder
videoCodec(codec: string): FfmpegCommandBuilder
toFormat(format: string): FfmpegCommandBuilder
inputOptions(...options: string[]): FfmpegCommandBuilder
outputOptions(...options: string[]): FfmpegCommandBuilder
}

interface FFmpegCommandOptions {
builder?: (cmd: FfmpegCommandBuilder) => Awaitable<void>
inputOptions?: ElementOrArray<string>
outputOptions?: ElementOrArray<string>
cache?: string
}

interface VideoExtractAudioOptions extends FFmpegCommandOptions {
Expand Down Expand Up @@ -2226,7 +2235,8 @@ interface Ffmpeg {
builder: (
cmd: FfmpegCommandBuilder,
options?: { input: string; dir: string }
) => Promise<{ output?: string }>
) => Promise<{ output?: string }>,
options?: FFmpegCommandOptions
): Promise<string[]>
}

Expand Down
5 changes: 5 additions & 0 deletions packages/sample/genaisrc/video.genai.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ const more = await ffmpeg.extractFrames(
"https://github.com/microsoft/jacdac-docs/raw/refs/heads/main/static/videos/addbutton.webm"
)

const audio = await ffmpeg.extractAudio("src/audio/helloworld.mp4", {
outputOptions: "-ar 16000",
})
console.log({ audio })

defImages(frames)
defImages(more)
$`Describe the images.`

0 comments on commit b5db18a

Please sign in to comment.