Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into feature/exif_to_prompt
Browse files Browse the repository at this point in the history
  • Loading branch information
faberf committed Oct 14, 2024
2 parents 42bd7bd + 16d97ee commit 1c25dfa
Show file tree
Hide file tree
Showing 7 changed files with 268 additions and 16 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ allprojects {
group = 'org.vitrivr'

/* Current version of our artifacts. */
version = '0.0.1'
version = '0.0.2-SNAPSHOT'

/* Repositories for build script. */
buildscript {
Expand Down
110 changes: 110 additions & 0 deletions example-configs/ingestion/example/image-captions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"schema": "caption",
"context": {
"contentFactory": "CachedContentFactory",
"resolverName": "disk",
"local": {
"content": {
"path": "../cache"
},
"enumerator": {
"path": "E:\\Joint-Image-EEG-Embedding\\Data\\Images",
"depth": "5",
"regex": ".*n014[0-9]{5}.*"
},
"imageFilePathContent": {
"field": "file"
},

"captionContent": {
"field": "captionSparse"
},
"clip": {
"contentSources": "imageDecoder"
},
"captionSparse": {
"contentSources": "imageDecoder"
},
"captionDense": {
"contentSources": "captionContent"
},
"documentType": {
"contentSources": "imageDecoder"
},
"imagePrompt": {
"template": "Create a short caption to the content of this image (file path: ${imageFilePathContent}) for the purpose of traning a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources.",
"defaultValue": "no content provided"
},

"imageSourceFilter": {
"type": "SOURCE:IMAGE"
}
}
},
"operators": {
"enumerator": {
"type": "ENUMERATOR",
"factory": "FileSystemEnumerator",
"mediaTypes": ["IMAGE", "VIDEO"]
},
"imageDecoder": {
"type": "DECODER",
"factory": "ImageDecoder"
},
"fileMetadata":{
"type": "EXTRACTOR",
"fieldName": "file"
},
"imageFilePathContent": {
"type": "TRANSFORMER",
"factory":"DescriptorAsContentTransformer"
},

"clip": {
"type": "EXTRACTOR",
"fieldName": "clip"
},

"imagePrompt": {
"type": "TRANSFORMER",
"factory": "TemplateTextTransformer"
},
"captionSparse": {
"type": "EXTRACTOR",
"fieldName": "captionSparse"
},
"captionContent": {
"type": "TRANSFORMER",
"factory": "DescriptorAsContentTransformer"
},
"captionDense": {
"type": "EXTRACTOR",
"fieldName": "captionDense"
},

"imageSourceFilter": {
"type": "TRANSFORMER",
"factory": "TypeFilterTransformer"
}
},
"operations": {
"enumerator-stage": {"operator": "enumerator"},
"image-decoder-stage": {"operator": "imageDecoder", "inputs": ["enumerator-stage"]},
"image-file-metadata-stage": {"operator": "fileMetadata", "inputs": ["image-decoder-stage"]},
"image-file-path-content-stage": {"operator": "imageFilePathContent", "inputs": ["image-file-metadata-stage"]},

"image-clip-stage": {"operator": "clip", "inputs": ["image-file-path-content-stage"]},

"image-prompt-stage": {"operator": "imagePrompt", "inputs": ["image-file-path-content-stage"]},
"image-caption-sparse-stage": {"operator": "captionSparse", "inputs": ["image-file-path-content-stage"]},
"image-caption-content-stage": {"operator": "captionContent", "inputs": ["image-caption-sparse-stage"]},
"image-caption-stage": {"operator": "captionDense", "inputs": ["image-caption-content-stage"]},


"image-final-stage": {"operator": "imageSourceFilter", "inputs": ["image-caption-stage","image-clip-stage"], "merge": "COMBINE"}
},
"output": [
"image-final-stage"
],
"mergeType": "MERGE"
}
73 changes: 73 additions & 0 deletions example-configs/schema/image-captions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"schemas": {
"caption": {
"connection": {
"database": "PgVectorConnectionProvider",
"parameters": {
"Host": "loalhost",
"port": "5432",
"username": "postgres",
"password": "vitrivr"
}
},
"fields": {
"file": {
"factory": "FileSourceMetadata"
},
"clip": {
"factory": "DenseEmbedding",
"parameters": {
"host": "http://10.34.64.84:8888/",
"model": "open-clip-vit-b32",
"length": "512",
"timeoutSeconds": "100",
"retries": "1000"
}
},
"captionSparse": {
"factory": "ImageCaption",
"parameters": {
"host": "http://10.34.64.84:8888/",
"timeoutSeconds": "100",
"retries": "1000",
"model": "gpt4o",
"prompt": "Create a short caption to the content of this image for the purpose of training a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources."
}
},
"captionDense": {
"factory": "DenseEmbedding",
"parameters": {
"host": "http://10.34.64.84:8888/",
"model": "e5mistral7b-instruct",
"length": "4096",
"timeoutSeconds": "100",
"retries": "1000"
}
}
},
"resolvers": {
"disk": {
"factory": "DiskResolver",
"parameters": {
"location": "../thumbnails"
}
}
},
"exporters": {
"thumbnail": {
"factory": "ThumbnailExporter",
"resolverName": "disk",
"parameters": {
"maxSideResolution": "400",
"mimeType": "JPG"
}
}
},
"extractionPipelines": {
"cap": {
"path": "example-configs/ingestion/example/image-captions.json"
}
}
}
}
}
4 changes: 4 additions & 0 deletions vitrivr-engine-index/gradle.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## Module-specific settings

# nopointergc https://github.com/bytedeco/javacv/issues/2266
org.bytedeco.javacpp.nopointergc=true
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ import kotlinx.coroutines.channels.ProducerScope
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.buffer
import kotlinx.coroutines.flow.channelFlow
import org.bytedeco.javacpp.PointerScope
import org.bytedeco.javacv.FFmpegFrameGrabber
import org.bytedeco.javacv.Frame
import org.bytedeco.javacv.FrameGrabber
import org.bytedeco.javacv.Java2DFrameConverter

import org.vitrivr.engine.core.context.IndexContext
import org.vitrivr.engine.core.model.content.Content
import org.vitrivr.engine.core.model.content.element.AudioContent
Expand Down Expand Up @@ -61,7 +63,7 @@ class VideoDecoder : DecoderFactory {
private val audio: Boolean = true,
private val keyFrames: Boolean = false,
private val timeWindowMs: Long = 500L,
private val name : String
private val name: String
) : Decoder {

/** [KLogger] instance. */
Expand Down Expand Up @@ -108,11 +110,17 @@ class VideoDecoder : DecoderFactory {
* @param grabber The [FFmpegFrameGrabber] used to decode the video.
* @param channel The [ProducerScope] used to emit [Retrievable] elements.
*/
private suspend fun decodeFromGrabber(source: Source, sourceRetrievable: Retrievable, grabber: FFmpegFrameGrabber, channel: ProducerScope<Retrievable>) {
private suspend fun decodeFromGrabber(
source: Source,
sourceRetrievable: Retrievable,
grabber: FFmpegFrameGrabber,
channel: ProducerScope<Retrievable>
) {
/* Determine end of time window. */
var windowEnd = TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs)
var error = false


/* Configure FFmpegFrameGrabber. */
grabber.imageMode = FrameGrabber.ImageMode.COLOR
grabber.sampleMode = FrameGrabber.SampleMode.SHORT
Expand All @@ -123,7 +131,8 @@ class VideoDecoder : DecoderFactory {

/* Extract and enrich source metadata. */
source.metadata[Metadata.METADATA_KEY_VIDEO_FPS] = grabber.videoFrameRate
source.metadata[Metadata.METADATA_KEY_AV_DURATION] = TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
source.metadata[Metadata.METADATA_KEY_AV_DURATION] =
TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
source.metadata[Metadata.METADATA_KEY_IMAGE_WIDTH] = grabber.imageWidth
source.metadata[Metadata.METADATA_KEY_IMAGE_HEIGHT] = grabber.imageHeight
source.metadata[Metadata.METADATA_KEY_AUDIO_CHANNELS] = grabber.audioChannels
Expand All @@ -139,10 +148,19 @@ class VideoDecoder : DecoderFactory {
var audioReady = !(grabber.hasAudio() && this@Instance.audio)

do {
val frame = grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true) ?: break
val frame =
grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true)
?: break
when (frame.type) {
Frame.Type.VIDEO -> {
imageBuffer.add(Java2DFrameConverter().use { it.convert(frame) to frame.timestamp })
imageBuffer.add(
(try {
PointerScope().use { scope -> Java2DFrameConverter().convert(frame) to frame.timestamp }
} catch (e: Exception) {
logger.error(e) { "Error converting frame to BufferedImage" }
null
})!!
)
if (frame.timestamp > windowEnd) {
videoReady = true
}
Expand Down Expand Up @@ -204,7 +222,14 @@ class VideoDecoder : DecoderFactory {
* @param timestampEnd The end timestamp.
* @param source The source [Retrievable] the emitted [Retrievable] is part of.
*/
private suspend fun emit(imageBuffer: LinkedList<Pair<BufferedImage, Long>>, audioBuffer: LinkedList<Pair<ShortBuffer, Long>>, grabber: FrameGrabber, timestampEnd: Long, source: Retrievable, channel: ProducerScope<Retrievable>) {
private suspend fun emit(
imageBuffer: LinkedList<Pair<BufferedImage, Long>>,
audioBuffer: LinkedList<Pair<ShortBuffer, Long>>,
grabber: FrameGrabber,
timestampEnd: Long,
source: Retrievable,
channel: ProducerScope<Retrievable>
) {
/* Audio samples. */
var audioSize = 0
val emitImage = mutableListOf<BufferedImage>()
Expand Down Expand Up @@ -233,7 +258,13 @@ class VideoDecoder : DecoderFactory {
val ingested = Ingested(UUID.randomUUID(), "SEGMENT", false)
source.filteredAttribute(SourceAttribute::class.java)?.let { ingested.addAttribute(it) }
ingested.addRelationship(Relationship.ByRef(ingested, "partOf", source, false))
ingested.addAttribute(TimeRangeAttribute(timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs), timestampEnd, TimeUnit.MICROSECONDS))
ingested.addAttribute(
TimeRangeAttribute(
timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs),
timestampEnd,
TimeUnit.MICROSECONDS
)
)

/* Prepare and append audio content element. */
if (emitAudio.size > 0) {
Expand All @@ -243,7 +274,11 @@ class VideoDecoder : DecoderFactory {
samples.put(frame)
}
samples.clear()
val audio = this.context.contentFactory.newAudioContent(grabber.audioChannels.toShort(), grabber.sampleRate, samples)
val audio = this.context.contentFactory.newAudioContent(
grabber.audioChannels.toShort(),
grabber.sampleRate,
samples
)
ingested.addContent(audio)
ingested.addAttribute(ContentAuthorAttribute(audio.id, name))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ class FileSystemEnumerator : EnumeratorFactory {
val skip = context[name, "skip"]?.toLongOrNull() ?: 0L
val limit = context[name, "limit"]?.toLongOrNull() ?: Long.MAX_VALUE
val type = context[name, "type"]
logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit}" }
return Instance(path, depth, mediaTypes, skip, limit, type)
val regex = context[name, "regex"]
logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit} and type: $type, regex: $regex" }
return Instance(path, depth, mediaTypes, skip, limit, type, regex)
}

/**
Expand All @@ -57,7 +58,12 @@ class FileSystemEnumerator : EnumeratorFactory {
* @param context The [IndexContext] to use.
* @param inputs Is ignored.
*/
override fun newEnumerator(name: String, context: IndexContext, mediaTypes: List<MediaType>, inputs: Stream<*>?): Enumerator {
override fun newEnumerator(
name: String,
context: IndexContext,
mediaTypes: List<MediaType>,
inputs: Stream<*>?
): Enumerator {
return newEnumerator(name, context, mediaTypes)
}

Expand All @@ -70,14 +76,24 @@ class FileSystemEnumerator : EnumeratorFactory {
private val mediaTypes: Collection<MediaType> = MediaType.allValid,
private val skip: Long = 0,
private val limit: Long = Long.MAX_VALUE,
private val typeName: String? = null
private val typeName: String? = null,
private val regex: String? = null
) : Enumerator {

override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = flow {
logger.debug { "In flow: Start Enumerating with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: $limit" }

val stream = try {
Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { it.isRegularFile() }.skip(skip).limit(limit)
if (regex == null) {
Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
it.isRegularFile()
}.skip(skip).limit(limit)
} else {
Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
it.isRegularFile() && it.toString().matches(Regex(regex))
}.skip(skip).limit(limit)
}

} catch (ex: NoSuchFileException) {
val mes = "In flow: Path ${this@Instance.path} does not exist."
logger.error { mes }
Expand Down
Loading

0 comments on commit 1c25dfa

Please sign in to comment.