From 19aa546e42a836ede830cd5eabe91be28cedafbc Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 26 Sep 2024 12:07:56 +0200 Subject: [PATCH 1/4] adds an optional regex pattern to 'FileSystemEnumerator' Will only enumerates files which matches the pattern. If option is not set, nothing changes. --- .../ingestion/example/image-captions.json | 110 ++++++++++++++++++ example-configs/schema/image-captions.json | 73 ++++++++++++ .../index/enumerate/FileSystemEnumerator.kt | 26 ++++- 3 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 example-configs/ingestion/example/image-captions.json create mode 100644 example-configs/schema/image-captions.json diff --git a/example-configs/ingestion/example/image-captions.json b/example-configs/ingestion/example/image-captions.json new file mode 100644 index 000000000..84f359c73 --- /dev/null +++ b/example-configs/ingestion/example/image-captions.json @@ -0,0 +1,110 @@ +{ + "schema": "caption", + "context": { + "contentFactory": "CachedContentFactory", + "resolverName": "disk", + "local": { + "content": { + "path": "../cache" + }, + "enumerator": { + "path": "E:\\Joint-Image-EEG-Embedding\\Data\\Images", + "depth": "5", + "regex": ".*n014[0-9]{5}.*" + }, + "imageFilePathContent": { + "field": "file" + }, + + "captionContent": { + "field": "captionSparse" + }, + "clip": { + "contentSources": "imageDecoder" + }, + "captionSparse": { + "contentSources": "imageDecoder" + }, + "captionDense": { + "contentSources": "captionContent" + }, + "documentType": { + "contentSources": "imageDecoder" + }, + "imagePrompt": { + "template": "Create a short caption to the content of this image (file path: ${imageFilePathContent}) for the purpose of traning a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources.", + "defaultValue": "no content provided" + }, + + "imageSourceFilter": { + "type": "SOURCE:IMAGE" + } + } + }, + "operators": { + "enumerator": { + "type": "ENUMERATOR", + "factory": "FileSystemEnumerator", + "mediaTypes": ["IMAGE", "VIDEO"] + }, + "imageDecoder": { + "type": "DECODER", + "factory": "ImageDecoder" + }, + "fileMetadata":{ + "type": "EXTRACTOR", + "fieldName": "file" + }, + "imageFilePathContent": { + "type": "TRANSFORMER", + "factory":"DescriptorAsContentTransformer" + }, + + "clip": { + "type": "EXTRACTOR", + "fieldName": "clip" + }, + + "imagePrompt": { + "type": "TRANSFORMER", + "factory": "TemplateTextTransformer" + }, + "captionSparse": { + "type": "EXTRACTOR", + "fieldName": "captionSparse" + }, + "captionContent": { + "type": "TRANSFORMER", + "factory": "DescriptorAsContentTransformer" + }, + "captionDense": { + "type": "EXTRACTOR", + "fieldName": "captionDense" + }, + + "imageSourceFilter": { + "type": "TRANSFORMER", + "factory": "TypeFilterTransformer" + } + }, + "operations": { + "enumerator-stage": {"operator": "enumerator"}, + "image-decoder-stage": {"operator": "imageDecoder", "inputs": ["enumerator-stage"]}, + "image-file-metadata-stage": {"operator": "fileMetadata", "inputs": ["image-decoder-stage"]}, + "image-file-path-content-stage": {"operator": "imageFilePathContent", "inputs": ["image-file-metadata-stage"]}, + + "image-clip-stage": {"operator": "clip", "inputs": ["image-file-path-content-stage"]}, + + "image-prompt-stage": {"operator": "imagePrompt", "inputs": ["image-file-path-content-stage"]}, + "image-caption-sparse-stage": {"operator": "captionSparse", "inputs": ["image-file-path-content-stage"]}, + "image-caption-content-stage": {"operator": "captionContent", "inputs": ["image-caption-sparse-stage"]}, + "image-caption-stage": {"operator": "captionDense", "inputs": ["image-caption-content-stage"]}, + + + "image-final-stage": {"operator": "imageSourceFilter", "inputs": ["image-caption-stage","image-clip-stage"], "merge": "COMBINE"} + }, + "output": [ + "image-final-stage" + ], + "mergeType": "MERGE" +} diff --git a/example-configs/schema/image-captions.json b/example-configs/schema/image-captions.json new file mode 100644 index 000000000..8dc6220c1 --- /dev/null +++ b/example-configs/schema/image-captions.json @@ -0,0 +1,73 @@ +{ + "schemas": { + "caption": { + "connection": { + "database": "PgVectorConnectionProvider", + "parameters": { + "Host": "loalhost", + "port": "5432", + "username": "postgres", + "password": "vitrivr" + } + }, + "fields": { + "file": { + "factory": "FileSourceMetadata" + }, + "clip": { + "factory": "DenseEmbedding", + "parameters": { + "host": "http://10.34.64.84:8888/", + "model": "open-clip-vit-b32", + "length": "512", + "timeoutSeconds": "100", + "retries": "1000" + } + }, + "captionSparse": { + "factory": "ImageCaption", + "parameters": { + "host": "http://10.34.64.84:8888/", + "timeoutSeconds": "100", + "retries": "1000", + "model": "gpt4o", + "prompt": "Create a short caption to the content of this image for the purpose of training a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources." + } + }, + "captionDense": { + "factory": "DenseEmbedding", + "parameters": { + "host": "http://10.34.64.84:8888/", + "model": "e5mistral7b-instruct", + "length": "4096", + "timeoutSeconds": "100", + "retries": "1000" + } + } + }, + "resolvers": { + "disk": { + "factory": "DiskResolver", + "parameters": { + "location": "../thumbnails" + } + } + }, + "exporters": { + "thumbnail": { + "factory": "ThumbnailExporter", + "resolverName": "disk", + "parameters": { + "maxSideResolution": "400", + "mimeType": "JPG" + } + } + }, + "extractionPipelines": { + "cap": { + "path": "example-configs/ingestion/example/image-captions.json" + } + } + } + } +} diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt index 346f59f38..609624d80 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt @@ -47,8 +47,9 @@ class FileSystemEnumerator : EnumeratorFactory { val skip = context[name, "skip"]?.toLongOrNull() ?: 0L val limit = context[name, "limit"]?.toLongOrNull() ?: Long.MAX_VALUE val type = context[name, "type"] - logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit}" } - return Instance(path, depth, mediaTypes, skip, limit, type) + val regex = context[name, "regex"] + logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit} and type: $type, regex: $regex" } + return Instance(path, depth, mediaTypes, skip, limit, type, regex) } /** @@ -57,7 +58,12 @@ class FileSystemEnumerator : EnumeratorFactory { * @param context The [IndexContext] to use. * @param inputs Is ignored. */ - override fun newEnumerator(name: String, context: IndexContext, mediaTypes: List, inputs: Stream<*>?): Enumerator { + override fun newEnumerator( + name: String, + context: IndexContext, + mediaTypes: List, + inputs: Stream<*>? + ): Enumerator { return newEnumerator(name, context, mediaTypes) } @@ -70,14 +76,24 @@ class FileSystemEnumerator : EnumeratorFactory { private val mediaTypes: Collection = MediaType.allValid, private val skip: Long = 0, private val limit: Long = Long.MAX_VALUE, - private val typeName: String? = null + private val typeName: String? = null, + private val regex: String? = null ) : Enumerator { override fun toFlow(scope: CoroutineScope): Flow = flow { logger.debug { "In flow: Start Enumerating with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: $limit" } val stream = try { - Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { it.isRegularFile() }.skip(skip).limit(limit) + if (regex == null) { + Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { + it.isRegularFile() + }.skip(skip).limit(limit) + } else { + Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { + it.isRegularFile() && it.toString().matches(Regex(regex)) + }.skip(skip).limit(limit) + } + } catch (ex: NoSuchFileException) { val mes = "In flow: Path ${this@Instance.path} does not exist." logger.error { mes } From a648841f2c1dda9023df8be92099040aa985e96d Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 30 Sep 2024 12:02:48 +0200 Subject: [PATCH 2/4] adds PointerScope to address issue #115 --- vitrivr-engine-index/gradle.properties | 4 ++ .../engine/index/decode/VideoDecoder.kt | 51 ++++++++++++++++--- .../index/exporters/VideoPreviewExporter.kt | 18 ++++++- 3 files changed, 63 insertions(+), 10 deletions(-) create mode 100644 vitrivr-engine-index/gradle.properties diff --git a/vitrivr-engine-index/gradle.properties b/vitrivr-engine-index/gradle.properties new file mode 100644 index 000000000..4450fdf81 --- /dev/null +++ b/vitrivr-engine-index/gradle.properties @@ -0,0 +1,4 @@ +## Module-specific settings + +# nopointergc https://github.com/bytedeco/javacv/issues/2266 +org.bytedeco.javacpp.nopointergc=true diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt index d17c23c83..ca760225c 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt @@ -9,10 +9,12 @@ import kotlinx.coroutines.channels.ProducerScope import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.buffer import kotlinx.coroutines.flow.channelFlow +import org.bytedeco.javacpp.PointerScope import org.bytedeco.javacv.FFmpegFrameGrabber import org.bytedeco.javacv.Frame import org.bytedeco.javacv.FrameGrabber import org.bytedeco.javacv.Java2DFrameConverter + import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.Content import org.vitrivr.engine.core.model.content.element.AudioContent @@ -61,7 +63,7 @@ class VideoDecoder : DecoderFactory { private val audio: Boolean = true, private val keyFrames: Boolean = false, private val timeWindowMs: Long = 500L, - private val name : String + private val name: String ) : Decoder { /** [KLogger] instance. */ @@ -108,11 +110,17 @@ class VideoDecoder : DecoderFactory { * @param grabber The [FFmpegFrameGrabber] used to decode the video. * @param channel The [ProducerScope] used to emit [Retrievable] elements. */ - private suspend fun decodeFromGrabber(source: Source, sourceRetrievable: Retrievable, grabber: FFmpegFrameGrabber, channel: ProducerScope) { + private suspend fun decodeFromGrabber( + source: Source, + sourceRetrievable: Retrievable, + grabber: FFmpegFrameGrabber, + channel: ProducerScope + ) { /* Determine end of time window. */ var windowEnd = TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs) var error = false + /* Configure FFmpegFrameGrabber. */ grabber.imageMode = FrameGrabber.ImageMode.COLOR grabber.sampleMode = FrameGrabber.SampleMode.SHORT @@ -123,7 +131,8 @@ class VideoDecoder : DecoderFactory { /* Extract and enrich source metadata. */ source.metadata[Metadata.METADATA_KEY_VIDEO_FPS] = grabber.videoFrameRate - source.metadata[Metadata.METADATA_KEY_AV_DURATION] = TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime) + source.metadata[Metadata.METADATA_KEY_AV_DURATION] = + TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime) source.metadata[Metadata.METADATA_KEY_IMAGE_WIDTH] = grabber.imageWidth source.metadata[Metadata.METADATA_KEY_IMAGE_HEIGHT] = grabber.imageHeight source.metadata[Metadata.METADATA_KEY_AUDIO_CHANNELS] = grabber.audioChannels @@ -139,10 +148,19 @@ class VideoDecoder : DecoderFactory { var audioReady = !(grabber.hasAudio() && this@Instance.audio) do { - val frame = grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true) ?: break + val frame = + grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true) + ?: break when (frame.type) { Frame.Type.VIDEO -> { - imageBuffer.add(Java2DFrameConverter().use { it.convert(frame) to frame.timestamp }) + imageBuffer.add( + (try { + PointerScope().use { scope -> Java2DFrameConverter().convert(frame) to frame.timestamp } + } catch (e: Exception) { + logger.error(e) { "Error converting frame to BufferedImage" } + null + })!! + ) if (frame.timestamp > windowEnd) { videoReady = true } @@ -204,7 +222,14 @@ class VideoDecoder : DecoderFactory { * @param timestampEnd The end timestamp. * @param source The source [Retrievable] the emitted [Retrievable] is part of. */ - private suspend fun emit(imageBuffer: LinkedList>, audioBuffer: LinkedList>, grabber: FrameGrabber, timestampEnd: Long, source: Retrievable, channel: ProducerScope) { + private suspend fun emit( + imageBuffer: LinkedList>, + audioBuffer: LinkedList>, + grabber: FrameGrabber, + timestampEnd: Long, + source: Retrievable, + channel: ProducerScope + ) { /* Audio samples. */ var audioSize = 0 val emitImage = mutableListOf() @@ -233,7 +258,13 @@ class VideoDecoder : DecoderFactory { val ingested = Ingested(UUID.randomUUID(), "SEGMENT", false) source.filteredAttribute(SourceAttribute::class.java)?.let { ingested.addAttribute(it) } ingested.addRelationship(Relationship.ByRef(ingested, "partOf", source, false)) - ingested.addAttribute(TimeRangeAttribute(timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs), timestampEnd, TimeUnit.MICROSECONDS)) + ingested.addAttribute( + TimeRangeAttribute( + timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs), + timestampEnd, + TimeUnit.MICROSECONDS + ) + ) /* Prepare and append audio content element. */ if (emitAudio.size > 0) { @@ -243,7 +274,11 @@ class VideoDecoder : DecoderFactory { samples.put(frame) } samples.clear() - val audio = this.context.contentFactory.newAudioContent(grabber.audioChannels.toShort(), grabber.sampleRate, samples) + val audio = this.context.contentFactory.newAudioContent( + grabber.audioChannels.toShort(), + grabber.sampleRate, + samples + ) ingested.addContent(audio) ingested.addAttribute(ContentAuthorAttribute(audio.id, name)) } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt index 58a35c835..ee80caec2 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt @@ -8,8 +8,10 @@ import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.onEach +import org.bytedeco.javacpp.PointerScope import org.bytedeco.javacv.FFmpegFrameGrabber import org.bytedeco.javacv.Java2DFrameConverter +import org.bytedeco.javacv.Java2DFrameUtils import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.attributes.SourceAttribute @@ -21,6 +23,7 @@ import org.vitrivr.engine.core.source.file.MimeType import java.awt.image.BufferedImage import java.io.InputStream + private val logger: KLogger = KotlinLogging.logger {} /** @@ -80,6 +83,7 @@ class VideoPreviewExporter : ExporterFactory { val writer = when (mimeType) { MimeType.JPEG, MimeType.JPG -> JpegWriter() + MimeType.PNG -> PngWriter() else -> throw IllegalArgumentException("Unsupported mime type $mimeType") } @@ -122,9 +126,19 @@ class VideoPreviewExporter : ExporterFactory { val frame = grabber.grabImage() grabber.stop() - return Java2DFrameConverter().use { - it.convert(frame) + + val img = try { + PointerScope().use { scope -> + Java2DFrameConverter().use { + it.convert(frame) + } + } + } catch (e: Exception) { + logger.error(e) { "Error converting frame to BufferedImage" } + null } + + return img!! } } } From 16f7eb36fab680fab3c3109129d493904dbf0d7b Mon Sep 17 00:00:00 2001 From: Rahel Arnold Date: Tue, 1 Oct 2024 15:04:20 +0200 Subject: [PATCH 3/4] updated snapshot version on dev branch --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index d635e8954..02c53c881 100644 --- a/build.gradle +++ b/build.gradle @@ -14,7 +14,7 @@ allprojects { group = 'org.vitrivr' /* Current version of our artifacts. */ - version = '0.0.1' + version = '0.0.2' /* Repositories for build script. */ buildscript { From 16d97ee5d37897e43baf1a95e41a1a73abf3a3ce Mon Sep 17 00:00:00 2001 From: Rahel Arnold Date: Thu, 3 Oct 2024 14:29:13 +0200 Subject: [PATCH 4/4] include snapshot in build-name --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 02c53c881..1c0794307 100644 --- a/build.gradle +++ b/build.gradle @@ -14,7 +14,7 @@ allprojects { group = 'org.vitrivr' /* Current version of our artifacts. */ - version = '0.0.2' + version = '0.0.2-SNAPSHOT' /* Repositories for build script. */ buildscript {