From 19aa546e42a836ede830cd5eabe91be28cedafbc Mon Sep 17 00:00:00 2001
From: Raphael <raphael.waltenspuel@unibas.ch>
Date: Thu, 26 Sep 2024 12:07:56 +0200
Subject: [PATCH 1/4] adds an optional regex pattern to 'FileSystemEnumerator'

Will only enumerates files which matches the pattern.
If option is not set, nothing changes.
---
 .../ingestion/example/image-captions.json     | 110 ++++++++++++++++++
 example-configs/schema/image-captions.json    |  73 ++++++++++++
 .../index/enumerate/FileSystemEnumerator.kt   |  26 ++++-
 3 files changed, 204 insertions(+), 5 deletions(-)
 create mode 100644 example-configs/ingestion/example/image-captions.json
 create mode 100644 example-configs/schema/image-captions.json

diff --git a/example-configs/ingestion/example/image-captions.json b/example-configs/ingestion/example/image-captions.json
new file mode 100644
index 000000000..84f359c73
--- /dev/null
+++ b/example-configs/ingestion/example/image-captions.json
@@ -0,0 +1,110 @@
+{
+  "schema": "caption",
+  "context": {
+    "contentFactory": "CachedContentFactory",
+    "resolverName": "disk",
+    "local": {
+      "content": {
+        "path": "../cache"
+      },
+      "enumerator": {
+        "path": "E:\\Joint-Image-EEG-Embedding\\Data\\Images",
+        "depth": "5",
+        "regex": ".*n014[0-9]{5}.*"
+      },
+      "imageFilePathContent": {
+        "field": "file"
+      },
+
+      "captionContent": {
+        "field": "captionSparse"
+      },
+      "clip": {
+        "contentSources": "imageDecoder"
+      },
+      "captionSparse": {
+        "contentSources": "imageDecoder"
+      },
+      "captionDense": {
+        "contentSources": "captionContent"
+      },
+      "documentType": {
+        "contentSources": "imageDecoder"
+      },
+      "imagePrompt": {
+        "template": "Create a short caption to the content of this image (file path: ${imageFilePathContent}) for the purpose of traning a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources.",
+        "defaultValue": "no content provided"
+      },
+
+      "imageSourceFilter": {
+        "type": "SOURCE:IMAGE"
+      }
+    }
+  },
+  "operators": {
+    "enumerator": {
+      "type": "ENUMERATOR",
+      "factory": "FileSystemEnumerator",
+      "mediaTypes": ["IMAGE", "VIDEO"]
+    },
+    "imageDecoder": {
+      "type": "DECODER",
+      "factory": "ImageDecoder"
+    },
+    "fileMetadata":{
+      "type": "EXTRACTOR",
+      "fieldName": "file"
+    },
+    "imageFilePathContent": {
+      "type": "TRANSFORMER",
+      "factory":"DescriptorAsContentTransformer"
+    },
+
+    "clip": {
+      "type": "EXTRACTOR",
+      "fieldName": "clip"
+    },
+
+    "imagePrompt": {
+      "type": "TRANSFORMER",
+      "factory": "TemplateTextTransformer"
+    },
+    "captionSparse": {
+      "type": "EXTRACTOR",
+      "fieldName": "captionSparse"
+    },
+    "captionContent": {
+      "type": "TRANSFORMER",
+      "factory": "DescriptorAsContentTransformer"
+    },
+    "captionDense": {
+      "type": "EXTRACTOR",
+      "fieldName": "captionDense"
+    },
+
+    "imageSourceFilter": {
+      "type": "TRANSFORMER",
+      "factory": "TypeFilterTransformer"
+    }
+  },
+  "operations": {
+    "enumerator-stage": {"operator": "enumerator"},
+    "image-decoder-stage": {"operator": "imageDecoder", "inputs": ["enumerator-stage"]},
+    "image-file-metadata-stage": {"operator": "fileMetadata", "inputs": ["image-decoder-stage"]},
+    "image-file-path-content-stage": {"operator": "imageFilePathContent", "inputs": ["image-file-metadata-stage"]},
+
+    "image-clip-stage": {"operator": "clip", "inputs": ["image-file-path-content-stage"]},
+
+    "image-prompt-stage": {"operator": "imagePrompt", "inputs": ["image-file-path-content-stage"]},
+    "image-caption-sparse-stage": {"operator": "captionSparse", "inputs": ["image-file-path-content-stage"]},
+    "image-caption-content-stage": {"operator": "captionContent", "inputs": ["image-caption-sparse-stage"]},
+    "image-caption-stage": {"operator": "captionDense", "inputs": ["image-caption-content-stage"]},
+
+
+    "image-final-stage": {"operator": "imageSourceFilter", "inputs": ["image-caption-stage","image-clip-stage"], "merge": "COMBINE"}
+  },
+  "output": [
+    "image-final-stage"
+  ],
+  "mergeType": "MERGE"
+}
diff --git a/example-configs/schema/image-captions.json b/example-configs/schema/image-captions.json
new file mode 100644
index 000000000..8dc6220c1
--- /dev/null
+++ b/example-configs/schema/image-captions.json
@@ -0,0 +1,73 @@
+{
+  "schemas": {
+    "caption": {
+      "connection": {
+        "database": "PgVectorConnectionProvider",
+        "parameters": {
+          "Host": "loalhost",
+          "port": "5432",
+          "username": "postgres",
+          "password": "vitrivr"
+        }
+      },
+      "fields": {
+        "file": {
+          "factory": "FileSourceMetadata"
+        },
+        "clip": {
+          "factory": "DenseEmbedding",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "model": "open-clip-vit-b32",
+            "length": "512",
+            "timeoutSeconds": "100",
+            "retries": "1000"
+          }
+        },
+        "captionSparse": {
+          "factory": "ImageCaption",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "timeoutSeconds": "100",
+            "retries": "1000",
+            "model": "gpt4o",
+            "prompt": "Create a short caption to the content of this image for the purpose of training a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources."
+          }
+        },
+        "captionDense": {
+          "factory": "DenseEmbedding",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "model": "e5mistral7b-instruct",
+            "length": "4096",
+            "timeoutSeconds": "100",
+            "retries": "1000"
+          }
+        }
+      },
+      "resolvers": {
+        "disk": {
+          "factory": "DiskResolver",
+          "parameters": {
+            "location": "../thumbnails"
+          }
+        }
+      },
+      "exporters": {
+        "thumbnail": {
+          "factory": "ThumbnailExporter",
+          "resolverName": "disk",
+          "parameters": {
+            "maxSideResolution": "400",
+            "mimeType": "JPG"
+          }
+        }
+      },
+      "extractionPipelines": {
+        "cap": {
+          "path": "example-configs/ingestion/example/image-captions.json"
+        }
+      }
+    }
+  }
+}
diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt
index 346f59f38..609624d80 100644
--- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt
+++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt
@@ -47,8 +47,9 @@ class FileSystemEnumerator : EnumeratorFactory {
         val skip = context[name, "skip"]?.toLongOrNull() ?: 0L
         val limit = context[name, "limit"]?.toLongOrNull() ?: Long.MAX_VALUE
         val type = context[name, "type"]
-        logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit}" }
-        return Instance(path, depth, mediaTypes, skip, limit, type)
+        val regex = context[name, "regex"]
+        logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit} and type: $type, regex: $regex" }
+        return Instance(path, depth, mediaTypes, skip, limit, type, regex)
     }
 
     /**
@@ -57,7 +58,12 @@ class FileSystemEnumerator : EnumeratorFactory {
      * @param context The [IndexContext] to use.
      * @param inputs Is ignored.
      */
-    override fun newEnumerator(name: String, context: IndexContext, mediaTypes: List<MediaType>, inputs: Stream<*>?): Enumerator {
+    override fun newEnumerator(
+        name: String,
+        context: IndexContext,
+        mediaTypes: List<MediaType>,
+        inputs: Stream<*>?
+    ): Enumerator {
         return newEnumerator(name, context, mediaTypes)
     }
 
@@ -70,14 +76,24 @@ class FileSystemEnumerator : EnumeratorFactory {
         private val mediaTypes: Collection<MediaType> = MediaType.allValid,
         private val skip: Long = 0,
         private val limit: Long = Long.MAX_VALUE,
-        private val typeName: String? = null
+        private val typeName: String? = null,
+        private val regex: String? = null
     ) : Enumerator {
 
         override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = flow {
             logger.debug { "In flow: Start Enumerating with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: $limit" }
 
             val stream = try {
-                Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { it.isRegularFile() }.skip(skip).limit(limit)
+                if (regex == null) {
+                    Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
+                        it.isRegularFile()
+                    }.skip(skip).limit(limit)
+                } else {
+                    Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
+                        it.isRegularFile() && it.toString().matches(Regex(regex))
+                    }.skip(skip).limit(limit)
+                }
+
             } catch (ex: NoSuchFileException) {
                 val mes = "In flow: Path ${this@Instance.path} does not exist."
                 logger.error { mes }

From a648841f2c1dda9023df8be92099040aa985e96d Mon Sep 17 00:00:00 2001
From: Raphael <raphael.waltenspuel@unibas.ch>
Date: Mon, 30 Sep 2024 12:02:48 +0200
Subject: [PATCH 2/4] adds  PointerScope to address issue #115

---
 vitrivr-engine-index/gradle.properties        |  4 ++
 .../engine/index/decode/VideoDecoder.kt       | 51 ++++++++++++++++---
 .../index/exporters/VideoPreviewExporter.kt   | 18 ++++++-
 3 files changed, 63 insertions(+), 10 deletions(-)
 create mode 100644 vitrivr-engine-index/gradle.properties

diff --git a/vitrivr-engine-index/gradle.properties b/vitrivr-engine-index/gradle.properties
new file mode 100644
index 000000000..4450fdf81
--- /dev/null
+++ b/vitrivr-engine-index/gradle.properties
@@ -0,0 +1,4 @@
+## Module-specific settings
+
+# nopointergc https://github.com/bytedeco/javacv/issues/2266
+org.bytedeco.javacpp.nopointergc=true
diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt
index d17c23c83..ca760225c 100644
--- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt
+++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt
@@ -9,10 +9,12 @@ import kotlinx.coroutines.channels.ProducerScope
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.buffer
 import kotlinx.coroutines.flow.channelFlow
+import org.bytedeco.javacpp.PointerScope
 import org.bytedeco.javacv.FFmpegFrameGrabber
 import org.bytedeco.javacv.Frame
 import org.bytedeco.javacv.FrameGrabber
 import org.bytedeco.javacv.Java2DFrameConverter
+
 import org.vitrivr.engine.core.context.IndexContext
 import org.vitrivr.engine.core.model.content.Content
 import org.vitrivr.engine.core.model.content.element.AudioContent
@@ -61,7 +63,7 @@ class VideoDecoder : DecoderFactory {
         private val audio: Boolean = true,
         private val keyFrames: Boolean = false,
         private val timeWindowMs: Long = 500L,
-        private val name : String
+        private val name: String
     ) : Decoder {
 
         /** [KLogger] instance. */
@@ -108,11 +110,17 @@ class VideoDecoder : DecoderFactory {
          * @param grabber The [FFmpegFrameGrabber] used to decode the video.
          * @param channel The [ProducerScope] used to emit [Retrievable] elements.
          */
-        private suspend fun decodeFromGrabber(source: Source, sourceRetrievable: Retrievable, grabber: FFmpegFrameGrabber, channel: ProducerScope<Retrievable>) {
+        private suspend fun decodeFromGrabber(
+            source: Source,
+            sourceRetrievable: Retrievable,
+            grabber: FFmpegFrameGrabber,
+            channel: ProducerScope<Retrievable>
+        ) {
             /* Determine end of time window. */
             var windowEnd = TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs)
             var error = false
 
+
             /* Configure FFmpegFrameGrabber. */
             grabber.imageMode = FrameGrabber.ImageMode.COLOR
             grabber.sampleMode = FrameGrabber.SampleMode.SHORT
@@ -123,7 +131,8 @@ class VideoDecoder : DecoderFactory {
 
                 /* Extract and enrich source metadata. */
                 source.metadata[Metadata.METADATA_KEY_VIDEO_FPS] = grabber.videoFrameRate
-                source.metadata[Metadata.METADATA_KEY_AV_DURATION] = TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
+                source.metadata[Metadata.METADATA_KEY_AV_DURATION] =
+                    TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
                 source.metadata[Metadata.METADATA_KEY_IMAGE_WIDTH] = grabber.imageWidth
                 source.metadata[Metadata.METADATA_KEY_IMAGE_HEIGHT] = grabber.imageHeight
                 source.metadata[Metadata.METADATA_KEY_AUDIO_CHANNELS] = grabber.audioChannels
@@ -139,10 +148,19 @@ class VideoDecoder : DecoderFactory {
                 var audioReady = !(grabber.hasAudio() && this@Instance.audio)
 
                 do {
-                    val frame = grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true) ?: break
+                    val frame =
+                        grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true)
+                            ?: break
                     when (frame.type) {
                         Frame.Type.VIDEO -> {
-                            imageBuffer.add(Java2DFrameConverter().use { it.convert(frame) to frame.timestamp })
+                            imageBuffer.add(
+                                (try {
+                                    PointerScope().use { scope -> Java2DFrameConverter().convert(frame) to frame.timestamp }
+                                } catch (e: Exception) {
+                                    logger.error(e) { "Error converting frame to BufferedImage" }
+                                    null
+                                })!!
+                            )
                             if (frame.timestamp > windowEnd) {
                                 videoReady = true
                             }
@@ -204,7 +222,14 @@ class VideoDecoder : DecoderFactory {
          * @param timestampEnd The end timestamp.
          * @param source The source [Retrievable] the emitted [Retrievable] is part of.
          */
-        private suspend fun emit(imageBuffer: LinkedList<Pair<BufferedImage, Long>>, audioBuffer: LinkedList<Pair<ShortBuffer, Long>>, grabber: FrameGrabber, timestampEnd: Long, source: Retrievable, channel: ProducerScope<Retrievable>) {
+        private suspend fun emit(
+            imageBuffer: LinkedList<Pair<BufferedImage, Long>>,
+            audioBuffer: LinkedList<Pair<ShortBuffer, Long>>,
+            grabber: FrameGrabber,
+            timestampEnd: Long,
+            source: Retrievable,
+            channel: ProducerScope<Retrievable>
+        ) {
             /* Audio samples. */
             var audioSize = 0
             val emitImage = mutableListOf<BufferedImage>()
@@ -233,7 +258,13 @@ class VideoDecoder : DecoderFactory {
             val ingested = Ingested(UUID.randomUUID(), "SEGMENT", false)
             source.filteredAttribute(SourceAttribute::class.java)?.let { ingested.addAttribute(it) }
             ingested.addRelationship(Relationship.ByRef(ingested, "partOf", source, false))
-            ingested.addAttribute(TimeRangeAttribute(timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs), timestampEnd, TimeUnit.MICROSECONDS))
+            ingested.addAttribute(
+                TimeRangeAttribute(
+                    timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs),
+                    timestampEnd,
+                    TimeUnit.MICROSECONDS
+                )
+            )
 
             /* Prepare and append audio content element. */
             if (emitAudio.size > 0) {
@@ -243,7 +274,11 @@ class VideoDecoder : DecoderFactory {
                     samples.put(frame)
                 }
                 samples.clear()
-                val audio = this.context.contentFactory.newAudioContent(grabber.audioChannels.toShort(), grabber.sampleRate, samples)
+                val audio = this.context.contentFactory.newAudioContent(
+                    grabber.audioChannels.toShort(),
+                    grabber.sampleRate,
+                    samples
+                )
                 ingested.addContent(audio)
                 ingested.addAttribute(ContentAuthorAttribute(audio.id, name))
             }
diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt
index 58a35c835..ee80caec2 100644
--- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt
+++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/exporters/VideoPreviewExporter.kt
@@ -8,8 +8,10 @@ import io.github.oshai.kotlinlogging.KotlinLogging
 import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.onEach
+import org.bytedeco.javacpp.PointerScope
 import org.bytedeco.javacv.FFmpegFrameGrabber
 import org.bytedeco.javacv.Java2DFrameConverter
+import org.bytedeco.javacv.Java2DFrameUtils
 import org.vitrivr.engine.core.context.IndexContext
 import org.vitrivr.engine.core.model.retrievable.Retrievable
 import org.vitrivr.engine.core.model.retrievable.attributes.SourceAttribute
@@ -21,6 +23,7 @@ import org.vitrivr.engine.core.source.file.MimeType
 import java.awt.image.BufferedImage
 import java.io.InputStream
 
+
 private val logger: KLogger = KotlinLogging.logger {}
 
 /**
@@ -80,6 +83,7 @@ class VideoPreviewExporter : ExporterFactory {
                     val writer = when (mimeType) {
                         MimeType.JPEG,
                         MimeType.JPG -> JpegWriter()
+
                         MimeType.PNG -> PngWriter()
                         else -> throw IllegalArgumentException("Unsupported mime type $mimeType")
                     }
@@ -122,9 +126,19 @@ class VideoPreviewExporter : ExporterFactory {
                 val frame = grabber.grabImage()
                 grabber.stop()
 
-                return Java2DFrameConverter().use {
-                    it.convert(frame)
+
+                val img = try {
+                    PointerScope().use { scope ->
+                        Java2DFrameConverter().use {
+                            it.convert(frame)
+                        }
+                    }
+                } catch (e: Exception) {
+                    logger.error(e) { "Error converting frame to BufferedImage" }
+                    null
                 }
+
+                return img!!
             }
         }
     }

From 16f7eb36fab680fab3c3109129d493904dbf0d7b Mon Sep 17 00:00:00 2001
From: Rahel Arnold <rahel.arnold@unibas.ch>
Date: Tue, 1 Oct 2024 15:04:20 +0200
Subject: [PATCH 3/4] updated snapshot version on dev branch

---
 build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index d635e8954..02c53c881 100644
--- a/build.gradle
+++ b/build.gradle
@@ -14,7 +14,7 @@ allprojects {
     group = 'org.vitrivr'
 
     /* Current version of our artifacts. */
-    version = '0.0.1'
+    version = '0.0.2'
 
     /* Repositories for build script. */
     buildscript {

From 16d97ee5d37897e43baf1a95e41a1a73abf3a3ce Mon Sep 17 00:00:00 2001
From: Rahel Arnold <rahel.arnold@unibas.ch>
Date: Thu, 3 Oct 2024 14:29:13 +0200
Subject: [PATCH 4/4] include snapshot in build-name

---
 build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.gradle b/build.gradle
index 02c53c881..1c0794307 100644
--- a/build.gradle
+++ b/build.gradle
@@ -14,7 +14,7 @@ allprojects {
     group = 'org.vitrivr'
 
     /* Current version of our artifacts. */
-    version = '0.0.2'
+    version = '0.0.2-SNAPSHOT'
 
     /* Repositories for build script. */
     buildscript {