Merge remote-tracking branch 'origin/dev' into feature/exif_to_prompt

vitrivr · Oct 14, 2024 · 1c25dfa · 1c25dfa
2 parents 42bd7bd + 16d97ee
commit 1c25dfa
Show file tree

Hide file tree

Showing 7 changed files with 268 additions and 16 deletions.
diff --git a/build.gradle b/build.gradle
@@ -14,7 +14,7 @@ allprojects {
     group = 'org.vitrivr'
 
     /* Current version of our artifacts. */
-    version = '0.0.1'
+    version = '0.0.2-SNAPSHOT'
 
     /* Repositories for build script. */
     buildscript {

diff --git a/example-configs/ingestion/example/image-captions.json b/example-configs/ingestion/example/image-captions.json
@@ -0,0 +1,110 @@
+{
+  "schema": "caption",
+  "context": {
+    "contentFactory": "CachedContentFactory",
+    "resolverName": "disk",
+    "local": {
+      "content": {
+        "path": "../cache"
+      },
+      "enumerator": {
+        "path": "E:\\Joint-Image-EEG-Embedding\\Data\\Images",
+        "depth": "5",
+        "regex": ".*n014[0-9]{5}.*"
+      },
+      "imageFilePathContent": {
+        "field": "file"
+      },
+
+      "captionContent": {
+        "field": "captionSparse"
+      },
+      "clip": {
+        "contentSources": "imageDecoder"
+      },
+      "captionSparse": {
+        "contentSources": "imageDecoder"
+      },
+      "captionDense": {
+        "contentSources": "captionContent"
+      },
+      "documentType": {
+        "contentSources": "imageDecoder"
+      },
+      "imagePrompt": {
+        "template": "Create a short caption to the content of this image (file path: ${imageFilePathContent}) for the purpose of traning a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources.",
+        "defaultValue": "no content provided"
+      },
+
+      "imageSourceFilter": {
+        "type": "SOURCE:IMAGE"
+      }
+    }
+  },
+  "operators": {
+    "enumerator": {
+      "type": "ENUMERATOR",
+      "factory": "FileSystemEnumerator",
+      "mediaTypes": ["IMAGE", "VIDEO"]
+    },
+    "imageDecoder": {
+      "type": "DECODER",
+      "factory": "ImageDecoder"
+    },
+    "fileMetadata":{
+      "type": "EXTRACTOR",
+      "fieldName": "file"
+    },
+    "imageFilePathContent": {
+      "type": "TRANSFORMER",
+      "factory":"DescriptorAsContentTransformer"
+    },
+
+    "clip": {
+      "type": "EXTRACTOR",
+      "fieldName": "clip"
+    },
+
+    "imagePrompt": {
+      "type": "TRANSFORMER",
+      "factory": "TemplateTextTransformer"
+    },
+    "captionSparse": {
+      "type": "EXTRACTOR",
+      "fieldName": "captionSparse"
+    },
+    "captionContent": {
+      "type": "TRANSFORMER",
+      "factory": "DescriptorAsContentTransformer"
+    },
+    "captionDense": {
+      "type": "EXTRACTOR",
+      "fieldName": "captionDense"
+    },
+
+    "imageSourceFilter": {
+      "type": "TRANSFORMER",
+      "factory": "TypeFilterTransformer"
+    }
+  },
+  "operations": {
+    "enumerator-stage": {"operator": "enumerator"},
+    "image-decoder-stage": {"operator": "imageDecoder", "inputs": ["enumerator-stage"]},
+    "image-file-metadata-stage": {"operator": "fileMetadata", "inputs": ["image-decoder-stage"]},
+    "image-file-path-content-stage": {"operator": "imageFilePathContent", "inputs": ["image-file-metadata-stage"]},
+
+    "image-clip-stage": {"operator": "clip", "inputs": ["image-file-path-content-stage"]},
+
+    "image-prompt-stage": {"operator": "imagePrompt", "inputs": ["image-file-path-content-stage"]},
+    "image-caption-sparse-stage": {"operator": "captionSparse", "inputs": ["image-file-path-content-stage"]},
+    "image-caption-content-stage": {"operator": "captionContent", "inputs": ["image-caption-sparse-stage"]},
+    "image-caption-stage": {"operator": "captionDense", "inputs": ["image-caption-content-stage"]},
+
+
+    "image-final-stage": {"operator": "imageSourceFilter", "inputs": ["image-caption-stage","image-clip-stage"], "merge": "COMBINE"}
+  },
+  "output": [
+    "image-final-stage"
+  ],
+  "mergeType": "MERGE"
+}
diff --git a/example-configs/schema/image-captions.json b/example-configs/schema/image-captions.json
@@ -0,0 +1,73 @@
+{
+  "schemas": {
+    "caption": {
+      "connection": {
+        "database": "PgVectorConnectionProvider",
+        "parameters": {
+          "Host": "loalhost",
+          "port": "5432",
+          "username": "postgres",
+          "password": "vitrivr"
+        }
+      },
+      "fields": {
+        "file": {
+          "factory": "FileSourceMetadata"
+        },
+        "clip": {
+          "factory": "DenseEmbedding",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "model": "open-clip-vit-b32",
+            "length": "512",
+            "timeoutSeconds": "100",
+            "retries": "1000"
+          }
+        },
+        "captionSparse": {
+          "factory": "ImageCaption",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "timeoutSeconds": "100",
+            "retries": "1000",
+            "model": "gpt4o",
+            "prompt": "Create a short caption to the content of this image for the purpose of training a co-embedding model. Use information from the internet to enhance the description, for instance by searching for proper nouns. If web sources turn out to be irrelevant, do not include them. The image is part of the imagenet-object-localization-challenge. Do not include general information about the imagenet-object-localization-challenge. Do not structure the description, put everything in one sentence. Do not mention words such as 'archive', 'documentation', 'archivist', 'search' or 'internet'. Do not mention any sources."
+          }
+        },
+        "captionDense": {
+          "factory": "DenseEmbedding",
+          "parameters": {
+            "host": "http://10.34.64.84:8888/",
+            "model": "e5mistral7b-instruct",
+            "length": "4096",
+            "timeoutSeconds": "100",
+            "retries": "1000"
+          }
+        }
+      },
+      "resolvers": {
+        "disk": {
+          "factory": "DiskResolver",
+          "parameters": {
+            "location": "../thumbnails"
+          }
+        }
+      },
+      "exporters": {
+        "thumbnail": {
+          "factory": "ThumbnailExporter",
+          "resolverName": "disk",
+          "parameters": {
+            "maxSideResolution": "400",
+            "mimeType": "JPG"
+          }
+        }
+      },
+      "extractionPipelines": {
+        "cap": {
+          "path": "example-configs/ingestion/example/image-captions.json"
+        }
+      }
+    }
+  }
+}
diff --git a/vitrivr-engine-index/gradle.properties b/vitrivr-engine-index/gradle.properties
@@ -0,0 +1,4 @@
+## Module-specific settings
+
+# nopointergc https://github.com/bytedeco/javacv/issues/2266
+org.bytedeco.javacpp.nopointergc=true
diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt
@@ -9,10 +9,12 @@ import kotlinx.coroutines.channels.ProducerScope
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.buffer
 import kotlinx.coroutines.flow.channelFlow
+import org.bytedeco.javacpp.PointerScope
 import org.bytedeco.javacv.FFmpegFrameGrabber
 import org.bytedeco.javacv.Frame
 import org.bytedeco.javacv.FrameGrabber
 import org.bytedeco.javacv.Java2DFrameConverter
+
 import org.vitrivr.engine.core.context.IndexContext
 import org.vitrivr.engine.core.model.content.Content
 import org.vitrivr.engine.core.model.content.element.AudioContent
@@ -61,7 +63,7 @@ class VideoDecoder : DecoderFactory {
         private val audio: Boolean = true,
         private val keyFrames: Boolean = false,
         private val timeWindowMs: Long = 500L,
-        private val name : String
+        private val name: String
     ) : Decoder {
 
         /** [KLogger] instance. */
@@ -108,11 +110,17 @@ class VideoDecoder : DecoderFactory {
          * @param grabber The [FFmpegFrameGrabber] used to decode the video.
          * @param channel The [ProducerScope] used to emit [Retrievable] elements.
          */
-        private suspend fun decodeFromGrabber(source: Source, sourceRetrievable: Retrievable, grabber: FFmpegFrameGrabber, channel: ProducerScope<Retrievable>) {
+        private suspend fun decodeFromGrabber(
+            source: Source,
+            sourceRetrievable: Retrievable,
+            grabber: FFmpegFrameGrabber,
+            channel: ProducerScope<Retrievable>
+        ) {
             /* Determine end of time window. */
             var windowEnd = TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs)
             var error = false
 
+
             /* Configure FFmpegFrameGrabber. */
             grabber.imageMode = FrameGrabber.ImageMode.COLOR
             grabber.sampleMode = FrameGrabber.SampleMode.SHORT
@@ -123,7 +131,8 @@ class VideoDecoder : DecoderFactory {
 
                 /* Extract and enrich source metadata. */
                 source.metadata[Metadata.METADATA_KEY_VIDEO_FPS] = grabber.videoFrameRate
-                source.metadata[Metadata.METADATA_KEY_AV_DURATION] = TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
+                source.metadata[Metadata.METADATA_KEY_AV_DURATION] =
+                    TimeUnit.MICROSECONDS.toMillis(grabber.lengthInTime)
                 source.metadata[Metadata.METADATA_KEY_IMAGE_WIDTH] = grabber.imageWidth
                 source.metadata[Metadata.METADATA_KEY_IMAGE_HEIGHT] = grabber.imageHeight
                 source.metadata[Metadata.METADATA_KEY_AUDIO_CHANNELS] = grabber.audioChannels
@@ -139,10 +148,19 @@ class VideoDecoder : DecoderFactory {
                 var audioReady = !(grabber.hasAudio() && this@Instance.audio)
 
                 do {
-                    val frame = grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true) ?: break
+                    val frame =
+                        grabber.grabFrame(this@Instance.audio, this@Instance.video, true, this@Instance.keyFrames, true)
+                            ?: break
                     when (frame.type) {
                         Frame.Type.VIDEO -> {
-                            imageBuffer.add(Java2DFrameConverter().use { it.convert(frame) to frame.timestamp })
+                            imageBuffer.add(
+                                (try {
+                                    PointerScope().use { scope -> Java2DFrameConverter().convert(frame) to frame.timestamp }
+                                } catch (e: Exception) {
+                                    logger.error(e) { "Error converting frame to BufferedImage" }
+                                    null
+                                })!!
+                            )
                             if (frame.timestamp > windowEnd) {
                                 videoReady = true
                             }
@@ -204,7 +222,14 @@ class VideoDecoder : DecoderFactory {
          * @param timestampEnd The end timestamp.
          * @param source The source [Retrievable] the emitted [Retrievable] is part of.
          */
-        private suspend fun emit(imageBuffer: LinkedList<Pair<BufferedImage, Long>>, audioBuffer: LinkedList<Pair<ShortBuffer, Long>>, grabber: FrameGrabber, timestampEnd: Long, source: Retrievable, channel: ProducerScope<Retrievable>) {
+        private suspend fun emit(
+            imageBuffer: LinkedList<Pair<BufferedImage, Long>>,
+            audioBuffer: LinkedList<Pair<ShortBuffer, Long>>,
+            grabber: FrameGrabber,
+            timestampEnd: Long,
+            source: Retrievable,
+            channel: ProducerScope<Retrievable>
+        ) {
             /* Audio samples. */
             var audioSize = 0
             val emitImage = mutableListOf<BufferedImage>()
@@ -233,7 +258,13 @@ class VideoDecoder : DecoderFactory {
             val ingested = Ingested(UUID.randomUUID(), "SEGMENT", false)
             source.filteredAttribute(SourceAttribute::class.java)?.let { ingested.addAttribute(it) }
             ingested.addRelationship(Relationship.ByRef(ingested, "partOf", source, false))
-            ingested.addAttribute(TimeRangeAttribute(timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs), timestampEnd, TimeUnit.MICROSECONDS))
+            ingested.addAttribute(
+                TimeRangeAttribute(
+                    timestampEnd - TimeUnit.MILLISECONDS.toMicros(this@Instance.timeWindowMs),
+                    timestampEnd,
+                    TimeUnit.MICROSECONDS
+                )
+            )
 
             /* Prepare and append audio content element. */
             if (emitAudio.size > 0) {
@@ -243,7 +274,11 @@ class VideoDecoder : DecoderFactory {
                     samples.put(frame)
                 }
                 samples.clear()
-                val audio = this.context.contentFactory.newAudioContent(grabber.audioChannels.toShort(), grabber.sampleRate, samples)
+                val audio = this.context.contentFactory.newAudioContent(
+                    grabber.audioChannels.toShort(),
+                    grabber.sampleRate,
+                    samples
+                )
                 ingested.addContent(audio)
                 ingested.addAttribute(ContentAuthorAttribute(audio.id, name))
             }

diff --git a/...r-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt b/...r-engine-index/src/main/kotlin/org/vitrivr/engine/index/enumerate/FileSystemEnumerator.kt
@@ -47,8 +47,9 @@ class FileSystemEnumerator : EnumeratorFactory {
         val skip = context[name, "skip"]?.toLongOrNull() ?: 0L
         val limit = context[name, "limit"]?.toLongOrNull() ?: Long.MAX_VALUE
         val type = context[name, "type"]
-        logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit}" }
-        return Instance(path, depth, mediaTypes, skip, limit, type)
+        val regex = context[name, "regex"]
+        logger.info { "Enumerator: FileSystemEnumerator with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: ${if (limit == Long.MAX_VALUE) "none" else limit} and type: $type, regex: $regex" }
+        return Instance(path, depth, mediaTypes, skip, limit, type, regex)
     }
 
     /**
@@ -57,7 +58,12 @@ class FileSystemEnumerator : EnumeratorFactory {
      * @param context The [IndexContext] to use.
      * @param inputs Is ignored.
      */
-    override fun newEnumerator(name: String, context: IndexContext, mediaTypes: List<MediaType>, inputs: Stream<*>?): Enumerator {
+    override fun newEnumerator(
+        name: String,
+        context: IndexContext,
+        mediaTypes: List<MediaType>,
+        inputs: Stream<*>?
+    ): Enumerator {
         return newEnumerator(name, context, mediaTypes)
     }
 
@@ -70,14 +76,24 @@ class FileSystemEnumerator : EnumeratorFactory {
         private val mediaTypes: Collection<MediaType> = MediaType.allValid,
         private val skip: Long = 0,
         private val limit: Long = Long.MAX_VALUE,
-        private val typeName: String? = null
+        private val typeName: String? = null,
+        private val regex: String? = null
     ) : Enumerator {
 
         override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = flow {
             logger.debug { "In flow: Start Enumerating with path: $path, depth: $depth, mediaTypes: $mediaTypes, skip: $skip, limit: $limit" }
 
             val stream = try {
-                Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter { it.isRegularFile() }.skip(skip).limit(limit)
+                if (regex == null) {
+                    Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
+                        it.isRegularFile()
+                    }.skip(skip).limit(limit)
+                } else {
+                    Files.walk(this@Instance.path, this@Instance.depth, FileVisitOption.FOLLOW_LINKS).filter {
+                        it.isRegularFile() && it.toString().matches(Regex(regex))
+                    }.skip(skip).limit(limit)
+                }
+
             } catch (ex: NoSuchFileException) {
                 val mes = "In flow: Path ${this@Instance.path} does not exist."
                 logger.error { mes }