From f753b9bb401ee301d231a021ae5f3630f6d7eefc Mon Sep 17 00:00:00 2001 From: faberf Date: Mon, 8 Jul 2024 11:55:20 +0200 Subject: [PATCH 01/34] refactor pipeliens --- .../core/features/AbstractBatchedExtractor.kt | 56 +++++++++++-------- .../engine/core/features/AbstractExtractor.kt | 2 +- .../model/content/element/ContentElement.kt | 4 ++ .../model/content/element/Model3DContent.kt | 3 - .../content/impl/cache/CachedAudioContent.kt | 2 + .../content/impl/cache/CachedImageContent.kt | 3 + .../content/impl/cache/CachedTextContent.kt | 3 + .../impl/memory/InMemoryAudioContent.kt | 5 +- .../impl/memory/InMemoryImageContent.kt | 5 +- .../impl/memory/InMemoryMeshContent.kt | 5 +- .../impl/memory/InMemoryTextContent.kt | 7 ++- .../attributes/ContentAuthorAttribute.kt | 23 ++++++++ .../operators/persistence/PersistingSink.kt | 4 ++ .../transform/PassthroughTransformer.kt | 23 ++++++++ ....core.operators.general.TransformerFactory | 3 +- .../index/aggregators/AbstractAggregator.kt | 12 +++- .../aggregators/FirstContentAggregator.kt | 4 +- .../aggregators/LastContentAggregator.kt | 4 +- .../aggregators/MiddleContentAggregator.kt | 4 +- .../image/AverageImageContentAggregator.kt | 4 +- .../RepresentativeImageContentAggregator.kt | 4 +- .../engine/index/decode/ImageDecoder.kt | 6 +- .../engine/index/decode/VideoDecoder.kt | 15 +++-- .../DescriptorAsContentTransformer.kt | 18 +++--- .../features/external/common/FesExtractor.kt | 17 ++++-- .../features/external/implementations/ASR.kt | 6 +- .../implementations/DenseEmbedding.kt | 6 +- .../external/implementations/ImageCaption.kt | 6 +- .../implementations/ImageClassification.kt | 6 +- .../features/external/implementations/OCR.kt | 6 +- .../engine/model3d/decoder/MeshDecoder.kt | 6 +- 31 files changed, 192 insertions(+), 80 deletions(-) create mode 100644 vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt create mode 100644 vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt index 6d1530bfb..ac23a34ca 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt @@ -3,9 +3,7 @@ package org.vitrivr.engine.core.features import io.github.oshai.kotlinlogging.KLogger import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.flow.Flow -import kotlinx.coroutines.flow.onCompletion -import kotlinx.coroutines.flow.onEach +import kotlinx.coroutines.flow.* import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.metamodel.Schema @@ -36,33 +34,41 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f * @return [Flow] of [Retrievable] */ final override fun toFlow(scope: CoroutineScope): Flow { + return flow { + val batch = mutableListOf() - val batch = mutableListOf() - - /* Prepare and return flow. */ - return this.input.toFlow(scope).onEach { retrievable -> - try { - if (this.matches(retrievable)) { - batch.add(retrievable) + this@AbstractBatchedExtractor.input.toFlow(scope).collect { retrievable -> + if (retrievable.type == "SOURCE:VIDEO") { + logger.info { "Processing video ${retrievable.id} with field ${field?.fieldName}" } } - if (batch.size >= bufferSize) { - val descriptors = extract(batch) - // zip descriptors and batch - for (i in batch.indices) { - val r = batch[i] - for (d in descriptors[i]) { - r.addDescriptor(d) + try { + if (this@AbstractBatchedExtractor.matches(retrievable)) { + batch.add(retrievable) + } + else { + emit(retrievable) + } + if (batch.size >= bufferSize) { + val descriptors = extract(batch) + // zip descriptors and batch + for (i in batch.indices) { + val r = batch[i] + for (d in descriptors[i]) { + r.addDescriptor(d) + } } + emitAll(batch.asFlow()) + batch.clear() + } + } catch (e: Exception) { + logger.error(e) { "Error during extraction" } + "Error during extraction: $e".let { + logger.error { it } } - batch.clear() } - } catch (e: Exception) { - "Error during extraction: ${e.message}".let { - logger.error { it } - } } - }.onCompletion { - /* Persist buffer if necessary. */ + + // Emit any remaining items in the batch if (batch.isNotEmpty()) { val descriptors = extract(batch) // zip descriptors and batch @@ -72,11 +78,13 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f r.addDescriptor(d) } } + emitAll(batch.asFlow()) batch.clear() } } } + /** * Internal method to check, if [Retrievable] matches this [Extractor] and should thus be processed. * diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractExtractor.kt index bafcec66c..a26dd72ac 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractExtractor.kt @@ -34,7 +34,7 @@ abstract class AbstractExtractor, D : Descriptor>(final ov final override fun toFlow(scope: CoroutineScope): Flow = this.input.toFlow(scope).onEach { retrievable -> if (this.matches(retrievable)) { /* Perform extraction. */ - logger.debug{"Extraction for retrievable: $retrievable" } + logger.debug{"Extraction on field ${field?.fieldName} for retrievable: $retrievable" } val descriptors = extract(retrievable) /* Append descriptor. */ diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt index ef5de14af..05bcc626c 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt @@ -2,6 +2,7 @@ package org.vitrivr.engine.core.model.content.element import org.vitrivr.engine.core.model.content.Content import org.vitrivr.engine.core.model.content.ContentType +import java.util.UUID /** * A [Content] element is a piece of [Content] that is tied to some actual [Content]. @@ -19,6 +20,9 @@ sealed interface ContentElement: Content { */ val content: T + val id: UUID + /** The [ContentType] of this [ContentElement]. */ val type: ContentType + } \ No newline at end of file diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/Model3DContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/Model3DContent.kt index b4082087d..df3d67a39 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/Model3DContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/Model3DContent.kt @@ -13,9 +13,6 @@ import java.awt.image.BufferedImage interface Model3DContent: ContentElement{ /** The [ContentType] of a [Model3DContent] is always [ContentType.MESH]. */ - val id: String - get() = this.content.id - override val type: ContentType get() = ContentType.MESH } \ No newline at end of file diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedAudioContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedAudioContent.kt index 3b9128934..c03066ffb 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedAudioContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedAudioContent.kt @@ -8,6 +8,7 @@ import java.nio.ShortBuffer import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardOpenOption +import java.util.* /** * A [AudioContent] implementation that is backed by a cache file. @@ -35,6 +36,7 @@ class CachedAudioContent(override val path: Path, override val channels: Short, } return buffer.asReadOnlyBuffer() } + override val id: UUID = UUID.randomUUID() init { val outBuffer = ByteBuffer.allocate(this.size).order(ByteOrder.LITTLE_ENDIAN) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedImageContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedImageContent.kt index 8b1e271b3..815a19942 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedImageContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedImageContent.kt @@ -6,6 +6,7 @@ import java.lang.ref.SoftReference import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardOpenOption +import java.util.* import javax.imageio.ImageIO /** @@ -16,6 +17,8 @@ import javax.imageio.ImageIO */ class CachedImageContent(override val path: Path, image: BufferedImage) : ImageContent, CachedContent { + override val id: UUID = UUID.randomUUID() + /** The [SoftReference] of the [BufferedImage] used for caching. */ private var reference: SoftReference = SoftReference(image) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedTextContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedTextContent.kt index d816f2ae1..385f40acf 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedTextContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/cache/CachedTextContent.kt @@ -5,6 +5,7 @@ import java.lang.ref.SoftReference import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardOpenOption +import java.util.* /** * A [TextContent] implementation that is backed by a cache file. @@ -14,6 +15,8 @@ import java.nio.file.StandardOpenOption */ class CachedTextContent(override val path: Path, text: String) : TextContent, CachedContent { + override val id: UUID = UUID.randomUUID() + /** The [SoftReference] of the [String] used for caching. */ private var reference: SoftReference = SoftReference(text) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryAudioContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryAudioContent.kt index deb768698..2a7b71358 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryAudioContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryAudioContent.kt @@ -2,6 +2,7 @@ package org.vitrivr.engine.core.model.content.impl.memory import org.vitrivr.engine.core.model.content.element.AudioContent import java.nio.ShortBuffer +import java.util.* /** * A naive in-memory implementation of the [AudioContent] interface. @@ -11,4 +12,6 @@ import java.nio.ShortBuffer * @author Ralph Gasser * @version 1.0.0 */ -data class InMemoryAudioContent(override val channels: Short, override val samplingRate: Int, override val content: ShortBuffer) : AudioContent +data class InMemoryAudioContent(override val channels: Short, override val samplingRate: Int, override val content: ShortBuffer) : AudioContent { + override val id: UUID = UUID.randomUUID() +} diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryImageContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryImageContent.kt index 40a9c2b0b..d60d69c36 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryImageContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryImageContent.kt @@ -2,6 +2,7 @@ package org.vitrivr.engine.core.model.content.impl.memory import org.vitrivr.engine.core.model.content.element.ImageContent import java.awt.image.BufferedImage +import java.util.* /** * A naive in-memory implementation of the [ImageContent] interface. @@ -11,4 +12,6 @@ import java.awt.image.BufferedImage * @author Luca Rossetto. * @version 1.0.0 */ -data class InMemoryImageContent(override val content: BufferedImage) : ImageContent +data class InMemoryImageContent(override val content: BufferedImage) : ImageContent { + override val id: UUID = UUID.randomUUID() +} diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryMeshContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryMeshContent.kt index a1818ad3c..9360c236d 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryMeshContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryMeshContent.kt @@ -3,6 +3,7 @@ package org.vitrivr.engine.core.model.content.impl.memory import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.content.element.Model3DContent import org.vitrivr.engine.core.model.mesh.Model3D +import java.util.* /** * A naive in-memory implementation of the [ImageContent] interface. @@ -12,4 +13,6 @@ import org.vitrivr.engine.core.model.mesh.Model3D * @author Luca Rossetto. * @version 1.0.0 */ -data class InMemoryMeshContent(override val content: Model3D) : Model3DContent +data class InMemoryMeshContent(override val content: Model3D) : Model3DContent { + override val id: UUID = UUID.randomUUID() +} diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryTextContent.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryTextContent.kt index 3e7d449d9..cfb5aa9ea 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryTextContent.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/impl/memory/InMemoryTextContent.kt @@ -2,6 +2,7 @@ package org.vitrivr.engine.core.model.content.impl.memory import kotlinx.serialization.Serializable import org.vitrivr.engine.core.model.content.element.TextContent +import java.util.UUID /** * A naive in-memory implementation of the [TextContent] interface. @@ -9,5 +10,7 @@ import org.vitrivr.engine.core.model.content.element.TextContent * @author Luca Rossetto. * @version 1.0.0 */ -@Serializable -data class InMemoryTextContent(override val content: String) : TextContent + +data class InMemoryTextContent(override val content: String) : TextContent { + override val id: UUID = UUID.randomUUID() +} diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt new file mode 100644 index 000000000..ce1c821dd --- /dev/null +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt @@ -0,0 +1,23 @@ +package org.vitrivr.engine.core.model.retrievable.attributes + +import java.util.* +import kotlin.collections.HashMap + +class ContentAuthorAttribute private constructor( + private val authorMap: HashMap> +) : MergingRetrievableAttribute { + + constructor(contentId: UUID, author: String) : this(hashMapOf(contentId to hashSetOf(author))) + + override fun merge(other: MergingRetrievableAttribute): MergingRetrievableAttribute { + val otherMap = (other as ContentAuthorAttribute).authorMap + for ((contentId, authors) in otherMap) { + authorMap.computeIfAbsent(contentId) { hashSetOf() }.addAll(authors) + } + return ContentAuthorAttribute(authorMap) + } + + fun getAuthors(contentId: UUID): Set { + return authorMap[contentId] ?: emptySet() + } +} \ No newline at end of file diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt index e84a0466a..35c95985d 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt @@ -14,6 +14,8 @@ import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.operators.Operator +private val logger = KotlinLogging.logger {} + /** * A [Operator.Sink] that persists the [Ingested] it receives. * @@ -67,6 +69,8 @@ class PersistingSink(override val input: Operator, val context: Ind val writer = f.let { field -> this.descriptorWriters.computeIfAbsent(field) { it.getWriter() } } as? DescriptorWriter writer?.addAll(d) } + + logger.debug { "Persisted ${retrievables.size} retrievables, ${relationships.size} relationships and ${descriptors.values.sumBy { it.size }} descriptors." } } /** diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt new file mode 100644 index 000000000..b3a9baf08 --- /dev/null +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt @@ -0,0 +1,23 @@ +package org.vitrivr.engine.core.operators.transform + +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.flow.Flow +import org.vitrivr.engine.core.context.Context +import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.operators.Operator +import org.vitrivr.engine.core.operators.general.Transformer +import org.vitrivr.engine.core.operators.general.TransformerFactory + +class PassthroughTransformer : TransformerFactory{ + override fun newTransformer(name: String, input: Operator, context: Context): Transformer { + return Instance(input) + } + + private class Instance(input: Operator) : Transformer { + override val input: Operator = input + + override fun toFlow(scope: CoroutineScope): Flow { + return input.toFlow(scope) + } + } +} diff --git a/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory b/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory index eab5cd5dc..0059c7e04 100644 --- a/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory +++ b/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory @@ -1,3 +1,4 @@ org.vitrivr.engine.core.operators.transform.map.MapRelationshipTransformer org.vitrivr.engine.core.operators.transform.filter.TypeFilterTransformer -org.vitrivr.engine.core.operators.transform.filter.DistinctTransformer \ No newline at end of file +org.vitrivr.engine.core.operators.transform.filter.DistinctTransformer +org.vitrivr.engine.core.operators.transform.PassthroughTransformer \ No newline at end of file diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt index 0201e1c27..8abd21b78 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt @@ -7,8 +7,10 @@ import org.vitrivr.engine.core.context.Context import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator import org.vitrivr.engine.core.operators.general.Transformer +import java.util.* /** * An abstract [Transformer] implementation for aggregators; aggregators are used to aggregate the content of [Ingested] objects. @@ -16,7 +18,7 @@ import org.vitrivr.engine.core.operators.general.Transformer * @author Ralph Gasser * @version 1.1.0 */ -abstract class AbstractAggregator(override val input: Operator, protected open val context: Context) : Transformer { +abstract class AbstractAggregator(override val input: Operator, protected open val context: Context, protected val name: String, val newContent: Boolean = true) : Transformer { /** * Creates a flow for this [AbstractAggregator]. * @@ -27,8 +29,12 @@ abstract class AbstractAggregator(override val input: Operator, override fun toFlow(scope: CoroutineScope): Flow = this.input.toFlow(scope).map { if (it.content.isNotEmpty()) { val aggregated = this.aggregate(it.content) - it.clearContent() - aggregated.forEach { c -> it.addContent(c) } + aggregated.forEach { c -> + if (newContent) { + it.addContent(c) + } + it.addAttribute(ContentAuthorAttribute(c.id, name)) + } it } else { it diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/FirstContentAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/FirstContentAggregator.kt index 77a5956ec..8ae7256e4 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/FirstContentAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/FirstContentAggregator.kt @@ -25,12 +25,12 @@ class FirstContentAggregator : TransformerFactory { * @param context The [IndexContext] to use. * @return [FirstContentAggregator.Instance] */ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context) + override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context, name) /** * The [Instance] returned by the [FirstContentAggregator] */ - private class Instance(override val input: Operator, context: Context) : AbstractAggregator(input, context) { + private class Instance(override val input: Operator, context: Context, name: String) : AbstractAggregator(input, context, name) { override fun aggregate(content: List>): List> = content.groupBy { it.type }.mapNotNull { (_, elements) -> elements.firstOrNull() } } } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/LastContentAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/LastContentAggregator.kt index 230ff250e..a58a09a47 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/LastContentAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/LastContentAggregator.kt @@ -25,12 +25,12 @@ class LastContentAggregator : TransformerFactory { * @param context The [IndexContext] to use. * @return [LastContentAggregator.Instance] */ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context) + override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context, name) /** * The [Instance] returns by the [LastContentAggregator] */ - private class Instance(override val input: Operator, context: Context) : AbstractAggregator(input, context) { + private class Instance(override val input: Operator, context: Context, name: String) : AbstractAggregator(input, context, name, newContent = false) { override fun aggregate(content: List>): List> = content.groupBy { it.type }.mapNotNull { (_, elements) -> elements.lastOrNull() } } } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/MiddleContentAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/MiddleContentAggregator.kt index e15b8cd7d..015e5b9cb 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/MiddleContentAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/MiddleContentAggregator.kt @@ -25,12 +25,12 @@ class MiddleContentAggregator : TransformerFactory { * @param context The [IndexContext] to use. * @return [MiddleContentAggregator.Instance] */ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context) + override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context, name) /** * The [Instance] returns by the [MiddleContentAggregator] */ - private class Instance(override val input: Operator, context: Context) : AbstractAggregator(input, context) { + private class Instance(override val input: Operator, context: Context, name: String) : AbstractAggregator(input, context, name) { override fun aggregate(content: List>): List> = content.groupBy { it.type }.mapNotNull { (_, elements) -> if (elements.isNotEmpty()) { elements[Math.floorDiv(elements.size, 2)] diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/AverageImageContentAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/AverageImageContentAggregator.kt index 12b0be36b..6ac64d4fa 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/AverageImageContentAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/AverageImageContentAggregator.kt @@ -35,12 +35,12 @@ class AverageImageContentAggregator : TransformerFactory { * @param context The [IndexContext] to use. * @return [AverageImageContentAggregator.Instance] */ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context as IndexContext) + override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context as IndexContext, name) /** * The [Instance] returns by the [AverageImageContentAggregator] */ - private class Instance(override val input: Operator, override val context: IndexContext) : AbstractAggregator(input, context) { + private class Instance(override val input: Operator, override val context: IndexContext, name: String) : AbstractAggregator(input, context, name) { override fun aggregate(content: List>): List> { /* Filter out images. */ val images = content.filterIsInstance() diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/RepresentativeImageContentAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/RepresentativeImageContentAggregator.kt index 04bf7cb6a..12269499d 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/RepresentativeImageContentAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/image/RepresentativeImageContentAggregator.kt @@ -31,13 +31,13 @@ class RepresentativeImageContentAggregator : TransformerFactory { * @param context The [IndexContext] to use. * @return [RepresentativeImageContentAggregator.Instance] */ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context as IndexContext) + override fun newTransformer(name: String, input: Operator, context: Context): Transformer = Instance(input, context as IndexContext, name) /** * The [Instance] returns by the [RepresentativeImageContentAggregator] */ - private class Instance(override val input: Operator, override val context: IndexContext) : AbstractAggregator(input, context) { + private class Instance(override val input: Operator, override val context: IndexContext, name: String) : AbstractAggregator(input, context, name) { override fun aggregate(content: List>): List> { val images = content.filterIsInstance() if (images.isEmpty()) { diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/ImageDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/ImageDecoder.kt index a53ec9ed4..9c0116732 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/ImageDecoder.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/ImageDecoder.kt @@ -8,6 +8,7 @@ import kotlinx.coroutines.flow.mapNotNull import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.model.retrievable.attributes.SourceAttribute import org.vitrivr.engine.core.operators.ingest.Decoder import org.vitrivr.engine.core.operators.ingest.DecoderFactory @@ -35,12 +36,12 @@ class ImageDecoder : DecoderFactory { * @param input The input [Enumerator]. * @param context The [IndexContext] to use. */ - override fun newDecoder(name: String, input: Enumerator, context: IndexContext): Decoder = Instance(input, context) + override fun newDecoder(name: String, input: Enumerator, context: IndexContext): Decoder = Instance(input, context, name) /** * The [Decoder] returned by this [ImageDecoder]. */ - private class Instance(override val input: Enumerator, private val context: IndexContext) : Decoder { + private class Instance(override val input: Enumerator, private val context: IndexContext, private val name: String) : Decoder { override fun toFlow(scope: CoroutineScope): Flow = this.input.toFlow(scope).mapNotNull { sourceRetrievable -> val source = sourceRetrievable.filteredAttribute(SourceAttribute::class.java)?.source ?: return@mapNotNull null if (source.type != MediaType.IMAGE) { @@ -53,6 +54,7 @@ class ImageDecoder : DecoderFactory { this.context.contentFactory.newImageContent(ImageIO.read(it)) } sourceRetrievable.addContent(content) + sourceRetrievable.addAttribute(ContentAuthorAttribute(content.id, this.name)) logger.info { "Finished decoding image from source '${source.name}' (${source.sourceId})." } /* Return ingested. */ diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt index 57da3b800..34fe349bf 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt @@ -20,6 +20,7 @@ import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.relationship.Relationship import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.model.retrievable.attributes.SourceAttribute import org.vitrivr.engine.core.model.retrievable.attributes.time.TimeRangeAttribute import org.vitrivr.engine.core.operators.ingest.Decoder @@ -47,7 +48,7 @@ class VideoDecoder : DecoderFactory { val audio = context[name, "audio"]?.let { it.lowercase() == "true" } ?: true val keyFrames = context[name, "keyFrames"]?.let { it.lowercase() == "true" } ?: false val timeWindowMs = context[name, "timeWindowMs"]?.toLongOrNull() ?: 500L - return Instance(input, context, video, audio, keyFrames, timeWindowMs) + return Instance(input, context, video, audio, keyFrames, timeWindowMs, name) } /** @@ -60,6 +61,7 @@ class VideoDecoder : DecoderFactory { private val audio: Boolean = true, private val keyFrames: Boolean = false, private val timeWindowMs: Long = 500L, + private val name : String ) : Decoder { /** [KLogger] instance. */ @@ -115,7 +117,7 @@ class VideoDecoder : DecoderFactory { grabber.imageMode = FrameGrabber.ImageMode.COLOR grabber.sampleMode = FrameGrabber.SampleMode.SHORT - logger.info { "Start decoding source ${source.name} (${source.sourceId})" } + logger.info { "Start decoding source ${source.name} (${source.sourceId}): ${sourceRetrievable.id}" } try { grabber.start() @@ -177,7 +179,7 @@ class VideoDecoder : DecoderFactory { emit(imageBuffer, audioBuffer, grabber, windowEnd, sourceRetrievable, channel) } - logger.info { "Finished decoding video from source '${source.name}' (${source.sourceId})." } + logger.info { "Finished decoding video from source '${source.name}' (${source.sourceId}): ${sourceRetrievable.id}" } } catch (exception: Exception) { error = true logger.error(exception) { "Failed to decode video from source '${source.name}' (${source.sourceId})." } @@ -241,13 +243,18 @@ class VideoDecoder : DecoderFactory { samples.clear() val audio = this.context.contentFactory.newAudioContent(grabber.audioChannels.toShort(), grabber.sampleRate, samples) ingested.addContent(audio) + ingested.addAttribute(ContentAuthorAttribute(audio.id, name)) } /* Prepare and append image content element. */ for (image in emitImage) { - ingested.addContent(this.context.contentFactory.newImageContent(image)) + val imageContent = this.context.contentFactory.newImageContent(image) + ingested.addContent(imageContent) + ingested.addAttribute(ContentAuthorAttribute(imageContent.id, name)) } + logger.debug { "Emitting ingested ${ingested.id} with ${emitImage.size} images and ${emitAudio.size} audio samples: ${ingested.id}" } + /* Emit ingested. */ channel.send(ingested) } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt index e3fb00ab0..50d9ceeb7 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt @@ -10,7 +10,9 @@ import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.factory.ContentFactory import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor +import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator import org.vitrivr.engine.core.operators.general.Transformer import org.vitrivr.engine.core.operators.general.TransformerFactory @@ -30,26 +32,22 @@ class DescriptorAsContentTransformer : TransformerFactory { input = input, contentFactory = (context as IndexContext).contentFactory, fieldName = context[name, "field"] ?: throw IllegalArgumentException("The descriptor as content transformer requires a field name."), - removeContent = context[name, "removeContent"]?.toBoolean() ?: false + name = name ) } - private class Instance(override val input: Operator, val contentFactory: ContentFactory, val fieldName : String, val removeContent: Boolean) : Transformer { + private class Instance(override val input: Operator, val contentFactory: ContentFactory, val fieldName : String, val name: String) : Transformer { override fun toFlow(scope: CoroutineScope): Flow = flow { input.toFlow(scope).collect { retrievable : Retrievable -> - if (removeContent) { - retrievable.clearContent().also { - logger.debug { "Content of retrievable ${retrievable.id} has been removed." } - } - } retrievable.descriptors.filter{ descriptor -> descriptor.field?.fieldName == fieldName }.forEach{ descriptor -> - retrievable.addContent(convertDescriptorToContent(descriptor)).also { - logger.debug { "Descriptor ${descriptor.id} of retrievable ${retrievable.id} has been converted to content element." } - } + val content = convertDescriptorToContent(descriptor) + retrievable.addContent(content) + retrievable.addAttribute(ContentAuthorAttribute(content.id, name)) + logger.debug { "Descriptor ${descriptor.id} of retrievable ${retrievable.id} has been converted to content element." } } emit(retrievable) } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index e9812bd32..7eff3074d 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -8,6 +8,7 @@ import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.metamodel.Schema import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.RetrievableId +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator import java.util.logging.Logger @@ -21,9 +22,10 @@ private val logger: KLogger = KotlinLogging.logger {} * @param A The type of the [ExternalFesAnalyser] to use. */ abstract class FesExtractor, A:ExternalFesAnalyser>( - input: Operator, - field: Schema.Field?, - bufferSize: Int + input: Operator, + field: Schema.Field?, + bufferSize: Int, + private val contentSources : Set? ) : AbstractBatchedExtractor(input, field, bufferSize) { @@ -55,11 +57,16 @@ abstract class FesExtractor, A:ExternalFesAnaly val allContent : List> = retrievables.map { retrievable -> retrievable.findContent { contentItem -> analyser.contentClasses.any { contentClass -> - contentClass.isInstance(contentItem) + contentClass.isInstance(contentItem) && contentSources?.let { sources -> + retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getAuthors(contentItem.id)?.any { it in sources } + } ?: true } }.map{ it as C} } - logger.debug { "Extracting descriptors from ${retrievables.size} retrievables (${allContent.flatten().size} content elements total)." } + + val idString: String = retrievables.joinToString(", ") { it.id.toString() } + + logger.debug { "Extracting descriptors for field ${this.field?.fieldName} from ${retrievables.size} retrievables (${allContent.flatten().size} content elements total): $idString" } val allDescriptors: List> try { diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ASR.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ASR.kt index 7906e089a..719e8ce73 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ASR.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ASR.kt @@ -76,7 +76,8 @@ class ASR : ExternalFesAnalyser() { context: IndexContext ): Extractor { val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, null, batchSize) { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId) } @@ -98,7 +99,8 @@ class ASR : ExternalFesAnalyser() { context: IndexContext ): Extractor { val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, field, batchSize) { + val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt index d510689f5..0ede283ce 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt @@ -139,7 +139,8 @@ companion object { ): FesExtractor, DenseEmbedding> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor, DenseEmbedding>(input, field, batchSize) { + val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() + return object : FesExtractor, DenseEmbedding>(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: FloatVectorDescriptor, retrievableId: RetrievableId): FloatVectorDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } @@ -159,7 +160,8 @@ companion object { context: IndexContext ): FesExtractor, DenseEmbedding>{ val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor, DenseEmbedding>(input, null, batchSize) { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + return object : FesExtractor, DenseEmbedding>(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: FloatVectorDescriptor, retrievableId: RetrievableId): FloatVectorDescriptor { return descriptor.copy(retrievableId = retrievableId) } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt index c61a63d49..a12225d57 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt @@ -79,7 +79,8 @@ class ImageCaption : ExternalFesAnalyser() { ): Extractor { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, field, batchSize) { + val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } @@ -101,7 +102,8 @@ class ImageCaption : ExternalFesAnalyser() { context: IndexContext ): Extractor { val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, null, batchSize) { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId) } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageClassification.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageClassification.kt index 34d5af6b5..c4d3d3518 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageClassification.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageClassification.kt @@ -100,7 +100,8 @@ class ImageClassification : ExternalFesAnalyser() context: IndexContext ): Extractor { val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, null, batchSize) { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: LabelDescriptor, retrievableId: RetrievableId): LabelDescriptor { return descriptor.copy(retrievableId = retrievableId) } @@ -122,7 +123,8 @@ class ImageClassification : ExternalFesAnalyser() context: IndexContext ): Extractor { val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, field, batchSize) { + val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: LabelDescriptor, retrievableId: RetrievableId): LabelDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/OCR.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/OCR.kt index 22ff2219d..a7abcc9f2 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/OCR.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/OCR.kt @@ -62,7 +62,8 @@ class OCR : ExternalFesAnalyser() { override fun newExtractor(field: Schema.Field, input: Operator, context: IndexContext): Extractor { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, field, batchSize) { + val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } @@ -80,7 +81,8 @@ class OCR : ExternalFesAnalyser() { */ override fun newExtractor(name: String, input: Operator, context: IndexContext): Extractor { val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() - return object : FesExtractor(input, null, batchSize) { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + return object : FesExtractor(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId) } diff --git a/vitrivr-engine-module-m3d/src/main/kotlin/org/vitrivr/engine/model3d/decoder/MeshDecoder.kt b/vitrivr-engine-module-m3d/src/main/kotlin/org/vitrivr/engine/model3d/decoder/MeshDecoder.kt index 10a2fed66..3f8df78d4 100644 --- a/vitrivr-engine-module-m3d/src/main/kotlin/org/vitrivr/engine/model3d/decoder/MeshDecoder.kt +++ b/vitrivr-engine-module-m3d/src/main/kotlin/org/vitrivr/engine/model3d/decoder/MeshDecoder.kt @@ -7,6 +7,7 @@ import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.mapNotNull import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.model.retrievable.attributes.SourceAttribute import org.vitrivr.engine.core.operators.ingest.Decoder import org.vitrivr.engine.core.operators.ingest.DecoderFactory @@ -30,11 +31,11 @@ class MeshDecoder : DecoderFactory { * @param input The input [Enumerator]. * @param context The [IndexContext] */ - override fun newDecoder(name: String, input: Enumerator, context: IndexContext): Decoder = Instance(input, context) + override fun newDecoder(name: String, input: Enumerator, context: IndexContext): Decoder = Instance(input, context, name) /** * The [Decoder] returned by this [MeshDecoder]. */ - private class Instance(override val input: Enumerator, private val context: IndexContext) : Decoder { + private class Instance(override val input: Enumerator, private val context: IndexContext, private val name : String) : Decoder { /** [KLogger] instance. */ private val logger: KLogger = KotlinLogging.logger {} @@ -61,6 +62,7 @@ class MeshDecoder : DecoderFactory { } val modelContent = this.context.contentFactory.newMeshContent(model) sourceRetrievable.addContent(modelContent) + sourceRetrievable.addAttribute(ContentAuthorAttribute(modelContent.id, this.name)) sourceRetrievable } catch (e: IOException) { logger.error(e) { "Failed to decode 3D model from $source due to an IO exception." } From d237eda5537fd0c989049e757872a4e9d6bed545 Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 10 Jul 2024 13:23:24 +0200 Subject: [PATCH 02/34] adding textquery --- vitrivr-engine-module-fes/build.gradle | 4 +- .../features/external/common/ApiWrapper.kt | 39 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/vitrivr-engine-module-fes/build.gradle b/vitrivr-engine-module-fes/build.gradle index b45bbc161..081cb8b5f 100644 --- a/vitrivr-engine-module-fes/build.gradle +++ b/vitrivr-engine-module-fes/build.gradle @@ -21,10 +21,10 @@ tasks.register('generateFESClient', GenerateTask) { } compileKotlin { - //dependsOn 'generateFESClient' + dependsOn 'generateFESClient' } -//sourcesJar.dependsOn 'generateFESClient' +sourcesJar.dependsOn 'generateFESClient' /* Add generated Open API client as source directory. */ sourceSets { diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt index 0579f9df6..175200c38 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt @@ -419,4 +419,43 @@ class ApiWrapper(private val hostName:String, private val model: String, private logger.info{ "Batched optical character recognition result: $it" } } } + +// /* +// * Method to get the text query embedding for a given text. +// * @param query The text for which to get the embedding. +// * @param instruction The instruction detailing the retrieval task. +// * @return The embedding for the text. +// */ +// fun textQueryEmbedding(query: String, instruction: String): kotlin.collections.List { +// logger.info{ "Starting text query embedding for query: \"$query\"" } +// val input = TextQueryEmbeddingInput(query, instruction) +// +// return executeJob( +// taskName = "Text Query Embedding", +// inp = input, +// startJobFunc = { inp -> textQueryEmbeddingApi.newJobApiTasksTextQueryEmbeddingModelJobsPost(model, inp).body() }, +// getJobResultFunc = { jobId -> textQueryEmbeddingApi.getJobResultsApiTasksTextQueryEmbeddingJobsJobGet(jobId).body().let { JobResult(it.status, it.result) } } +// ).embedding.map{it.toFloat()}.also { +// logger.info{ "Text query embedding result: $it" } +// } +// } +// +// /* +// * Method to get the text query embedding for a list of texts. +// * @param query The list of texts for which to get the embedding. +// * @param instruction The instruction detailing the retrieval task. +// */ +// fun textQueryEmbedding(query: kotlin.collections.List, instruction: String): kotlin.collections.List> { +// logger.info{ "Starting batched text query embedding for queries: \"$query\" (batch size: ${query.size})" } +// val input = BatchedTextQueryEmbeddingInput(query, instruction) +// +// return executeJob( +// taskName = "Batched Text Query Embedding", +// inp = input, +// startJobFunc = { inp -> textQueryEmbeddingApi.newBatchedJobApiTasksTextQueryEmbeddingBatchedModelJobsPost(model, inp).body() }, +// getJobResultFunc = { jobId -> textQueryEmbeddingApi.getBatchedJobResultsApiTasksTextQueryEmbeddingBatchedJobsJobGet(jobId).body().let { JobResult(it.status, it.result) } } +// ).map { it.embedding.map{it.toFloat()}}.also { +// logger.info{ "Batched text query embedding result: $it" } +// } +// } } \ No newline at end of file From 3a5dec1a7e0576178c0b8e8aa7163583719f5b9b Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 10 Jul 2024 16:25:00 +0200 Subject: [PATCH 03/34] retrieval task instructions --- .../features/external/common/ApiWrapper.kt | 79 ++++++++++--------- .../implementations/DenseEmbedding.kt | 13 ++- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt index 175200c38..f7f29f60a 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/ApiWrapper.kt @@ -38,6 +38,7 @@ class ApiWrapper(private val hostName:String, private val model: String, private private val objectDetectionApi = ObjectDetectionApi(basePath = hostName, client = okHttpClient) private val automatedSpeechRecognitionApi = AutomatedSpeechRecognitionApi(basePath = hostName, client = okHttpClient) private val opticalCharacterRecognitionApi = OpticalCharacterRecognitionApi(basePath = hostName, client = okHttpClient) + private val textQueryEmbeddingApi = TextQueryEmbeddingApi(basePath = hostName, client = okHttpClient) init { logger.info{ "Initialized API wrapper with host: $hostName, model: $model, timeout: $timeoutSeconds seconds, polling interval: $pollingIntervalMs ms" } @@ -420,42 +421,44 @@ class ApiWrapper(private val hostName:String, private val model: String, private } } -// /* -// * Method to get the text query embedding for a given text. -// * @param query The text for which to get the embedding. -// * @param instruction The instruction detailing the retrieval task. -// * @return The embedding for the text. -// */ -// fun textQueryEmbedding(query: String, instruction: String): kotlin.collections.List { -// logger.info{ "Starting text query embedding for query: \"$query\"" } -// val input = TextQueryEmbeddingInput(query, instruction) -// -// return executeJob( -// taskName = "Text Query Embedding", -// inp = input, -// startJobFunc = { inp -> textQueryEmbeddingApi.newJobApiTasksTextQueryEmbeddingModelJobsPost(model, inp).body() }, -// getJobResultFunc = { jobId -> textQueryEmbeddingApi.getJobResultsApiTasksTextQueryEmbeddingJobsJobGet(jobId).body().let { JobResult(it.status, it.result) } } -// ).embedding.map{it.toFloat()}.also { -// logger.info{ "Text query embedding result: $it" } -// } -// } -// -// /* -// * Method to get the text query embedding for a list of texts. -// * @param query The list of texts for which to get the embedding. -// * @param instruction The instruction detailing the retrieval task. -// */ -// fun textQueryEmbedding(query: kotlin.collections.List, instruction: String): kotlin.collections.List> { -// logger.info{ "Starting batched text query embedding for queries: \"$query\" (batch size: ${query.size})" } -// val input = BatchedTextQueryEmbeddingInput(query, instruction) -// -// return executeJob( -// taskName = "Batched Text Query Embedding", -// inp = input, -// startJobFunc = { inp -> textQueryEmbeddingApi.newBatchedJobApiTasksTextQueryEmbeddingBatchedModelJobsPost(model, inp).body() }, -// getJobResultFunc = { jobId -> textQueryEmbeddingApi.getBatchedJobResultsApiTasksTextQueryEmbeddingBatchedJobsJobGet(jobId).body().let { JobResult(it.status, it.result) } } -// ).map { it.embedding.map{it.toFloat()}}.also { -// logger.info{ "Batched text query embedding result: $it" } -// } -// } + /* + * Method to get the text query embedding for a given text. + * @param query The text for which to get the embedding. + * @param instruction The instruction detailing the retrieval task. + * @return The embedding for the text. + */ + fun textQueryEmbedding(query: String, instruction: String): kotlin.collections.List { + logger.info{ "Starting text query embedding for query: \"$query\"" } + val input = TextQueryEmbeddingInput(query, instruction) + + return executeJob( + taskName = "Text Query Embedding", + inp = input, + startJobFunc = { inp -> textQueryEmbeddingApi.newJobApiTasksTextQueryEmbeddingModelJobsPost(model, inp) }, + getJobResultFunc = { jobId -> textQueryEmbeddingApi.getJobResultsApiTasksTextQueryEmbeddingJobsJobGet(jobId).let { JobResult(it.status, it.result) } } + ).embedding.map{it.toFloat()}.also { + logger.info{ "Text query embedding result: $it" } + } + } + + /* + * Method to get the text query embedding for a list of texts. + * @param query The list of texts for which to get the embedding. + * @param instruction The instruction detailing the retrieval task. + * @return The embedding for the texts. + */ + fun textQueryEmbedding(query: kotlin.collections.List, instruction: String): kotlin.collections.List> { + logger.info{ "Starting batched text query embedding for queries: \"$query\" (batch size: ${query.size})" } + val input = BatchedTextQueryEmbeddingInput(query, instruction) + + return executeJob( + taskName = "Batched Text Query Embedding", + inp = input, + startJobFunc = { inp -> textQueryEmbeddingApi.newBatchedJobApiTasksTextQueryEmbeddingBatchedModelJobsPost(model, inp) }, + getJobResultFunc = { jobId -> textQueryEmbeddingApi.getBatchedJobResultsApiTasksTextQueryEmbeddingBatchedJobsJobGet(jobId).let { JobResult(it.status, it.result) } } + ).map { it.embedding.map{it.toFloat()}}.also { + logger.info{ "Batched text query embedding result: $it" } + } + } + } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt index 0ede283ce..80ce8e274 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/DenseEmbedding.kt @@ -56,7 +56,11 @@ companion object { imageResults = apiWrapper.imageEmbedding(imageContents.map { it.content }) } if (textContents.isNotEmpty()) { - textResults = apiWrapper.textEmbedding(textContents.map { it.content }) + if ("retrievalTaskInstructions" in parameters) { + textResults = apiWrapper.textQueryEmbedding(textContents.map { it.content }, parameters["retrievalTaskInstructions"]!!) + }else{ + textResults = apiWrapper.textEmbedding(textContents.map { it.content }) + } } return content.map { element -> @@ -115,8 +119,13 @@ companion object { */ override fun newRetrieverForContent(field: Schema.Field, FloatVectorDescriptor>, content: Collection>, context: QueryContext): Retriever, FloatVectorDescriptor> { + val parameters = field.parameters.toMutableMap() + /* Prepare query parameters. */ - val vector = analyse(content.first(), field.parameters) + if (context.getProperty(field.fieldName, "retrievalTaskInstructions") != null) { + parameters["retrievalTaskInstructions"] = context.getProperty(field.fieldName, "retrievalTaskInstructions")!! + } + val vector = analyse(content.first(), parameters) val k = context.getProperty(field.fieldName, "limit")?.toLongOrNull() ?: 1000L val fetchVector = context.getProperty(field.fieldName, "returnDescriptor")?.toBooleanStrictOrNull() ?: false From d13422cad119040be3eca3522aed7ed8bb968b97 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Thu, 11 Jul 2024 16:33:20 +0200 Subject: [PATCH 04/34] Initial draft of ContentMergingTransformer --- .../transform/ContentMergingTransformer.kt | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt new file mode 100644 index 000000000..011245154 --- /dev/null +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -0,0 +1,54 @@ +package org.vitrivr.engine.index.transform + +import io.github.oshai.kotlinlogging.KotlinLogging +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import org.vitrivr.engine.core.context.Context +import org.vitrivr.engine.core.context.IndexContext +import org.vitrivr.engine.core.model.content.factory.ContentFactory +import org.vitrivr.engine.core.model.retrievable.Retrievable +import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute +import org.vitrivr.engine.core.operators.Operator +import org.vitrivr.engine.core.operators.general.Transformer +import org.vitrivr.engine.core.operators.general.TransformerFactory + +private val logger = KotlinLogging.logger {} + +class ContentMergingTransformer : TransformerFactory { + override fun newTransformer(name: String, input: Operator, context: Context): Transformer { + val contentFields = context[name, "contentFields"]?.split(",") ?: throw IllegalArgumentException("The content merging transformer requires a list of content fields.") + return Instance( + input = input, + contentFactory = (context as IndexContext).contentFactory, + contentFields = contentFields, + name = name + ) + } + + private class Instance( + override val input: Operator, + val contentFactory: ContentFactory, + val contentFields: List, + val name: String + ) : Transformer { + override fun toFlow(scope: CoroutineScope): Flow = flow { + input.toFlow(scope).collect { retrievable: Retrievable -> + val mergedContent = StringBuilder() + contentFields.forEach { fieldName -> + retrievable.getContent(fieldName)?.let { content -> + mergedContent.append(content) + mergedContent.append("\n") + } + } + if (mergedContent.isNotEmpty()) { + val content = contentFactory.newTextContent(mergedContent.toString().trim()) + retrievable.addContent(content) + retrievable.addAttribute(ContentAuthorAttribute(content.id, name)) + logger.debug { "Contents from fields $contentFields of retrievable ${retrievable.id} have been merged into a single content element." } + } + emit(retrievable) + } + } + } +} \ No newline at end of file From f19fe95314aef6841e1a8215a96e23cf56c1c266 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Mon, 15 Jul 2024 17:02:24 +0200 Subject: [PATCH 05/34] Extended ContentMergingTransformer to include a template and fill matching fields. --- .../transform/ContentMergingTransformer.kt | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index 011245154..e00be57b9 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -7,6 +7,7 @@ import kotlinx.coroutines.flow.flow import org.vitrivr.engine.core.context.Context import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.factory.ContentFactory +import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator @@ -18,34 +19,36 @@ private val logger = KotlinLogging.logger {} class ContentMergingTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { val contentFields = context[name, "contentFields"]?.split(",") ?: throw IllegalArgumentException("The content merging transformer requires a list of content fields.") + val template = context[name, "template"] ?: throw IllegalArgumentException("The content merging transformer requires a template.") return Instance( input = input, contentFactory = (context as IndexContext).contentFactory, contentFields = contentFields, + template = template, name = name ) } - private class Instance( - override val input: Operator, - val contentFactory: ContentFactory, - val contentFields: List, - val name: String - ) : Transformer { + private class Instance(override val input: Operator, val contentFactory: ContentFactory, val contentFields: List, val template: String, val name: String) : Transformer { override fun toFlow(scope: CoroutineScope): Flow = flow { input.toFlow(scope).collect { retrievable: Retrievable -> - val mergedContent = StringBuilder() + var mergedContent = template + contentFields.forEach { fieldName -> - retrievable.getContent(fieldName)?.let { content -> - mergedContent.append(content) - mergedContent.append("\n") - } + val placeholder = "\$$fieldName" + val contentText = retrievable.descriptors.find { + it.field?.fieldName == fieldName + }?.let { descriptor -> + if (descriptor is StringDescriptor) descriptor.value.value else "" + } ?: "" + mergedContent = mergedContent.replace(placeholder, contentText) } - if (mergedContent.isNotEmpty()) { - val content = contentFactory.newTextContent(mergedContent.toString().trim()) + + if (mergedContent.isNotBlank()) { + val content = contentFactory.newTextContent(mergedContent.trim()) retrievable.addContent(content) retrievable.addAttribute(ContentAuthorAttribute(content.id, name)) - logger.debug { "Contents from fields $contentFields of retrievable ${retrievable.id} have been merged into a single content element." } + logger.debug { "Contents from fields $contentFields of retrievable ${retrievable.id} have been merged into a single content element using template." } } emit(retrievable) } From 6fe6bbb064c90b9b99741430b425d4b0eccf22bb Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 17 Jul 2024 09:55:44 +0200 Subject: [PATCH 06/34] added unnormalized fusion and fixed score bug --- .../descriptors/vector/VectorDescriptorReader.kt | 2 +- .../engine/query/aggregate/WeightedScoreFusion.kt | 12 ++++++++++-- .../query/aggregate/WeightedScoreFusionFactory.kt | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/descriptors/vector/VectorDescriptorReader.kt b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/descriptors/vector/VectorDescriptorReader.kt index 53508600e..0a8e344f7 100644 --- a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/descriptors/vector/VectorDescriptorReader.kt +++ b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/descriptors/vector/VectorDescriptorReader.kt @@ -83,7 +83,7 @@ internal class VectorDescriptorReader(field: Schema.Field<*, VectorDescriptor<*> val descriptors = this.connection.client.query(cottontailQuery).asSequence().map { tuple -> val scoreIndex = tuple.indexOf(DISTANCE_COLUMN_NAME) tupleToDescriptor(tuple) to if (scoreIndex > -1) { - tuple.asFloat(DISTANCE_COLUMN_NAME)?.let { DistanceAttribute(it) } + tuple.asDouble(DISTANCE_COLUMN_NAME)?.let { DistanceAttribute(it.toFloat()) } } else { null } diff --git a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt index 0142ef43b..8ba168697 100644 --- a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt +++ b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt @@ -10,12 +10,14 @@ import org.vitrivr.engine.core.model.retrievable.RetrievableId import org.vitrivr.engine.core.model.retrievable.attributes.ScoreAttribute import org.vitrivr.engine.core.operators.Operator import org.vitrivr.engine.core.operators.general.Aggregator +import java.util.* import kotlin.math.pow class WeightedScoreFusion( override val inputs: List>, weights: List, - val p: Float + val p: Float, + val normalize: Boolean ) : Aggregator { private val weights: List = when { @@ -81,7 +83,8 @@ class WeightedScoreFusion( else { score = retrieveds.map { ((it.second.filteredAttribute(ScoreAttribute::class.java))?.score ?: 0f).pow(p) * weights[it.first] - }.sum().pow(1 / p) / normalization + }.sum().pow(1 / p) + if (normalize) score /= normalization } first = retrieveds.first().second } @@ -91,6 +94,11 @@ class WeightedScoreFusion( retrieved.filteredAttribute(ScoreAttribute::class.java) retrieved.addAttribute(ScoreAttribute.Unbound(score)) + if(retrieved.id == UUID.fromString("aea15b88-79a2-4da7-8612-95ed65cf6475")){ + println("Score: $score") + println(retrieveds) + } + emit(retrieved) } diff --git a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusionFactory.kt b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusionFactory.kt index a315d6bd1..1d60a6e9a 100644 --- a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusionFactory.kt +++ b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusionFactory.kt @@ -17,9 +17,10 @@ class WeightedScoreFusionFactory : AggregatorFactory { ): Aggregator { val weights = context[name, "weights"]?.split(",")?.mapNotNull { s -> s.trim().toFloatOrNull() } ?: emptyList() val p = context[name, "p"]?.toFloatOrNull() ?: 1f + val normalize = context[name, "normalize"]?.toBoolean() ?: true if (p == Float.POSITIVE_INFINITY && weights.isNotEmpty()) { logger.warn { "Weights are ignored when p is set to infinity" } } - return WeightedScoreFusion(inputs, weights, p) + return WeightedScoreFusion(inputs, weights, p, normalize) } } \ No newline at end of file From 62a3f7e3b33b4eb9bcf7353f4637945db68baba9 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Wed, 17 Jul 2024 14:50:07 +0200 Subject: [PATCH 07/34] Tested ContentMergingTransformer on sample pipeline; pipeline config files to be removed at a later stage. --- test-pipeline.json | 127 ++++++++++++++++++ test-schema.json | 86 ++++++++++++ .../transform/ContentMergingTransformer.kt | 6 + ....core.operators.general.TransformerFactory | 1 + 4 files changed, 220 insertions(+) create mode 100644 test-pipeline.json create mode 100644 test-schema.json diff --git a/test-pipeline.json b/test-pipeline.json new file mode 100644 index 000000000..7091e4fc7 --- /dev/null +++ b/test-pipeline.json @@ -0,0 +1,127 @@ +{ + "schema": "test", + "context": { + "contentFactory": "CachedContentFactory", + "resolverName": "disk", + "local": { + "content": { + "path": "../cache" + }, + "enumerator": { + "path": "../benchmark/media_objects", + "depth": "5" + }, + "image_source_filter": { + "type": "SOURCE:IMAGE" + }, + "video_source_filter": { + "type": "SOURCE:VIDEO" + }, + "ocr_content": { + "field": "ocr_sparse", + "removeContent": "true" + }, + "asr_content": { + "field": "asr_sparse", + "removeContent": "true" + }, + "caption_content": { + "field": "caption_sparse", + "removeContent": "true" + }, + "video_decoder": { + "timeWindowMs": "10000" + }, + "ocr_sparse": { + "contentSources": "image_decoder,selector" + }, + "caption_sparse": { + "contentSources": "image_decoder,selector" + }, + "asr_sparse": { + "contentSources": "video_decoder" + }, + "merge_prompt": { + "contentFields": "asr_sparse,caption_sparse,ocr_sparse", + "template": "test $asr_sparse ASR \n $caption_sparse CAPTION \n $ocr_sparse OCR" + } + } + }, + "operators": { + "passthrough": { + "type": "TRANSFORMER", + "factory": "PassthroughTransformer" + }, + "enumerator": { + "type": "ENUMERATOR", + "factory": "FileSystemEnumerator", + "mediaTypes": ["IMAGE", "VIDEO"] + }, + "image_decoder": { + "type": "DECODER", + "factory": "ImageDecoder" + }, + "video_decoder": { + "type": "DECODER", + "factory": "VideoDecoder" + }, + "file_metadata":{ + "type": "EXTRACTOR", + "fieldName": "file" + }, + "ocr_sparse": { + "type": "EXTRACTOR", + "fieldName": "ocr_sparse" + }, + "caption_sparse": { + "type": "EXTRACTOR", + "fieldName": "caption_sparse" + }, + "asr_sparse": { + "type": "EXTRACTOR", + "fieldName": "asr_sparse" + }, + "ocr_content": { + "type": "TRANSFORMER", + "factory": "DescriptorAsContentTransformer" + }, + "asr_content": { + "type": "TRANSFORMER", + "factory": "DescriptorAsContentTransformer" + }, + "caption_content": { + "type": "TRANSFORMER", + "factory": "DescriptorAsContentTransformer" + }, + "merge_prompt": { + "type": "TRANSFORMER", + "factory": "ContentMergingTransformer" + }, + "selector": { + "type": "TRANSFORMER", + "factory": "LastContentAggregator" + }, + "time":{ + "type": "EXTRACTOR", + "fieldName": "time" + } + }, + "operations": { + "enumerator-stage": {"operator": "enumerator"}, + "video-decoder-stage": {"operator": "video_decoder", "inputs": ["enumerator-stage"]}, + "time-stage": {"operator": "time","inputs": ["video-decoder-stage"]}, + "image-decoder-stage": {"operator": "image_decoder", "inputs": ["enumerator-stage"]}, + "selector-stage": {"operator": "selector", "inputs": ["time-stage"]}, + "video-ocr-sparse-stage": {"operator": "ocr_sparse", "inputs": ["selector-stage"]}, + "video-ocr-content-stage": {"operator": "ocr_content", "inputs": ["video-ocr-sparse-stage"]}, + "video-caption-sparse-stage": {"operator": "caption_sparse", "inputs": ["selector-stage"]}, + "video-caption-content-stage": {"operator": "caption_content", "inputs": ["video-caption-sparse-stage"]}, + "asr-sparse-stage": {"operator": "asr_sparse", "inputs": ["time-stage"]}, + "asr-content-stage": {"operator": "asr_content", "inputs": ["asr-sparse-stage"]}, + "prompt": {"operator": "merge_prompt", "inputs": ["asr-content-stage", "video-caption-content-stage", "video-ocr-content-stage"], "merge": "COMBINE"} + }, + "output": [ + "prompt" + ], + "mergeType": "MERGE" +} \ No newline at end of file diff --git a/test-schema.json b/test-schema.json new file mode 100644 index 000000000..475bca28a --- /dev/null +++ b/test-schema.json @@ -0,0 +1,86 @@ +{ + "schemas": [ + { + "name": "test", + "connection": { + "database": "CottontailConnectionProvider", + "parameters": { + "Host": "127.0.0.1", + "port": "1865" + } + }, + "fields": [ + { + "name": "averagecolor", + "factory": "AverageColor" + }, + { + "name": "file", + "factory": "FileSourceMetadata" + }, + { + "name": "time", + "factory": "TemporalMetadata" + }, + { + "name": "video", + "factory": "VideoSourceMetadata" + }, + { + "name": "asr_sparse", + "factory": "ASR", + "parameters": { + "host": "http://10.34.64.84:8888/", + "model": "whisper", + "timeoutSeconds": "100", + "retries":"1000" + } + }, + { + "name": "caption_sparse", + "factory": "ImageCaption", + "parameters": { + "host": "http://10.34.64.84:8888/", + "timeoutSeconds": "100", + "retries":"1000" + } + }, + { + "name": "ocr_sparse", + "factory": "OCR", + "parameters": { + "host": "http://10.34.64.84:8888/", + "model": "tesseract", + "timeoutSeconds": "100", + "retries":"1000" + } + } + ], + "resolvers": { + "disk": { + "factory": "DiskResolver", + "parameters": { + "location": "../thumbnails" + } + } + }, + "exporters": [ + { + "name": "thumbnail", + "factory": "ThumbnailExporter", + "resolverName": "disk", + "parameters": { + "maxSideResolution": "400", + "mimeType": "JPG" + } + } + ], + "extractionPipelines": [ + { + "name": "full", + "path": "./test-pipeline.json" + } + ] + } + ] +} \ No newline at end of file diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index e00be57b9..ddd226bda 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -16,6 +16,12 @@ import org.vitrivr.engine.core.operators.general.TransformerFactory private val logger = KotlinLogging.logger {} +/** + * A [Transformer] that takes an input template with placeholders and inserts content from fields in their place. + * + * @author Laura Rettig + * @version 1.0.0 + */ class ContentMergingTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { val contentFields = context[name, "contentFields"]?.split(",") ?: throw IllegalArgumentException("The content merging transformer requires a list of content fields.") diff --git a/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory b/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory index 8d2630c6e..440b9de87 100644 --- a/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory +++ b/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory @@ -1,4 +1,5 @@ org.vitrivr.engine.index.transform.ContentSamplingTransformer +org.vitrivr.engine.index.transform.ContentMergingTransformer org.vitrivr.engine.index.transform.DescriptorAsContentTransformer org.vitrivr.engine.index.transform.LabelFilterTransformer From 1c83e7e3a953e4182cf599900e73d4683496a062 Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 17 Jul 2024 15:35:24 +0200 Subject: [PATCH 08/34] bidirectional content author maps --- .../attributes/ContentAuthorAttribute.kt | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt index ce1c821dd..3805d8043 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/retrievable/attributes/ContentAuthorAttribute.kt @@ -4,20 +4,32 @@ import java.util.* import kotlin.collections.HashMap class ContentAuthorAttribute private constructor( - private val authorMap: HashMap> + private val authorMap: HashMap>, + private val contentMap: HashMap> ) : MergingRetrievableAttribute { - constructor(contentId: UUID, author: String) : this(hashMapOf(contentId to hashSetOf(author))) + constructor(contentId: UUID, author: String) : this(hashMapOf(contentId to hashSetOf(author)), hashMapOf(author to hashSetOf(contentId))) override fun merge(other: MergingRetrievableAttribute): MergingRetrievableAttribute { - val otherMap = (other as ContentAuthorAttribute).authorMap - for ((contentId, authors) in otherMap) { + val otherAuthorMap = (other as ContentAuthorAttribute).authorMap + for ((contentId, authors) in otherAuthorMap) { authorMap.computeIfAbsent(contentId) { hashSetOf() }.addAll(authors) } - return ContentAuthorAttribute(authorMap) + + val otherContentMap = other.contentMap + for ((author, contentIds) in otherContentMap) { + contentMap.computeIfAbsent(author) { hashSetOf() }.addAll(contentIds) + } + + return ContentAuthorAttribute(authorMap, contentMap) } fun getAuthors(contentId: UUID): Set { return authorMap[contentId] ?: emptySet() } + + fun getContentIds(author: String): Set { + return contentMap[author] ?: emptySet() + } + } \ No newline at end of file From a08ab40f9513e52f3093efb5f1332076e95a7127 Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 17 Jul 2024 18:12:43 +0200 Subject: [PATCH 09/34] allow image captioning to use content as prompt --- .../external/implementations/ImageCaption.kt | 103 ++++++++++++++---- 1 file changed, 81 insertions(+), 22 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt index a12225d57..c3404ca0b 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt @@ -1,10 +1,13 @@ package org.vitrivr.engine.base.features.external.implementations +import io.github.oshai.kotlinlogging.KLogger +import io.github.oshai.kotlinlogging.KotlinLogging import org.vitrivr.engine.base.features.external.common.ApiWrapper import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.base.features.external.common.FesExtractor import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.context.QueryContext +import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.content.element.TextContent import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor @@ -21,13 +24,17 @@ import org.vitrivr.engine.module.features.feature.fulltext.FulltextRetriever import java.util.* +private val logger: KLogger = KotlinLogging.logger {} + + + /** * Implementation of the [ImageCaption] [ExternalFesAnalyser] that uses the [ApiWrapper] to extract captions from images. * * @author Fynn Faber * @version 1.0.0 */ -class ImageCaption : ExternalFesAnalyser() { +class ImageCaption : ExternalFesAnalyser, StringDescriptor>() { companion object { const val PROMPT_PARAMETER_NAME = "prompt" } @@ -35,23 +42,75 @@ class ImageCaption : ExternalFesAnalyser() { override val defaultModel = "blip2" /** - * Analyse the provided [ImageContent] using the provided [apiWrapper] and return a list of [StringDescriptor]s. - * If the prompt parameter is set, the prompt is used for conditional captioning. - * - * @param content List of [ImageContent] to analyse. - * @param apiWrapper [ApiWrapper] to use for the analysis. + * Analyse the provided [ImageContent] using the texts as prompts and the provided [apiWrapper] and return a list of [StringDescriptor]s. + * If the textContent parameter is set, the prompt is used for conditional captioning. + * + * @param content List of [ImageContent] to analyse. + * @param apiWrapper [ApiWrapper] to use for the analysis. + */ + fun makeCaption(imageContent: List, text: List, apiWrapper: ApiWrapper): List { + val withTextIndices = text.mapIndexedNotNull { index, t -> if (t != null) index to t else null } + val withoutTextIndices = text.mapIndexedNotNull { index, t -> if (t == null) index else null } + + val withTextResults = if (withTextIndices.isNotEmpty()) { + val imageContentsWithText = withTextIndices.map { imageContent[it.first].content } + val textsWithText = withTextIndices.map { it.second } + val results = apiWrapper.conditionalImageCaptioning(imageContentsWithText, textsWithText) + withTextIndices.mapIndexed { index, pair -> pair.first to results[index] } + } else { + emptyList() + } + + val withoutTextResults = if (withoutTextIndices.isNotEmpty()) { + val imageContentsWithoutText = withoutTextIndices.map { imageContent[it].content } + val results = apiWrapper.imageCaptioning(imageContentsWithoutText) + withoutTextIndices.mapIndexed { index, i -> i to results[index] } + } else { + emptyList() + } + + val mergedResults = (withTextResults + withoutTextResults).sortedBy { it.first } + return mergedResults.map { StringDescriptor(UUID.randomUUID(), null, Value.String(it.second)) } + } + + /** + * Analyse the provided [content] using the provided [apiWrapper] and return a list of [StringDescriptor]s. + * + * @param content Nested list of [ContentElement] to analyse. + * @param apiWrapper [ApiWrapper] to use for the analysis. + * @param parameters Parameters to use for the analysis. */ - override fun analyseFlattened(content: List, apiWrapper: ApiWrapper, parameters: Map): List> { - val prompt = parameters[PROMPT_PARAMETER_NAME] - if (prompt != null) { - val result = apiWrapper.conditionalImageCaptioning(content.map { it.content }, List(content.size) { prompt }) - return result.map { listOf(StringDescriptor(UUID.randomUUID(), null, Value.String(it))) } + override fun analyse( + content: List>>, + apiWrapper: ApiWrapper, + parameters: Map + ): List> { + val promptDefault = parameters[PROMPT_PARAMETER_NAME] + + val imageContents = content.map { it.filterIsInstance() } + + val texts = content.map { it.filterIsInstance().map { it.content } }.mapIndexed { + index, text -> if (text.isEmpty()) { + List(imageContents[index].size) { promptDefault } + } else { + if (text.size != 1) { + logger.warn { "Text content has more than one element. Only the first element will be used as an image captioning prompt." } + } + List(imageContents[index].size) { text.first() } } - val result = apiWrapper.imageCaptioning(content.map { it.content }) - return result.map { listOf(StringDescriptor(UUID.randomUUID(), null, Value.String(it))) } + } + + val flatResults = makeCaption(imageContents.flatten(), texts.flatten(), apiWrapper) + var index = 0 + return content.map { innerList -> + innerList.map { _ -> + flatResults[index++] + } + } + } - override val contentClasses = setOf(ImageContent::class) + override val contentClasses = setOf(ImageContent::class, TextContent::class) override val descriptorClass = StringDescriptor::class /** @@ -73,14 +132,14 @@ class ImageCaption : ExternalFesAnalyser() { * @throws [UnsupportedOperationException], if this [ExternalFesAnalyser] does not support the creation of an [Extractor] instance. */ override fun newExtractor( - field: Schema.Field, + field: Schema.Field, StringDescriptor>, input: Operator, context: IndexContext - ): Extractor { + ): Extractor, StringDescriptor> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } val batchSize = context.getProperty(field.fieldName, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() val contentSources = context.getProperty(field.fieldName, "contentSources")?.split(",")?.toSet() - return object : FesExtractor(input, field, batchSize, contentSources) { + return object : FesExtractor, ImageCaption>(input, field, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId, field = field) } @@ -100,10 +159,10 @@ class ImageCaption : ExternalFesAnalyser() { name: String, input: Operator, context: IndexContext - ): Extractor { + ): Extractor, StringDescriptor> { val batchSize = context.getProperty(name, BATCHSIZE_PARAMETER_NAME)?.toIntOrNull() ?: BATCHSIZE_PARAMETER_DEFAULT.toInt() val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() - return object : FesExtractor(input, null, batchSize, contentSources) { + return object : FesExtractor, ImageCaption>(input, null, batchSize, contentSources) { override fun assignRetrievableId(descriptor: StringDescriptor, retrievableId: RetrievableId): StringDescriptor { return descriptor.copy(retrievableId = retrievableId) } @@ -119,7 +178,7 @@ class ImageCaption : ExternalFesAnalyser() { * * @return A new [FulltextRetriever] instance for this [ExternalFesAnalyser] */ - override fun newRetrieverForQuery(field: Schema.Field, query: Query, context: QueryContext): Retriever { + override fun newRetrieverForQuery(field: Schema.Field, StringDescriptor>, query: Query, context: QueryContext): Retriever, StringDescriptor> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } require(query is SimpleFulltextQuery) { "The query is not a fulltext query. This is a programmer's error!" } return FulltextRetriever(field, query, context) @@ -133,7 +192,7 @@ class ImageCaption : ExternalFesAnalyser() { * @param context The [QueryContext] to use with the [Retriever] * @return [FulltextRetriever] */ - override fun newRetrieverForContent(field: Schema.Field, content: Collection, context: QueryContext): Retriever { + override fun newRetrieverForContent(field: Schema.Field, StringDescriptor>, content: Collection>, context: QueryContext): Retriever, StringDescriptor> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } /* Prepare query parameters. */ @@ -144,4 +203,4 @@ class ImageCaption : ExternalFesAnalyser() { } -} \ No newline at end of file +} From 01d0ee3276a9bf4d9c7cffa3e0272e487ef46f7a Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Thu, 18 Jul 2024 12:53:15 +0200 Subject: [PATCH 10/34] Updated ContentMergingTransformer to transform content to content rather than descriptor to content. --- test-pipeline.json | 5 ++-- .../transform/ContentMergingTransformer.kt | 24 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/test-pipeline.json b/test-pipeline.json index 7091e4fc7..97241810f 100644 --- a/test-pipeline.json +++ b/test-pipeline.json @@ -42,8 +42,9 @@ "contentSources": "video_decoder" }, "merge_prompt": { - "contentFields": "asr_sparse,caption_sparse,ocr_sparse", - "template": "test $asr_sparse ASR \n $caption_sparse CAPTION \n $ocr_sparse OCR" + "contentFields": "asr_content,caption_content,ocr_content", + "template": "test $asr_content ASR \n $caption_content CAPTION \n $ocr_content OCR", + "defaultValue": "no content provided" } } }, diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index ddd226bda..b81eb667c 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -6,6 +6,7 @@ import kotlinx.coroutines.flow.Flow import kotlinx.coroutines.flow.flow import org.vitrivr.engine.core.context.Context import org.vitrivr.engine.core.context.IndexContext +import org.vitrivr.engine.core.model.content.ContentType import org.vitrivr.engine.core.model.content.factory.ContentFactory import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor import org.vitrivr.engine.core.model.retrievable.Retrievable @@ -13,6 +14,8 @@ import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribu import org.vitrivr.engine.core.operators.Operator import org.vitrivr.engine.core.operators.general.Transformer import org.vitrivr.engine.core.operators.general.TransformerFactory +import java.time.Year +import javax.swing.text.AbstractDocument.Content private val logger = KotlinLogging.logger {} @@ -26,28 +29,35 @@ class ContentMergingTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { val contentFields = context[name, "contentFields"]?.split(",") ?: throw IllegalArgumentException("The content merging transformer requires a list of content fields.") val template = context[name, "template"] ?: throw IllegalArgumentException("The content merging transformer requires a template.") + val defaultValue = context[name, "defaultValue"] ?: "" return Instance( input = input, contentFactory = (context as IndexContext).contentFactory, contentFields = contentFields, template = template, + defaultValue = defaultValue, name = name ) } - private class Instance(override val input: Operator, val contentFactory: ContentFactory, val contentFields: List, val template: String, val name: String) : Transformer { + private class Instance(override val input: Operator, val contentFactory: ContentFactory, val contentFields: List, val template: String, val defaultValue: String, val name: String) : Transformer { override fun toFlow(scope: CoroutineScope): Flow = flow { input.toFlow(scope).collect { retrievable: Retrievable -> var mergedContent = template contentFields.forEach { fieldName -> val placeholder = "\$$fieldName" - val contentText = retrievable.descriptors.find { - it.field?.fieldName == fieldName - }?.let { descriptor -> - if (descriptor is StringDescriptor) descriptor.value.value else "" - } ?: "" - mergedContent = mergedContent.replace(placeholder, contentText) + val contentIds = retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getContentIds(fieldName) + val fieldContent = StringBuilder() + contentIds?.forEach{ id -> + retrievable.content.find { + it.id == id && it.type == ContentType.TEXT + }?.content?.let { + fieldContent.append(it) + } + } + val finalContent = if (fieldContent.isEmpty()) defaultValue else fieldContent.toString() + mergedContent = mergedContent.replace(placeholder, finalContent) } if (mergedContent.isNotBlank()) { From f64358c95d766ca771f4b7483ecf2103b9937625 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Thu, 18 Jul 2024 12:57:58 +0200 Subject: [PATCH 11/34] Refactored parameters to reduce redundancy: content to include taken from template rather than given separately. --- test-pipeline.json | 1 - .../index/transform/ContentMergingTransformer.kt | 11 +++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test-pipeline.json b/test-pipeline.json index 97241810f..055f5e54e 100644 --- a/test-pipeline.json +++ b/test-pipeline.json @@ -42,7 +42,6 @@ "contentSources": "video_decoder" }, "merge_prompt": { - "contentFields": "asr_content,caption_content,ocr_content", "template": "test $asr_content ASR \n $caption_content CAPTION \n $ocr_content OCR", "defaultValue": "no content provided" } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index b81eb667c..dc39162e4 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -27,27 +27,29 @@ private val logger = KotlinLogging.logger {} */ class ContentMergingTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { - val contentFields = context[name, "contentFields"]?.split(",") ?: throw IllegalArgumentException("The content merging transformer requires a list of content fields.") val template = context[name, "template"] ?: throw IllegalArgumentException("The content merging transformer requires a template.") val defaultValue = context[name, "defaultValue"] ?: "" return Instance( input = input, contentFactory = (context as IndexContext).contentFactory, - contentFields = contentFields, template = template, defaultValue = defaultValue, name = name ) } - private class Instance(override val input: Operator, val contentFactory: ContentFactory, val contentFields: List, val template: String, val defaultValue: String, val name: String) : Transformer { + private class Instance(override val input: Operator, val contentFactory: ContentFactory, val template: String, val defaultValue: String, val name: String) : Transformer { override fun toFlow(scope: CoroutineScope): Flow = flow { input.toFlow(scope).collect { retrievable: Retrievable -> var mergedContent = template + val regex = "\\$\\{([^}]+)\\}".toRegex() + val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() + contentFields.forEach { fieldName -> val placeholder = "\$$fieldName" val contentIds = retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getContentIds(fieldName) + val fieldContent = StringBuilder() contentIds?.forEach{ id -> retrievable.content.find { @@ -56,6 +58,7 @@ class ContentMergingTransformer : TransformerFactory { fieldContent.append(it) } } + val finalContent = if (fieldContent.isEmpty()) defaultValue else fieldContent.toString() mergedContent = mergedContent.replace(placeholder, finalContent) } @@ -64,7 +67,7 @@ class ContentMergingTransformer : TransformerFactory { val content = contentFactory.newTextContent(mergedContent.trim()) retrievable.addContent(content) retrievable.addAttribute(ContentAuthorAttribute(content.id, name)) - logger.debug { "Contents from fields $contentFields of retrievable ${retrievable.id} have been merged into a single content element using template." } + logger.debug { "Contents from retrievable ${retrievable.id} have been merged into a single content element using template." } } emit(retrievable) } From 93ec49a6707e7dcee16a13147bdf3fb38b65ee83 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Wed, 24 Jul 2024 13:10:56 +0200 Subject: [PATCH 12/34] Refactor regex to not be applied for each retrievable --- .../engine/index/transform/ContentMergingTransformer.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index dc39162e4..b0f2f4dca 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -28,24 +28,24 @@ private val logger = KotlinLogging.logger {} class ContentMergingTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { val template = context[name, "template"] ?: throw IllegalArgumentException("The content merging transformer requires a template.") + val regex = "\\$\\{([^}]+)\\}".toRegex() + val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() val defaultValue = context[name, "defaultValue"] ?: "" return Instance( input = input, contentFactory = (context as IndexContext).contentFactory, template = template, + contentFields = contentFields, defaultValue = defaultValue, name = name ) } - private class Instance(override val input: Operator, val contentFactory: ContentFactory, val template: String, val defaultValue: String, val name: String) : Transformer { + private class Instance(override val input: Operator, val contentFactory: ContentFactory, val template: String, val contentFields: List, val defaultValue: String, val name: String) : Transformer { override fun toFlow(scope: CoroutineScope): Flow = flow { input.toFlow(scope).collect { retrievable: Retrievable -> var mergedContent = template - val regex = "\\$\\{([^}]+)\\}".toRegex() - val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() - contentFields.forEach { fieldName -> val placeholder = "\$$fieldName" val contentIds = retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getContentIds(fieldName) From 7f4f1b31f4a38f39d89f2e5a6bd8f8f403e102a4 Mon Sep 17 00:00:00 2001 From: faberf Date: Mon, 29 Jul 2024 08:48:04 +0200 Subject: [PATCH 13/34] bug fixes for content pipelines --- .../engine/core/features/AbstractBatchedExtractor.kt | 4 +--- .../vitrivr/engine/index/aggregators/AbstractAggregator.kt | 6 +++++- .../kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt | 1 + .../engine/index/transform/ContentMergingTransformer.kt | 2 +- .../index/transform/DescriptorAsContentTransformer.kt | 3 +++ .../engine/base/features/external/common/FesExtractor.kt | 7 ++++--- .../base/features/external/implementations/ImageCaption.kt | 2 +- 7 files changed, 16 insertions(+), 9 deletions(-) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt index ac23a34ca..97a7d55e5 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt @@ -38,9 +38,6 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f val batch = mutableListOf() this@AbstractBatchedExtractor.input.toFlow(scope).collect { retrievable -> - if (retrievable.type == "SOURCE:VIDEO") { - logger.info { "Processing video ${retrievable.id} with field ${field?.fieldName}" } - } try { if (this@AbstractBatchedExtractor.matches(retrievable)) { batch.add(retrievable) @@ -49,6 +46,7 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f emit(retrievable) } if (batch.size >= bufferSize) { + logger.debug { "Batch size reached for field ${field?.fieldName}, extracting descriptors" } val descriptors = extract(batch) // zip descriptors and batch for (i in batch.indices) { diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt index 8abd21b78..96c62a400 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/aggregators/AbstractAggregator.kt @@ -27,8 +27,12 @@ abstract class AbstractAggregator(override val input: Operator, * @param scope [CoroutineScope] to use for the [Flow]. */ override fun toFlow(scope: CoroutineScope): Flow = this.input.toFlow(scope).map { + val contentSources = context.getProperty(name, "contentSources")?.split(",")?.toSet() + val contentIds = contentSources?.flatMap { source -> it.filteredAttribute(ContentAuthorAttribute::class.java)?.getContentIds(source) ?: emptySet() }?.toSet() + + if (it.content.isNotEmpty()) { - val aggregated = this.aggregate(it.content) + val aggregated = this.aggregate(it.content.filter { c -> contentIds?.contains(c.id) ?: true}) aggregated.forEach { c -> if (newContent) { it.addContent(c) diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt index 34fe349bf..49a0ab888 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/decode/VideoDecoder.kt @@ -189,6 +189,7 @@ class VideoDecoder : DecoderFactory { /* Send source retrievable downstream as a signal that file has been decoded. */ if (!error) { + logger.debug { "Emitting source ${sourceRetrievable.id} as signal that video has been decoded." } channel.send(sourceRetrievable) } } diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt index dc39162e4..5919197dc 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt @@ -47,7 +47,7 @@ class ContentMergingTransformer : TransformerFactory { val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() contentFields.forEach { fieldName -> - val placeholder = "\$$fieldName" + val placeholder = "\${${fieldName}}" val contentIds = retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getContentIds(fieldName) val fieldContent = StringBuilder() diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt index 50d9ceeb7..646821030 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt @@ -10,6 +10,7 @@ import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.factory.ContentFactory import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor +import org.vitrivr.engine.core.model.descriptor.struct.metadata.source.FileSourceMetadataDescriptor import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute @@ -56,8 +57,10 @@ class DescriptorAsContentTransformer : TransformerFactory { private fun convertDescriptorToContent(descriptor: Descriptor): ContentElement<*> { return when (descriptor) { is StringDescriptor -> contentFactory.newTextContent(descriptor.value.value) + is FileSourceMetadataDescriptor -> contentFactory.newTextContent(descriptor.path.value) else -> throw IllegalArgumentException("Descriptor type not supported.") } + } } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index 7eff3074d..1b00b1750 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -10,6 +10,7 @@ import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.RetrievableId import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator +import java.util.* import java.util.logging.Logger private val logger: KLogger = KotlinLogging.logger {} @@ -55,11 +56,11 @@ abstract class FesExtractor, A:ExternalFesAnaly val analyser = field!!.analyser as A val allContent : List> = retrievables.map { retrievable -> + val authors = retrievable.filteredAttribute(ContentAuthorAttribute::class.java) + val retrievableContentIds : Set? = contentSources?.flatMap { authors?.getContentIds(it)?: emptySet() }?.toSet() retrievable.findContent { contentItem -> analyser.contentClasses.any { contentClass -> - contentClass.isInstance(contentItem) && contentSources?.let { sources -> - retrievable.filteredAttribute(ContentAuthorAttribute::class.java)?.getAuthors(contentItem.id)?.any { it in sources } - } ?: true + contentClass.isInstance(contentItem) && retrievableContentIds?.contains(contentItem.id) ?: true } }.map{ it as C} } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt index c3404ca0b..09003ec62 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ImageCaption.kt @@ -102,7 +102,7 @@ class ImageCaption : ExternalFesAnalyser, StringDescriptor>() val flatResults = makeCaption(imageContents.flatten(), texts.flatten(), apiWrapper) var index = 0 - return content.map { innerList -> + return imageContents.map { innerList -> innerList.map { _ -> flatResults[index++] } From e34f8912734ed065b9447101e0598974c8825380 Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 31 Jul 2024 14:57:21 +0200 Subject: [PATCH 14/34] simplified passthrough, renamed templatetext --- .../config/ingest/IngestionPipelineBuilder.kt | 46 +++++++++++--- .../core/config/ingest/operation/Operation.kt | 62 ++++++++++++------- .../ingest/operation/OperationConfig.kt | 2 +- .../transform/PassthroughTransformer.kt | 23 ------- ....core.operators.general.TransformerFactory | 3 +- ...nsformer.kt => TemplateTextTransformer.kt} | 7 +-- ....core.operators.general.TransformerFactory | 2 +- .../query/aggregate/WeightedScoreFusion.kt | 5 -- 8 files changed, 81 insertions(+), 69 deletions(-) delete mode 100644 vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt rename vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/{ContentMergingTransformer.kt => TemplateTextTransformer.kt} (92%) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt index a786bd5ff..7a2755461 100755 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt @@ -3,7 +3,9 @@ package org.vitrivr.engine.core.config.ingest import io.github.oshai.kotlinlogging.KLogger import io.github.oshai.kotlinlogging.KotlinLogging import org.vitrivr.engine.core.config.IndexContextFactory +import org.vitrivr.engine.core.config.ingest.operation.BaseOperation import org.vitrivr.engine.core.config.ingest.operation.Operation +import org.vitrivr.engine.core.config.ingest.operation.PassthroughOperation import org.vitrivr.engine.core.config.ingest.operator.OperatorConfig import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.element.ContentElement @@ -55,6 +57,7 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { @Suppress("UNCHECKED_CAST") fun build(stream: Stream<*>? = null): List> { return parseOperations().map { root -> + val config = root.opConfig as? OperatorConfig.Enumerator ?: throw IllegalArgumentException("Root stage must always be an enumerator!") val built = HashMap>() built[root.name] = buildEnumerator(root.opName, config, stream) @@ -82,11 +85,11 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { /** * This is an internal function that can be called recursively to build the [Operator] DAG. * - * @param operation The [Operation] to build. + * @param operation The [IOperation] to build. * @param memoizationTable The memoization table that holds the already built operators. * @return The built [Operator]. */ - private fun buildInternal(operation: Operation, memoizationTable: MutableMap>, breakAt: Operation? = null) { + private fun buildInternal(operation: BaseOperation, memoizationTable: MutableMap>, breakAt: BaseOperation? = null) { /* Find all required input operations and merge them (if necessary). */ if (operation == breakAt) return val inputs = operation.input.map { @@ -107,11 +110,23 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { } /* Prepare and cache operator. */ - var operator = buildOperator(operation.opName, op, operation.opConfig) - if (operation.output.size > 1) { - operator = BroadcastOperator(operator) + when(operation) { + is Operation -> { + val operator = buildOperator(operation.opName, op, operation.opConfig) + if (operation.output.size > 1) { + memoizationTable[operation.name] = BroadcastOperator(operator) + } else { + memoizationTable[operation.name] = operator + } + } + is PassthroughOperation -> { + if (operation.output.size > 1) { + memoizationTable[operation.name] = BroadcastOperator(op) + } else { + memoizationTable[operation.name] = op + } + } } - memoizationTable[operation.name] = operator /* Process output operators. */ for (output in operation.output) { @@ -133,17 +148,28 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { /* Build trees with entry points as roots. */ return entrypoints.map { - val stages = HashMap() - val root = Operation(it.key, it.value.operator, config.operators[it.value.operator] ?: throw IllegalArgumentException("Undefined operator '${it.value.operator}'"), it.value.merge) + val stages = HashMap() + val root = Operation(it.key, it.value.operator as String, config.operators[it.value.operator] ?: throw IllegalArgumentException("Undefined operator '${it.value.operator}'"), it.value.merge) stages[it.key] = root for (operation in this.config.operations) { if (!stages.containsKey(operation.key)) { - stages[operation.key] = Operation(operation.key, operation.value.operator, config.operators[operation.value.operator] ?: throw IllegalArgumentException("Undefined operator '${operation.value.operator}'"), operation.value.merge) + when(operation.value.operator) { + is String -> + stages[operation.key] = Operation( + operation.key, + operation.value.operator as String, + config.operators[operation.value.operator as String] ?: throw IllegalArgumentException("Undefined operator '${operation.value.operator}'"), + operation.value.merge + ) + + null -> + stages[operation.key] = PassthroughOperation(operation.key, operation.value.merge) + } } for (inputKey in operation.value.inputs) { if (!stages.containsKey(inputKey)) { val op = this.config.operations[inputKey] ?: throw IllegalArgumentException("Undefined operation '${inputKey}'") - stages[inputKey] = Operation(inputKey, op.operator, config.operators[op.operator] ?: throw IllegalArgumentException("Undefined operator '${op.operator}'"), op.merge) + stages[inputKey] = Operation(inputKey, op.operator as String, config.operators[op.operator] ?: throw IllegalArgumentException("Undefined operator '${op.operator}'"), op.merge) } stages[operation.key]?.addInput(stages[inputKey]!!) } diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt index c619d6ec1..f40f713fc 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt @@ -5,44 +5,62 @@ import org.vitrivr.engine.core.operators.transform.shape.MergeType import java.util.* /** - * This [Operation] class represents a single operation in the ingest pipeline. - * - * @author Ralph Gasser - * @version 1.0.0 + * This sealed class represents a base operation in the ingest pipeline. */ -data class Operation(val name: String, val opName: String, val opConfig: OperatorConfig, val merge: MergeType? = null) { +sealed class BaseOperation(val name: String, val merge: MergeType?) { - /** A [LinkedList] of all input [Operation]s. */ - private val _input = LinkedList() + /** A [LinkedList] of all input [BaseOperation]s. */ + private val _input = LinkedList() - /** A [LinkedList] of all output [Operation]s. */ - private val _output = LinkedList() + /** A [LinkedList] of all output [BaseOperation]s. */ + private val _output = LinkedList() - /** A [List] of all input [Operation]s. */ - val input: List + /** A [List] of all input [BaseOperation]s. */ + val input: List get() = Collections.unmodifiableList(this._input) - /** A [List] of all output [Operation]s. */ - val output: List + /** A [List] of all output [BaseOperation]s. */ + val output: List get() = Collections.unmodifiableList(this._output) /** - * Adds an input [Operation] to this [Operation]. + * Adds an input [BaseOperation] to this [BaseOperation]. * - * @param operation The [Operation] to add. + * @param operation The [BaseOperation] to add. */ - fun addInput(operation: Operation) { + fun addInput(operation: BaseOperation) { this._input.add(operation) - operation._output.add(this) + operation.internalAddOutput(this) } /** - * Adds an output [Operation] to this [Operation]. + * Adds an output [BaseOperation] to this [BaseOperation]. * - * @param operation The [Operation] to add. + * @param operation The [BaseOperation] to add. */ - fun addOutput(operation: Operation) { + fun addOutput(operation: BaseOperation) { + this._output.add(operation) + operation.internalAddInput(this) + } + + protected fun internalAddInput(operation: BaseOperation) { + this._input.add(operation) + } + + protected fun internalAddOutput(operation: BaseOperation) { this._output.add(operation) - operation._input.add(this) } -} \ No newline at end of file +} + +/** + * This [Operation] class represents a single operation in the ingest pipeline. + * + * @param opName The specific operation name. + * @param opConfig The configuration for the operation. + */ +class Operation(name: String, val opName: String, val opConfig: OperatorConfig, merge: MergeType? = null) : BaseOperation(name, merge) + +/** + * This [PassthroughOperation] class represents a passthrough operation in the ingest pipeline. + */ +class PassthroughOperation(name: String, merge: MergeType? = null) : BaseOperation(name, merge) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/OperationConfig.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/OperationConfig.kt index a5a6d0920..8bc203683 100755 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/OperationConfig.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/OperationConfig.kt @@ -15,7 +15,7 @@ import org.vitrivr.engine.core.operators.transform.shape.MergeType @Serializable data class OperationConfig( /** The name of the [OperatorConfig] at this stage. Must be a name of the [IngestionConfig.operators] property. */ - val operator: String, + val operator: String? = null, /** The names of the [OperationConfig] that follow this operation. Must be a name of the [IngestionConfig.operations] property. */ val inputs: List = emptyList(), diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt deleted file mode 100644 index b3a9baf08..000000000 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/transform/PassthroughTransformer.kt +++ /dev/null @@ -1,23 +0,0 @@ -package org.vitrivr.engine.core.operators.transform - -import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.flow.Flow -import org.vitrivr.engine.core.context.Context -import org.vitrivr.engine.core.model.retrievable.Retrievable -import org.vitrivr.engine.core.operators.Operator -import org.vitrivr.engine.core.operators.general.Transformer -import org.vitrivr.engine.core.operators.general.TransformerFactory - -class PassthroughTransformer : TransformerFactory{ - override fun newTransformer(name: String, input: Operator, context: Context): Transformer { - return Instance(input) - } - - private class Instance(input: Operator) : Transformer { - override val input: Operator = input - - override fun toFlow(scope: CoroutineScope): Flow { - return input.toFlow(scope) - } - } -} diff --git a/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory b/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory index 0059c7e04..eab5cd5dc 100644 --- a/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory +++ b/vitrivr-engine-core/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory @@ -1,4 +1,3 @@ org.vitrivr.engine.core.operators.transform.map.MapRelationshipTransformer org.vitrivr.engine.core.operators.transform.filter.TypeFilterTransformer -org.vitrivr.engine.core.operators.transform.filter.DistinctTransformer -org.vitrivr.engine.core.operators.transform.PassthroughTransformer \ No newline at end of file +org.vitrivr.engine.core.operators.transform.filter.DistinctTransformer \ No newline at end of file diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt similarity index 92% rename from vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt rename to vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt index 4c9643650..dc4166e29 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/ContentMergingTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt @@ -8,14 +8,11 @@ import org.vitrivr.engine.core.context.Context import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.ContentType import org.vitrivr.engine.core.model.content.factory.ContentFactory -import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute import org.vitrivr.engine.core.operators.Operator import org.vitrivr.engine.core.operators.general.Transformer import org.vitrivr.engine.core.operators.general.TransformerFactory -import java.time.Year -import javax.swing.text.AbstractDocument.Content private val logger = KotlinLogging.logger {} @@ -25,9 +22,9 @@ private val logger = KotlinLogging.logger {} * @author Laura Rettig * @version 1.0.0 */ -class ContentMergingTransformer : TransformerFactory { +class TemplateTextTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { - val template = context[name, "template"] ?: throw IllegalArgumentException("The content merging transformer requires a template.") + val template = context[name, "template"] ?: throw IllegalArgumentException("The template text transformer requires a template.") val regex = "\\$\\{([^}]+)\\}".toRegex() val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() val defaultValue = context[name, "defaultValue"] ?: "" diff --git a/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory b/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory index 440b9de87..b67681106 100644 --- a/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory +++ b/vitrivr-engine-index/src/main/resources/META-INF/services/org.vitrivr.engine.core.operators.general.TransformerFactory @@ -1,5 +1,5 @@ org.vitrivr.engine.index.transform.ContentSamplingTransformer -org.vitrivr.engine.index.transform.ContentMergingTransformer +org.vitrivr.engine.index.transform.TemplateTextTransformer org.vitrivr.engine.index.transform.DescriptorAsContentTransformer org.vitrivr.engine.index.transform.LabelFilterTransformer diff --git a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt index 8ba168697..d1ff30a8e 100644 --- a/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt +++ b/vitrivr-engine-query/src/main/kotlin/org/vitrivr/engine/query/aggregate/WeightedScoreFusion.kt @@ -94,11 +94,6 @@ class WeightedScoreFusion( retrieved.filteredAttribute(ScoreAttribute::class.java) retrieved.addAttribute(ScoreAttribute.Unbound(score)) - if(retrieved.id == UUID.fromString("aea15b88-79a2-4da7-8612-95ed65cf6475")){ - println("Score: $score") - println(retrieveds) - } - emit(retrieved) } From 7b132bd072d779547833341fdc5948583a7df447 Mon Sep 17 00:00:00 2001 From: faberf Date: Thu, 8 Aug 2024 17:07:03 +0200 Subject: [PATCH 15/34] finished merge with dev --- .../DescriptorAsContentTransformer.kt | 2 ++ .../features/external/common/FesExtractor.kt | 3 ++- .../implementations/caption/ImageCaption.kt | 13 ++++++------ .../caption/ImageCaptionExtractor.kt | 20 ++++++++++++++----- .../ImageClassificationExtractor.kt | 2 +- .../dense/DenseEmbeddingExtractor.kt | 2 +- .../implementations/ocr/OCRExtractor.kt | 2 +- 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt index 646821030..657205497 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/DescriptorAsContentTransformer.kt @@ -10,6 +10,7 @@ import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.factory.ContentFactory import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.scalar.StringDescriptor +import org.vitrivr.engine.core.model.descriptor.scalar.TextDescriptor import org.vitrivr.engine.core.model.descriptor.struct.metadata.source.FileSourceMetadataDescriptor import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable @@ -57,6 +58,7 @@ class DescriptorAsContentTransformer : TransformerFactory { private fun convertDescriptorToContent(descriptor: Descriptor): ContentElement<*> { return when (descriptor) { is StringDescriptor -> contentFactory.newTextContent(descriptor.value.value) + is TextDescriptor -> contentFactory.newTextContent(descriptor.value.value) is FileSourceMetadataDescriptor -> contentFactory.newTextContent(descriptor.path.value) else -> throw IllegalArgumentException("Descriptor type not supported.") } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index dc747dc36..cc8a1aac9 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -7,6 +7,7 @@ import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Comp import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.POLLINGINTERVAL_MS_PARAMETER_NAME import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.RETRIES_PARAMETER_DEFAULT import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.RETRIES_PARAMETER_NAME +import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.TIMEOUT_MS_PARAMETER_DEFAULT import org.vitrivr.engine.core.features.AbstractExtractor import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.descriptor.Descriptor @@ -42,7 +43,7 @@ abstract class FesExtractor, D : Descriptor>( /** */ protected val timeoutMs: Long - get() = this.parameters[POLLINGINTERVAL_MS_PARAMETER_NAME]?.toLongOrNull() ?: POLLINGINTERVAL_MS_PARAMETER_DEFAULT + get() = this.parameters[POLLINGINTERVAL_MS_PARAMETER_NAME]?.toLongOrNull() ?: TIMEOUT_MS_PARAMETER_DEFAULT /** */ protected val pollingIntervalMs: Long diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaption.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaption.kt index aa5610f87..34d7cb8d4 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaption.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaption.kt @@ -5,6 +5,7 @@ import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.context.QueryContext import org.vitrivr.engine.core.features.fulltext.FulltextRetriever +import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.content.element.TextContent import org.vitrivr.engine.core.model.descriptor.scalar.TextDescriptor @@ -25,11 +26,11 @@ import java.util.* * @author Fynn Faber * @version 1.0.0 */ -class ImageCaption : ExternalFesAnalyser() { +class ImageCaption : ExternalFesAnalyser, TextDescriptor>() { companion object { const val PROMPT_PARAMETER_NAME = "prompt" } - override val contentClasses = setOf(ImageContent::class) + override val contentClasses = setOf(ImageContent::class, TextContent::class) override val descriptorClass = TextDescriptor::class /** @@ -58,7 +59,7 @@ class ImageCaption : ExternalFesAnalyser() { * @param context The [IndexContext] to use with the [ImageCaptionExtractor]. * @return [ImageCaptionExtractor] */ - override fun newExtractor(field: Schema.Field, input: Operator, context: IndexContext) = ImageCaptionExtractor(input, field, this, merge(field, context)) + override fun newExtractor(field: Schema.Field, TextDescriptor>, input: Operator, context: IndexContext) = ImageCaptionExtractor(input, field, this, merge(field, context)) /** * Generates and returns a new [FulltextRetriever] instance for this [ExternalFesAnalyser]. @@ -69,7 +70,7 @@ class ImageCaption : ExternalFesAnalyser() { * * @return A new [FulltextRetriever] instance for this [ExternalFesAnalyser] */ - override fun newRetrieverForQuery(field: Schema.Field, query: Query, context: QueryContext): Retriever { + override fun newRetrieverForQuery(field: Schema.Field, TextDescriptor>, query: Query, context: QueryContext): Retriever, TextDescriptor> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } require(query is SimpleFulltextQuery) { "The query is not a fulltext query. This is a programmer's error!" } return FulltextRetriever(field, query, context) @@ -79,11 +80,11 @@ class ImageCaption : ExternalFesAnalyser() { * Generates and returns a new [FulltextRetriever] instance for this [ExternalFesAnalyser]. * * @param field The [Schema.Field] to create an [Retriever] for. - * @param content An array of [ImageContent] elements to use with the [Retriever] + * @param content An array of [ContentElement] elements to use with the [Retriever] * @param context The [QueryContext] to use with the [Retriever] * @return [FulltextRetriever] */ - override fun newRetrieverForContent(field: Schema.Field, content: Collection, context: QueryContext): Retriever { + override fun newRetrieverForContent(field: Schema.Field, TextDescriptor>, content: Collection>, context: QueryContext): Retriever, TextDescriptor> { require(field.analyser == this) { "The field '${field.fieldName}' analyser does not correspond with this analyser. This is a programmer's error!" } /* Prepare query parameters. */ val text = content.filterIsInstance().firstOrNull() ?: throw IllegalArgumentException("No text content found in the provided content.") diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt index 4d0a03f0f..95551b124 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt @@ -5,7 +5,9 @@ import org.vitrivr.engine.base.features.external.api.ImageCaptioningApi import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.base.features.external.common.FesExtractor import org.vitrivr.engine.base.features.external.implementations.caption.ImageCaption.Companion.PROMPT_PARAMETER_NAME +import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.element.ImageContent +import org.vitrivr.engine.core.model.content.element.TextContent import org.vitrivr.engine.core.model.content.impl.memory.InMemoryTextContent import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.scalar.TextDescriptor @@ -21,10 +23,10 @@ import java.util.* */ class ImageCaptionExtractor( input: Operator, - field: Schema.Field?, - analyser: ExternalFesAnalyser, + field: Schema.Field, TextDescriptor>?, + analyser: ExternalFesAnalyser, TextDescriptor>, parameters: Map -) : FesExtractor(input, field, analyser, parameters) { +) : FesExtractor, TextDescriptor>(input, field, analyser, parameters) { /** The [ImageCaptioningApi] used to perform extraction with. */ private val captioningApi by lazy { ImageCaptioningApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) } @@ -39,8 +41,16 @@ class ImageCaptionExtractor( * @return List of resulting [Descriptor]s. */ override fun extract(retrievable: Retrievable): List { - val prompt = this.field?.parameters?.get(PROMPT_PARAMETER_NAME)?.let { InMemoryTextContent(it) } - return retrievable.content.mapNotNull { + + val content = this.filterContent(retrievable) + val textContent = content.filterIsInstance() + if (textContent.size > 1) { + logger.warn { "Text content has more than one element. Only the first element will be used as an image captioning prompt." } + } + + val prompt = (textContent.firstOrNull()?.content ?: this.field?.parameters?.get(PROMPT_PARAMETER_NAME) )?.let { InMemoryTextContent(it) } + + return content.mapNotNull { if (it is ImageContent) { val result = if (prompt == null) { this.captioningApi.analyse(it) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt index 28d083ff8..1e42dc5fc 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt @@ -41,7 +41,7 @@ class ImageClassificationExtractor( val classes = this.parameters[CLASSES_PARAMETER_NAME]?.split(",") ?: throw IllegalArgumentException("No classes provided.") val topK = this.parameters[TOPK_PARAMETER_NAME]?.toInt() ?: 1 val threshold = this.parameters[THRESHOLD_PARAMETER_NAME]?.toFloat() ?: 0.0f - return retrievable.content.flatMap { content -> + return this.filterContent(retrievable).flatMap { content -> if (content is ImageContent) { val result = this.api.analyse(content to classes) result?.mapIndexed { index, score -> diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt index 49fab0246..2292bee94 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt @@ -38,7 +38,7 @@ class DenseEmbeddingExtractor( * @param retrievable The [Retrievable] to process. * @return List of resulting [Descriptor]s. */ - override fun extract(retrievable: Retrievable): List = retrievable.content.mapNotNull { + override fun extract(retrievable: Retrievable): List = this.filterContent(retrievable).mapNotNull { val result = when (it) { is ImageContent -> this.imageApi.analyse(it) is TextContent -> this.textApi.analyse(it) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt index e768d0a75..e0afafbd8 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt @@ -33,7 +33,7 @@ class OCRExtractor( * @return List of resulting [Descriptor]s. */ override fun extract(retrievable: Retrievable): List { - val content = retrievable.content.filterIsInstance() + val content = this.filterContent(retrievable) return content.mapNotNull { audio -> val result = this.api.analyse(audio) if (result != null) { From f2a10ec1fcba90211bbdfdd5c720719d2c010e0d Mon Sep 17 00:00:00 2001 From: faberf Date: Thu, 8 Aug 2024 20:48:15 +0200 Subject: [PATCH 16/34] reimplemented batched extraction --- .../core/features/AbstractBatchedExtractor.kt | 13 +++- .../base/features/external/api/AbstractApi.kt | 57 ++++++++++++++ .../base/features/external/api/AsrApi.kt | 36 +++++++++ .../api/ConditionalImageCaptioningApi.kt | 35 +++++++++ .../features/external/api/FaceEmbeddingApi.kt | 33 ++++++++ .../external/api/ImageCaptioningApi.kt | 35 +++++++++ .../external/api/ImageEmbeddingApi.kt | 26 +++++++ .../external/api/ObjectDetectionApi.kt | 27 +++++++ .../base/features/external/api/OcrApi.kt | 33 ++++++++ .../features/external/api/TextEmbeddingApi.kt | 33 ++++++++ .../external/api/ZeroShotClassificationApi.kt | 40 ++++++++++ .../features/external/common/FesExtractor.kt | 3 +- .../implementations/asr/ASRExtractor.kt | 20 ++--- .../caption/ImageCaptionExtractor.kt | 77 +++++++++++++------ .../ImageClassificationExtractor.kt | 33 ++++---- .../dense/DenseEmbeddingExtractor.kt | 47 ++++++++--- .../implementations/ocr/OCRExtractor.kt | 23 +++--- 17 files changed, 495 insertions(+), 76 deletions(-) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt index 97a7d55e5..eb876c014 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt @@ -6,6 +6,7 @@ import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.flow.* import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.descriptor.Descriptor +import org.vitrivr.engine.core.model.metamodel.Analyser import org.vitrivr.engine.core.model.metamodel.Schema import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.operators.Operator @@ -19,10 +20,14 @@ import java.util.* * @author Ralph Gasser * @version 1.0.0 */ -abstract class AbstractBatchedExtractor, D : Descriptor>(final override val input: Operator, final override val field: Schema.Field?, private val bufferSize: Int = 100) : +abstract class AbstractBatchedExtractor, D : Descriptor>(final override val input: Operator, final override val analyser: Analyser, final override val field: Schema.Field? = null, private val bufferSize: Int = 100) : Extractor { private val logger: KLogger = KotlinLogging.logger {} + init { + require(field == null || this.field.analyser == this.analyser) { "Field and analyser do not match! This is a programmer's error!" } + } + /** * A default [Extractor] implementation for batched extraction. It executes the following steps: * @@ -86,10 +91,14 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f /** * Internal method to check, if [Retrievable] matches this [Extractor] and should thus be processed. * + * By default, a [Retrievable] matches this [Extractor] if it contains at least one [ContentElement] that matches the [Analyser.contentClasses]. + * * @param retrievable The [Retrievable] to check. * @return True on match, false otherwise, */ - protected abstract fun matches(retrievable: Retrievable): Boolean + protected open fun matches(retrievable: Retrievable): Boolean = retrievable.content.any { content -> + this.analyser.contentClasses.any { it.isInstance(content) } + } /** * Internal method to perform extraction on batch of [Retrievable]. diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AbstractApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AbstractApi.kt index 542ff601b..ecd93c32f 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AbstractApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AbstractApi.kt @@ -85,6 +85,47 @@ abstract class AbstractApi(protected val host: String, protected val model null } + fun analyseBatched(input: List): List = runBlocking { + var retriesLeft = retries + outer@ while (retriesLeft > 0) { + /* Start job. */ + val jobStatus = this@AbstractApi.startBatchedJob(input) + if (jobStatus.status == JobState.failed) { + retriesLeft -= 1 + continue + } + + /* Poll for result. */ + var jobResult = this@AbstractApi.pollBatchedJob(jobStatus.id) + inner@ while (jobResult.status != JobState.complete) { + if (jobResult.status == JobState.failed) { + logger.error { "$model job on host $host with ID: ${jobStatus.id} failed." } + retriesLeft -= 1 + continue@outer + } + + logger.debug { "Waiting for $model job completion on host $host with ID ${jobStatus.id}. Current status: ${jobResult.status}" } + delay(this@AbstractApi.pollingIntervalMs) + jobResult = this@AbstractApi.pollBatchedJob(jobStatus.id) + } + + /* Extract results. */ + val result = jobResult.result + if (result == null) { + logger.error { "$model job on host $host with ID: ${jobStatus.id} returned no result." } + retriesLeft -= 1 + continue@outer + } else { + logger.info { "Job result: $result" } + } + + /* Return results. */ + return@runBlocking result + } + throw IllegalStateException("Failed to analyse batched input.") + + } + /** * This method is used to start a job on the API. * @@ -93,6 +134,14 @@ abstract class AbstractApi(protected val host: String, protected val model */ protected abstract suspend fun startJob(input: I): JobStatus + /** + * This method is used to start a batched job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + protected abstract suspend fun startBatchedJob(input: List): JobStatus + /** * This method is used to poll for results of a job on the API. * @@ -100,4 +149,12 @@ abstract class AbstractApi(protected val host: String, protected val model * @return The [JobResult] */ protected abstract suspend fun pollJob(jobId: String): JobResult + + /** + * This method is used to poll for results of a batched job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + protected abstract suspend fun pollBatchedJob(jobId: String): JobResult> } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AsrApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AsrApi.kt index 88b30cb26..2e88e4214 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AsrApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/AsrApi.kt @@ -2,6 +2,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.AutomatedSpeechRecognitionApi import org.openapitools.client.models.AutomatedSpeechRecognitionInput +import org.openapitools.client.models.BatchedAutomatedSpeechRecognitionInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus import org.vitrivr.engine.base.features.external.api.model.JobResult @@ -35,6 +36,22 @@ class AsrApi(host: String, model: String, timeoutMs: Long, pollingIntervalMs: Lo } } + /** + * This method is used to start a batched ASR job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List): JobStatus { + logger.debug { "Starting batched ASR job for audio." } + val wrapped = BatchedAutomatedSpeechRecognitionInput(input.map { it.toDataURL() }) + return try { + this.automatedSpeechRecognitionApi.newBatchedJobApiTasksAutomatedSpeechRecognitionBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an ASR job on the API. * @@ -53,4 +70,23 @@ class AsrApi(host: String, model: String, timeoutMs: Long, pollingIntervalMs: Lo } catch (e: Throwable) { JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched ASR job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.automatedSpeechRecognitionApi.getBatchedJobResultsApiTasksAutomatedSpeechRecognitionBatchedJobsJobGet(jobId).body().let { result -> + val values = result.result?.map { it.transcript.trim() } + if (!values.isNullOrEmpty()) { + JobResult(result.status, values.map { Value.Text(it) }) + } else { + JobResult(result.status, null) + } + } + } catch (e: Throwable) { + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt index 587f44973..9b359bdd8 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ConditionalImageCaptioningApi +import org.openapitools.client.models.BatchedConditionalImageCaptioningInput import org.openapitools.client.models.ConditionalImageCaptioningInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus @@ -36,6 +37,23 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long } } + /** + * This method is used to start a batched conditional image captioning job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List>): JobStatus { + logger.debug { "Starting batched conditional image captioning job for images." } + val wrapped = BatchedConditionalImageCaptioningInput(image = input.map{it.first.toDataUrl()}, text = input.map{it.second.content}) + return try { + this.conditionalImageCaptioningApi.newBatchedJobApiTasksConditionalImageCaptioningBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched conditional image captioning job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of a conditional image captioning job on the API. * @@ -55,4 +73,21 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long logger.error(e) { "Failed to poll for status of conditional image captioning job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched conditional image captioning job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> { + this.conditionalImageCaptioningApi.getBatchedJobResultsApiTasksConditionalImageCaptioningBatchedJobsJobGet(jobId).body().let { result -> + val value = result.result?.map { it.caption.trim() } + if (value != null) { + return JobResult(result.status, value.map { Value.Text(it) }) + } else { + return JobResult(result.status, null) + } + } + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/FaceEmbeddingApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/FaceEmbeddingApi.kt index e28e0f1e3..ac80fcdcf 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/FaceEmbeddingApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/FaceEmbeddingApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.FaceEmbeddingApi +import org.openapitools.client.models.BatchedFaceEmbeddingInput import org.openapitools.client.models.FaceEmbeddingInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus @@ -36,6 +37,23 @@ class FaceEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInte } } + /** + * This method is used to start a batched face embedding job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List): JobStatus { + val wrapped = BatchedFaceEmbeddingInput(input.map { it.toDataUrl() }) + return try { + logger.debug { "Starting batched face embedding for images." } + this.faceEmbeddingApi.newBatchedJobApiTasksFaceEmbeddingBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched face embedding job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of a face embedding job on the API. * @@ -50,4 +68,19 @@ class FaceEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInte logger.error(e) { "Failed to poll for status of face embedding job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched face embedding job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.faceEmbeddingApi.getBatchedJobResultsApiTasksFaceEmbeddingBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { r -> r.embedding.let { e -> Value.FloatVector(FloatArray(e.size) { i -> e[i].toFloat() }) } }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched face embedding job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageCaptioningApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageCaptioningApi.kt index 27bf8f295..a2602c5e6 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageCaptioningApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageCaptioningApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ImageCaptioningApi +import org.openapitools.client.models.BatchedImageCaptioningInput import org.openapitools.client.models.ImageCaptioningInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus @@ -35,6 +36,23 @@ class ImageCaptioningApi(host: String, model: String, timeoutMs: Long, pollingIn } } + /** + * This method is used to start a batched image captioning job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List): JobStatus { + logger.debug { "Starting batched image captioning job for images." } + val wrapped = BatchedImageCaptioningInput(input.map { it.toDataUrl() }) + return try { + this.imageCaptioningApi.newBatchedJobApiTasksImageCaptioningBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched image captioning job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an image captioning job on the API. * @@ -54,4 +72,21 @@ class ImageCaptioningApi(host: String, model: String, timeoutMs: Long, pollingIn logger.error(e) { "Failed to poll for status of image captioning job." } JobResult(JobState.failed, null) } + + + /** + * This method is used to poll for results of a batched image captioning job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.imageCaptioningApi.getBatchedJobResultsApiTasksImageCaptioningBatchedJobsJobGet(jobId).body().let { result -> + val values = result.result?.map { Value.Text(it.caption.trim() ?: "") } + JobResult(result.status, values) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched image captioning job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageEmbeddingApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageEmbeddingApi.kt index 4dbe22ee6..1d5754ef7 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageEmbeddingApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ImageEmbeddingApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ImageEmbeddingApi +import org.openapitools.client.models.BatchedImageEmbeddingInput import org.openapitools.client.models.ImageEmbeddingInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus @@ -36,6 +37,16 @@ class ImageEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInt } } + override suspend fun startBatchedJob(input: List): JobStatus { + val wrapped = BatchedImageEmbeddingInput(input.map { it.toDataUrl() }) + return try { + logger.debug { "Starting batched image embedding for images." } + this.imageEmbeddingApi.newBatchedJobApiTasksImageEmbeddingBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched image embedding job." } + JobStatus("unknown", JobState.failed) + } + } /** * This method is used to poll for results of an image embedding job on the API. * @@ -50,4 +61,19 @@ class ImageEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInt logger.error(e) { "Failed to poll for status of image embedding job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched image embedding job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.imageEmbeddingApi.getBatchedJobResultsApiTasksImageEmbeddingBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { r -> Value.FloatVector(FloatArray(r.embedding.size) { i -> r.embedding[i].toFloat() }) }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched image embedding job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ObjectDetectionApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ObjectDetectionApi.kt index 429766f21..0449508ef 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ObjectDetectionApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ObjectDetectionApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ObjectDetectionApi +import org.openapitools.client.models.BatchedObjectDetectionInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus import org.openapitools.client.models.ObjectDetectionInput @@ -36,6 +37,17 @@ class ObjectDetectionApi(host: String, model: String, timeoutMs: Long, pollingIn } } + override suspend fun startBatchedJob(input: List): JobStatus { + logger.debug { "Starting batched object detection job for images." } + val wrapped = BatchedObjectDetectionInput(input.map { it.toDataUrl() }) + return try { + this.objectDetectionApi.newBatchedJobApiTasksObjectDetectionBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched object detection job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an object detection job on the API. * @@ -50,4 +62,19 @@ class ObjectDetectionApi(host: String, model: String, timeoutMs: Long, pollingIn logger.error(e) { "Failed to poll for status of object detection job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched object detection job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult>> = try { + this.objectDetectionApi.getBatchedJobResultsApiTasksObjectDetectionBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { it.labels.map { Value.String(it.trim()) } }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched object detection job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/OcrApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/OcrApi.kt index a6e0291c9..2da3e8109 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/OcrApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/OcrApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.OpticalCharacterRecognitionApi +import org.openapitools.client.models.BatchedOpticalCharacterRecognitionInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus import org.openapitools.client.models.OpticalCharacterRecognitionInput @@ -35,6 +36,23 @@ class OcrApi(host: String, model: String, timeoutMs: Long, pollingIntervalMs: Lo } } + /** + * This method is used to start a batched OCR job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List): JobStatus { + logger.debug { "Starting batched OCR job for images." } + val wrapped = BatchedOpticalCharacterRecognitionInput(input.map { it.toDataUrl() }) + return try { + this.opticalCharacterRecognitionApi.newBatchedJobApiTasksOpticalCharacterRecognitionBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched OCR job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an OCR job on the API. * @@ -54,4 +72,19 @@ class OcrApi(host: String, model: String, timeoutMs: Long, pollingIntervalMs: Lo logger.error(e) { "Failed to poll for status of OCR job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched OCR job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.opticalCharacterRecognitionApi.getBatchedJobResultsApiTasksOpticalCharacterRecognitionBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { Value.Text(it.text.trim()) }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched OCR job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/TextEmbeddingApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/TextEmbeddingApi.kt index 1cf22254f..587064745 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/TextEmbeddingApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/TextEmbeddingApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.TextEmbeddingApi +import org.openapitools.client.models.BatchedTextEmbeddingInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus import org.openapitools.client.models.TextEmbeddingInput @@ -36,6 +37,23 @@ class TextEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInte } } + /** + * This method is used to start a batched text embedding job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List): JobStatus { + val wrapped = BatchedTextEmbeddingInput(input.map { it.content }) + return try { + logger.debug { "Starting batched text embedding for texts." } + this.textEmbeddingApi.newBatchedJobApiTasksTextEmbeddingBatchedModelJobsPost(this.model, wrapped).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched text embedding job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an text embedding job on the API. * @@ -50,4 +68,19 @@ class TextEmbeddingApi(host: String, model: String, timeoutMs: Long, pollingInte logger.error(e) { "Failed to poll for status of text embedding job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched text embedding job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult> = try { + this.textEmbeddingApi.getBatchedJobResultsApiTasksTextEmbeddingBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { r -> Value.FloatVector(FloatArray(r.embedding.size) { i -> r.embedding[i].toFloat() }) }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched text embedding job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ZeroShotClassificationApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ZeroShotClassificationApi.kt index 0d7c08bad..4edf0646a 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ZeroShotClassificationApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ZeroShotClassificationApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ZeroShotImageClassificationApi +import org.openapitools.client.models.BatchedZeroShotImageClassificationInput import org.openapitools.client.models.JobState import org.openapitools.client.models.JobStatus import org.openapitools.client.models.ZeroShotImageClassificationInput @@ -36,6 +37,30 @@ class ZeroShotClassificationApi(host: String, model: String, timeoutMs: Long, po } } + /** + * This method is used to start a batched zero shot image classification job on the API. + * + * @param input The input for the job. + * @return The [JobStatus] + */ + override suspend fun startBatchedJob(input: List>>): JobStatus { + logger.debug { "Starting batched zero shot image classification job for images." } + val classes = input.map { it.second }.toSet() + if (classes.size > 1) { + throw IllegalArgumentException("All classes must be the same for batched zero shot image classification.") + } + val wrapped = BatchedZeroShotImageClassificationInput(input.map { it.first.toDataUrl() }, classes.first()) + return try { + this.zeroShotImageClassificationApi.newBatchedJobApiTasksZeroShotImageClassificationBatchedModelJobsPost( + this.model, + wrapped + ).body() + } catch (e: Throwable) { + logger.error(e) { "Failed to start batched zero shot image classification job." } + JobStatus("unknown", JobState.failed) + } + } + /** * This method is used to poll for results of an object detection job on the API. * @@ -50,4 +75,19 @@ class ZeroShotClassificationApi(host: String, model: String, timeoutMs: Long, po logger.error(e) { "Failed to poll for status of object detection job." } JobResult(JobState.failed, null) } + + /** + * This method is used to poll for results of a batched object detection job on the API. + * + * @param jobId The ID of the job to poll. + * @return The [JobResult] + */ + override suspend fun pollBatchedJob(jobId: String): JobResult>> = try { + this.zeroShotImageClassificationApi.getBatchedJobResultsApiTasksZeroShotImageClassificationBatchedJobsJobGet(jobId).body().let { result -> + JobResult(result.status, result.result?.map { it.probabilities.map { Value.Double(it) } }) + } + } catch (e: Throwable) { + logger.error(e) { "Failed to poll for status of batched object detection job." } + JobResult(JobState.failed, null) + } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index cc8a1aac9..ac812a335 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -8,6 +8,7 @@ import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Comp import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.RETRIES_PARAMETER_DEFAULT import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.RETRIES_PARAMETER_NAME import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser.Companion.TIMEOUT_MS_PARAMETER_DEFAULT +import org.vitrivr.engine.core.features.AbstractBatchedExtractor import org.vitrivr.engine.core.features.AbstractExtractor import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.descriptor.Descriptor @@ -28,7 +29,7 @@ abstract class FesExtractor, D : Descriptor>( field: Schema.Field?, analyser: ExternalFesAnalyser, protected val parameters: Map, -) : AbstractExtractor(input, analyser, field) { +) : AbstractBatchedExtractor(input, analyser, field, parameters["batchSize"]?.toIntOrNull() ?: 1) { private val contentSources = parameters["contentSources"]?.split(",")?.toSet() diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt index a5330de62..ef1ee891a 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt @@ -26,19 +26,21 @@ class ASRExtractor( /** The [AsrApi] used to perform extraction with. */ private val api = AsrApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) + /** * Internal method to perform extraction on [Retrievable]. ** - * @param retrievable The [Retrievable] to process. - * @return List of resulting [Descriptor]s. + * @param retrievables The [Retrievable]s to process. + * @return List of resulting [Descriptor]s grouped by [Retrievable]. */ - override fun extract(retrievable: Retrievable): List { - return this.filterContent(retrievable).mapNotNull { audio -> - val result = this.api.analyse(audio) - if (result != null) { - TextDescriptor(UUID.randomUUID(), retrievable.id, result, this.field) - } else { - null + override fun extract(retrievables: List): List> { + val flatResults = this.api.analyseBatched(retrievables.flatMap { this.filterContent(it) }).mapNotNull { result -> TextDescriptor(UUID.randomUUID(), null, result, this.field)} + + var index = 0 + + return retrievables.map { retrievable -> + this.filterContent(retrievable).map { + flatResults[index++].also { it.retrievableId = retrievable.id } } } } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt index 95551b124..ba85a0ebb 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt @@ -1,5 +1,6 @@ package org.vitrivr.engine.base.features.external.implementations.caption +import io.github.oshai.kotlinlogging.KotlinLogging import org.vitrivr.engine.base.features.external.api.ConditionalImageCaptioningApi import org.vitrivr.engine.base.features.external.api.ImageCaptioningApi import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser @@ -16,6 +17,8 @@ import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.operators.Operator import java.util.* +val logger = KotlinLogging.logger {} + /** * * @author Ralph Gasser @@ -34,37 +37,63 @@ class ImageCaptionExtractor( /** The [ConditionalImageCaptioningApi] used to perform extraction with. */ private val conditionalCaptioningApi by lazy { ConditionalImageCaptioningApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) } - /** - * Internal method to perform extraction on [Retrievable]. - ** - * @param retrievable The [Retrievable] to process. - * @return List of resulting [Descriptor]s. - */ - override fun extract(retrievable: Retrievable): List { + private fun makeCaption(imageContent: List, text: List) : List { + val withTextIndices = text.mapIndexedNotNull { index, t -> if (t != null) index to t else null } + val withoutTextIndices = text.mapIndexedNotNull { index, t -> if (t == null) index else null } + - val content = this.filterContent(retrievable) - val textContent = content.filterIsInstance() - if (textContent.size > 1) { - logger.warn { "Text content has more than one element. Only the first element will be used as an image captioning prompt." } + val withTextResults = if (withTextIndices.isEmpty()) { + emptyList() + } else { + this.conditionalCaptioningApi.analyseBatched(withTextIndices.map { imageContent[it.first] to InMemoryTextContent(it.second) }) + } + val withoutTextResults = if (withoutTextIndices.isEmpty()) { + emptyList() + } else { + this.captioningApi.analyseBatched(withoutTextIndices.map { imageContent[it] }) } - val prompt = (textContent.firstOrNull()?.content ?: this.field?.parameters?.get(PROMPT_PARAMETER_NAME) )?.let { InMemoryTextContent(it) } + // merge results so they are in the same order as the input + val results = mutableListOf() + var withTextIndex = 0 + var withoutTextIndex = 0 + for (i in text.indices) { + if (text[i] != null) { + results.add(TextDescriptor(UUID.randomUUID(),null,withTextResults[withTextIndex++])) + } else { + results.add(TextDescriptor(UUID.randomUUID(),null,withoutTextResults[withoutTextIndex++])) + } + } + return results + } + + override fun extract(retrievables: List): List> { + + val content = retrievables.map { this.filterContent(it) } + val imageContents = content.map { it.filterIsInstance() } - return content.mapNotNull { - if (it is ImageContent) { - val result = if (prompt == null) { - this.captioningApi.analyse(it) - } else { - this.conditionalCaptioningApi.analyse(it to prompt) + val texts : List> = content.map { it.filterIsInstance().map { it.content } }.mapIndexed { index, text -> if (text.isEmpty()) { + List(imageContents[index].size) { this.parameters[PROMPT_PARAMETER_NAME] } + } else { + if (text.size != 1) { + logger.warn { "Text content has more than one element. Only the first element will be used as an image captioning prompt." } } - if (result != null) { - TextDescriptor(UUID.randomUUID(), retrievable.id, result, this.field) - } else { + List(imageContents[index].size) { text.first() } + } + } + + val flatResults = makeCaption(imageContents.flatten(), texts.flatten()) + + var index = 0 + + return retrievables.map { retrievable -> + this.filterContent(retrievable).map { + if (it !is ImageContent) { null + } else{ + flatResults[index++].also { it.retrievableId = retrievable.id } } - } else { - null - } + }.filterNotNull() } } } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt index 1e42dc5fc..74fe12501 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt @@ -4,8 +4,6 @@ import org.vitrivr.engine.base.features.external.api.ZeroShotClassificationApi import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.base.features.external.common.FesExtractor import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.CLASSES_PARAMETER_NAME -import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.THRESHOLD_PARAMETER_NAME -import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.TOPK_PARAMETER_NAME import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.struct.LabelDescriptor @@ -31,26 +29,21 @@ class ImageClassificationExtractor( /** The [ZeroShotClassificationApi] used to perform extraction with. */ private val api by lazy { ZeroShotClassificationApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) } - /** - * Internal method to perform extraction on [Retrievable]. - ** - * @param retrievable The [Retrievable] to process. - * @return List of resulting [Descriptor]s. - */ - override fun extract(retrievable: Retrievable): List { + + + override fun extract(retrievables: List): List> { val classes = this.parameters[CLASSES_PARAMETER_NAME]?.split(",") ?: throw IllegalArgumentException("No classes provided.") - val topK = this.parameters[TOPK_PARAMETER_NAME]?.toInt() ?: 1 - val threshold = this.parameters[THRESHOLD_PARAMETER_NAME]?.toFloat() ?: 0.0f - return this.filterContent(retrievable).flatMap { content -> - if (content is ImageContent) { - val result = this.api.analyse(content to classes) - result?.mapIndexed { index, score -> - LabelDescriptor(UUID.randomUUID(), retrievable.id, mapOf("label" to Value.String(classes[index]), "confidence" to score), this.field) - }?.filter { it.confidence.value >= threshold }?.sortedByDescending { it.confidence.value }?.take(topK) - ?: emptyList() - } else { - emptyList() + val flatResults = this.api.analyseBatched(retrievables.flatMap { this.filterContent(it).map{it to classes}}).map { result -> + LabelDescriptor(UUID.randomUUID(), null, result.mapIndexed { index, double -> classes[index] to double}.toMap(), this.field) + } + + var index = 0 + return retrievables.map { retrievable -> + this.filterContent(retrievable).map { + flatResults[index++].also{it.retrievableId = retrievable.id} } } } + + } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt index 2292bee94..5594d5e47 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt @@ -32,22 +32,49 @@ class DenseEmbeddingExtractor( /** The [AsrApi] used to perform extraction with. */ private val imageApi by lazy { ImageEmbeddingApi(this.host, model, this.timeoutMs, this.pollingIntervalMs, this.retries) } + /** * Internal method to perform extraction on [Retrievable]. ** - * @param retrievable The [Retrievable] to process. - * @return List of resulting [Descriptor]s. + * @param retrievables The [Retrievable]s to process. + * @return List of resulting [Descriptor]s grouped by [Retrievable]. */ - override fun extract(retrievable: Retrievable): List = this.filterContent(retrievable).mapNotNull { - val result = when (it) { - is ImageContent -> this.imageApi.analyse(it) - is TextContent -> this.textApi.analyse(it) - else -> null + override fun extract(retrievables: List): List> { + val content = retrievables.flatMap { this.filterContent(it) } + val textContent = content.mapIndexed { index, contentElement -> if (contentElement is TextContent) index to contentElement else null }.filterNotNull().toMap() + val imageContent = content.mapIndexed { index, contentElement -> if (contentElement is ImageContent) index to contentElement else null }.filterNotNull().toMap() + + val textResults: List = if (textContent.isNotEmpty()) { + this.textApi.analyseBatched(textContent.map { it.value }) + .map { FloatVectorDescriptor(UUID.randomUUID(), null, it, this.field) } + } else { + emptyList() } - if (result != null) { - FloatVectorDescriptor(UUID.randomUUID(), retrievable.id, result, this.field) + + val imageResults: List = if (imageContent.isNotEmpty()) { + this.imageApi.analyseBatched(imageContent.map { it.value }) + .map { FloatVectorDescriptor(UUID.randomUUID(), null, it, this.field) } } else { - null + emptyList() + } + + + val textResultMap = textContent.keys.zip(textResults).toMap() + val imageResultMap = imageContent.keys.zip(imageResults).toMap() + + return retrievables.indices.map { index -> + val descriptors = mutableListOf() + textResultMap[index]?.let { + it.retrievableId = retrievables[index].id + descriptors.add(it) + } + imageResultMap[index]?.let { + it.retrievableId = retrievables[index].id + descriptors.add(it) + } + descriptors } } + + } \ No newline at end of file diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt index e0afafbd8..77a2acee2 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt @@ -26,20 +26,23 @@ class OCRExtractor( /** The [OcrApi] used to perform extraction with. */ private val api = OcrApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) + /** * Internal method to perform extraction on [Retrievable]. ** - * @param retrievable The [Retrievable] to process. - * @return List of resulting [Descriptor]s. + * @param retrievables The [Retrievable]s to process. + * @return List of resulting [Descriptor]s grouped by [Retrievable]. */ - override fun extract(retrievable: Retrievable): List { - val content = this.filterContent(retrievable) - return content.mapNotNull { audio -> - val result = this.api.analyse(audio) - if (result != null) { - TextDescriptor(UUID.randomUUID(), retrievable.id, result, this.field) - } else { - null + override fun extract(retrievables: List): List> { + val flatResults = this.api.analyseBatched(retrievables.flatMap { this.filterContent(it) }).mapNotNull { result -> + TextDescriptor(UUID.randomUUID(), null, result, this.field) + } + + var index = 0 + + return retrievables.map { retrievable -> + this.filterContent(retrievable).map { + flatResults[index++].also { it.retrievableId = retrievable.id } } } } From 6b369925ebbac09605208cc3d5475457c51f16a6 Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 12 Aug 2024 11:01:13 +0200 Subject: [PATCH 17/34] WIP on feature/contentpipelines --- .../ImageClassificationExtractor.kt | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt index 74fe12501..22bf495e6 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt @@ -5,7 +5,6 @@ import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.base.features.external.common.FesExtractor import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.CLASSES_PARAMETER_NAME import org.vitrivr.engine.core.model.content.element.ImageContent -import org.vitrivr.engine.core.model.descriptor.Descriptor import org.vitrivr.engine.core.model.descriptor.struct.LabelDescriptor import org.vitrivr.engine.core.model.metamodel.Schema import org.vitrivr.engine.core.model.retrievable.Retrievable @@ -27,23 +26,36 @@ class ImageClassificationExtractor( /** The [ZeroShotClassificationApi] used to perform extraction with. */ - private val api by lazy { ZeroShotClassificationApi(this.host, this.model, this.timeoutMs, this.pollingIntervalMs, this.retries) } - + private val api by lazy { + ZeroShotClassificationApi( + this.host, + this.model, + this.timeoutMs, + this.pollingIntervalMs, + this.retries + ) + } override fun extract(retrievables: List): List> { - val classes = this.parameters[CLASSES_PARAMETER_NAME]?.split(",") ?: throw IllegalArgumentException("No classes provided.") - val flatResults = this.api.analyseBatched(retrievables.flatMap { this.filterContent(it).map{it to classes}}).map { result -> - LabelDescriptor(UUID.randomUUID(), null, result.mapIndexed { index, double -> classes[index] to double}.toMap(), this.field) - } - - var index = 0 - return retrievables.map { retrievable -> - this.filterContent(retrievable).map { - flatResults[index++].also{it.retrievableId = retrievable.id} + val classes = this.parameters[CLASSES_PARAMETER_NAME]?.split(",") + ?: throw IllegalArgumentException("No classes provided.") + + val flatResults = this.api.analyseBatched( + retrievables.flatMap { + this.filterContent(it).map { it to classes } + }).map { result -> + result.mapIndexed { index, confidence -> + LabelDescriptor( + UUID.randomUUID(), + null, + mapOf( + "label" to Value.String(classes[index]), + "confidence" to Value.Float(confidence.value.toFloat()) + ), + ) } } + return flatResults } - - } \ No newline at end of file From d93d9407827103ad023c52b4005de400da6b9799 Mon Sep 17 00:00:00 2001 From: Raphael Date: Wed, 14 Aug 2024 11:11:17 +0200 Subject: [PATCH 18/34] improves log message --- .../api/ConditionalImageCaptioningApi.kt | 55 ++++++++++++++----- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt index 9b359bdd8..a80d49cd8 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/api/ConditionalImageCaptioningApi.kt @@ -1,6 +1,7 @@ package org.vitrivr.engine.base.features.external.api import org.openapitools.client.apis.ConditionalImageCaptioningApi +import org.openapitools.client.infrastructure.map import org.openapitools.client.models.BatchedConditionalImageCaptioningInput import org.openapitools.client.models.ConditionalImageCaptioningInput import org.openapitools.client.models.JobState @@ -16,9 +17,20 @@ import org.vitrivr.engine.core.model.types.Value * @author Ralph Gasser * @version 1.0.0 */ -class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long, pollingIntervalMs: Long, retries: Int) : AbstractApi, Value.Text>(host, model, timeoutMs, pollingIntervalMs, retries) { +class ConditionalImageCaptioningApi( + host: String, + model: String, + timeoutMs: Long, + pollingIntervalMs: Long, + retries: Int +) : AbstractApi, Value.Text>(host, model, timeoutMs, pollingIntervalMs, retries) { /** The API used for FES conditional image captioning. */ - private val conditionalImageCaptioningApi by lazy { ConditionalImageCaptioningApi(baseUrl = this.host, httpClientConfig = this.httpClientConfig) } + private val conditionalImageCaptioningApi by lazy { + ConditionalImageCaptioningApi( + baseUrl = this.host, + httpClientConfig = this.httpClientConfig + ) + } /** * This method is used to start a conditional image captioning job on the API. @@ -30,7 +42,10 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long logger.debug { "Starting conditional image captioning job for image." } val wrapped = ConditionalImageCaptioningInput(input.first.toDataUrl(), input.second.content) return try { - this.conditionalImageCaptioningApi.newJobApiTasksConditionalImageCaptioningModelJobsPost(this.model, wrapped).body() + this.conditionalImageCaptioningApi.newJobApiTasksConditionalImageCaptioningModelJobsPost( + this.model, + wrapped + ).body() } catch (e: Throwable) { logger.error(e) { "Failed to start conditional image captioning job." } JobStatus("unknown", JobState.failed) @@ -45,11 +60,18 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long */ override suspend fun startBatchedJob(input: List>): JobStatus { logger.debug { "Starting batched conditional image captioning job for images." } - val wrapped = BatchedConditionalImageCaptioningInput(image = input.map{it.first.toDataUrl()}, text = input.map{it.second.content}) + val wrapped = BatchedConditionalImageCaptioningInput(image = input.map { it.first.toDataUrl() }, + text = input.map { it.second.content }) return try { - this.conditionalImageCaptioningApi.newBatchedJobApiTasksConditionalImageCaptioningBatchedModelJobsPost(this.model, wrapped).body() - } catch (e: Throwable) { - logger.error(e) { "Failed to start batched conditional image captioning job." } + val result = + this.conditionalImageCaptioningApi.newBatchedJobApiTasksConditionalImageCaptioningBatchedModelJobsPost( + this.model, + wrapped + ) + return result.takeIf { it.success }?.body() + ?: throw IllegalStateException("Api Error. Status: ${result.response.status}") + } catch (ex: Throwable) { + logger.error(ex) { "Error in startBatchedJob" } JobStatus("unknown", JobState.failed) } } @@ -61,14 +83,15 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long * @return The [JobResult] */ override suspend fun pollJob(jobId: String): JobResult = try { - this.conditionalImageCaptioningApi.getJobResultsApiTasksConditionalImageCaptioningJobsJobGet(jobId).body().let { result -> - val value = result.result?.caption?.trim() - if (!value.isNullOrBlank()) { - JobResult(result.status, Value.Text(value)) - } else { - JobResult(result.status, null) + this.conditionalImageCaptioningApi.getJobResultsApiTasksConditionalImageCaptioningJobsJobGet(jobId).body() + .let { result -> + val value = result.result?.caption?.trim() + if (!value.isNullOrBlank()) { + JobResult(result.status, Value.Text(value)) + } else { + JobResult(result.status, null) + } } - } } catch (e: Throwable) { logger.error(e) { "Failed to poll for status of conditional image captioning job." } JobResult(JobState.failed, null) @@ -81,7 +104,9 @@ class ConditionalImageCaptioningApi(host: String, model: String, timeoutMs: Long * @return The [JobResult] */ override suspend fun pollBatchedJob(jobId: String): JobResult> { - this.conditionalImageCaptioningApi.getBatchedJobResultsApiTasksConditionalImageCaptioningBatchedJobsJobGet(jobId).body().let { result -> + this.conditionalImageCaptioningApi.getBatchedJobResultsApiTasksConditionalImageCaptioningBatchedJobsJobGet( + jobId + ).body().let { result -> val value = result.result?.map { it.caption.trim() } if (value != null) { return JobResult(result.status, value.map { Value.Text(it) }) From 23522daff8adebb0261354f9568c31a7802098e1 Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Wed, 14 Aug 2024 14:04:39 +0200 Subject: [PATCH 19/34] Define default string and regex as constant --- .../engine/index/transform/TemplateTextTransformer.kt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt index dc4166e29..a7ff9f37b 100644 --- a/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt +++ b/vitrivr-engine-index/src/main/kotlin/org/vitrivr/engine/index/transform/TemplateTextTransformer.kt @@ -16,6 +16,9 @@ import org.vitrivr.engine.core.operators.general.TransformerFactory private val logger = KotlinLogging.logger {} +private val TEMPLATE_REGEX = "\\$\\{([^}]+)\\}".toRegex() +private const val DEFAULT_VALUE = "No content available." + /** * A [Transformer] that takes an input template with placeholders and inserts content from fields in their place. * @@ -25,9 +28,8 @@ private val logger = KotlinLogging.logger {} class TemplateTextTransformer : TransformerFactory { override fun newTransformer(name: String, input: Operator, context: Context): Transformer { val template = context[name, "template"] ?: throw IllegalArgumentException("The template text transformer requires a template.") - val regex = "\\$\\{([^}]+)\\}".toRegex() - val contentFields = regex.findAll(template).map { it.groupValues[1] }.toList() - val defaultValue = context[name, "defaultValue"] ?: "" + val contentFields = TEMPLATE_REGEX.findAll(template).map { it.groupValues[1] }.toList() + val defaultValue = context[name, "defaultValue"] ?: DEFAULT_VALUE return Instance( input = input, contentFactory = (context as IndexContext).contentFactory, From e147e2ad3b167b8ded2581862e82282ac933a5fb Mon Sep 17 00:00:00 2001 From: Laura Rettig Date: Wed, 14 Aug 2024 14:48:07 +0200 Subject: [PATCH 20/34] Deleted test schema and pipeline files --- test-pipeline.json | 127 --------------------------------------------- test-schema.json | 86 ------------------------------ 2 files changed, 213 deletions(-) delete mode 100644 test-pipeline.json delete mode 100644 test-schema.json diff --git a/test-pipeline.json b/test-pipeline.json deleted file mode 100644 index 055f5e54e..000000000 --- a/test-pipeline.json +++ /dev/null @@ -1,127 +0,0 @@ -{ - "schema": "test", - "context": { - "contentFactory": "CachedContentFactory", - "resolverName": "disk", - "local": { - "content": { - "path": "../cache" - }, - "enumerator": { - "path": "../benchmark/media_objects", - "depth": "5" - }, - "image_source_filter": { - "type": "SOURCE:IMAGE" - }, - "video_source_filter": { - "type": "SOURCE:VIDEO" - }, - "ocr_content": { - "field": "ocr_sparse", - "removeContent": "true" - }, - "asr_content": { - "field": "asr_sparse", - "removeContent": "true" - }, - "caption_content": { - "field": "caption_sparse", - "removeContent": "true" - }, - "video_decoder": { - "timeWindowMs": "10000" - }, - "ocr_sparse": { - "contentSources": "image_decoder,selector" - }, - "caption_sparse": { - "contentSources": "image_decoder,selector" - }, - "asr_sparse": { - "contentSources": "video_decoder" - }, - "merge_prompt": { - "template": "test $asr_content ASR \n $caption_content CAPTION \n $ocr_content OCR", - "defaultValue": "no content provided" - } - } - }, - "operators": { - "passthrough": { - "type": "TRANSFORMER", - "factory": "PassthroughTransformer" - }, - "enumerator": { - "type": "ENUMERATOR", - "factory": "FileSystemEnumerator", - "mediaTypes": ["IMAGE", "VIDEO"] - }, - "image_decoder": { - "type": "DECODER", - "factory": "ImageDecoder" - }, - "video_decoder": { - "type": "DECODER", - "factory": "VideoDecoder" - }, - "file_metadata":{ - "type": "EXTRACTOR", - "fieldName": "file" - }, - "ocr_sparse": { - "type": "EXTRACTOR", - "fieldName": "ocr_sparse" - }, - "caption_sparse": { - "type": "EXTRACTOR", - "fieldName": "caption_sparse" - }, - "asr_sparse": { - "type": "EXTRACTOR", - "fieldName": "asr_sparse" - }, - "ocr_content": { - "type": "TRANSFORMER", - "factory": "DescriptorAsContentTransformer" - }, - "asr_content": { - "type": "TRANSFORMER", - "factory": "DescriptorAsContentTransformer" - }, - "caption_content": { - "type": "TRANSFORMER", - "factory": "DescriptorAsContentTransformer" - }, - "merge_prompt": { - "type": "TRANSFORMER", - "factory": "ContentMergingTransformer" - }, - "selector": { - "type": "TRANSFORMER", - "factory": "LastContentAggregator" - }, - "time":{ - "type": "EXTRACTOR", - "fieldName": "time" - } - }, - "operations": { - "enumerator-stage": {"operator": "enumerator"}, - "video-decoder-stage": {"operator": "video_decoder", "inputs": ["enumerator-stage"]}, - "time-stage": {"operator": "time","inputs": ["video-decoder-stage"]}, - "image-decoder-stage": {"operator": "image_decoder", "inputs": ["enumerator-stage"]}, - "selector-stage": {"operator": "selector", "inputs": ["time-stage"]}, - "video-ocr-sparse-stage": {"operator": "ocr_sparse", "inputs": ["selector-stage"]}, - "video-ocr-content-stage": {"operator": "ocr_content", "inputs": ["video-ocr-sparse-stage"]}, - "video-caption-sparse-stage": {"operator": "caption_sparse", "inputs": ["selector-stage"]}, - "video-caption-content-stage": {"operator": "caption_content", "inputs": ["video-caption-sparse-stage"]}, - "asr-sparse-stage": {"operator": "asr_sparse", "inputs": ["time-stage"]}, - "asr-content-stage": {"operator": "asr_content", "inputs": ["asr-sparse-stage"]}, - "prompt": {"operator": "merge_prompt", "inputs": ["asr-content-stage", "video-caption-content-stage", "video-ocr-content-stage"], "merge": "COMBINE"} - }, - "output": [ - "prompt" - ], - "mergeType": "MERGE" -} \ No newline at end of file diff --git a/test-schema.json b/test-schema.json deleted file mode 100644 index 475bca28a..000000000 --- a/test-schema.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "schemas": [ - { - "name": "test", - "connection": { - "database": "CottontailConnectionProvider", - "parameters": { - "Host": "127.0.0.1", - "port": "1865" - } - }, - "fields": [ - { - "name": "averagecolor", - "factory": "AverageColor" - }, - { - "name": "file", - "factory": "FileSourceMetadata" - }, - { - "name": "time", - "factory": "TemporalMetadata" - }, - { - "name": "video", - "factory": "VideoSourceMetadata" - }, - { - "name": "asr_sparse", - "factory": "ASR", - "parameters": { - "host": "http://10.34.64.84:8888/", - "model": "whisper", - "timeoutSeconds": "100", - "retries":"1000" - } - }, - { - "name": "caption_sparse", - "factory": "ImageCaption", - "parameters": { - "host": "http://10.34.64.84:8888/", - "timeoutSeconds": "100", - "retries":"1000" - } - }, - { - "name": "ocr_sparse", - "factory": "OCR", - "parameters": { - "host": "http://10.34.64.84:8888/", - "model": "tesseract", - "timeoutSeconds": "100", - "retries":"1000" - } - } - ], - "resolvers": { - "disk": { - "factory": "DiskResolver", - "parameters": { - "location": "../thumbnails" - } - } - }, - "exporters": [ - { - "name": "thumbnail", - "factory": "ThumbnailExporter", - "resolverName": "disk", - "parameters": { - "maxSideResolution": "400", - "mimeType": "JPG" - } - } - ], - "extractionPipelines": [ - { - "name": "full", - "path": "./test-pipeline.json" - } - ] - } - ] -} \ No newline at end of file From af3970970e1bbb6b07aed0d93255fb19f307ffd9 Mon Sep 17 00:00:00 2001 From: faberf Date: Wed, 14 Aug 2024 15:39:35 +0200 Subject: [PATCH 21/34] small refactoring --- .../config/ingest/IngestionPipelineBuilder.kt | 50 +++++++++-------- .../core/config/ingest/operation/Operation.kt | 54 ++++++------------- .../core/features/AbstractBatchedExtractor.kt | 12 ++--- .../model/content/element/ContentElement.kt | 6 ++- .../operators/persistence/PersistingSink.kt | 2 - 5 files changed, 50 insertions(+), 74 deletions(-) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt index 7a2755461..657286d4f 100755 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/IngestionPipelineBuilder.kt @@ -3,9 +3,7 @@ package org.vitrivr.engine.core.config.ingest import io.github.oshai.kotlinlogging.KLogger import io.github.oshai.kotlinlogging.KotlinLogging import org.vitrivr.engine.core.config.IndexContextFactory -import org.vitrivr.engine.core.config.ingest.operation.BaseOperation import org.vitrivr.engine.core.config.ingest.operation.Operation -import org.vitrivr.engine.core.config.ingest.operation.PassthroughOperation import org.vitrivr.engine.core.config.ingest.operator.OperatorConfig import org.vitrivr.engine.core.context.IndexContext import org.vitrivr.engine.core.model.content.element.ContentElement @@ -60,6 +58,7 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { val config = root.opConfig as? OperatorConfig.Enumerator ?: throw IllegalArgumentException("Root stage must always be an enumerator!") val built = HashMap>() + root.opName ?: throw IllegalArgumentException("Root stage cannot be passthrough!") built[root.name] = buildEnumerator(root.opName, config, stream) for (output in root.output) { @@ -85,11 +84,11 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { /** * This is an internal function that can be called recursively to build the [Operator] DAG. * - * @param operation The [IOperation] to build. + * @param operation The [BaseOperation] to build. * @param memoizationTable The memoization table that holds the already built operators. * @return The built [Operator]. */ - private fun buildInternal(operation: BaseOperation, memoizationTable: MutableMap>, breakAt: BaseOperation? = null) { + private fun buildInternal(operation: Operation, memoizationTable: MutableMap>, breakAt: Operation? = null) { /* Find all required input operations and merge them (if necessary). */ if (operation == breakAt) return val inputs = operation.input.map { @@ -110,24 +109,22 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { } /* Prepare and cache operator. */ - when(operation) { - is Operation -> { - val operator = buildOperator(operation.opName, op, operation.opConfig) - if (operation.output.size > 1) { - memoizationTable[operation.name] = BroadcastOperator(operator) - } else { - memoizationTable[operation.name] = operator - } + if (operation.opName != null && operation.opConfig != null) { + val operator = buildOperator(operation.opName, op, operation.opConfig) + if (operation.output.size > 1) { + memoizationTable[operation.name] = BroadcastOperator(operator) + } else { + memoizationTable[operation.name] = operator } - is PassthroughOperation -> { - if (operation.output.size > 1) { - memoizationTable[operation.name] = BroadcastOperator(op) - } else { - memoizationTable[operation.name] = op - } + } else { + if (operation.output.size > 1) { + memoizationTable[operation.name] = BroadcastOperator(op) + } else { + memoizationTable[operation.name] = op } } + /* Process output operators. */ for (output in operation.output) { buildInternal(output, memoizationTable, operation) @@ -148,28 +145,29 @@ class IngestionPipelineBuilder(val config: IngestionConfig) { /* Build trees with entry points as roots. */ return entrypoints.map { - val stages = HashMap() - val root = Operation(it.key, it.value.operator as String, config.operators[it.value.operator] ?: throw IllegalArgumentException("Undefined operator '${it.value.operator}'"), it.value.merge) + val stages = HashMap() + it.value.operator ?: throw IllegalArgumentException("Entrypoints must have an operator!") + val root = Operation(it.key, it.value.operator!!, config.operators[it.value.operator] ?: throw IllegalArgumentException("Undefined operator '${it.value.operator}'"), it.value.merge) stages[it.key] = root for (operation in this.config.operations) { if (!stages.containsKey(operation.key)) { when(operation.value.operator) { is String -> stages[operation.key] = Operation( - operation.key, - operation.value.operator as String, - config.operators[operation.value.operator as String] ?: throw IllegalArgumentException("Undefined operator '${operation.value.operator}'"), - operation.value.merge + name = operation.key, + opName = operation.value.operator!!, + opConfig = config.operators[operation.value.operator!!] ?: throw IllegalArgumentException("Undefined operator '${operation.value.operator}'"), + merge = operation.value.merge ) null -> - stages[operation.key] = PassthroughOperation(operation.key, operation.value.merge) + stages[operation.key] = Operation(name = operation.key, opName = null, opConfig = null, merge = operation.value.merge) } } for (inputKey in operation.value.inputs) { if (!stages.containsKey(inputKey)) { val op = this.config.operations[inputKey] ?: throw IllegalArgumentException("Undefined operation '${inputKey}'") - stages[inputKey] = Operation(inputKey, op.operator as String, config.operators[op.operator] ?: throw IllegalArgumentException("Undefined operator '${op.operator}'"), op.merge) + stages[inputKey] = Operation(inputKey, op.operator!!, config.operators[op.operator] ?: throw IllegalArgumentException("Undefined operator '${op.operator}'"), op.merge) } stages[operation.key]?.addInput(stages[inputKey]!!) } diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt index f40f713fc..41a9f6280 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/config/ingest/operation/Operation.kt @@ -7,60 +7,40 @@ import java.util.* /** * This sealed class represents a base operation in the ingest pipeline. */ -sealed class BaseOperation(val name: String, val merge: MergeType?) { +class Operation(val name: String, val opName: String?, val opConfig: OperatorConfig?, val merge: MergeType?) { - /** A [LinkedList] of all input [BaseOperation]s. */ - private val _input = LinkedList() + /** A [LinkedList] of all input [Operation]s. */ + private val _input = LinkedList() - /** A [LinkedList] of all output [BaseOperation]s. */ - private val _output = LinkedList() + /** A [LinkedList] of all output [Operation]s. */ + private val _output = LinkedList() - /** A [List] of all input [BaseOperation]s. */ - val input: List + /** A [List] of all input [Operation]s. */ + val input: List get() = Collections.unmodifiableList(this._input) - /** A [List] of all output [BaseOperation]s. */ - val output: List + /** A [List] of all output [Operation]s. */ + val output: List get() = Collections.unmodifiableList(this._output) /** - * Adds an input [BaseOperation] to this [BaseOperation]. + * Adds an input [Operation] to this [Operation]. * - * @param operation The [BaseOperation] to add. + * @param operation The [Operation] to add. */ - fun addInput(operation: BaseOperation) { + fun addInput(operation: Operation) { this._input.add(operation) - operation.internalAddOutput(this) + operation._output.add(this) } /** - * Adds an output [BaseOperation] to this [BaseOperation]. + * Adds an output [Operation] to this [Operation]. * - * @param operation The [BaseOperation] to add. + * @param operation The [Operation] to add. */ - fun addOutput(operation: BaseOperation) { - this._output.add(operation) - operation.internalAddInput(this) - } - - protected fun internalAddInput(operation: BaseOperation) { - this._input.add(operation) - } - - protected fun internalAddOutput(operation: BaseOperation) { + fun addOutput(operation: Operation) { this._output.add(operation) + operation._input.add(this) } } -/** - * This [Operation] class represents a single operation in the ingest pipeline. - * - * @param opName The specific operation name. - * @param opConfig The configuration for the operation. - */ -class Operation(name: String, val opName: String, val opConfig: OperatorConfig, merge: MergeType? = null) : BaseOperation(name, merge) - -/** - * This [PassthroughOperation] class represents a passthrough operation in the ingest pipeline. - */ -class PassthroughOperation(name: String, merge: MergeType? = null) : BaseOperation(name, merge) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt index eb876c014..f2afc9800 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt @@ -53,10 +53,8 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f if (batch.size >= bufferSize) { logger.debug { "Batch size reached for field ${field?.fieldName}, extracting descriptors" } val descriptors = extract(batch) - // zip descriptors and batch - for (i in batch.indices) { - val r = batch[i] - for (d in descriptors[i]) { + batch.forEachIndexed { i, r -> + descriptors[i].forEach { d -> r.addDescriptor(d) } } @@ -74,10 +72,8 @@ abstract class AbstractBatchedExtractor, D : Descriptor>(f // Emit any remaining items in the batch if (batch.isNotEmpty()) { val descriptors = extract(batch) - // zip descriptors and batch - for (i in batch.indices) { - val r = batch[i] - for (d in descriptors[i]) { + batch.forEachIndexed { i, r -> + descriptors[i].forEach { d -> r.addDescriptor(d) } } diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt index 05bcc626c..a50c3ba59 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/content/element/ContentElement.kt @@ -1,9 +1,13 @@ package org.vitrivr.engine.core.model.content.element +import kotlinx.serialization.Serializable import org.vitrivr.engine.core.model.content.Content import org.vitrivr.engine.core.model.content.ContentType +import org.vitrivr.engine.core.model.serializer.UUIDSerializer import java.util.UUID +typealias ContentId = @Serializable(UUIDSerializer::class) UUID + /** * A [Content] element is a piece of [Content] that is tied to some actual [Content]. * @@ -20,7 +24,7 @@ sealed interface ContentElement: Content { */ val content: T - val id: UUID + val id: ContentId /** The [ContentType] of this [ContentElement]. */ val type: ContentType diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt index cdb638278..430a8328c 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/operators/persistence/PersistingSink.kt @@ -14,8 +14,6 @@ import org.vitrivr.engine.core.model.retrievable.Ingested import org.vitrivr.engine.core.model.retrievable.Retrievable import org.vitrivr.engine.core.operators.Operator -private val logger = KotlinLogging.logger {} - /** * A [Operator.Sink] that persists the [Ingested] it receives. * From 9c6238eb9a7df9a604f07ef98067c5a339f46ac9 Mon Sep 17 00:00:00 2001 From: Raphael Date: Wed, 14 Aug 2024 15:59:43 +0200 Subject: [PATCH 22/34] Adds filter threshold and topk --- .../classification/ImageClassificationExtractor.kt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt index 22bf495e6..b42ea7481 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt @@ -4,6 +4,8 @@ import org.vitrivr.engine.base.features.external.api.ZeroShotClassificationApi import org.vitrivr.engine.base.features.external.common.ExternalFesAnalyser import org.vitrivr.engine.base.features.external.common.FesExtractor import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.CLASSES_PARAMETER_NAME +import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.THRESHOLD_PARAMETER_NAME +import org.vitrivr.engine.base.features.external.implementations.classification.ImageClassification.Companion.TOPK_PARAMETER_NAME import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.descriptor.struct.LabelDescriptor import org.vitrivr.engine.core.model.metamodel.Schema @@ -41,6 +43,9 @@ class ImageClassificationExtractor( val classes = this.parameters[CLASSES_PARAMETER_NAME]?.split(",") ?: throw IllegalArgumentException("No classes provided.") + val topK = this.parameters[TOPK_PARAMETER_NAME]?.toInt() ?: 1 + val threshold = this.parameters[THRESHOLD_PARAMETER_NAME]?.toFloat() ?: 0.0f + val flatResults = this.api.analyseBatched( retrievables.flatMap { this.filterContent(it).map { it to classes } @@ -54,7 +59,7 @@ class ImageClassificationExtractor( "confidence" to Value.Float(confidence.value.toFloat()) ), ) - } + }.filter { it.confidence.value >= threshold }.sortedByDescending { it.confidence.value }.take(topK) } return flatResults } From e2815f7e96eed2ccc5bf5df4b4bcbcddda5069ae Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 15 Aug 2024 09:47:49 +0200 Subject: [PATCH 23/34] adds missing field --- .../external/implementations/caption/ImageCaptionExtractor.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt index ba85a0ebb..2bcd35916 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt @@ -59,9 +59,9 @@ class ImageCaptionExtractor( var withoutTextIndex = 0 for (i in text.indices) { if (text[i] != null) { - results.add(TextDescriptor(UUID.randomUUID(),null,withTextResults[withTextIndex++])) + results.add(TextDescriptor(UUID.randomUUID(),null,withTextResults[withTextIndex++],this.field)) } else { - results.add(TextDescriptor(UUID.randomUUID(),null,withoutTextResults[withoutTextIndex++])) + results.add(TextDescriptor(UUID.randomUUID(),null,withoutTextResults[withoutTextIndex++],this.field)) } } return results From 442a38f3b1f3ffe4179ba03ce4babe82b382d680 Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 15 Aug 2024 10:50:27 +0200 Subject: [PATCH 24/34] adds escaping entity name for pg --- .../implementations/caption/ImageCaptionExtractor.kt | 2 +- .../database/pgvector/descriptor/PgDescriptorWriter.kt | 2 +- .../database/pgvector/retrievable/RetrievableWriter.kt | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt index 2bcd35916..8a130e283 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt @@ -61,7 +61,7 @@ class ImageCaptionExtractor( if (text[i] != null) { results.add(TextDescriptor(UUID.randomUUID(),null,withTextResults[withTextIndex++],this.field)) } else { - results.add(TextDescriptor(UUID.randomUUID(),null,withoutTextResults[withoutTextIndex++],this.field)) + results.add(TextDescriptor(UUID.randomUUID(),null,withoutTextResults[withoutTextIndex++])) } } return results diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt index b2300d00a..68df8a0a2 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt @@ -165,7 +165,7 @@ open class PgDescriptorWriter(final override val field: Schema.F * @return [PreparedStatement] */ protected fun prepareInsertStatement(): PreparedStatement { - val statement = StringBuilder("INSERT INTO $tableName ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") + val statement = StringBuilder("INSERT INTO \"$tableName\" ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") for (field in this.prototype.layout()) { statement.append(", \"${field.name}\"") } diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt index 5de14b540..feb20d393 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt @@ -20,7 +20,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun add(item: Retrievable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO $RETRIEVABLE_ENTITY_NAME ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO \"$RETRIEVABLE_ENTITY_NAME\" ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> stmt.setObject(1, item.id) stmt.setString(2, item.type) return stmt.executeUpdate() == 1 @@ -38,7 +38,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun addAll(items: Iterable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO $RETRIEVABLE_ENTITY_NAME ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO \"$RETRIEVABLE_ENTITY_NAME\" ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> for (item in items) { stmt.setObject(1, item.id) stmt.setString(2, item.type) @@ -116,7 +116,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun connect(relationship: Relationship): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO $RELATIONSHIP_ENTITY_NAME ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO \"$RELATIONSHIP_ENTITY_NAME\" ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> stmt.setObject(1, relationship.objectId) stmt.setString(2, relationship.predicate) stmt.setObject(3, relationship.subjectId) @@ -136,7 +136,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun connectAll(relationships: Iterable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO $RELATIONSHIP_ENTITY_NAME ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO \"$RELATIONSHIP_ENTITY_NAME\" ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> for (relationship in relationships) { stmt.setObject(1, relationship.objectId) stmt.setString(2, relationship.predicate) From 0854d6957edd37074d18e892c4429e8466ad6390 Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 15 Aug 2024 12:50:18 +0200 Subject: [PATCH 25/34] Adds retrievableId for persisting --- .../classification/ImageClassificationExtractor.kt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt index b42ea7481..9684ef944 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/classification/ImageClassificationExtractor.kt @@ -49,15 +49,16 @@ class ImageClassificationExtractor( val flatResults = this.api.analyseBatched( retrievables.flatMap { this.filterContent(it).map { it to classes } - }).map { result -> - result.mapIndexed { index, confidence -> + }).mapIndexed { idx, result -> + result.mapIndexed { idy, confidence -> LabelDescriptor( UUID.randomUUID(), - null, + retrievables[idx].id, mapOf( - "label" to Value.String(classes[index]), + "label" to Value.String(classes[idy]), "confidence" to Value.Float(confidence.value.toFloat()) ), + this.field ) }.filter { it.confidence.value >= threshold }.sortedByDescending { it.confidence.value }.take(topK) } From 7ade79b3a9b129e0de4c570c32db46e619d27ec8 Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 15 Aug 2024 13:14:42 +0200 Subject: [PATCH 26/34] removes postgres escapeing --- .../database/pgvector/PgVectorConnection.kt | 4 +-- .../descriptor/PgDescriptorInitializer.kt | 36 +++++++++---------- .../pgvector/descriptor/PgDescriptorWriter.kt | 6 ++-- .../scalar/ScalarDescriptorReader.kt | 4 +-- .../struct/StructDescriptorReader.kt | 4 +-- .../vector/VectorDescriptorReader.kt | 4 +-- .../retrievable/RetrievableInitializer.kt | 14 ++++---- .../pgvector/retrievable/RetrievableWriter.kt | 8 ++--- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt index 1b256dc45..a693a91cd 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt @@ -37,7 +37,7 @@ class PgVectorConnection(provider: PgVectorConnectionProvider, schemaName: Strin /* Create necessary schema. */ try { - this.jdbc.prepareStatement("CREATE SCHEMA \"${schemaName}\";").use { + this.jdbc.prepareStatement("CREATE SCHEMA ${schemaName};").use { it.execute() } } catch (e: SQLException) { @@ -50,7 +50,7 @@ class PgVectorConnection(provider: PgVectorConnectionProvider, schemaName: Strin } try { - this.jdbc.prepareStatement("SET search_path TO \"$schemaName\", public;").use { + this.jdbc.prepareStatement("SET search_path TO $schemaName, public;").use { it.execute() } } catch (e: SQLException) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt index 4e7879dd3..053b3c8dd 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt @@ -37,29 +37,29 @@ open class PgDescriptorInitializer(final override val field: Sch * Initializes the PostgreSQL table entity backing this [PgDescriptorInitializer]. */ override fun initialize() { - val statement = StringBuilder("CREATE TABLE IF NOT EXISTS \"${tableName}\" (") + val statement = StringBuilder("CREATE TABLE IF NOT EXISTS ${tableName} (") statement.append("$DESCRIPTOR_ID_COLUMN_NAME uuid NOT NULL, ") statement.append("$RETRIEVABLE_ID_COLUMN_NAME uuid NOT NULL, ") /* Add columns for each field in the struct. */ for (field in this.prototype.layout()) { when (field.type) { - Type.String -> statement.append("\"${field.name}\" varchar(255), ") - Type.Text -> statement.append("\"${field.name}\" text, ") - Type.Boolean -> statement.append("\"${field.name}\" boolean, ") - Type.Byte -> statement.append("\"${field.name}\" smallint, ") - Type.Short -> statement.append("\"${field.name}\" smallint, ") - Type.Int -> statement.append("\"${field.name}\" integer, ") - Type.Long -> statement.append("\"${field.name}\" bigint, ") - Type.Float -> statement.append("\"${field.name}\" real, ") - Type.Double -> statement.append("\"${field.name}\" double precision, ") - Type.Datetime -> statement.append("\"${field.name}\" datetime, ") - Type.UUID -> statement.append("\"${field.name}\" uuid, ") - is Type.BooleanVector -> statement.append("\"${field.name}\" bit(${field.type.dimensions}), ") - is Type.DoubleVector -> statement.append("\"${field.name}\" vector(${field.type.dimensions}), ") - is Type.FloatVector -> statement.append("\"${field.name}\" vector(${field.type.dimensions}), ") - is Type.IntVector -> statement.append("\"${field.name}\" vector(${field.type.dimensions}), ") - is Type.LongVector -> statement.append("\"${field.name}\" vector(${field.type.dimensions}), ") + Type.String -> statement.append("${field.name} varchar(255), ") + Type.Text -> statement.append("${field.name} text, ") + Type.Boolean -> statement.append("${field.name} boolean, ") + Type.Byte -> statement.append("${field.name} smallint, ") + Type.Short -> statement.append("${field.name} smallint, ") + Type.Int -> statement.append("${field.name} integer, ") + Type.Long -> statement.append("${field.name} bigint, ") + Type.Float -> statement.append("${field.name} real, ") + Type.Double -> statement.append("${field.name} double precision, ") + Type.Datetime -> statement.append("${field.name} datetime, ") + Type.UUID -> statement.append("${field.name} uuid, ") + is Type.BooleanVector -> statement.append("${field.name} bit(${field.type.dimensions}), ") + is Type.DoubleVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") + is Type.FloatVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") + is Type.IntVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") + is Type.LongVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") } } @@ -109,7 +109,7 @@ open class PgDescriptorInitializer(final override val field: Sch override fun deinitialize() { try { /* Create 'retrievable' entity and index. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS \"${tableName}\" CASCADE;").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS ${tableName} CASCADE;").use { it.execute() } } catch (e: SQLException) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt index 68df8a0a2..31ed62ea5 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt @@ -153,7 +153,7 @@ open class PgDescriptorWriter(final override val field: Schema.F protected fun prepareUpdateStatement(): PreparedStatement { val statement = StringBuilder("UPDATE $tableName SET $RETRIEVABLE_ID_COLUMN_NAME = ?") for (field in this.prototype.layout()) { - statement.append(", \"${field.name}\" = ?") + statement.append(", ${field.name} = ?") } statement.append("WHERE $DESCRIPTOR_ID_COLUMN_NAME = ?;") return this.connection.jdbc.prepareStatement(statement.toString()) @@ -165,9 +165,9 @@ open class PgDescriptorWriter(final override val field: Schema.F * @return [PreparedStatement] */ protected fun prepareInsertStatement(): PreparedStatement { - val statement = StringBuilder("INSERT INTO \"$tableName\" ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") + val statement = StringBuilder("INSERT INTO $tableName ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") for (field in this.prototype.layout()) { - statement.append(", \"${field.name}\"") + statement.append(", ${field.name}") } statement.append(") VALUES (?, ?") for (field in this.field.analyser.prototype(this.field).layout()) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt index c6729dd54..0e701613f 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt @@ -62,7 +62,7 @@ class ScalarDescriptorReader(field: Schema.Field<*, ScalarDescriptor<*>>, connec * @return [Sequence] of [ScalarDescriptor]s. */ private fun queryFulltext(query: SimpleFulltextQuery): Sequence> { - val statement = "SELECT * FROM \"$tableName\" WHERE $VALUE_ATTRIBUTE_NAME @@ plainto_tsquery(?)" + val statement = "SELECT * FROM $tableName WHERE $VALUE_ATTRIBUTE_NAME @@ plainto_tsquery(?)" return sequence { this@ScalarDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setString(1, query.value.value) @@ -82,7 +82,7 @@ class ScalarDescriptorReader(field: Schema.Field<*, ScalarDescriptor<*>>, connec * @return [Sequence] of [ScalarDescriptor]s. */ private fun queryBoolean(query: SimpleBooleanQuery<*>): Sequence> { - val statement = "SELECT * FROM \"$tableName\" WHERE $VALUE_ATTRIBUTE_NAME ${query.comparison.toSql()} ?" + val statement = "SELECT * FROM $tableName WHERE $VALUE_ATTRIBUTE_NAME ${query.comparison.toSql()} ?" return sequence { this@ScalarDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt index 7ac1461b6..f15b444b1 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt @@ -86,7 +86,7 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio */ private fun queryFulltext(query: SimpleFulltextQuery): Sequence { require(query.attributeName != null) { "Query attribute must not be null for a fulltext query on a struct descriptor." } - val statement = "SELECT * FROM \"$tableName\" WHERE ${query.attributeName} @@ plainto_tsquery(?)" + val statement = "SELECT * FROM $tableName WHERE ${query.attributeName} @@ plainto_tsquery(?)" return sequence { this@StructDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setString(1, query.value.value) @@ -107,7 +107,7 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio */ private fun queryBoolean(query: SimpleBooleanQuery<*>): Sequence { require(query.attributeName != null) { "Query attribute must not be null for a fulltext query on a struct descriptor." } - val statement = "SELECT * FROM \"$tableName\" WHERE ${query.attributeName} ${query.comparison.toSql()} ?" + val statement = "SELECT * FROM $tableName WHERE ${query.attributeName} ${query.comparison.toSql()} ?" return sequence { this@StructDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt index 6d69d35be..f74a60c41 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt @@ -97,7 +97,7 @@ class VectorDescriptorReader(field: Schema.Field<*, VectorDescriptor<*>>, connec */ private fun queryProximity(query: ProximityQuery<*>): Sequence> = sequence { val statement = - "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM \"${tableName}\" ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" + "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM ${tableName} ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" this@VectorDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) stmt.executeQuery().use { result -> @@ -117,7 +117,7 @@ class VectorDescriptorReader(field: Schema.Field<*, VectorDescriptor<*>>, connec private fun queryAndJoinProximity(query: ProximityQuery<*>): Sequence { val descriptors = mutableListOf, Float>>() val statement = - "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM \"${tableName}\" ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" + "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM ${tableName} ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" this@VectorDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) stmt.executeQuery().use { result -> diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableInitializer.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableInitializer.kt index afd8408f6..1e53c6e1c 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableInitializer.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableInitializer.kt @@ -18,12 +18,12 @@ internal class RetrievableInitializer(private val connection: PgVectorConnection override fun initialize() { try { /* Create 'retrievable' entity and index. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "CREATE TABLE IF NOT EXISTS \"${RETRIEVABLE_ENTITY_NAME}\" ($RETRIEVABLE_ID_COLUMN_NAME uuid NOT NULL, type VARCHAR(100), PRIMARY KEY ($RETRIEVABLE_ID_COLUMN_NAME));").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "CREATE TABLE IF NOT EXISTS ${RETRIEVABLE_ENTITY_NAME} ($RETRIEVABLE_ID_COLUMN_NAME uuid NOT NULL, type VARCHAR(100), PRIMARY KEY ($RETRIEVABLE_ID_COLUMN_NAME));").use { it.execute() } /* Create 'relationship' entity. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "CREATE TABLE IF NOT EXISTS \"${RELATIONSHIP_ENTITY_NAME}\" ($OBJECT_ID_COLUMN_NAME uuid NOT NULL, $PREDICATE_COLUMN_NAME VARCHAR(100) NOT NULL, $SUBJECT_ID_COLUMN_NAME uuid NOT NULL, PRIMARY KEY ($OBJECT_ID_COLUMN_NAME, $PREDICATE_COLUMN_NAME, $SUBJECT_ID_COLUMN_NAME), FOREIGN KEY($OBJECT_ID_COLUMN_NAME) REFERENCES $RETRIEVABLE_ENTITY_NAME($RETRIEVABLE_ID_COLUMN_NAME) ON DELETE CASCADE, FOREIGN KEY($SUBJECT_ID_COLUMN_NAME) REFERENCES $RETRIEVABLE_ENTITY_NAME($RETRIEVABLE_ID_COLUMN_NAME) ON DELETE CASCADE);") + this.connection.jdbc.prepareStatement(/* sql = postgres */ "CREATE TABLE IF NOT EXISTS ${RELATIONSHIP_ENTITY_NAME} ($OBJECT_ID_COLUMN_NAME uuid NOT NULL, $PREDICATE_COLUMN_NAME VARCHAR(100) NOT NULL, $SUBJECT_ID_COLUMN_NAME uuid NOT NULL, PRIMARY KEY ($OBJECT_ID_COLUMN_NAME, $PREDICATE_COLUMN_NAME, $SUBJECT_ID_COLUMN_NAME), FOREIGN KEY($OBJECT_ID_COLUMN_NAME) REFERENCES $RETRIEVABLE_ENTITY_NAME($RETRIEVABLE_ID_COLUMN_NAME) ON DELETE CASCADE, FOREIGN KEY($SUBJECT_ID_COLUMN_NAME) REFERENCES $RETRIEVABLE_ENTITY_NAME($RETRIEVABLE_ID_COLUMN_NAME) ON DELETE CASCADE);") .use { it.execute() } @@ -38,12 +38,12 @@ internal class RetrievableInitializer(private val connection: PgVectorConnection override fun deinitialize() { try { /* Create 'retrievable' entity and index. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS \"${RETRIEVABLE_ENTITY_NAME}\" CASCADE;").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS ${RETRIEVABLE_ENTITY_NAME} CASCADE;").use { it.execute() } /* Create 'relationship' entity. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS \"${RELATIONSHIP_ENTITY_NAME}\" CASCADE;").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS ${RELATIONSHIP_ENTITY_NAME} CASCADE;").use { it.execute() } } catch (e: SQLException) { @@ -58,14 +58,14 @@ internal class RetrievableInitializer(private val connection: PgVectorConnection */ override fun isInitialized(): Boolean { try { - this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM \"${RETRIEVABLE_ENTITY_NAME}\";").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM ${RETRIEVABLE_ENTITY_NAME};").use { it.execute() } } catch (e: SQLException) { return false } try { - this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM \"$RELATIONSHIP_ENTITY_NAME\";").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM $RELATIONSHIP_ENTITY_NAME;").use { it.execute() } } catch (e: SQLException) { @@ -79,7 +79,7 @@ internal class RetrievableInitializer(private val connection: PgVectorConnection */ override fun truncate() { try { - this.connection.jdbc.prepareStatement(/* sql = postgres */ "TRUNCATE \"${RETRIEVABLE_ENTITY_NAME}\", \"${RELATIONSHIP_ENTITY_NAME}\"").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "TRUNCATE ${RETRIEVABLE_ENTITY_NAME}, ${RELATIONSHIP_ENTITY_NAME}").use { it.execute() } } catch (e: SQLException) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt index feb20d393..5de14b540 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/retrievable/RetrievableWriter.kt @@ -20,7 +20,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun add(item: Retrievable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO \"$RETRIEVABLE_ENTITY_NAME\" ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO $RETRIEVABLE_ENTITY_NAME ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> stmt.setObject(1, item.id) stmt.setString(2, item.type) return stmt.executeUpdate() == 1 @@ -38,7 +38,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun addAll(items: Iterable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO \"$RETRIEVABLE_ENTITY_NAME\" ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO $RETRIEVABLE_ENTITY_NAME ($RETRIEVABLE_ID_COLUMN_NAME, $RETRIEVABLE_TYPE_COLUMN_NAME) VALUES (?, ?);").use { stmt -> for (item in items) { stmt.setObject(1, item.id) stmt.setString(2, item.type) @@ -116,7 +116,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun connect(relationship: Relationship): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO \"$RELATIONSHIP_ENTITY_NAME\" ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO $RELATIONSHIP_ENTITY_NAME ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> stmt.setObject(1, relationship.objectId) stmt.setString(2, relationship.predicate) stmt.setObject(3, relationship.subjectId) @@ -136,7 +136,7 @@ internal class RetrievableWriter(override val connection: PgVectorConnection): R */ override fun connectAll(relationships: Iterable): Boolean { try { - this.connection.jdbc.prepareStatement("INSERT INTO \"$RELATIONSHIP_ENTITY_NAME\" ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> + this.connection.jdbc.prepareStatement("INSERT INTO $RELATIONSHIP_ENTITY_NAME ($OBJECT_ID_COLUMN_NAME,$PREDICATE_COLUMN_NAME,$SUBJECT_ID_COLUMN_NAME) VALUES (?,?,?)").use { stmt -> for (relationship in relationships) { stmt.setObject(1, relationship.objectId) stmt.setString(2, relationship.predicate) From 1131708db751385691eee8cd81ebed948f500188 Mon Sep 17 00:00:00 2001 From: Raphael Date: Thu, 15 Aug 2024 13:26:42 +0200 Subject: [PATCH 27/34] Escapes and lowercases field names Preventing naming conflicts e.g. "end" --- .../descriptor/PgDescriptorInitializer.kt | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt index 053b3c8dd..7d20abea0 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt @@ -44,22 +44,22 @@ open class PgDescriptorInitializer(final override val field: Sch /* Add columns for each field in the struct. */ for (field in this.prototype.layout()) { when (field.type) { - Type.String -> statement.append("${field.name} varchar(255), ") - Type.Text -> statement.append("${field.name} text, ") - Type.Boolean -> statement.append("${field.name} boolean, ") - Type.Byte -> statement.append("${field.name} smallint, ") - Type.Short -> statement.append("${field.name} smallint, ") - Type.Int -> statement.append("${field.name} integer, ") - Type.Long -> statement.append("${field.name} bigint, ") - Type.Float -> statement.append("${field.name} real, ") - Type.Double -> statement.append("${field.name} double precision, ") - Type.Datetime -> statement.append("${field.name} datetime, ") - Type.UUID -> statement.append("${field.name} uuid, ") - is Type.BooleanVector -> statement.append("${field.name} bit(${field.type.dimensions}), ") - is Type.DoubleVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") - is Type.FloatVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") - is Type.IntVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") - is Type.LongVector -> statement.append("${field.name} vector(${field.type.dimensions}), ") + Type.String -> statement.append("\"${field.name.lowercase()}\" varchar(255), ") + Type.Text -> statement.append("\"${field.name.lowercase()}\" text, ") + Type.Boolean -> statement.append("\"${field.name.lowercase()}\" boolean, ") + Type.Byte -> statement.append("\"${field.name.lowercase()}\" smallint, ") + Type.Short -> statement.append("\"${field.name.lowercase()}\" smallint, ") + Type.Int -> statement.append("\"${field.name.lowercase()}\" integer, ") + Type.Long -> statement.append("\"${field.name.lowercase()}\" bigint, ") + Type.Float -> statement.append("\"${field.name.lowercase()}\" real, ") + Type.Double -> statement.append("\"${field.name.lowercase()}\" double precision, ") + Type.Datetime -> statement.append("\"${field.name.lowercase()}\" datetime, ") + Type.UUID -> statement.append("\"${field.name.lowercase()}\" uuid, ") + is Type.BooleanVector -> statement.append("\"${field.name.lowercase()}\" bit(${field.type.dimensions}), ") + is Type.DoubleVector -> statement.append("\"${field.name.lowercase()}\" vector(${field.type.dimensions}), ") + is Type.FloatVector -> statement.append("\"${field.name.lowercase()}\" vector(${field.type.dimensions}), ") + is Type.IntVector -> statement.append("\"${field.name.lowercase()}\" vector(${field.type.dimensions}), ") + is Type.LongVector -> statement.append("\"${field.name.lowercase()}\" vector(${field.type.dimensions}), ") } } From 29f800b5b1128def051b3af3a26f5edf91488861 Mon Sep 17 00:00:00 2001 From: Raphael Date: Fri, 16 Aug 2024 11:47:12 +0200 Subject: [PATCH 28/34] escapes schema in PGVectorConnection --- .../engine/database/pgvector/PgVectorConnection.kt | 4 ++-- .../pgvector/descriptor/PgDescriptorInitializer.kt | 10 +++++----- .../src/test/resources/test-schema-postgres.json | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt index a693a91cd..a88aa0731 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/PgVectorConnection.kt @@ -37,7 +37,7 @@ class PgVectorConnection(provider: PgVectorConnectionProvider, schemaName: Strin /* Create necessary schema. */ try { - this.jdbc.prepareStatement("CREATE SCHEMA ${schemaName};").use { + this.jdbc.prepareStatement("CREATE SCHEMA \"${schemaName.lowercase()}\";").use { it.execute() } } catch (e: SQLException) { @@ -50,7 +50,7 @@ class PgVectorConnection(provider: PgVectorConnectionProvider, schemaName: Strin } try { - this.jdbc.prepareStatement("SET search_path TO $schemaName, public;").use { + this.jdbc.prepareStatement("SET search_path TO \"${schemaName.lowercase()}\", public;").use { it.execute() } } catch (e: SQLException) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt index 7d20abea0..2a522b216 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorInitializer.kt @@ -37,7 +37,7 @@ open class PgDescriptorInitializer(final override val field: Sch * Initializes the PostgreSQL table entity backing this [PgDescriptorInitializer]. */ override fun initialize() { - val statement = StringBuilder("CREATE TABLE IF NOT EXISTS ${tableName} (") + val statement = StringBuilder("CREATE TABLE IF NOT EXISTS \"${tableName.lowercase()}\" (") statement.append("$DESCRIPTOR_ID_COLUMN_NAME uuid NOT NULL, ") statement.append("$RETRIEVABLE_ID_COLUMN_NAME uuid NOT NULL, ") @@ -97,7 +97,7 @@ open class PgDescriptorInitializer(final override val field: Sch } this.connection.jdbc.prepareStatement(/* sql = postgres */ indexStatement).use { it.execute() } } catch (e: SQLException) { - LOGGER.error(e) { "Failed to create index ${index.type} for entity '$tableName' due to exception." } + LOGGER.error(e) { "Failed to create index ${index.type} for entity \"${tableName.lowercase()}\" due to exception." } throw e } } @@ -109,7 +109,7 @@ open class PgDescriptorInitializer(final override val field: Sch override fun deinitialize() { try { /* Create 'retrievable' entity and index. */ - this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS ${tableName} CASCADE;").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "DROP TABLE IF EXISTS \"${tableName.lowercase()}\" CASCADE;").use { it.execute() } } catch (e: SQLException) { @@ -124,7 +124,7 @@ open class PgDescriptorInitializer(final override val field: Sch */ override fun isInitialized(): Boolean { try { - this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM $tableName").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "SELECT count(*) FROM \"${tableName.lowercase()}\"").use { it.execute() } } catch (e: SQLException) { @@ -138,7 +138,7 @@ open class PgDescriptorInitializer(final override val field: Sch */ override fun truncate() { try { - this.connection.jdbc.prepareStatement(/* sql = postgres */ "TRUNCATE $tableName").use { + this.connection.jdbc.prepareStatement(/* sql = postgres */ "TRUNCATE \"${tableName.lowercase()}\"").use { it.execute() } } catch (e: SQLException) { diff --git a/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json b/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json index 50f5d0f5a..e089edd92 100644 --- a/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json +++ b/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json @@ -5,7 +5,7 @@ "host": "127.0.0.1", "port": "5432", "username": "postgres", - "password": "vitrivr" + "password": "admin" } }, "fields": { From 71270340a57fb4869b746cac2f80f3fc9bb6210d Mon Sep 17 00:00:00 2001 From: Raphael Date: Fri, 16 Aug 2024 12:59:29 +0200 Subject: [PATCH 29/34] changes credentials for test db --- .../engine/database/pgvector/Constants.kt | 8 ++--- .../pgvector/descriptor/PgDescriptorWriter.kt | 18 +++++----- .../scalar/ScalarDescriptorReader.kt | 4 +-- .../struct/StructDescriptorReader.kt | 36 +++++++++---------- .../test/resources/test-schema-postgres.json | 2 +- 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/Constants.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/Constants.kt index 735a41cde..07df55db5 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/Constants.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/Constants.kt @@ -4,7 +4,7 @@ package org.vitrivr.engine.database.pgvector const val RETRIEVABLE_ENTITY_NAME = "retrievable" /** The column name of a retrievable ID. */ -const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableId" +const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableid" /** The column name of a retrievable ID. */ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" @@ -13,10 +13,10 @@ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" const val RELATIONSHIP_ENTITY_NAME = "relationships" /** The column name of a retrievable ID. */ -const val SUBJECT_ID_COLUMN_NAME = "subjectId" +const val SUBJECT_ID_COLUMN_NAME = "subjectid" /** The column name of a retrievable ID. */ -const val OBJECT_ID_COLUMN_NAME = "objectId" +const val OBJECT_ID_COLUMN_NAME = "objectid" /** The column name of a retrievable ID. */ const val PREDICATE_COLUMN_NAME = "predicate" @@ -25,7 +25,7 @@ const val PREDICATE_COLUMN_NAME = "predicate" const val DESCRIPTOR_ENTITY_PREFIX = "descriptor" /** The column name of a descriptor ID. */ -const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorId" +const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorid" /** The column name used to describe a distance.*/ const val DISTANCE_COLUMN_NAME = "distance" \ No newline at end of file diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt index 31ed62ea5..6eb1529a2 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/PgDescriptorWriter.kt @@ -43,7 +43,7 @@ open class PgDescriptorWriter(final override val field: Schema.F return stmt.executeUpdate() == 1 } } catch (e: SQLException) { - LOGGER.error(e) { "Failed to INSERT descriptor ${item.id} into '$tableName' due to SQL error." } + LOGGER.error(e) { "Failed to INSERT descriptor ${item.id} into \"${tableName.lowercase()}\" due to SQL error." } return false } } @@ -74,7 +74,7 @@ open class PgDescriptorWriter(final override val field: Schema.F return stmt.executeBatch().all { it == 1 } } } catch (e: SQLException) { - LOGGER.error(e) { "Failed to INSERT descriptors into '$tableName' due to SQL error." } + LOGGER.error(e) { "Failed to INSERT descriptors into \"${tableName.lowercase()}\" due to SQL error." } return false } } @@ -102,7 +102,7 @@ open class PgDescriptorWriter(final override val field: Schema.F return stmt.execute() } } catch (e: SQLException) { - LOGGER.error(e) { "Failed to UPDATE descriptors in '$tableName' due to SQL error." } + LOGGER.error(e) { "Failed to UPDATE descriptors in \"${tableName.lowercase()}\" due to SQL error." } return false } } @@ -115,7 +115,7 @@ open class PgDescriptorWriter(final override val field: Schema.F */ override fun delete(item: D): Boolean { try { - this.connection.jdbc.prepareStatement("DELETE FROM $tableName WHERE $DESCRIPTOR_ID_COLUMN_NAME = ?;").use { stmt -> + this.connection.jdbc.prepareStatement("DELETE FROM \"${tableName.lowercase()}\" WHERE $DESCRIPTOR_ID_COLUMN_NAME = ?;").use { stmt -> stmt.setObject(1, item.id) return stmt.executeUpdate() == 1 } @@ -133,7 +133,7 @@ open class PgDescriptorWriter(final override val field: Schema.F */ override fun deleteAll(items: Iterable): Boolean { try { - this.connection.jdbc.prepareStatement("DELETE FROM $tableName WHERE $DESCRIPTOR_ID_COLUMN_NAME = ANY (?);").use { stmt -> + this.connection.jdbc.prepareStatement("DELETE FROM \"${tableName.lowercase()}\" WHERE $DESCRIPTOR_ID_COLUMN_NAME = ANY (?);").use { stmt -> val values = items.map { it.id }.toTypedArray() stmt.setArray(1, this.connection.jdbc.createArrayOf("uuid", values)) return stmt.executeUpdate() > 0 @@ -151,9 +151,9 @@ open class PgDescriptorWriter(final override val field: Schema.F * @return [PreparedStatement] */ protected fun prepareUpdateStatement(): PreparedStatement { - val statement = StringBuilder("UPDATE $tableName SET $RETRIEVABLE_ID_COLUMN_NAME = ?") + val statement = StringBuilder("UPDATE \"${tableName.lowercase()}\" SET $RETRIEVABLE_ID_COLUMN_NAME = ?") for (field in this.prototype.layout()) { - statement.append(", ${field.name} = ?") + statement.append(", \"${field.name.lowercase()}\" = ?") } statement.append("WHERE $DESCRIPTOR_ID_COLUMN_NAME = ?;") return this.connection.jdbc.prepareStatement(statement.toString()) @@ -165,9 +165,9 @@ open class PgDescriptorWriter(final override val field: Schema.F * @return [PreparedStatement] */ protected fun prepareInsertStatement(): PreparedStatement { - val statement = StringBuilder("INSERT INTO $tableName ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") + val statement = StringBuilder("INSERT INTO \"${tableName.lowercase()}\" ($DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME") for (field in this.prototype.layout()) { - statement.append(", ${field.name}") + statement.append(", \"${field.name.lowercase()}\"") } statement.append(") VALUES (?, ?") for (field in this.field.analyser.prototype(this.field).layout()) { diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt index 0e701613f..a50964dbe 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/scalar/ScalarDescriptorReader.kt @@ -62,7 +62,7 @@ class ScalarDescriptorReader(field: Schema.Field<*, ScalarDescriptor<*>>, connec * @return [Sequence] of [ScalarDescriptor]s. */ private fun queryFulltext(query: SimpleFulltextQuery): Sequence> { - val statement = "SELECT * FROM $tableName WHERE $VALUE_ATTRIBUTE_NAME @@ plainto_tsquery(?)" + val statement = "SELECT * FROM \"${tableName.lowercase()}\" WHERE $VALUE_ATTRIBUTE_NAME @@ plainto_tsquery(?)" return sequence { this@ScalarDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setString(1, query.value.value) @@ -82,7 +82,7 @@ class ScalarDescriptorReader(field: Schema.Field<*, ScalarDescriptor<*>>, connec * @return [Sequence] of [ScalarDescriptor]s. */ private fun queryBoolean(query: SimpleBooleanQuery<*>): Sequence> { - val statement = "SELECT * FROM $tableName WHERE $VALUE_ATTRIBUTE_NAME ${query.comparison.toSql()} ?" + val statement = "SELECT * FROM \"${tableName.lowercase()}\" WHERE $VALUE_ATTRIBUTE_NAME ${query.comparison.toSql()} ?" return sequence { this@ScalarDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt index f15b444b1..522fec14f 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt @@ -55,22 +55,22 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio /* Append dynamic parameters of struct. */ for (field in this.prototype.layout()) { values[field.name] = when(field.type) { - Type.String -> result.getString(field.name)?.let { Value.String(it) } - Type.Text -> result.getString(field.name)?.let { Value.Text(it) } - Type.Boolean -> result.getBoolean(field.name).let { Value.Boolean(it) } - Type.Byte -> result.getByte(field.name).let { Value.Byte(it) } - Type.Short -> result.getShort(field.name).let { Value.Short(it) } - Type.Int -> result.getInt(field.name).let { Value.Int(it) } - Type.Long -> result.getLong(field.name).let { Value.Long(it) } - Type.Float -> result.getFloat(field.name).let { Value.Float(it) } - Type.Double -> result.getDouble(field.name).let { Value.Double(it) } - Type.Datetime -> result.getDate(field.name).toInstant().let { Value.DateTime(Date(it.toEpochMilli())) } - Type.UUID -> result.getObject(field.name, UUID::class.java).let { Value.UUIDValue(it) } - is Type.BooleanVector -> result.getObject(field.name, PgBitVector::class.java).toBooleanVector() - is Type.IntVector -> result.getObject(field.name, PgVector::class.java)?.toIntVector() - is Type.LongVector -> result.getObject(field.name, PgVector::class.java)?.toLongVector() - is Type.FloatVector -> result.getObject(field.name, PgVector::class.java)?.toFloatVector() - is Type.DoubleVector -> result.getObject(field.name, PgVector::class.java)?.toDoubleVector() + Type.String -> result.getString(field.name.lowercase())?.let { Value.String(it) } + Type.Text -> result.getString(field.name.lowercase())?.let { Value.Text(it) } + Type.Boolean -> result.getBoolean(field.name.lowercase()).let { Value.Boolean(it) } + Type.Byte -> result.getByte(field.name.lowercase()).let { Value.Byte(it) } + Type.Short -> result.getShort(field.name.lowercase()).let { Value.Short(it) } + Type.Int -> result.getInt(field.name.lowercase()).let { Value.Int(it) } + Type.Long -> result.getLong(field.name.lowercase()).let { Value.Long(it) } + Type.Float -> result.getFloat(field.name.lowercase()).let { Value.Float(it) } + Type.Double -> result.getDouble(field.name.lowercase()).let { Value.Double(it) } + Type.Datetime -> result.getDate(field.name.lowercase()).toInstant().let { Value.DateTime(Date(it.toEpochMilli())) } + Type.UUID -> result.getObject(field.name.lowercase(), UUID::class.java).let { Value.UUIDValue(it) } + is Type.BooleanVector -> result.getObject(field.name.lowercase(), PgBitVector::class.java).toBooleanVector() + is Type.IntVector -> result.getObject(field.name.lowercase(), PgVector::class.java)?.toIntVector() + is Type.LongVector -> result.getObject(field.name.lowercase(), PgVector::class.java)?.toLongVector() + is Type.FloatVector -> result.getObject(field.name.lowercase(), PgVector::class.java)?.toFloatVector() + is Type.DoubleVector -> result.getObject(field.name.lowercase(), PgVector::class.java)?.toDoubleVector() } as Value<*>? } @@ -86,7 +86,7 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio */ private fun queryFulltext(query: SimpleFulltextQuery): Sequence { require(query.attributeName != null) { "Query attribute must not be null for a fulltext query on a struct descriptor." } - val statement = "SELECT * FROM $tableName WHERE ${query.attributeName} @@ plainto_tsquery(?)" + val statement = "SELECT * FROM \"${tableName.lowercase()}\" WHERE ${query.attributeName} @@ plainto_tsquery(?)" return sequence { this@StructDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setString(1, query.value.value) @@ -107,7 +107,7 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio */ private fun queryBoolean(query: SimpleBooleanQuery<*>): Sequence { require(query.attributeName != null) { "Query attribute must not be null for a fulltext query on a struct descriptor." } - val statement = "SELECT * FROM $tableName WHERE ${query.attributeName} ${query.comparison.toSql()} ?" + val statement = "SELECT * FROM field.name.lowercase() WHERE ${query.attributeName} ${query.comparison.toSql()} ?" return sequence { this@StructDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) diff --git a/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json b/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json index e089edd92..50f5d0f5a 100644 --- a/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json +++ b/vitrivr-engine-module-pgvector/src/test/resources/test-schema-postgres.json @@ -5,7 +5,7 @@ "host": "127.0.0.1", "port": "5432", "username": "postgres", - "password": "admin" + "password": "vitrivr" } }, "fields": { From 7b79384d7c73e9ce2a22e788357128509af58825 Mon Sep 17 00:00:00 2001 From: Raphael Date: Fri, 16 Aug 2024 13:08:12 +0200 Subject: [PATCH 30/34] maintains lc pg naming convention --- .../org/vitrivr/engine/plugin/cottontaildb/Common.kt | 8 ++++---- .../org/vitrivr/engine/database/jsonl/JsonlConnection.kt | 4 ++-- .../pgvector/descriptor/vector/VectorDescriptorReader.kt | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt index 911a377cc..5e3d224df 100644 --- a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt +++ b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt @@ -16,7 +16,7 @@ import java.util.* const val RETRIEVABLE_ENTITY_NAME = "retrievable" /** The column name of a retrievable ID. */ -const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableId" +const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableid" /** The column name of a retrievable ID. */ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" @@ -25,10 +25,10 @@ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" const val RELATIONSHIP_ENTITY_NAME = "relationships" /** The column name of a retrievable ID. */ -const val SUBJECT_ID_COLUMN_NAME = "subjectId" +const val SUBJECT_ID_COLUMN_NAME = "subjectid" /** The column name of a retrievable ID. */ -const val OBJECT_ID_COLUMN_NAME = "objectId" +const val OBJECT_ID_COLUMN_NAME = "objectid" /** The column name of a retrievable ID. */ const val PREDICATE_COLUMN_NAME = "predicate" @@ -37,7 +37,7 @@ const val PREDICATE_COLUMN_NAME = "predicate" const val DESCRIPTOR_ENTITY_PREFIX = "descriptor" /** The column name of a descriptor ID. */ -const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorId" +const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorid" /** The column name used to describe a distance.*/ const val DISTANCE_COLUMN_NAME = "distance" diff --git a/vitrivr-engine-module-jsonl/src/main/kotlin/org/vitrivr/engine/database/jsonl/JsonlConnection.kt b/vitrivr-engine-module-jsonl/src/main/kotlin/org/vitrivr/engine/database/jsonl/JsonlConnection.kt index 8cf018bd6..eb5b42fe6 100644 --- a/vitrivr-engine-module-jsonl/src/main/kotlin/org/vitrivr/engine/database/jsonl/JsonlConnection.kt +++ b/vitrivr-engine-module-jsonl/src/main/kotlin/org/vitrivr/engine/database/jsonl/JsonlConnection.kt @@ -48,10 +48,10 @@ class JsonlConnection( companion object { /** The column name of a retrievable ID. */ - const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableId" + const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableid" /** The column name of a descriptor ID. */ - const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorId" + const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorid" } } \ No newline at end of file diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt index f74a60c41..5176b5d3f 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/vector/VectorDescriptorReader.kt @@ -97,7 +97,7 @@ class VectorDescriptorReader(field: Schema.Field<*, VectorDescriptor<*>>, connec */ private fun queryProximity(query: ProximityQuery<*>): Sequence> = sequence { val statement = - "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM ${tableName} ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" + "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM \"${tableName.lowercase()}\" ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" this@VectorDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) stmt.executeQuery().use { result -> @@ -117,7 +117,7 @@ class VectorDescriptorReader(field: Schema.Field<*, VectorDescriptor<*>>, connec private fun queryAndJoinProximity(query: ProximityQuery<*>): Sequence { val descriptors = mutableListOf, Float>>() val statement = - "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM ${tableName} ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" + "SELECT $DESCRIPTOR_ID_COLUMN_NAME, $RETRIEVABLE_ID_COLUMN_NAME, $VECTOR_ATTRIBUTE_NAME, $VECTOR_ATTRIBUTE_NAME ${query.distance.toSql()} ? AS $DISTANCE_COLUMN_NAME FROM \"${tableName.lowercase()}\" ORDER BY $DISTANCE_COLUMN_NAME ${query.order} LIMIT ${query.k}" this@VectorDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) stmt.executeQuery().use { result -> From 29b105e2b1224412e7959b74c47e88c09420e133 Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 19 Aug 2024 13:29:41 +0200 Subject: [PATCH 31/34] bugfix --- .../pgvector/descriptor/struct/StructDescriptorReader.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt index 522fec14f..d0f5bc8e3 100644 --- a/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt +++ b/vitrivr-engine-module-pgvector/src/main/kotlin/org/vitrivr/engine/database/pgvector/descriptor/struct/StructDescriptorReader.kt @@ -107,7 +107,7 @@ class StructDescriptorReader(field: Schema.Field<*, StructDescriptor>, connectio */ private fun queryBoolean(query: SimpleBooleanQuery<*>): Sequence { require(query.attributeName != null) { "Query attribute must not be null for a fulltext query on a struct descriptor." } - val statement = "SELECT * FROM field.name.lowercase() WHERE ${query.attributeName} ${query.comparison.toSql()} ?" + val statement = "SELECT * FROM \"${tableName.lowercase()}\" WHERE ${query.attributeName} ${query.comparison.toSql()} ?" return sequence { this@StructDescriptorReader.connection.jdbc.prepareStatement(statement).use { stmt -> stmt.setValue(1, query.value) From bf8b78faa4b19febbefb34e8f9f0521d0c0f03d9 Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 19 Aug 2024 14:06:40 +0200 Subject: [PATCH 32/34] bugfix --- .../org/vitrivr/engine/plugin/cottontaildb/Common.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt index 5e3d224df..911a377cc 100644 --- a/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt +++ b/vitrivr-engine-module-cottontaildb/src/main/kotlin/org/vitrivr/engine/plugin/cottontaildb/Common.kt @@ -16,7 +16,7 @@ import java.util.* const val RETRIEVABLE_ENTITY_NAME = "retrievable" /** The column name of a retrievable ID. */ -const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableid" +const val RETRIEVABLE_ID_COLUMN_NAME = "retrievableId" /** The column name of a retrievable ID. */ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" @@ -25,10 +25,10 @@ const val RETRIEVABLE_TYPE_COLUMN_NAME = "type" const val RELATIONSHIP_ENTITY_NAME = "relationships" /** The column name of a retrievable ID. */ -const val SUBJECT_ID_COLUMN_NAME = "subjectid" +const val SUBJECT_ID_COLUMN_NAME = "subjectId" /** The column name of a retrievable ID. */ -const val OBJECT_ID_COLUMN_NAME = "objectid" +const val OBJECT_ID_COLUMN_NAME = "objectId" /** The column name of a retrievable ID. */ const val PREDICATE_COLUMN_NAME = "predicate" @@ -37,7 +37,7 @@ const val PREDICATE_COLUMN_NAME = "predicate" const val DESCRIPTOR_ENTITY_PREFIX = "descriptor" /** The column name of a descriptor ID. */ -const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorid" +const val DESCRIPTOR_ID_COLUMN_NAME = "descriptorId" /** The column name used to describe a distance.*/ const val DISTANCE_COLUMN_NAME = "distance" From 5fecf897998cc4be757bedeecbd6374bb831ac34 Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 19 Aug 2024 16:20:43 +0200 Subject: [PATCH 33/34] debug CI --- .../engine/base/features/external/common/FesExtractor.kt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index 3647ab585..6ebc4d0b6 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -33,6 +33,8 @@ abstract class FesExtractor, D : Descriptor<*>>( ) : AbstractExtractor(input, analyser, field) { /** Host of the FES API. */ + private val contentSources = parameters[CONTENT_AUTHORS_KEY]?.split(",")?.toSet() + protected val host: String get() = this.parameters[HOST_PARAMETER_NAME] ?: HOST_PARAMETER_DEFAULT From 3f0076fe383b84728de05878736ea4c9f65a2a60 Mon Sep 17 00:00:00 2001 From: Raphael Date: Mon, 19 Aug 2024 16:46:45 +0200 Subject: [PATCH 34/34] adds change due immutable retrievable --- .../engine/core/features/AbstractBatchedExtractor.kt | 3 ++- .../engine/core/model/descriptor/scalar/TextDescriptor.kt | 1 + .../engine/base/features/external/common/FesExtractor.kt | 3 +-- .../features/external/implementations/asr/ASRExtractor.kt | 2 +- .../implementations/caption/ImageCaptionExtractor.kt | 2 +- .../implementations/dense/DenseEmbeddingExtractor.kt | 8 ++++---- .../features/external/implementations/ocr/OCRExtractor.kt | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt index 31e7a38b1..6056a2ef4 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/features/AbstractBatchedExtractor.kt @@ -19,10 +19,11 @@ import org.vitrivr.engine.core.operators.ingest.Extractor * @author Ralph Gasser * @version 1.0.0 */ -abstract class AbstractBatchedExtractor, D : Descriptor<*>>(final override val input: Operator, final override val field: Schema.Field?, private val bufferSize: Int = 100) : Extractor { +abstract class AbstractBatchedExtractor, D : Descriptor<*>>(final override val input: Operator, final override val analyser: Analyser, final override val field: Schema.Field?, private val bufferSize: Int = 100) : Extractor { private val logger: KLogger = KotlinLogging.logger {} + init { require(field == null || this.field.analyser == this.analyser) { "Field and analyser do not match! This is a programmer's error!" } } diff --git a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/descriptor/scalar/TextDescriptor.kt b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/descriptor/scalar/TextDescriptor.kt index 49d7f23f6..0131da118 100644 --- a/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/descriptor/scalar/TextDescriptor.kt +++ b/vitrivr-engine-core/src/main/kotlin/org/vitrivr/engine/core/model/descriptor/scalar/TextDescriptor.kt @@ -26,6 +26,7 @@ data class TextDescriptor( companion object { private val SCHEMA = listOf(Attribute(VALUE_ATTRIBUTE_NAME, Type.Text)) } + /** * Returns the [Attribute] [List ]of this [StringDescriptor]. diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt index 6ebc4d0b6..879e80fb6 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/common/FesExtractor.kt @@ -30,8 +30,7 @@ abstract class FesExtractor, D : Descriptor<*>>( field: Schema.Field?, analyser: ExternalFesAnalyser, protected val parameters: Map, -) : AbstractExtractor(input, analyser, field) { - /** Host of the FES API. */ +) : AbstractBatchedExtractor(input, analyser, field, parameters["batchSize"]?.toIntOrNull() ?: 1) { private val contentSources = parameters[CONTENT_AUTHORS_KEY]?.split(",")?.toSet() diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt index ef1ee891a..07e2d3555 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/asr/ASRExtractor.kt @@ -40,7 +40,7 @@ class ASRExtractor( return retrievables.map { retrievable -> this.filterContent(retrievable).map { - flatResults[index++].also { it.retrievableId = retrievable.id } + flatResults[index++].also { TextDescriptor(it.id, retrievable.id, it.value, it.field) } } } } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt index 8a130e283..8b62b41f9 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/caption/ImageCaptionExtractor.kt @@ -91,7 +91,7 @@ class ImageCaptionExtractor( if (it !is ImageContent) { null } else{ - flatResults[index++].also { it.retrievableId = retrievable.id } + flatResults[index++].also { TextDescriptor(it.id, retrievable.id, it.value, it.field) } } }.filterNotNull() } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt index 5594d5e47..06740c068 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/dense/DenseEmbeddingExtractor.kt @@ -9,6 +9,7 @@ import org.vitrivr.engine.core.model.content.element.ContentElement import org.vitrivr.engine.core.model.content.element.ImageContent import org.vitrivr.engine.core.model.content.element.TextContent import org.vitrivr.engine.core.model.descriptor.Descriptor +import org.vitrivr.engine.core.model.descriptor.scalar.TextDescriptor import org.vitrivr.engine.core.model.descriptor.vector.FloatVectorDescriptor import org.vitrivr.engine.core.model.metamodel.Schema import org.vitrivr.engine.core.model.retrievable.Retrievable @@ -65,12 +66,11 @@ class DenseEmbeddingExtractor( return retrievables.indices.map { index -> val descriptors = mutableListOf() textResultMap[index]?.let { - it.retrievableId = retrievables[index].id - descriptors.add(it) + + descriptors.add(FloatVectorDescriptor(it.id, retrievables[index].id, it.vector, it.field)) } imageResultMap[index]?.let { - it.retrievableId = retrievables[index].id - descriptors.add(it) + descriptors.add(FloatVectorDescriptor(it.id, retrievables[index].id, it.vector, it.field)) } descriptors } diff --git a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt index 77a2acee2..22f368e4f 100644 --- a/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt +++ b/vitrivr-engine-module-fes/src/main/kotlin/org/vitrivr/engine/base/features/external/implementations/ocr/OCRExtractor.kt @@ -42,7 +42,7 @@ class OCRExtractor( return retrievables.map { retrievable -> this.filterContent(retrievable).map { - flatResults[index++].also { it.retrievableId = retrievable.id } + flatResults[index++].also { TextDescriptor(it.id, retrievable.id, it.value, it.field) } } } }