diff --git a/docs/reference/query-dsl/semantic-query.asciidoc b/docs/reference/query-dsl/semantic-query.asciidoc index 11e19d6356081..914f4429f7f9c 100644 --- a/docs/reference/query-dsl/semantic-query.asciidoc +++ b/docs/reference/query-dsl/semantic-query.asciidoc @@ -117,79 +117,3 @@ GET my-index/_search } ------------------------------------------------------------ // TEST[skip: Requires inference endpoints] - - -[discrete] -[[advanced-search]] -==== Advanced search on `semantic_text` fields - -The `semantic` query uses default settings for searching on `semantic_text` fields for ease of use. -If you want to fine-tune a search on a `semantic_text` field, you need to know the task type used by the `inference_id` configured in `semantic_text`. -You can find the task type using the <>, and check the `task_type` associated with the {infer} service. -Depending on the `task_type`, use either the <> or the <> query for greater flexibility and customization. - -NOTE: While it is possible to use the `sparse_vector` query or the `knn` query -on a `semantic_text` field, it is not supported to use the `semantic_query` on a -`sparse_vector` or `dense_vector` field type. - - -[discrete] -[[search-sparse-inference]] -===== Search with `sparse_embedding` inference - -When the {infer} endpoint uses a `sparse_embedding` model, you can use a <> on a <> field in the following way: - -[source,console] ------------------------------------------------------------- -GET test-index/_search -{ - "query": { - "nested": { - "path": "inference_field.inference.chunks", - "query": { - "sparse_vector": { - "field": "inference_field.inference.chunks.embeddings", - "inference_id": "my-inference-id", - "query": "mountain lake" - } - } - } - } -} ------------------------------------------------------------- -// TEST[skip: Requires inference endpoints] - -You can customize the `sparse_vector` query to include specific settings, like <>. - - -[discrete] -[[search-text-inferece]] -===== Search with `text_embedding` inference - -When the {infer} endpoint uses a `text_embedding` model, you can use a <> on a `semantic_text` field in the following way: - -[source,console] ------------------------------------------------------------- -GET test-index/_search -{ - "query": { - "nested": { - "path": "inference_field.inference.chunks", - "query": { - "knn": { - "field": "inference_field.inference.chunks.embeddings", - "query_vector_builder": { - "text_embedding": { - "model_id": "my_inference_id", - "model_text": "mountain lake" - } - } - } - } - } - } -} ------------------------------------------------------------- -// TEST[skip: Requires inference endpoints] - -You can customize the `knn` query to include specific settings, like `num_candidates` and `k`. diff --git a/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc b/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc index ce69988388f6e..50a9da4af2fba 100644 --- a/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc +++ b/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc @@ -151,89 +151,7 @@ GET semantic-embeddings/_search <2> The query text. As a result, you receive the top 10 documents that are closest in meaning to the -query from the `semantic-embedding` index: - -[source,console-result] ------------------------------------------------------------- -"hits": [ - { - "_index": "semantic-embeddings", - "_id": "Jy5065EBBFPLbFsdh_f9", - "_score": 21.487484, - "_source": { - "id": 8836652, - "content": { - "text": "There are a few foods and food groups that will help to fight inflammation and delayed onset muscle soreness (both things that are inevitable after a long, hard workout) when you incorporate them into your postworkout eats, whether immediately after your run or at a meal later in the day. Advertisement. Advertisement.", - "inference": { - "inference_id": "my-elser-endpoint", - "model_settings": { - "task_type": "sparse_embedding" - }, - "chunks": [ - { - "text": "There are a few foods and food groups that will help to fight inflammation and delayed onset muscle soreness (both things that are inevitable after a long, hard workout) when you incorporate them into your postworkout eats, whether immediately after your run or at a meal later in the day. Advertisement. Advertisement.", - "embeddings": { - (...) - } - } - ] - } - } - } - }, - { - "_index": "semantic-embeddings", - "_id": "Ji5065EBBFPLbFsdh_f9", - "_score": 18.211695, - "_source": { - "id": 8836651, - "content": { - "text": "During Your Workout. There are a few things you can do during your workout to help prevent muscle injury and soreness. According to personal trainer and writer for Iron Magazine, Marc David, doing warm-ups and cool-downs between sets can help keep muscle soreness to a minimum.", - "inference": { - "inference_id": "my-elser-endpoint", - "model_settings": { - "task_type": "sparse_embedding" - }, - "chunks": [ - { - "text": "During Your Workout. There are a few things you can do during your workout to help prevent muscle injury and soreness. According to personal trainer and writer for Iron Magazine, Marc David, doing warm-ups and cool-downs between sets can help keep muscle soreness to a minimum.", - "embeddings": { - (...) - } - } - ] - } - } - } - }, - { - "_index": "semantic-embeddings", - "_id": "Wi5065EBBFPLbFsdh_b9", - "_score": 13.089405, - "_source": { - "id": 8800197, - "content": { - "text": "This is especially important if the soreness is due to a weightlifting routine. For this time period, do not exert more than around 50% of the level of effort (weight, distance and speed) that caused the muscle groups to be sore.", - "inference": { - "inference_id": "my-elser-endpoint", - "model_settings": { - "task_type": "sparse_embedding" - }, - "chunks": [ - { - "text": "This is especially important if the soreness is due to a weightlifting routine. For this time period, do not exert more than around 50% of the level of effort (weight, distance and speed) that caused the muscle groups to be sore.", - "embeddings": { - (...) - } - } - ] - } - } - } - } -] ------------------------------------------------------------- -// NOTCONSOLE +query from the `semantic-embedding` index. [discrete] [[semantic-text-further-examples]] diff --git a/muted-tests.yml b/muted-tests.yml index 685ce6e389d49..8d91d03f0ea30 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -302,6 +302,9 @@ tests: - class: org.elasticsearch.xpack.inference.InferenceRestIT method: test {p0=inference/30_semantic_text_inference/Calculates embeddings using the default ELSER 2 endpoint} issue: https://github.com/elastic/elasticsearch/issues/117349 +- class: org.elasticsearch.xpack.inference.InferenceRestIT + method: test {p0=inference/30_semantic_text_inference_bwc/Calculates embeddings using the default ELSER 2 endpoint} + issue: https://github.com/elastic/elasticsearch/issues/117349 - class: org.elasticsearch.search.basic.SearchWithRandomDisconnectsIT method: testSearchWithRandomDisconnects issue: https://github.com/elastic/elasticsearch/issues/116175 diff --git a/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java b/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java index 74143cc5c059b..89cee714a9ff2 100644 --- a/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java +++ b/server/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java @@ -48,9 +48,11 @@ import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.get.GetResult; import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.MapperException; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MappingLookup; +import org.elasticsearch.index.mapper.RoutingFieldMapper; import org.elasticsearch.index.mapper.SourceToParse; import org.elasticsearch.index.seqno.SequenceNumbers; import org.elasticsearch.index.shard.IndexShard; @@ -326,7 +328,8 @@ static boolean executeBulkItemRequest( if (opType == DocWriteRequest.OpType.UPDATE) { final UpdateRequest updateRequest = (UpdateRequest) context.getCurrent(); try { - updateResult = updateHelper.prepare(updateRequest, context.getPrimary(), nowInMillisSupplier); + var gFields = getStoredFieldsSpec(context.getPrimary()); + updateResult = updateHelper.prepare(updateRequest, context.getPrimary(), nowInMillisSupplier, gFields); } catch (Exception failure) { // we may fail translating a update to index or delete operation // we use index result to communicate failure while translating update request @@ -401,6 +404,16 @@ static boolean executeBulkItemRequest( return true; } + private static String[] getStoredFieldsSpec(IndexShard indexShard) { + if (InferenceMetadataFieldsMapper.isEnabled(indexShard.mapperService().mappingLookup())) { + if (indexShard.mapperService().mappingLookup().inferenceFields().size() > 0) { + // Retrieves the inference metadata field containing the inference results for all semantic fields defined in the mapping. + return new String[] { RoutingFieldMapper.NAME, InferenceMetadataFieldsMapper.NAME }; + } + } + return new String[] { RoutingFieldMapper.NAME }; + } + private static boolean handleMappingUpdateRequired( BulkPrimaryExecutionContext context, MappingUpdatePerformer mappingUpdater, diff --git a/server/src/main/java/org/elasticsearch/action/update/TransportUpdateAction.java b/server/src/main/java/org/elasticsearch/action/update/TransportUpdateAction.java index 0749512635f83..ee84bcd15824d 100644 --- a/server/src/main/java/org/elasticsearch/action/update/TransportUpdateAction.java +++ b/server/src/main/java/org/elasticsearch/action/update/TransportUpdateAction.java @@ -44,6 +44,7 @@ import org.elasticsearch.index.IndexService; import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.mapper.InferenceFieldMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MappingLookup; import org.elasticsearch.index.shard.IndexShard; @@ -374,7 +375,7 @@ private static UpdateHelper.Result deleteInferenceResults( IndexMetadata indexMetadata, MappingLookup mappingLookup ) { - if (result.getResponseResult() != DocWriteResponse.Result.UPDATED) { + if (result.getResponseResult() != DocWriteResponse.Result.UPDATED || InferenceMetadataFieldsMapper.isEnabled(mappingLookup)) { return result; } @@ -403,7 +404,7 @@ private static UpdateHelper.Result deleteInferenceResults( String inferenceFieldName = entry.getKey(); Mapper mapper = mappingLookup.getMapper(inferenceFieldName); - if (mapper instanceof InferenceFieldMapper inferenceFieldMapper) { + if (mapper instanceof InferenceFieldMapper) { String[] sourceFields = entry.getValue().getSourceFields(); for (String sourceField : sourceFields) { if (sourceField.equals(inferenceFieldName) == false @@ -412,7 +413,7 @@ private static UpdateHelper.Result deleteInferenceResults( // This has two important side effects: // - The inference field value will remain parsable by its mapper // - The inference results will be removed, forcing them to be re-generated downstream - updatedSource.put(inferenceFieldName, inferenceFieldMapper.getOriginalValue(updatedSource)); + updatedSource.put(inferenceFieldName, getOriginalValueLegacy(inferenceFieldName, updatedSource)); updatedSourceModified = true; break; } @@ -435,4 +436,24 @@ private static UpdateHelper.Result deleteInferenceResults( return returnedResult; } + + /** + * Get the field's original value (i.e. the value the user specified) from the provided source. + * + * @param sourceAsMap The source as a map + * @return The field's original value, or {@code null} if none was provided + */ + private static Object getOriginalValueLegacy(String fullPath, Map sourceAsMap) { + // TODO: Fix bug here when semantic text field is in an object + Object fieldValue = sourceAsMap.get(fullPath); + if (fieldValue == null) { + return null; + } else if (fieldValue instanceof Map == false) { + // Don't try to further validate the non-map value, that will be handled when the source is fully parsed + return fieldValue; + } + + Map fieldValueMap = XContentMapValues.nodeMapValue(fieldValue, "Field [" + fullPath + "]"); + return XContentMapValues.extractValue("text", fieldValueMap); + } } diff --git a/server/src/main/java/org/elasticsearch/action/update/UpdateHelper.java b/server/src/main/java/org/elasticsearch/action/update/UpdateHelper.java index d28d196c42f2e..aff1db629ab6d 100644 --- a/server/src/main/java/org/elasticsearch/action/update/UpdateHelper.java +++ b/server/src/main/java/org/elasticsearch/action/update/UpdateHelper.java @@ -63,7 +63,15 @@ public UpdateHelper(ScriptService scriptService, DocumentParsingProvider documen * Prepares an update request by converting it into an index or delete request or an update response (no action). */ public Result prepare(UpdateRequest request, IndexShard indexShard, LongSupplier nowInMillis) throws IOException { - final GetResult getResult = indexShard.getService().getForUpdate(request.id(), request.ifSeqNo(), request.ifPrimaryTerm()); + // TODO: Don't hard-code gFields + return prepare(request, indexShard, nowInMillis, new String[] { RoutingFieldMapper.NAME }); + } + + /** + * Prepares an update request by converting it into an index or delete request or an update response (no action). + */ + public Result prepare(UpdateRequest request, IndexShard indexShard, LongSupplier nowInMillis, String[] gFields) throws IOException { + final GetResult getResult = indexShard.getService().getForUpdate(request.id(), request.ifSeqNo(), request.ifPrimaryTerm(), gFields); return prepare(indexShard, request, getResult, nowInMillis); } diff --git a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java index 029cf3aa75119..18cd73bb8881d 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java @@ -34,6 +34,7 @@ import org.elasticsearch.index.fielddata.IndexFieldDataService; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.IgnoredSourceFieldMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.SourceFieldMapper; import org.elasticsearch.index.similarity.SimilarityService; @@ -191,6 +192,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings { IgnoredSourceFieldMapper.SKIP_IGNORED_SOURCE_READ_SETTING, SourceFieldMapper.INDEX_MAPPER_SOURCE_MODE_SETTING, IndexSettings.RECOVERY_USE_SYNTHETIC_SOURCE_SETTING, + InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT, // validate that built-in similarities don't get redefined Setting.groupSetting("index.similarity.", (s) -> { diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 354c712c4f45a..6c03d6fbca73f 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -124,6 +124,7 @@ private static IndexVersion def(int id, Version luceneVersion) { public static final IndexVersion DEPRECATE_SOURCE_MODE_MAPPER = def(8_521_00_0, Version.LUCENE_9_12_0); public static final IndexVersion USE_SYNTHETIC_SOURCE_FOR_RECOVERY_BACKPORT = def(8_522_00_0, Version.LUCENE_9_12_0); public static final IndexVersion UPGRADE_TO_LUCENE_9_12_1 = def(8_523_00_0, Version.LUCENE_9_12_1); + public static final IndexVersion INFERENCE_METADATA_FIELDS_BACKPORT = def(8_524_00_0, Version.LUCENE_9_12_1); /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ diff --git a/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java b/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java index c7acd730fadb5..3c2ac53a005d2 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/index/engine/TranslogDirectoryReader.java @@ -441,7 +441,7 @@ private void readStoredFieldsDirectly(StoredFieldVisitor visitor) throws IOExcep SourceFieldMapper mapper = mappingLookup.getMapping().getMetadataMapperByClass(SourceFieldMapper.class); if (mapper != null) { try { - sourceBytes = mapper.applyFilters(sourceBytes, null); + sourceBytes = mapper.applyFilters(mappingLookup, sourceBytes, null); } catch (IOException e) { throw new IOException("Failed to reapply filters after reading from translog", e); } diff --git a/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java b/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java index 43b5d2c7d3f78..9ed9fc6dabf3d 100644 --- a/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java +++ b/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java @@ -9,7 +9,9 @@ package org.elasticsearch.index.get; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.IndexSearcher; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.document.DocumentField; @@ -26,6 +28,7 @@ import org.elasticsearch.index.fieldvisitor.LeafStoredFieldLoader; import org.elasticsearch.index.fieldvisitor.StoredFieldLoader; import org.elasticsearch.index.mapper.IgnoredFieldMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperMetrics; @@ -39,6 +42,7 @@ import org.elasticsearch.index.shard.MultiEngineGet; import org.elasticsearch.search.fetch.subphase.FetchSourceContext; import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.search.lookup.SourceFilter; import java.io.IOException; import java.util.ArrayList; @@ -190,9 +194,13 @@ public GetResult getFromTranslog( } public GetResult getForUpdate(String id, long ifSeqNo, long ifPrimaryTerm) throws IOException { + return getForUpdate(id, ifSeqNo, ifPrimaryTerm, new String[] { RoutingFieldMapper.NAME }); + } + + public GetResult getForUpdate(String id, long ifSeqNo, long ifPrimaryTerm, String[] gFields) throws IOException { return get( id, - new String[] { RoutingFieldMapper.NAME }, + gFields, true, Versions.MATCH_ANY, VersionType.INTERNAL, @@ -288,11 +296,18 @@ private GetResult innerGetFetch( boolean forceSyntheticSource ) throws IOException { assert get.exists() : "method should only be called if document could be retrieved"; - // check first if stored fields to be loaded don't contain an object field MappingLookup mappingLookup = mapperService.mappingLookup(); + final IndexVersion indexVersion = indexSettings.getIndexVersionCreated(); + final Set storedFieldSet = new HashSet<>(); + boolean hasInferenceMetadataFields = false; if (storedFields != null) { for (String field : storedFields) { + if (field.equals(InferenceMetadataFieldsMapper.NAME) + && InferenceMetadataFieldsMapper.isEnabled(indexShard.mapperService().mappingLookup())) { + hasInferenceMetadataFields = true; + continue; + } Mapper fieldMapper = mappingLookup.getMapper(field); if (fieldMapper == null) { if (mappingLookup.objectMappers().get(field) != null) { @@ -300,6 +315,7 @@ private GetResult innerGetFetch( throw new IllegalArgumentException("field [" + field + "] isn't a leaf field"); } } + storedFieldSet.add(field); } } @@ -313,8 +329,8 @@ private GetResult innerGetFetch( () -> mappingLookup.getMapping().syntheticFieldLoader(sourceFilter), mapperMetrics.sourceFieldMetrics() ) - : mappingLookup.newSourceLoader(fetchSourceContext.filter(), mapperMetrics.sourceFieldMetrics()); - StoredFieldLoader storedFieldLoader = buildStoredFieldLoader(storedFields, fetchSourceContext, loader); + : mappingLookup.newSourceLoader(sourceFilter, mapperMetrics.sourceFieldMetrics()); + StoredFieldLoader storedFieldLoader = buildStoredFieldLoader(storedFieldSet, fetchSourceContext, loader); LeafStoredFieldLoader leafStoredFieldLoader = storedFieldLoader.getLoader(docIdAndVersion.reader.getContext(), null); try { leafStoredFieldLoader.advanceTo(docIdAndVersion.docId); @@ -323,7 +339,6 @@ private GetResult innerGetFetch( } // put stored fields into result objects - final IndexVersion indexVersion = indexSettings.getIndexVersionCreated(); if (leafStoredFieldLoader.storedFields().isEmpty() == false) { Set needed = new HashSet<>(); if (storedFields != null) { @@ -372,6 +387,19 @@ private GetResult innerGetFetch( if (mapperService.mappingLookup().isSourceEnabled() && fetchSourceContext.fetchSource()) { Source source = loader.leaf(docIdAndVersion.reader, new int[] { docIdAndVersion.docId }) .source(leafStoredFieldLoader, docIdAndVersion.docId); + + SourceFilter filter = fetchSourceContext.filter(); + if (filter != null) { + source = source.filter(filter); + } + + if (hasInferenceMetadataFields) { + /** + * Adds the {@link InferenceMetadataFieldsMapper#NAME} field from the document fields + * to the original _source if it has been requested. + */ + source = addInferenceMetadataFields(mapperService, docIdAndVersion.reader.getContext(), docIdAndVersion.docId, source); + } sourceBytes = source.internalSourceRef(); } @@ -404,18 +432,38 @@ private static DocumentField loadIgnoredMetadataField(final DocIdAndVersion docI return new DocumentField(IgnoredFieldMapper.NAME, ignoredValues); } - private static StoredFieldLoader buildStoredFieldLoader(String[] fields, FetchSourceContext fetchSourceContext, SourceLoader loader) { - Set fieldsToLoad = new HashSet<>(); - if (fields != null && fields.length > 0) { - Collections.addAll(fieldsToLoad, fields); + private static Source addInferenceMetadataFields(MapperService mapperService, LeafReaderContext readerContext, int docId, Source source) + throws IOException { + var mappingLookup = mapperService.mappingLookup(); + var inferenceMetadata = (InferenceMetadataFieldsMapper) mappingLookup.getMapping() + .getMetadataMapperByName(InferenceMetadataFieldsMapper.NAME); + if (inferenceMetadata == null || mapperService.mappingLookup().inferenceFields().isEmpty()) { + return source; } + var inferenceLoader = inferenceMetadata.fieldType() + .valueFetcher(mappingLookup, mapperService.getBitSetProducer(), new IndexSearcher(readerContext.reader())); + inferenceLoader.setNextReader(readerContext); + List values = inferenceLoader.fetchValues(source, docId, List.of()); + if (values.size() == 1) { + var newSource = source.source(); + newSource.put(InferenceMetadataFieldsMapper.NAME, values.get(0)); + return Source.fromMap(newSource, source.sourceContentType()); + } + return source; + } + + private static StoredFieldLoader buildStoredFieldLoader( + Set fields, + FetchSourceContext fetchSourceContext, + SourceLoader loader + ) { if (fetchSourceContext.fetchSource()) { - fieldsToLoad.addAll(loader.requiredStoredFields()); + fields.addAll(loader.requiredStoredFields()); } else { - if (fieldsToLoad.isEmpty()) { + if (fields.isEmpty()) { return StoredFieldLoader.empty(); } } - return StoredFieldLoader.create(fetchSourceContext.fetchSource(), fieldsToLoad); + return StoredFieldLoader.create(fetchSourceContext.fetchSource(), fields); } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/InferenceFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/InferenceFieldMapper.java index 249ef5004e59c..f7c6eef7dfd49 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/InferenceFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/InferenceFieldMapper.java @@ -12,7 +12,6 @@ import org.elasticsearch.cluster.metadata.InferenceFieldMetadata; import org.elasticsearch.inference.InferenceService; -import java.util.Map; import java.util.Set; /** @@ -26,12 +25,4 @@ public interface InferenceFieldMapper { * @param sourcePaths The source path that populates the input for the field (before inference) */ InferenceFieldMetadata getMetadata(Set sourcePaths); - - /** - * Get the field's original value (i.e. the value the user specified) from the provided source. - * - * @param sourceAsMap The source as a map - * @return The field's original value, or {@code null} if none was provided - */ - Object getOriginalValue(Map sourceAsMap); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/InferenceMetadataFieldsMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/InferenceMetadataFieldsMapper.java new file mode 100644 index 0000000000000..e1a9c3eb81e79 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/InferenceMetadataFieldsMapper.java @@ -0,0 +1,107 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.join.BitSetProducer; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexVersions; +import org.elasticsearch.index.query.SearchExecutionContext; + +import java.util.Map; +import java.util.function.Function; + +/** + * An abstract {@link MetadataFieldMapper} used as a placeholder for implementation + * in the inference module. It is required by {@link SourceFieldMapper} to identify + * the field name for removal from _source. + */ +public abstract class InferenceMetadataFieldsMapper extends MetadataFieldMapper { + /** + * Internal index setting to control the format used for semantic text fields. + * Determines whether to use the legacy format (default: true). + * This setting is immutable and can only be defined at index creation + * to ensure the internal format remains consistent throughout the index's lifecycle. + */ + public static final Setting USE_LEGACY_SEMANTIC_TEXT_FORMAT = Setting.boolSetting( + "index.mapping.semantic_text.use_legacy_format", + // don't use the new format by default yet + true, + Setting.Property.Final, + Setting.Property.IndexScope, + Setting.Property.InternalIndex + ); + + public static final String NAME = "_inference_fields"; + public static final String CONTENT_TYPE = "_inference_fields"; + + protected InferenceMetadataFieldsMapper(MappedFieldType inferenceFieldType) { + super(inferenceFieldType); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + public InferenceMetadataFieldType fieldType() { + return (InferenceMetadataFieldType) super.fieldType(); + } + + public abstract static class InferenceMetadataFieldType extends MappedFieldType { + public InferenceMetadataFieldType() { + super(NAME, false, false, false, TextSearchInfo.NONE, Map.of()); + } + + /** + * Returns a {@link ValueFetcher} without requiring the construction of a full {@link SearchExecutionContext}. + */ + public abstract ValueFetcher valueFetcher( + MappingLookup mappingLookup, + Function bitSetCache, + IndexSearcher searcher + ); + } + + /** + * Checks if the {@link InferenceMetadataFieldsMapper} is enabled for the given {@link Settings}. + * + * This indicates whether the new format for semantic text fields is active. + * The new format is enabled if: + * 1. The index version is on or after {@link IndexVersions#INFERENCE_METADATA_FIELDS_BACKPORT}, and + * 2. The legacy semantic text format is disabled. + * + * @param settings the index settings to evaluate + * @return {@code true} if the new format is enabled; {@code false} otherwise + */ + public static boolean isEnabled(Settings settings) { + return IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings).onOrAfter(IndexVersions.INFERENCE_METADATA_FIELDS_BACKPORT) + && USE_LEGACY_SEMANTIC_TEXT_FORMAT.get(settings) == false; + } + + /** + * Checks if the {@link InferenceMetadataFieldsMapper} is enabled based on the provided {@link Mapping}. + * + * This indicates whether the new format for semantic text fields is active by verifying the existence + * of the {@link InferenceMetadataFieldsMapper} in the mapping's metadata. + * + * @param mappingLookup the mapping to evaluate + * @return {@code true} if the {@link InferenceMetadataFieldsMapper} is present; {@code false} otherwise + */ + public static boolean isEnabled(MappingLookup mappingLookup) { + return mappingLookup != null + && mappingLookup.getMapping() + .getMetadataMapperByName(InferenceMetadataFieldsMapper.NAME) instanceof InferenceMetadataFieldsMapper; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperService.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperService.java index 3f6a061d529ae..b6539156c055d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperService.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperService.java @@ -181,6 +181,7 @@ public boolean isAutoUpdate() { private final IndexVersion indexVersionCreated; private final MapperRegistry mapperRegistry; private final Supplier mappingParserContextSupplier; + private final Function bitSetProducer; private final MapperMetrics mapperMetrics; private volatile DocumentMapper mapper; @@ -255,6 +256,7 @@ public MapperService( this::getMetadataMappers, this::resolveDocumentType ); + this.bitSetProducer = bitSetProducer; this.mapperMetrics = mapperMetrics; } @@ -836,6 +838,10 @@ public MapperRegistry getMapperRegistry() { return mapperRegistry; } + public Function getBitSetProducer() { + return bitSetProducer; + } + public MapperMetrics getMapperMetrics() { return mapperMetrics; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java b/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java index 1278ebf0a393a..907f7265e98af 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java @@ -106,7 +106,7 @@ public T getMetadataMapperByClass(Class clazz return (T) metadataMappersMap.get(clazz); } - MetadataFieldMapper getMetadataMapperByName(String mapperName) { + public MetadataFieldMapper getMetadataMapperByName(String mapperName) { return metadataMappersByName.get(mapperName); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/NestedObjectMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/NestedObjectMapper.java index 03818f7b5c83f..08b0fb32a10c5 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/NestedObjectMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/NestedObjectMapper.java @@ -245,6 +245,10 @@ public MapperBuilderContext createChildContext(String name, Dynamic dynamic) { this.indexSettings = indexSettings; } + public IndexSettings indexSettings() { + return indexSettings; + } + public Query parentTypeFilter() { return parentTypeFilter; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/ObjectMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/ObjectMapper.java index 46b70193ba0e8..6a107dbaa9e63 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/ObjectMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/ObjectMapper.java @@ -855,7 +855,7 @@ protected void doXContent(XContentBuilder builder, Params params) throws IOExcep } - ObjectMapper findParentMapper(String leafFieldPath) { + public ObjectMapper findParentMapper(String leafFieldPath) { var pathComponents = leafFieldPath.split("\\."); int startPathComponent = 0; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/SourceFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/SourceFieldMapper.java index 648dcd304f7b7..9d1bb24e899a2 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/SourceFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/SourceFieldMapper.java @@ -404,7 +404,7 @@ public boolean isComplete() { public void preParse(DocumentParserContext context) throws IOException { BytesReference originalSource = context.sourceToParse().source(); XContentType contentType = context.sourceToParse().getXContentType(); - final BytesReference adaptedSource = applyFilters(originalSource, contentType); + final BytesReference adaptedSource = applyFilters(context.mappingLookup(), originalSource, contentType); if (adaptedSource != null) { final BytesRef ref = adaptedSource.toBytesRef(); @@ -432,13 +432,32 @@ public void preParse(DocumentParserContext context) throws IOException { } @Nullable - public BytesReference applyFilters(@Nullable BytesReference originalSource, @Nullable XContentType contentType) throws IOException { - if (stored() == false) { + public BytesReference applyFilters( + @Nullable MappingLookup mappingLookup, + @Nullable BytesReference originalSource, + @Nullable XContentType contentType + ) throws IOException { + if (stored() == false || originalSource == null) { return null; } - if (originalSource != null && sourceFilter != null) { + var modSourceFilter = sourceFilter; + if (mappingLookup != null + && InferenceMetadataFieldsMapper.isEnabled(mappingLookup) + && mappingLookup.inferenceFields().isEmpty() == false) { + /** + * Removes {@link InferenceMetadataFieldsMapper} content from _source. + * This content is re-generated at query time (if requested) using stored fields and doc values. + */ + String[] modExcludes = new String[excludes != null ? excludes.length + 1 : 1]; + if (excludes != null) { + System.arraycopy(excludes, 0, modExcludes, 0, excludes.length); + } + modExcludes[modExcludes.length - 1] = InferenceMetadataFieldsMapper.NAME; + modSourceFilter = new SourceFilter(includes, modExcludes); + } + if (modSourceFilter != null) { // Percolate and tv APIs may not set the source and that is ok, because these APIs will not index any data - return Source.fromBytes(originalSource, contentType).filter(sourceFilter).internalSourceRef(); + return Source.fromBytes(originalSource, contentType).filter(modSourceFilter).internalSourceRef(); } else { return originalSource; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index d883dc9c64006..266fee7b3fc70 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -78,6 +78,11 @@ public Builder(String name) { super(name); } + public Builder setStored(boolean value) { + stored.setValue(value); + return this; + } + @Override protected Parameter[] getParameters() { return new Parameter[] { stored, meta }; diff --git a/server/src/main/java/org/elasticsearch/search/SearchHit.java b/server/src/main/java/org/elasticsearch/search/SearchHit.java index 381b4bc0e9008..d060637098e0a 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchHit.java +++ b/server/src/main/java/org/elasticsearch/search/SearchHit.java @@ -517,6 +517,10 @@ public void addDocumentFields(Map docFields, Map plugins) { registerFetchSubPhase(new StoredFieldsPhase()); registerFetchSubPhase(new FetchDocValuesPhase()); registerFetchSubPhase(new ScriptFieldsPhase()); - registerFetchSubPhase(new FetchSourcePhase()); registerFetchSubPhase(new FetchFieldsPhase()); + // register after fetch fields to handle metadata fields that needs to be copied in _source (e.g. _inference_fields). + registerFetchSubPhase(new FetchSourcePhase()); registerFetchSubPhase(new FetchVersionPhase()); registerFetchSubPhase(new SeqNoPrimaryTermPhase()); registerFetchSubPhase(new MatchedQueriesPhase()); diff --git a/server/src/main/java/org/elasticsearch/search/fetch/FetchContext.java b/server/src/main/java/org/elasticsearch/search/fetch/FetchContext.java index 0bbbff3a5d5f4..2523c62015215 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/FetchContext.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/FetchContext.java @@ -10,6 +10,7 @@ package org.elasticsearch.search.fetch; import org.apache.lucene.search.Query; +import org.elasticsearch.index.cache.bitset.BitsetFilterCache; import org.elasticsearch.index.mapper.SourceFieldMapper; import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.query.ParsedQuery; @@ -87,6 +88,10 @@ private static StoredFieldsContext buildStoredFieldsContext(SearchContext in) { return sfc; } + public BitsetFilterCache bitsetFilterCache() { + return searchContext.bitsetFilterCache(); + } + /** * The name of the index that documents are being fetched from */ diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhase.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhase.java index 79e51036a91be..df99a718887e1 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhase.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhase.java @@ -10,6 +10,7 @@ package org.elasticsearch.search.fetch.subphase; import org.apache.lucene.index.LeafReaderContext; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.fetch.FetchContext; import org.elasticsearch.search.fetch.FetchSubPhase; @@ -63,6 +64,7 @@ private void hitExecute(HitContext hitContext) { // If this is a parent document and there are no source filters, then add the source as-is. if (nestedHit == false && sourceFilter == null) { + source = replaceInferenceMetadataFields(hitContext.hit(), source); hitContext.hit().sourceRef(source.internalSourceRef()); fastPath++; return; @@ -77,10 +79,30 @@ private void hitExecute(HitContext hitContext) { } if (nestedHit) { source = extractNested(source, hitContext.hit().getNestedIdentity()); + } else { + source = replaceInferenceMetadataFields(hitContext.hit(), source); } hitContext.hit().sourceRef(source.internalSourceRef()); } + /** + * Transfers the {@link InferenceMetadataFieldsMapper#NAME} field from the document fields + * to the original _source if it has been requested. + */ + private Source replaceInferenceMetadataFields(SearchHit hit, Source source) { + if (InferenceMetadataFieldsMapper.isEnabled(fetchContext.getSearchExecutionContext().getMappingLookup()) == false) { + return source; + } + + var field = hit.removeDocumentField(InferenceMetadataFieldsMapper.NAME); + if (field == null || field.getValues().isEmpty()) { + return source; + } + var newSource = source.source(); + newSource.put(InferenceMetadataFieldsMapper.NAME, field.getValues().get(0)); + return Source.fromMap(newSource, source.sourceContentType()); + } + @Override public Map getDebugInfo() { return Map.of("fast_path", fastPath); diff --git a/server/src/test/java/org/elasticsearch/action/bulk/TransportShardBulkActionTests.java b/server/src/test/java/org/elasticsearch/action/bulk/TransportShardBulkActionTests.java index c81aa9a21c428..75b0d79afbe39 100644 --- a/server/src/test/java/org/elasticsearch/action/bulk/TransportShardBulkActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/bulk/TransportShardBulkActionTests.java @@ -91,8 +91,16 @@ public class TransportShardBulkActionTests extends IndexShardTestCase { private final ShardId shardId = new ShardId("index", "_na_", 0); private final Settings idxSettings = indexSettings(IndexVersion.current(), 1, 0).build(); - private IndexMetadata indexMetadata() throws IOException { - return IndexMetadata.builder("index").putMapping(""" + private IndexMetadata indexMetadata(String mapping) { + IndexMetadata.Builder builder = IndexMetadata.builder("index").settings(idxSettings).primaryTerm(0, 1); + if (mapping != null) { + builder.putMapping(mapping); + } + return builder.build(); + } + + private IndexMetadata indexMetadata() { + return indexMetadata(""" { "properties": { "foo": { @@ -105,7 +113,7 @@ private IndexMetadata indexMetadata() throws IOException { } } } - }""").settings(idxSettings).primaryTerm(0, 1).build(); + }"""); } public void testExecuteBulkIndexRequest() throws Exception { @@ -494,7 +502,7 @@ public void testNoopUpdateRequest() throws Exception { IndexShard shard = mockShard(null, null); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( noopUpdateResponse, DocWriteResponse.Result.NOOP, @@ -549,7 +557,7 @@ public void testUpdateRequestWithFailure() throws Exception { ); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( updateResponse, randomBoolean() ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -610,7 +618,7 @@ public void testUpdateRequestWithConflictFailure() throws Exception { ); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( updateResponse, randomBoolean() ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -673,7 +681,7 @@ public void testUpdateRequestWithSuccess() throws Exception { ); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( updateResponse, created ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -730,7 +738,7 @@ public void testUpdateWithDelete() throws Exception { when(shard.applyDeleteOperationOnPrimary(anyLong(), any(), any(), anyLong(), anyLong())).thenReturn(deleteResult); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( updateResponse, DocWriteResponse.Result.DELETED, @@ -776,7 +784,7 @@ public void testFailureDuringUpdateProcessing() throws Exception { UpdateHelper updateHelper = mock(UpdateHelper.class); final ElasticsearchException err = new ElasticsearchException("oops"); - when(updateHelper.prepare(any(), eq(shard), any())).thenThrow(err); + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenThrow(err); BulkItemRequest[] items = new BulkItemRequest[] { primaryRequest }; BulkShardRequest bulkShardRequest = new BulkShardRequest(shardId, RefreshPolicy.NONE, items); @@ -908,7 +916,7 @@ public void testRetries() throws Exception { }); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( updateResponse, randomBoolean() ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -1121,7 +1129,7 @@ public void testNoopMappingUpdateInfiniteLoopPrevention() throws Exception { ); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( new IndexRequest("index").id("id").source(Requests.INDEX_CONTENT_TYPE, "field", "value"), randomBoolean() ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -1187,7 +1195,7 @@ public void testNoopMappingUpdateSuccessOnRetry() throws Exception { ); UpdateHelper updateHelper = mock(UpdateHelper.class); - when(updateHelper.prepare(any(), eq(shard), any())).thenReturn( + when(updateHelper.prepare(any(), eq(shard), any(), any())).thenReturn( new UpdateHelper.Result( new IndexRequest("index").id("id").source(Requests.INDEX_CONTENT_TYPE, "field", "value"), randomBoolean() ? DocWriteResponse.Result.CREATED : DocWriteResponse.Result.UPDATED, @@ -1227,6 +1235,9 @@ private IndexShard mockShard(IndexSettings indexSettings, MapperService mapperSe if (indexSettings != null) { when(shard.indexSettings()).thenReturn(indexSettings); + } else { + IndexSettings defaultIndexSettings = new IndexSettings(indexMetadata(null), Settings.EMPTY); + when(shard.indexSettings()).thenReturn(defaultIndexSettings); } if (mapperService != null) { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/MappingLookupInferenceFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/MappingLookupInferenceFieldMapperTests.java index b1470c1ee5b3b..755b83e8eb7ad 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/MappingLookupInferenceFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/MappingLookupInferenceFieldMapperTests.java @@ -105,11 +105,6 @@ public InferenceFieldMetadata getMetadata(Set sourcePaths) { return new InferenceFieldMetadata(fullPath(), INFERENCE_ID, SEARCH_INFERENCE_ID, sourcePaths.toArray(new String[0])); } - @Override - public Object getOriginalValue(Map sourceAsMap) { - return null; - } - @Override protected void parseCreateField(DocumentParserContext context) {} diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhaseTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhaseTests.java index deada75279e33..971f3bab7b6a3 100644 --- a/server/src/test/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhaseTests.java +++ b/server/src/test/java/org/elasticsearch/search/fetch/subphase/FetchSourcePhaseTests.java @@ -11,8 +11,12 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.memory.MemoryIndex; +import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.fetch.FetchContext; @@ -190,6 +194,12 @@ private HitContext hitExecuteMultiple( when(fetchContext.getIndexName()).thenReturn("index"); SearchExecutionContext sec = mock(SearchExecutionContext.class); when(sec.isSourceEnabled()).thenReturn(sourceBuilder != null); + IndexSettings indexSettings = new IndexSettings( + IndexMetadata.builder("index").settings(indexSettings(IndexVersion.current(), 1, 0)).build(), + Settings.EMPTY + ); + when(sec.indexVersionCreated()).thenReturn(indexSettings.getIndexVersionCreated()); + when(sec.getIndexSettings()).thenReturn(indexSettings); when(fetchContext.getSearchExecutionContext()).thenReturn(sec); final SearchHit searchHit = SearchHit.unpooled(1, null, nestedIdentity); diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle index 21b876038688e..81b804741f871 100644 --- a/x-pack/plugin/build.gradle +++ b/x-pack/plugin/build.gradle @@ -211,5 +211,6 @@ tasks.named("yamlRestTestV7CompatTransform").configure({ task -> task.skipTest("esql/61_enrich_ip/Invalid IP strings", "We switched from exceptions to null+warnings for ENRICH runtime errors") task.skipTest("esql/180_match_operator/match with non text field", "Match operator can now be used on non-text fields") task.skipTest("esql/180_match_operator/match with functions", "Error message changed") + task.skipTest("esql/40_unsupported_types/semantic_text declared in mapping", "The semantic text field format changed") }) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java index f5923a4942634..23f9e91dc32da 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java @@ -395,8 +395,6 @@ public List getNamedXContent() { ); } - // TODO: The WeightedTokensBuilder is slated for removal after the SparseVectorQueryBuilder is available. - // The logic to create a Boolean query based on weighted tokens will remain and/or be moved to server. @Override public List> getQueries() { return List.of( diff --git a/x-pack/plugin/inference/build.gradle b/x-pack/plugin/inference/build.gradle index 36eada1243664..c8239eee9b0e8 100644 --- a/x-pack/plugin/inference/build.gradle +++ b/x-pack/plugin/inference/build.gradle @@ -35,6 +35,7 @@ dependencies { testImplementation(testArtifact(project(':server'))) testImplementation(project(':x-pack:plugin:inference:qa:test-service-plugin')) testImplementation project(':modules:reindex') + testImplementation project(':modules:mapper-extras') clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin') api "com.ibm.icu:icu4j:${versions.icu4j}" diff --git a/x-pack/plugin/inference/src/internalClusterTest/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterIT.java b/x-pack/plugin/inference/src/internalClusterTest/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterIT.java index 24248f832a8dd..480648a22ee45 100644 --- a/x-pack/plugin/inference/src/internalClusterTest/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterIT.java +++ b/x-pack/plugin/inference/src/internalClusterTest/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterIT.java @@ -7,6 +7,8 @@ package org.elasticsearch.xpack.inference.action.filter; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; import org.elasticsearch.action.bulk.BulkItemResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; @@ -17,6 +19,9 @@ import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.update.UpdateRequestBuilder; import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.inference.SimilarityMeasure; import org.elasticsearch.plugins.Plugin; @@ -29,9 +34,9 @@ import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -40,9 +45,19 @@ import static org.hamcrest.Matchers.equalTo; public class ShardBulkInferenceActionFilterIT extends ESIntegTestCase { - public static final String INDEX_NAME = "test-index"; + private final boolean useLegacyFormat; + + public ShardBulkInferenceActionFilterIT(boolean useLegacyFormat) { + this.useLegacyFormat = useLegacyFormat; + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return List.of(new Object[] { true }, new Object[] { false }); + } + @Before public void setup() throws Exception { Utils.storeSparseModel(client()); @@ -61,8 +76,16 @@ protected Collection> nodePlugins() { return Arrays.asList(Utils.TestInferencePlugin.class); } + @Override + public Settings indexSettings() { + return Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, randomIntBetween(1, 10)) + .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat) + .build(); + } + public void testBulkOperations() throws Exception { - Map shardsSettings = Collections.singletonMap(IndexMetadata.SETTING_NUMBER_OF_SHARDS, randomIntBetween(1, 10)); indicesAdmin().prepareCreate(INDEX_NAME) .setMapping( String.format( @@ -85,7 +108,6 @@ public void testBulkOperations() throws Exception { TestDenseInferenceServiceExtension.TestInferenceService.NAME ) ) - .setSettings(shardsSettings) .get(); int totalBulkReqs = randomIntBetween(2, 100); @@ -151,5 +173,4 @@ public void testBulkOperations() throws Exception { searchResponse.decRef(); } } - } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java index 5513ab6fe7de2..ffa43d57e322f 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java @@ -23,6 +23,7 @@ import org.elasticsearch.core.TimeValue; import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MetadataFieldMapper; import org.elasticsearch.indices.SystemIndexDescriptor; import org.elasticsearch.inference.InferenceServiceExtension; import org.elasticsearch.inference.InferenceServiceRegistry; @@ -72,6 +73,7 @@ import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter; import org.elasticsearch.xpack.inference.logging.ThrottlerManager; import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper; +import org.elasticsearch.xpack.inference.mapper.SemanticInferenceMetadataFieldsMapper; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; import org.elasticsearch.xpack.inference.queries.SemanticMatchQueryRewriteInterceptor; import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder; @@ -252,7 +254,7 @@ public Collection createComponents(PluginServices services) { } inferenceServiceRegistry.set(registry); - var actionFilter = new ShardBulkInferenceActionFilter(registry, modelRegistry); + var actionFilter = new ShardBulkInferenceActionFilter(services.clusterService(), registry, modelRegistry); shardBulkInferenceActionFilter.set(actionFilter); var meterRegistry = services.telemetryProvider().getMeterRegistry(); @@ -384,6 +386,11 @@ public void close() { IOUtils.closeWhileHandlingException(inferenceServiceRegistry.get(), throttlerToClose); } + @Override + public Map getMetadataMappers() { + return Map.of(SemanticInferenceMetadataFieldsMapper.NAME, SemanticInferenceMetadataFieldsMapper.PARSER); + } + @Override public Map getMappers() { return Map.of( diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java index a9195ea24af3a..22d6157b335ca 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java @@ -24,11 +24,13 @@ import org.elasticsearch.action.support.RefCountingRunnable; import org.elasticsearch.action.update.UpdateRequest; import org.elasticsearch.cluster.metadata.InferenceFieldMetadata; +import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.util.concurrent.AtomicArray; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.core.Nullable; import org.elasticsearch.core.Releasable; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.inference.ChunkedInference; import org.elasticsearch.inference.InferenceService; import org.elasticsearch.inference.InferenceServiceRegistry; @@ -40,11 +42,11 @@ import org.elasticsearch.xpack.core.inference.results.ChunkedInferenceError; import org.elasticsearch.xpack.inference.mapper.SemanticTextField; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; +import org.elasticsearch.xpack.inference.mapper.SemanticTextUtils; import org.elasticsearch.xpack.inference.registry.ModelRegistry; import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -53,8 +55,6 @@ import java.util.Map; import java.util.stream.Collectors; -import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunks; - /** * A {@link MappedActionFilter} that intercepts {@link BulkShardRequest} to apply inference on fields specified * as {@link SemanticTextFieldMapper} in the index mapping. For each semantic text field referencing fields in @@ -68,15 +68,26 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter { protected static final int DEFAULT_BATCH_SIZE = 512; + private final ClusterService clusterService; private final InferenceServiceRegistry inferenceServiceRegistry; private final ModelRegistry modelRegistry; private final int batchSize; - public ShardBulkInferenceActionFilter(InferenceServiceRegistry inferenceServiceRegistry, ModelRegistry modelRegistry) { - this(inferenceServiceRegistry, modelRegistry, DEFAULT_BATCH_SIZE); + public ShardBulkInferenceActionFilter( + ClusterService clusterService, + InferenceServiceRegistry inferenceServiceRegistry, + ModelRegistry modelRegistry + ) { + this(clusterService, inferenceServiceRegistry, modelRegistry, DEFAULT_BATCH_SIZE); } - public ShardBulkInferenceActionFilter(InferenceServiceRegistry inferenceServiceRegistry, ModelRegistry modelRegistry, int batchSize) { + public ShardBulkInferenceActionFilter( + ClusterService clusterService, + InferenceServiceRegistry inferenceServiceRegistry, + ModelRegistry modelRegistry, + int batchSize + ) { + this.clusterService = clusterService; this.inferenceServiceRegistry = inferenceServiceRegistry; this.modelRegistry = modelRegistry; this.batchSize = batchSize; @@ -112,7 +123,9 @@ private void processBulkShardRequest( BulkShardRequest bulkShardRequest, Runnable onCompletion ) { - new AsyncBulkShardInferenceAction(fieldInferenceMap, bulkShardRequest, onCompletion).run(); + var index = clusterService.state().getMetadata().index(bulkShardRequest.index()); + boolean useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(index.getSettings()) == false; + new AsyncBulkShardInferenceAction(useLegacyFormat, fieldInferenceMap, bulkShardRequest, onCompletion).run(); } private record InferenceProvider(InferenceService service, Model model) {} @@ -121,26 +134,29 @@ private record InferenceProvider(InferenceService service, Model model) {} * A field inference request on a single input. * @param index The index of the request in the original bulk request. * @param field The target field. + * @param sourceField The source field. * @param input The input to run inference on. * @param inputOrder The original order of the input. - * @param isOriginalFieldInput Whether the input is part of the original values of the field. + * @param offsetAdjustment The adjustment to apply to the chunk text offsets. */ - private record FieldInferenceRequest(int index, String field, String input, int inputOrder, boolean isOriginalFieldInput) {} + private record FieldInferenceRequest(int index, String field, String sourceField, String input, int inputOrder, int offsetAdjustment) {} /** * The field inference response. * @param field The target field. + * @param sourceField The input that was used to run inference. * @param input The input that was used to run inference. * @param inputOrder The original order of the input. - * @param isOriginalFieldInput Whether the input is part of the original values of the field. + * @param offsetAdjustment The adjustment to apply to the chunk text offsets. * @param model The model used to run inference. * @param chunkedResults The actual results. */ private record FieldInferenceResponse( String field, + String sourceField, String input, int inputOrder, - boolean isOriginalFieldInput, + int offsetAdjustment, Model model, ChunkedInference chunkedResults ) {} @@ -165,16 +181,19 @@ void addFailure(Exception exc) { } private class AsyncBulkShardInferenceAction implements Runnable { + private final boolean useLegacyFormat; private final Map fieldInferenceMap; private final BulkShardRequest bulkShardRequest; private final Runnable onCompletion; private final AtomicArray inferenceResults; private AsyncBulkShardInferenceAction( + boolean useLegacyFormat, Map fieldInferenceMap, BulkShardRequest bulkShardRequest, Runnable onCompletion ) { + this.useLegacyFormat = useLegacyFormat; this.fieldInferenceMap = fieldInferenceMap; this.bulkShardRequest = bulkShardRequest; this.inferenceResults = new AtomicArray<>(bulkShardRequest.items().length); @@ -295,9 +314,10 @@ public void onResponse(List results) { acc.addOrUpdateResponse( new FieldInferenceResponse( request.field(), + request.sourceField(), request.input(), request.inputOrder(), - request.isOriginalFieldInput(), + request.offsetAdjustment(), inferenceProvider.model, result ) @@ -357,8 +377,7 @@ private void addInferenceResponseFailure(int id, Exception failure) { /** * Applies the {@link FieldInferenceResponseAccumulator} to the provided {@link BulkItemRequest}. * If the response contains failures, the bulk item request is marked as failed for the downstream action. - * Otherwise, the source of the request is augmented with the field inference results under the - * {@link SemanticTextField#INFERENCE_FIELD} field. + * Otherwise, the source of the request is augmented with the field inference results. */ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceResponseAccumulator response) throws IOException { if (response.failures().isEmpty() == false) { @@ -370,25 +389,49 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons final IndexRequest indexRequest = getIndexRequestOrNull(item.request()); var newDocMap = indexRequest.sourceAsMap(); + Map inferenceFieldsMap = new HashMap<>(); for (var entry : response.responses.entrySet()) { var fieldName = entry.getKey(); var responses = entry.getValue(); var model = responses.get(0).model(); // ensure that the order in the original field is consistent in case of multiple inputs Collections.sort(responses, Comparator.comparingInt(FieldInferenceResponse::inputOrder)); - List inputs = responses.stream().filter(r -> r.isOriginalFieldInput).map(r -> r.input).collect(Collectors.toList()); - List results = responses.stream().map(r -> r.chunkedResults).collect(Collectors.toList()); + Map> chunkMap = new LinkedHashMap<>(); + for (var resp : responses) { + var lst = chunkMap.computeIfAbsent(resp.sourceField, k -> new ArrayList<>()); + lst.addAll( + SemanticTextField.toSemanticTextFieldChunks( + resp.input, + resp.offsetAdjustment, + resp.chunkedResults, + indexRequest.getContentType(), + useLegacyFormat + ) + ); + } + List inputs = responses.stream() + .filter(r -> r.sourceField().equals(fieldName)) + .map(r -> r.input) + .collect(Collectors.toList()); var result = new SemanticTextField( + useLegacyFormat, fieldName, - inputs, + useLegacyFormat ? inputs : null, new SemanticTextField.InferenceResult( model.getInferenceEntityId(), new SemanticTextField.ModelSettings(model), - toSemanticTextFieldChunks(results, indexRequest.getContentType()) + chunkMap ), indexRequest.getContentType() ); - SemanticTextFieldMapper.insertValue(fieldName, newDocMap, result); + if (useLegacyFormat) { + SemanticTextUtils.insertValue(fieldName, newDocMap, result); + } else { + inferenceFieldsMap.put(fieldName, result); + } + } + if (useLegacyFormat == false) { + newDocMap.put(InferenceMetadataFieldsMapper.NAME, inferenceFieldsMap); } indexRequest.source(newDocMap, indexRequest.getContentType()); } @@ -437,17 +480,30 @@ private Map> createFieldInferenceRequests(Bu for (var entry : fieldInferenceMap.values()) { String field = entry.getName(); String inferenceId = entry.getInferenceId(); - var originalFieldValue = XContentMapValues.extractValue(field, docMap); - if (originalFieldValue instanceof Map || (originalFieldValue == null && entry.getSourceFields().length == 1)) { - // Inference has already been computed, or there is no inference required. - continue; + + if (useLegacyFormat) { + var originalFieldValue = XContentMapValues.extractValue(field, docMap); + if (originalFieldValue instanceof Map || (originalFieldValue == null && entry.getSourceFields().length == 1)) { + // Inference has already been computed, or there is no inference required. + continue; + } + } else { + var inferenceMetadataFieldsValue = XContentMapValues.extractValue( + InferenceMetadataFieldsMapper.NAME + "." + field, + docMap + ); + if (inferenceMetadataFieldsValue != null) { + // Inference has already been computed + continue; + } } + int order = 0; for (var sourceField : entry.getSourceFields()) { - boolean isOriginalFieldInput = sourceField.equals(field); + // TODO: Detect when the field is provided with an explicit null value var valueObj = XContentMapValues.extractValue(sourceField, docMap); if (valueObj == null) { - if (isUpdateRequest) { + if (isUpdateRequest && useLegacyFormat) { addInferenceResponseFailure( item.id(), new ElasticsearchStatusException( @@ -464,14 +520,21 @@ private Map> createFieldInferenceRequests(Bu ensureResponseAccumulatorSlot(itemIndex); final List values; try { - values = nodeStringValues(field, valueObj); + values = SemanticTextUtils.nodeStringValues(field, valueObj); } catch (Exception exc) { addInferenceResponseFailure(item.id(), exc); break; } + List fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>()); - for (var v : values) { - fieldRequests.add(new FieldInferenceRequest(itemIndex, field, v, order++, isOriginalFieldInput)); + int offsetAdjustment = 0; + for (String v : values) { + fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment)); + + // When using the inference metadata fields format, all the input values are concatenated so that the + // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment + // to apply to account for this. + offsetAdjustment += v.length() + 1; // Add one for separator char length } } } @@ -480,41 +543,6 @@ private Map> createFieldInferenceRequests(Bu } } - /** - * This method converts the given {@code valueObj} into a list of strings. - * If {@code valueObj} is not a string or a collection of strings, it throws an ElasticsearchStatusException. - */ - private static List nodeStringValues(String field, Object valueObj) { - if (valueObj instanceof Number || valueObj instanceof Boolean) { - return List.of(valueObj.toString()); - } else if (valueObj instanceof String value) { - return List.of(value); - } else if (valueObj instanceof Collection values) { - List valuesString = new ArrayList<>(); - for (var v : values) { - if (v instanceof Number || v instanceof Boolean) { - valuesString.add(v.toString()); - } else if (v instanceof String value) { - valuesString.add(value); - } else { - throw new ElasticsearchStatusException( - "Invalid format for field [{}], expected [String] got [{}]", - RestStatus.BAD_REQUEST, - field, - valueObj.getClass().getSimpleName() - ); - } - } - return valuesString; - } - throw new ElasticsearchStatusException( - "Invalid format for field [{}], expected [String] got [{}]", - RestStatus.BAD_REQUEST, - field, - valueObj.getClass().getSimpleName() - ); - } - static IndexRequest getIndexRequestOrNull(DocWriteRequest docWriteRequest) { if (docWriteRequest instanceof IndexRequest indexRequest) { return indexRequest; diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java index f2bfa72ec617a..cd3cde65e12ea 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java @@ -53,8 +53,9 @@ private record OffsetAndScore(int offset, float score) {} @Override public boolean canHighlight(MappedFieldType fieldType) { - if (fieldType instanceof SemanticTextFieldMapper.SemanticTextFieldType) { - return true; + if (fieldType instanceof SemanticTextFieldMapper.SemanticTextFieldType semanticTextFieldType) { + // TODO: Implement highlighting when using inference metadata fields + return semanticTextFieldType.useLegacyFormat(); } return false; } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java new file mode 100644 index 0000000000000..7a1a9b056d0a1 --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java @@ -0,0 +1,177 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.join.BitSetProducer; +import org.elasticsearch.common.xcontent.XContentParserUtils; +import org.elasticsearch.index.mapper.ContentPath; +import org.elasticsearch.index.mapper.DocumentParserContext; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MappingLookup; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.QueryShardException; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.search.fetch.StoredFieldsSpec; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.XContentLocation; +import org.elasticsearch.xcontent.XContentParser; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +/** + * An {@link InferenceMetadataFieldsMapper} that delegates parsing of underlying fields + * to the corresponding {@link SemanticTextFieldMapper}. + */ +public class SemanticInferenceMetadataFieldsMapper extends InferenceMetadataFieldsMapper { + private static final SemanticInferenceMetadataFieldsMapper INSTANCE = new SemanticInferenceMetadataFieldsMapper(); + + public static final TypeParser PARSER = new FixedTypeParser( + c -> InferenceMetadataFieldsMapper.isEnabled(c.getSettings()) ? INSTANCE : null + ); + + static class FieldType extends InferenceMetadataFieldType { + private static final FieldType INSTANCE = new FieldType(); + + FieldType() { + super(); + } + + @Override + public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + return valueFetcher(context.getMappingLookup(), context::bitsetFilter, context.searcher()); + } + + @Override + public ValueFetcher valueFetcher(MappingLookup mappingLookup, Function bitSetCache, IndexSearcher searcher) { + Map fieldFetchers = new HashMap<>(); + for (var inferenceField : mappingLookup.inferenceFields().keySet()) { + MappedFieldType ft = mappingLookup.getFieldType(inferenceField); + if (ft instanceof SemanticTextFieldMapper.SemanticTextFieldType semanticTextFieldType) { + fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher)); + } else { + throw new IllegalArgumentException( + "Invalid inference field [" + ft.name() + "]. Expected field type [semantic_text] but got [" + ft.typeName() + "]" + ); + } + } + if (fieldFetchers.isEmpty()) { + return ValueFetcher.EMPTY; + } + return new ValueFetcher() { + @Override + public void setNextReader(LeafReaderContext context) { + fieldFetchers.values().forEach(f -> f.setNextReader(context)); + } + + @Override + public List fetchValues(Source source, int doc, List ignoredValues) throws IOException { + Map result = new HashMap<>(); + for (var entry : fieldFetchers.entrySet()) { + var values = entry.getValue().fetchValues(source, doc, ignoredValues); + if (values.size() > 0) { + assert values.size() == 1; + result.put(entry.getKey(), values.get(0)); + } + } + return result.isEmpty() ? List.of() : List.of(result); + } + + @Override + public StoredFieldsSpec storedFieldsSpec() { + return StoredFieldsSpec.NO_REQUIREMENTS; + } + }; + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + throw new QueryShardException( + context, + "[" + name() + "] field which is of type [" + typeName() + "], does not support term queries" + ); + } + } + + private SemanticInferenceMetadataFieldsMapper() { + super(FieldType.INSTANCE); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + protected boolean supportsParsingObject() { + return true; + } + + @Override + protected void parseCreateField(DocumentParserContext context) throws IOException { + final boolean isWithinLeaf = context.path().isWithinLeafObject(); + try { + // make sure that we don't expand dots in field names while parsing + context.path().setWithinLeafObject(true); + XContentParser parser = context.parser(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.currentToken(), parser); + while (parser.nextToken() != XContentParser.Token.END_OBJECT) { + XContentParserUtils.ensureExpectedToken(XContentParser.Token.FIELD_NAME, parser.currentToken(), parser); + String fieldName = parser.currentName(); + + // Set the path to that of semantic text field so the parser acts as if we are parsing the semantic text field value + // directly. We can safely split on all "." chars because semantic text fields cannot be used when subobjects == false. + String[] fieldNameParts = fieldName.split("\\."); + setPath(context.path(), fieldNameParts); + + var parent = context.parent().findParentMapper(fieldName); + if (parent == null) { + throw new IllegalArgumentException("Field [" + fieldName + "] does not have a parent mapper"); + } + String suffix = parent != context.parent() ? fieldName.substring(parent.fullPath().length() + 1) : fieldName; + var mapper = parent.getMapper(suffix); + if (mapper instanceof SemanticTextFieldMapper fieldMapper) { + XContentLocation xContentLocation = context.parser().getTokenLocation(); + var input = fieldMapper.parseSemanticTextField(context); + if (input != null) { + fieldMapper.parseCreateFieldFromContext(context, input, xContentLocation); + } + } else { + throw new IllegalArgumentException( + "Field [" + fieldName + "] is not a [" + SemanticTextFieldMapper.CONTENT_TYPE + "] field" + ); + } + } + } finally { + context.path().setWithinLeafObject(isWithinLeaf); + setPath(context.path(), new String[] { InferenceMetadataFieldsMapper.NAME }); + } + } + + private static void setPath(ContentPath contentPath, String[] newPath) { + while (contentPath.length() > 0) { + contentPath.remove(); + } + + for (String pathPart : newPath) { + contentPath.add(pathPart); + } + } +} diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java index d651729dee259..d99889f11d3f2 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java @@ -10,8 +10,10 @@ import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.common.xcontent.XContentParserUtils; import org.elasticsearch.common.xcontent.support.XContentMapValues; -import org.elasticsearch.core.Tuple; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.inference.ChunkedInference; import org.elasticsearch.inference.Model; @@ -31,6 +33,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -46,13 +51,18 @@ * the inference results under the {@link SemanticTextField#INFERENCE_FIELD}. * * @param fieldName The original field name. - * @param originalValues The original values associated with the field name. + * @param originalValues The original values associated with the field name for indices created before + * {@link IndexVersions#INFERENCE_METADATA_FIELDS_BACKPORT}, null otherwise. * @param inference The inference result. * @param contentType The {@link XContentType} used to store the embeddings chunks. */ -public record SemanticTextField(String fieldName, List originalValues, InferenceResult inference, XContentType contentType) - implements - ToXContentObject { +public record SemanticTextField( + boolean useLegacyFormat, + String fieldName, + @Nullable List originalValues, + InferenceResult inference, + XContentType contentType +) implements ToXContentObject { static final String TEXT_FIELD = "text"; static final String INFERENCE_FIELD = "inference"; @@ -61,15 +71,20 @@ public record SemanticTextField(String fieldName, List originalValues, I static final String CHUNKS_FIELD = "chunks"; static final String CHUNKED_EMBEDDINGS_FIELD = "embeddings"; public static final String CHUNKED_TEXT_FIELD = "text"; + static final String CHUNKED_OFFSET_FIELD = "offset"; + static final String CHUNKED_START_OFFSET_FIELD = "start_offset"; + static final String CHUNKED_END_OFFSET_FIELD = "end_offset"; static final String MODEL_SETTINGS_FIELD = "model_settings"; static final String TASK_TYPE_FIELD = "task_type"; static final String DIMENSIONS_FIELD = "dimensions"; static final String SIMILARITY_FIELD = "similarity"; static final String ELEMENT_TYPE_FIELD = "element_type"; - public record InferenceResult(String inferenceId, ModelSettings modelSettings, List chunks) {} + public record InferenceResult(String inferenceId, ModelSettings modelSettings, Map> chunks) {} - record Chunk(String text, BytesReference rawEmbeddings) {} + public record Chunk(@Nullable String text, int startOffset, int endOffset, BytesReference rawEmbeddings) {} + + public record Offset(String sourceFieldName, int startOffset, int endOffset) {} public record ModelSettings( TaskType taskType, @@ -187,12 +202,14 @@ public static String getEmbeddingsFieldName(String fieldName) { return getChunksFieldName(fieldName) + "." + CHUNKED_EMBEDDINGS_FIELD; } - static SemanticTextField parse(XContentParser parser, Tuple context) throws IOException { - return SEMANTIC_TEXT_FIELD_PARSER.parse(parser, context); + public static String getOffsetsFieldName(String fieldName) { + return getChunksFieldName(fieldName) + "." + CHUNKED_OFFSET_FIELD; } - static ModelSettings parseModelSettings(XContentParser parser) throws IOException { - return MODEL_SETTINGS_PARSER.parse(parser, null); + record ParserContext(boolean useLegacyFormat, String fieldName, XContentType xContentType) {} + + static SemanticTextField parse(XContentParser parser, ParserContext context) throws IOException { + return SEMANTIC_TEXT_FIELD_PARSER.parse(parser, context); } static ModelSettings parseModelSettingsFromMap(Object node) { @@ -207,63 +224,102 @@ static ModelSettings parseModelSettingsFromMap(Object node) { map, XContentType.JSON ); - return parseModelSettings(parser); + return MODEL_SETTINGS_PARSER.parse(parser, null); } catch (Exception exc) { throw new ElasticsearchException(exc); } } + @Override + public List originalValues() { + return originalValues != null ? originalValues : Collections.emptyList(); + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); - if (originalValues.isEmpty() == false) { + List originalValues = originalValues(); + if (useLegacyFormat && originalValues.isEmpty() == false) { builder.field(TEXT_FIELD, originalValues.size() == 1 ? originalValues.get(0) : originalValues); } builder.startObject(INFERENCE_FIELD); builder.field(INFERENCE_ID_FIELD, inference.inferenceId); builder.field(MODEL_SETTINGS_FIELD, inference.modelSettings); - builder.startArray(CHUNKS_FIELD); - for (var chunk : inference.chunks) { - builder.startObject(); - builder.field(CHUNKED_TEXT_FIELD, chunk.text); - XContentParser parser = XContentHelper.createParserNotCompressed( - XContentParserConfiguration.EMPTY, - chunk.rawEmbeddings, - contentType - ); - builder.field(CHUNKED_EMBEDDINGS_FIELD).copyCurrentStructure(parser); + if (useLegacyFormat) { + builder.startArray(CHUNKS_FIELD); + } else { + builder.startObject(CHUNKS_FIELD); + } + for (var entry : inference.chunks.entrySet()) { + if (useLegacyFormat == false) { + builder.startArray(entry.getKey()); + } + for (var chunk : entry.getValue()) { + builder.startObject(); + if (useLegacyFormat) { + builder.field(TEXT_FIELD, chunk.text); + } else { + builder.field(CHUNKED_START_OFFSET_FIELD, chunk.startOffset); + builder.field(CHUNKED_END_OFFSET_FIELD, chunk.endOffset); + } + XContentParser parser = XContentHelper.createParserNotCompressed( + XContentParserConfiguration.EMPTY, + chunk.rawEmbeddings, + contentType + ); + builder.field(CHUNKED_EMBEDDINGS_FIELD).copyCurrentStructure(parser); + builder.endObject(); + } + if (useLegacyFormat == false) { + builder.endArray(); + } + } + if (useLegacyFormat) { + builder.endArray(); + } else { builder.endObject(); } - builder.endArray(); builder.endObject(); builder.endObject(); return builder; } @SuppressWarnings("unchecked") - private static final ConstructingObjectParser> SEMANTIC_TEXT_FIELD_PARSER = - new ConstructingObjectParser<>( - SemanticTextFieldMapper.CONTENT_TYPE, - true, - (args, context) -> new SemanticTextField( - context.v1(), - (List) (args[0] == null ? List.of() : args[0]), + private static final ConstructingObjectParser SEMANTIC_TEXT_FIELD_PARSER = + new ConstructingObjectParser<>(SemanticTextFieldMapper.CONTENT_TYPE, true, (args, context) -> { + List originalValues = (List) args[0]; + if (context.useLegacyFormat() == false) { + if (originalValues != null && originalValues.isEmpty() == false) { + throw new IllegalArgumentException("Unknown field [" + TEXT_FIELD + "]"); + } + originalValues = null; + } + return new SemanticTextField( + context.useLegacyFormat(), + context.fieldName(), + originalValues, (InferenceResult) args[1], - context.v2() - ) - ); + context.xContentType() + ); + }); @SuppressWarnings("unchecked") - private static final ConstructingObjectParser INFERENCE_RESULT_PARSER = new ConstructingObjectParser<>( + private static final ConstructingObjectParser INFERENCE_RESULT_PARSER = new ConstructingObjectParser<>( INFERENCE_FIELD, true, - args -> new InferenceResult((String) args[0], (ModelSettings) args[1], (List) args[2]) + args -> new InferenceResult((String) args[0], (ModelSettings) args[1], (Map>) args[2]) ); - private static final ConstructingObjectParser CHUNKS_PARSER = new ConstructingObjectParser<>( + private static final ConstructingObjectParser CHUNKS_PARSER = new ConstructingObjectParser<>( CHUNKS_FIELD, true, - args -> new Chunk((String) args[0], (BytesReference) args[1]) + (args, context) -> { + String text = (String) args[0]; + if (context.useLegacyFormat() && text == null) { + throw new IllegalArgumentException("Missing chunk text"); + } + return new Chunk(text, args[1] != null ? (int) args[1] : -1, args[2] != null ? (int) args[2] : -1, (BytesReference) args[3]); + } ); private static final ConstructingObjectParser MODEL_SETTINGS_PARSER = new ConstructingObjectParser<>( @@ -284,15 +340,26 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws SEMANTIC_TEXT_FIELD_PARSER.declareStringArray(optionalConstructorArg(), new ParseField(TEXT_FIELD)); SEMANTIC_TEXT_FIELD_PARSER.declareObject( constructorArg(), - (p, c) -> INFERENCE_RESULT_PARSER.parse(p, null), + (p, c) -> INFERENCE_RESULT_PARSER.parse(p, c), new ParseField(INFERENCE_FIELD) ); INFERENCE_RESULT_PARSER.declareString(constructorArg(), new ParseField(INFERENCE_ID_FIELD)); - INFERENCE_RESULT_PARSER.declareObject(constructorArg(), MODEL_SETTINGS_PARSER, new ParseField(MODEL_SETTINGS_FIELD)); - INFERENCE_RESULT_PARSER.declareObjectArray(constructorArg(), CHUNKS_PARSER, new ParseField(CHUNKS_FIELD)); + INFERENCE_RESULT_PARSER.declareObject( + constructorArg(), + (p, c) -> MODEL_SETTINGS_PARSER.parse(p, null), + new ParseField(MODEL_SETTINGS_FIELD) + ); + INFERENCE_RESULT_PARSER.declareField(constructorArg(), (p, c) -> { + if (c.useLegacyFormat()) { + return Map.of(c.fieldName, parseChunksArrayLegacy(p, c)); + } + return parseChunksMap(p, c); + }, new ParseField(CHUNKS_FIELD), ObjectParser.ValueType.OBJECT_ARRAY); - CHUNKS_PARSER.declareString(constructorArg(), new ParseField(CHUNKED_TEXT_FIELD)); + CHUNKS_PARSER.declareString(optionalConstructorArg(), new ParseField(TEXT_FIELD)); + CHUNKS_PARSER.declareInt(optionalConstructorArg(), new ParseField(CHUNKED_START_OFFSET_FIELD)); + CHUNKS_PARSER.declareInt(optionalConstructorArg(), new ParseField(CHUNKED_END_OFFSET_FIELD)); CHUNKS_PARSER.declareField(constructorArg(), (p, c) -> { XContentBuilder b = XContentBuilder.builder(p.contentType().xContent()); b.copyCurrentStructure(p); @@ -305,18 +372,64 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws MODEL_SETTINGS_PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), new ParseField(ELEMENT_TYPE_FIELD)); } + private static Map> parseChunksMap(XContentParser parser, ParserContext context) throws IOException { + Map> resultMap = new LinkedHashMap<>(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.currentToken(), parser); + while (parser.nextToken() != XContentParser.Token.END_OBJECT) { + XContentParserUtils.ensureExpectedToken(XContentParser.Token.FIELD_NAME, parser.currentToken(), parser); + String fieldName = parser.currentName(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_ARRAY, parser.nextToken(), parser); + var chunks = resultMap.computeIfAbsent(fieldName, k -> new ArrayList<>()); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + chunks.add(CHUNKS_PARSER.parse(parser, context)); + } + } + return resultMap; + } + + private static List parseChunksArrayLegacy(XContentParser parser, ParserContext context) throws IOException { + List results = new ArrayList<>(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_ARRAY, parser.currentToken(), parser); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + results.add(CHUNKS_PARSER.parse(parser, context)); + } + return results; + } + /** * Converts the provided {@link ChunkedInference} into a list of {@link Chunk}. */ - public static List toSemanticTextFieldChunks(List results, XContentType contentType) throws IOException { + public static List toSemanticTextFieldChunks( + String input, + int offsetAdjustment, + ChunkedInference results, + XContentType contentType, + boolean useLegacyFormat + ) throws IOException { List chunks = new ArrayList<>(); - for (var result : results) { - for (var it = result.chunksAsMatchedTextAndByteReference(contentType.xContent()); it.hasNext();) { - var chunkAsByteReference = it.next(); - chunks.add(new Chunk(chunkAsByteReference.matchedText(), chunkAsByteReference.bytesReference())); - } + Iterator it = results.chunksAsMatchedTextAndByteReference(contentType.xContent()); + while (it.hasNext()) { + chunks.add(toSemanticTextFieldChunk(input, offsetAdjustment, it.next(), useLegacyFormat)); } return chunks; } + public static Chunk toSemanticTextFieldChunk( + String input, + int offsetAdjustment, + ChunkedInference.Chunk chunk, + boolean useLegacyFormat + ) { + String text = null; + int startOffset = -1; + int endOffset = -1; + if (useLegacyFormat) { + text = input.substring(chunk.textOffset().start(), chunk.textOffset().end()); + } else { + startOffset = chunk.textOffset().start() + offsetAdjustment; + endOffset = chunk.textOffset().end() + offsetAdjustment; + } + + return new Chunk(text, startOffset, endOffset, chunk.bytesReference()); + } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index 899c5d4f21b31..b47c55c302273 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -8,16 +8,23 @@ package org.elasticsearch.xpack.inference.mapper; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.join.BitSetProducer; import org.apache.lucene.search.join.ScoreMode; +import org.apache.lucene.util.BitSet; import org.elasticsearch.cluster.metadata.InferenceFieldMetadata; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.xcontent.XContentHelper; -import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.common.xcontent.XContentParserUtils; +import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.core.Nullable; -import org.elasticsearch.core.Tuple; import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; @@ -29,6 +36,7 @@ import org.elasticsearch.index.mapper.DocumentParsingException; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.InferenceFieldMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; @@ -38,6 +46,7 @@ import org.elasticsearch.index.mapper.NestedObjectMapper; import org.elasticsearch.index.mapper.ObjectMapper; import org.elasticsearch.index.mapper.SimpleMappedFieldType; +import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.mapper.ValueFetcher; @@ -49,19 +58,25 @@ import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.inference.InferenceResults; import org.elasticsearch.inference.SimilarityMeasure; +import org.elasticsearch.search.fetch.StoredFieldsSpec; +import org.elasticsearch.search.lookup.Source; import org.elasticsearch.search.vectors.KnnVectorQueryBuilder; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; import org.elasticsearch.xcontent.XContentLocation; import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults; import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults; import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -71,7 +86,7 @@ import static org.elasticsearch.search.SearchService.DEFAULT_SIZE; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD; -import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_TEXT_FIELD; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_OFFSET_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKS_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.INFERENCE_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.INFERENCE_ID_FIELD; @@ -80,6 +95,7 @@ import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.TEXT_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getChunksFieldName; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getEmbeddingsFieldName; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getOffsetsFieldName; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getOriginalTextFieldName; import static org.elasticsearch.xpack.inference.services.elasticsearch.ElasticsearchInternalService.DEFAULT_ELSER_ID; @@ -100,16 +116,14 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie public static final String CONTENT_TYPE = "semantic_text"; public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID; - private final IndexSettings indexSettings; - public static final TypeParser PARSER = new TypeParser( - (n, c) -> new Builder(n, c.indexVersionCreated(), c::bitSetProducer, c.getIndexSettings()), + (n, c) -> new Builder(n, c::bitSetProducer, c.getIndexSettings()), List.of(notInMultiFields(CONTENT_TYPE), notFromDynamicTemplates(CONTENT_TYPE)) ); public static class Builder extends FieldMapper.Builder { + private final boolean useLegacyFormat; private final IndexVersion indexVersionCreated; - private final IndexSettings indexSettings; private final Parameter inferenceId = Parameter.stringParam( INFERENCE_ID_FIELD, @@ -154,26 +168,21 @@ public static class Builder extends FieldMapper.Builder { public static Builder from(SemanticTextFieldMapper mapper) { Builder builder = new Builder( mapper.leafName(), - mapper.fieldType().indexVersionCreated, mapper.fieldType().getChunksField().bitsetProducer(), - mapper.indexSettings + mapper.fieldType().getChunksField().indexSettings() ); builder.init(mapper); return builder; } - public Builder( - String name, - IndexVersion indexVersionCreated, - Function bitSetProducer, - IndexSettings indexSettings - ) { + public Builder(String name, Function bitSetProducer, IndexSettings indexSettings) { super(name); - this.indexVersionCreated = indexVersionCreated; - this.indexSettings = indexSettings; + this.indexVersionCreated = indexSettings.getIndexVersionCreated(); + this.useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(indexSettings.getSettings()) == false; this.inferenceFieldBuilder = c -> createInferenceField( c, - indexVersionCreated, + indexSettings.getIndexVersionCreated(), + useLegacyFormat, modelSettings.get(), bitSetProducer, indexSettings @@ -238,10 +247,10 @@ public SemanticTextFieldMapper build(MapperBuilderContext context) { modelSettings.getValue(), inferenceField, indexVersionCreated, + useLegacyFormat, meta.getValue() ), - builderParams(this, context), - indexSettings + builderParams(this, context) ); } @@ -265,14 +274,8 @@ private SemanticTextFieldMapper copySettings(SemanticTextFieldMapper mapper, Map } } - private SemanticTextFieldMapper( - String simpleName, - MappedFieldType mappedFieldType, - BuilderParams builderParams, - IndexSettings indexSettings - ) { + private SemanticTextFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) { super(simpleName, mappedFieldType, builderParams); - this.indexSettings = indexSettings; } @Override @@ -289,21 +292,48 @@ public FieldMapper.Builder getMergeBuilder() { @Override protected void parseCreateField(DocumentParserContext context) throws IOException { - XContentParser parser = context.parser(); - if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + final XContentParser parser = context.parser(); + final XContentLocation xContentLocation = parser.getTokenLocation(); + + if (fieldType().useLegacyFormat == false) { + // Detect if field value is an object, which we don't support parsing + if (parser.currentToken() == XContentParser.Token.START_OBJECT) { + throw new DocumentParsingException( + xContentLocation, + "[" + CONTENT_TYPE + "] field [" + fullPath() + "] does not support object values" + ); + } + + // ignore the rest of the field value + parser.skipChildren(); return; } - XContentLocation xContentLocation = parser.getTokenLocation(); - final SemanticTextField field; + final SemanticTextField field = parseSemanticTextField(context); + if (field != null) { + parseCreateFieldFromContext(context, field, xContentLocation); + } + } + + SemanticTextField parseSemanticTextField(DocumentParserContext context) throws IOException { + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + return null; + } boolean isWithinLeaf = context.path().isWithinLeafObject(); try { context.path().setWithinLeafObject(true); - field = SemanticTextField.parse(parser, new Tuple<>(fullPath(), context.parser().contentType())); + return SemanticTextField.parse( + context.parser(), + new SemanticTextField.ParserContext(fieldType().useLegacyFormat, fullPath(), context.parser().contentType()) + ); } finally { context.path().setWithinLeafObject(isWithinLeaf); } + } + void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextField field, XContentLocation xContentLocation) + throws IOException { final String fullFieldName = fieldType().name(); if (field.inference().inferenceId().equals(fieldType().getInferenceId()) == false) { throw new DocumentParsingException( @@ -324,9 +354,8 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio context.path().remove(); Builder builder = (Builder) new Builder( leafName(), - fieldType().indexVersionCreated, fieldType().getChunksField().bitsetProducer(), - indexSettings + fieldType().getChunksField().indexSettings() ).init(this); try { mapper = builder.setModelSettings(field.inference().modelSettings()) @@ -357,17 +386,44 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio var chunksField = mapper.fieldType().getChunksField(); var embeddingsField = mapper.fieldType().getEmbeddingsField(); - for (var chunk : field.inference().chunks()) { - try ( - XContentParser subParser = XContentHelper.createParserNotCompressed( - XContentParserConfiguration.EMPTY, - chunk.rawEmbeddings(), - context.parser().contentType() - ) - ) { - DocumentParserContext subContext = context.createNestedContext(chunksField).switchParser(subParser); - subParser.nextToken(); - embeddingsField.parse(subContext); + var offsetsField = mapper.fieldType().getOffsetsField(); + for (var entry : field.inference().chunks().entrySet()) { + for (var chunk : entry.getValue()) { + var nestedContext = context.createNestedContext(chunksField); + try ( + XContentParser subParser = XContentHelper.createParserNotCompressed( + XContentParserConfiguration.EMPTY, + chunk.rawEmbeddings(), + context.parser().contentType() + ) + ) { + DocumentParserContext subContext = nestedContext.switchParser(subParser); + subParser.nextToken(); + embeddingsField.parse(subContext); + } + + if (fieldType().useLegacyFormat) { + continue; + } + + try (XContentBuilder builder = XContentFactory.contentBuilder(context.parser().contentType())) { + builder.startObject(); + builder.field("field", entry.getKey()); + builder.field("start", chunk.startOffset()); + builder.field("end", chunk.endOffset()); + builder.endObject(); + try ( + XContentParser subParser = XContentHelper.createParserNotCompressed( + XContentParserConfiguration.EMPTY, + BytesReference.bytes(builder), + context.parser().contentType() + ) + ) { + DocumentParserContext subContext = nestedContext.switchParser(subParser); + subParser.nextToken(); + offsetsField.parse(subContext); + } + } } } } @@ -390,20 +446,6 @@ public InferenceFieldMetadata getMetadata(Set sourcePaths) { return new InferenceFieldMetadata(fullPath(), fieldType().getInferenceId(), fieldType().getSearchInferenceId(), copyFields); } - @Override - public Object getOriginalValue(Map sourceAsMap) { - Object fieldValue = sourceAsMap.get(fullPath()); - if (fieldValue == null) { - return null; - } else if (fieldValue instanceof Map == false) { - // Don't try to further validate the non-map value, that will be handled when the source is fully parsed - return fieldValue; - } - - Map fieldValueMap = XContentMapValues.nodeMapValue(fieldValue, "Field [" + fullPath() + "]"); - return XContentMapValues.extractValue(TEXT_FIELD, fieldValueMap); - } - @Override protected void doValidate(MappingLookup mappers) { int parentPathIndex = fullPath().lastIndexOf(leafName()); @@ -429,6 +471,7 @@ public static class SemanticTextFieldType extends SimpleMappedFieldType { private final SemanticTextField.ModelSettings modelSettings; private final ObjectMapper inferenceField; private final IndexVersion indexVersionCreated; + private final boolean useLegacyFormat; public SemanticTextFieldType( String name, @@ -437,6 +480,7 @@ public SemanticTextFieldType( SemanticTextField.ModelSettings modelSettings, ObjectMapper inferenceField, IndexVersion indexVersionCreated, + boolean useLegacyFormat, Map meta ) { super(name, true, false, false, TextSearchInfo.NONE, meta); @@ -445,6 +489,11 @@ public SemanticTextFieldType( this.modelSettings = modelSettings; this.inferenceField = inferenceField; this.indexVersionCreated = indexVersionCreated; + this.useLegacyFormat = useLegacyFormat; + } + + public boolean useLegacyFormat() { + return useLegacyFormat; } @Override @@ -476,6 +525,10 @@ public FieldMapper getEmbeddingsField() { return (FieldMapper) getChunksField().getMapper(CHUNKED_EMBEDDINGS_FIELD); } + public FieldMapper getOffsetsField() { + return (FieldMapper) getChunksField().getMapper(CHUNKED_OFFSET_FIELD); + } + @Override public Query termQuery(Object value, SearchExecutionContext context) { throw new IllegalArgumentException(CONTENT_TYPE + " fields do not support term query"); @@ -498,8 +551,30 @@ public Query existsQuery(SearchExecutionContext context) { @Override public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { - // Redirect the fetcher to load the original values of the field - return SourceValueFetcher.toString(getOriginalTextFieldName(name()), context, format); + if (useLegacyFormat) { + // Redirect the fetcher to load the original values of the field + return SourceValueFetcher.toString(getOriginalTextFieldName(name()), context, format); + } + return SourceValueFetcher.toString(name(), context, null); + } + + ValueFetcher valueFetcherWithInferenceResults(Function bitSetCache, IndexSearcher searcher) { + var embeddingsField = getEmbeddingsField(); + if (embeddingsField == null) { + return ValueFetcher.EMPTY; + } + try { + var embeddingsLoader = embeddingsField.syntheticFieldLoader(); + var bitSetFilter = bitSetCache.apply(getChunksField().parentTypeFilter()); + var childWeight = searcher.createWeight( + getChunksField().nestedTypeFilter(), + org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES, + 1 + ); + return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader); + } catch (IOException exc) { + throw new UncheckedIOException(exc); + } } @Override @@ -622,158 +697,170 @@ private String generateInvalidQueryInferenceResultsMessage(StringBuilder baseMes @Override public BlockLoader blockLoader(MappedFieldType.BlockLoaderContext blContext) { - SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name().concat(".text"))); + String name = useLegacyFormat ? name().concat(".text") : name(); + SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name)); return new BlockSourceReader.BytesRefsBlockLoader(fetcher, BlockSourceReader.lookupMatchingAll()); } - } - - /** - *

- * Insert or replace the path's value in the map with the provided new value. The map will be modified in-place. - * If the complete path does not exist in the map, it will be added to the deepest (sub-)map possible. - *

- *

- * For example, given the map: - *

- *
-     * {
-     *   "path1": {
-     *     "path2": {
-     *       "key1": "value1"
-     *     }
-     *   }
-     * }
-     * 
- *

- * And the caller wanted to insert {@code "path1.path2.path3.key2": "value2"}, the method would emit the modified map: - *

- *
-     * {
-     *   "path1": {
-     *     "path2": {
-     *       "key1": "value1",
-     *       "path3.key2": "value2"
-     *     }
-     *   }
-     * }
-     * 
- * - * @param path the value's path in the map. - * @param map the map to search and modify in-place. - * @param newValue the new value to assign to the path. - * - * @throws IllegalArgumentException If either the path cannot be fully traversed or there is ambiguity about where to insert the new - * value. - */ - public static void insertValue(String path, Map map, Object newValue) { - String[] pathElements = path.split("\\."); - if (pathElements.length == 0) { - return; - } - List suffixMaps = extractSuffixMaps(pathElements, 0, map); - if (suffixMaps.isEmpty()) { - // This should never happen. Throw in case it does for some reason. - throw new IllegalStateException("extractSuffixMaps returned an empty suffix map list"); - } else if (suffixMaps.size() == 1) { - SuffixMap suffixMap = suffixMaps.get(0); - suffixMap.map().put(suffixMap.suffix(), newValue); - } else { - throw new IllegalArgumentException( - "Path [" + path + "] could be inserted in " + suffixMaps.size() + " distinct ways, it is ambiguous which one to use" - ); - } - } + private class SemanticTextFieldValueFetcher implements ValueFetcher { + private final BitSetProducer parentBitSetProducer; + private final Weight childWeight; + private final SourceLoader.SyntheticFieldLoader fieldLoader; - private record SuffixMap(String suffix, Map map) {} + private BitSet bitSet; + private Scorer childScorer; + private SourceLoader.SyntheticFieldLoader.DocValuesLoader dvLoader; + private OffsetSourceField.OffsetSourceLoader offsetsLoader; - private static List extractSuffixMaps(String[] pathElements, int index, Object currentValue) { - if (currentValue instanceof List valueList) { - List suffixMaps = new ArrayList<>(valueList.size()); - for (Object o : valueList) { - suffixMaps.addAll(extractSuffixMaps(pathElements, index, o)); + private SemanticTextFieldValueFetcher( + BitSetProducer bitSetProducer, + Weight childWeight, + SourceLoader.SyntheticFieldLoader fieldLoader + ) { + this.parentBitSetProducer = bitSetProducer; + this.childWeight = childWeight; + this.fieldLoader = fieldLoader; } - return suffixMaps; - } else if (currentValue instanceof Map) { - @SuppressWarnings("unchecked") - Map map = (Map) currentValue; - List suffixMaps = new ArrayList<>(map.size()); - - String key = pathElements[index]; - while (index < pathElements.length) { - if (map.containsKey(key)) { - if (index + 1 == pathElements.length) { - // We found the complete path - suffixMaps.add(new SuffixMap(key, map)); - } else { - // We've matched that path partially, keep traversing to try to match it fully - suffixMaps.addAll(extractSuffixMaps(pathElements, index + 1, map.get(key))); + @Override + public void setNextReader(LeafReaderContext context) { + try { + bitSet = parentBitSetProducer.getBitSet(context); + childScorer = childWeight.scorer(context); + if (childScorer != null) { + childScorer.iterator().nextDoc(); } + dvLoader = fieldLoader.docValuesLoader(context.reader(), null); + var terms = context.reader().terms(getOffsetsFieldName(name())); + offsetsLoader = terms != null ? OffsetSourceField.loader(terms) : null; + } catch (IOException exc) { + throw new UncheckedIOException(exc); } + } - if (++index < pathElements.length) { - key += "." + pathElements[index]; + @Override + public List fetchValues(Source source, int doc, List ignoredValues) throws IOException { + if (childScorer == null || offsetsLoader == null || doc == 0) { + return List.of(); + } + int previousParent = bitSet.prevSetBit(doc - 1); + var it = childScorer.iterator(); + if (it.docID() < previousParent) { + it.advance(previousParent); + } + Map> chunkMap = new LinkedHashMap<>(); + while (it.docID() < doc) { + if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) { + throw new IllegalStateException( + "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]" + ); + } + var offset = offsetsLoader.advanceTo(it.docID()); + if (offset == null) { + throw new IllegalStateException( + "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]" + ); + } + var chunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>()); + chunks.add( + new SemanticTextField.Chunk( + null, + offset.start(), + offset.end(), + rawEmbeddings(fieldLoader::write, source.sourceContentType()) + ) + ); + if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + break; + } } + if (chunkMap.isEmpty()) { + return List.of(); + } + return List.of( + new SemanticTextField( + useLegacyFormat, + name(), + null, + new SemanticTextField.InferenceResult(inferenceId, modelSettings, chunkMap), + source.sourceContentType() + ) + ); } - if (suffixMaps.isEmpty()) { - // We checked for all remaining elements in the path, and they do not exist. This means we found a leaf map that we should - // add the value to. - suffixMaps.add(new SuffixMap(key, map)); + private BytesReference rawEmbeddings(CheckedConsumer writer, XContentType xContentType) + throws IOException { + try (var result = XContentFactory.contentBuilder(xContentType)) { + try (var builder = XContentFactory.contentBuilder(xContentType)) { + builder.startObject(); + writer.accept(builder); + builder.endObject(); + try ( + XContentParser parser = XContentHelper.createParserNotCompressed( + XContentParserConfiguration.EMPTY, + BytesReference.bytes(builder), + xContentType + ) + ) { + XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.nextToken(), parser); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.FIELD_NAME, parser.nextToken(), parser); + parser.nextToken(); + result.copyCurrentStructure(parser); + } + return BytesReference.bytes(result); + } + } } - return suffixMaps; - } else { - throw new IllegalArgumentException( - "Path [" - + String.join(".", Arrays.copyOfRange(pathElements, 0, index)) - + "] has value [" - + currentValue - + "] of type [" - + currentValue.getClass().getSimpleName() - + "], which cannot be traversed into further" - ); + @Override + public StoredFieldsSpec storedFieldsSpec() { + return StoredFieldsSpec.NO_REQUIREMENTS; + } } } private static ObjectMapper createInferenceField( MapperBuilderContext context, IndexVersion indexVersionCreated, + boolean useLegacyFormat, @Nullable SemanticTextField.ModelSettings modelSettings, Function bitSetProducer, IndexSettings indexSettings ) { return new ObjectMapper.Builder(INFERENCE_FIELD, Optional.of(ObjectMapper.Subobjects.ENABLED)).dynamic(ObjectMapper.Dynamic.FALSE) - .add(createChunksField(indexVersionCreated, modelSettings, bitSetProducer, indexSettings)) + .add(createChunksField(indexVersionCreated, useLegacyFormat, modelSettings, bitSetProducer, indexSettings)) .build(context); } private static NestedObjectMapper.Builder createChunksField( IndexVersion indexVersionCreated, + boolean useLegacyFormat, @Nullable SemanticTextField.ModelSettings modelSettings, Function bitSetProducer, IndexSettings indexSettings ) { NestedObjectMapper.Builder chunksField = new NestedObjectMapper.Builder( - CHUNKS_FIELD, - indexVersionCreated, + SemanticTextField.CHUNKS_FIELD, + indexSettings.getIndexVersionCreated(), bitSetProducer, indexSettings ); chunksField.dynamic(ObjectMapper.Dynamic.FALSE); - KeywordFieldMapper.Builder chunkTextField = new KeywordFieldMapper.Builder(CHUNKED_TEXT_FIELD, indexVersionCreated).indexed(false) - .docValues(false); if (modelSettings != null) { - chunksField.add(createEmbeddingsField(indexVersionCreated, modelSettings)); + chunksField.add(createEmbeddingsField(indexSettings.getIndexVersionCreated(), modelSettings)); + } + if (useLegacyFormat) { + var chunkTextField = new KeywordFieldMapper.Builder(TEXT_FIELD, indexVersionCreated).indexed(false).docValues(false); + chunksField.add(chunkTextField); + } else { + chunksField.add(new OffsetSourceFieldMapper.Builder(CHUNKED_OFFSET_FIELD)); } - chunksField.add(chunkTextField); return chunksField; } private static Mapper.Builder createEmbeddingsField(IndexVersion indexVersionCreated, SemanticTextField.ModelSettings modelSettings) { return switch (modelSettings.taskType()) { - case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD); + case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD).setStored(true); case TEXT_EMBEDDING -> { DenseVectorFieldMapper.Builder denseVectorMapperBuilder = new DenseVectorFieldMapper.Builder( CHUNKED_EMBEDDINGS_FIELD, diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtils.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtils.java new file mode 100644 index 0000000000000..14a3304e94df4 --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtils.java @@ -0,0 +1,163 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.elasticsearch.ElasticsearchStatusException; +import org.elasticsearch.rest.RestStatus; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +public interface SemanticTextUtils { + /** + * This method converts the given {@code valueObj} into a list of strings. + */ + static List nodeStringValues(String field, Object valueObj) { + if (valueObj instanceof Number || valueObj instanceof Boolean) { + return List.of(valueObj.toString()); + } else if (valueObj instanceof String value) { + return List.of(value); + } else if (valueObj instanceof Collection values) { + List valuesString = new ArrayList<>(); + for (var v : values) { + if (v instanceof Number || v instanceof Boolean) { + valuesString.add(v.toString()); + } else if (v instanceof String value) { + valuesString.add(value); + } else { + throw new ElasticsearchStatusException( + "Invalid format for field [{}], expected [String|Number|Boolean] got [{}]", + RestStatus.BAD_REQUEST, + field, + valueObj.getClass().getSimpleName() + ); + } + } + return valuesString; + } + throw new ElasticsearchStatusException( + "Invalid format for field [{}], expected [String|Number|Boolean] got [{}]", + RestStatus.BAD_REQUEST, + field, + valueObj.getClass().getSimpleName() + ); + } + + /** + *

+ * Insert or replace the path's value in the map with the provided new value. The map will be modified in-place. + * If the complete path does not exist in the map, it will be added to the deepest (sub-)map possible. + *

+ *

+ * For example, given the map: + *

+ *
+     * {
+     *   "path1": {
+     *     "path2": {
+     *       "key1": "value1"
+     *     }
+     *   }
+     * }
+     * 
+ *

+ * And the caller wanted to insert {@code "path1.path2.path3.key2": "value2"}, the method would emit the modified map: + *

+ *
+     * {
+     *   "path1": {
+     *     "path2": {
+     *       "key1": "value1",
+     *       "path3.key2": "value2"
+     *     }
+     *   }
+     * }
+     * 
+ * + * @param path the value's path in the map. + * @param map the map to search and modify in-place. + * @param newValue the new value to assign to the path. + * + * @throws IllegalArgumentException If either the path cannot be fully traversed or there is ambiguity about where to insert the new + * value. + */ + static void insertValue(String path, Map map, Object newValue) { + String[] pathElements = path.split("\\."); + if (pathElements.length == 0) { + return; + } + + List suffixMaps = extractSuffixMaps(pathElements, 0, map); + if (suffixMaps.isEmpty()) { + // This should never happen. Throw in case it does for some reason. + throw new IllegalStateException("extractSuffixMaps returned an empty suffix map list"); + } else if (suffixMaps.size() == 1) { + SuffixMap suffixMap = suffixMaps.get(0); + suffixMap.map().put(suffixMap.suffix(), newValue); + } else { + throw new IllegalArgumentException( + "Path [" + path + "] could be inserted in " + suffixMaps.size() + " distinct ways, it is ambiguous which one to use" + ); + } + } + + record SuffixMap(String suffix, Map map) {} + + private static List extractSuffixMaps(String[] pathElements, int index, Object currentValue) { + if (currentValue instanceof List valueList) { + List suffixMaps = new ArrayList<>(valueList.size()); + for (Object o : valueList) { + suffixMaps.addAll(extractSuffixMaps(pathElements, index, o)); + } + + return suffixMaps; + } else if (currentValue instanceof Map) { + @SuppressWarnings("unchecked") + Map map = (Map) currentValue; + List suffixMaps = new ArrayList<>(map.size()); + + String key = pathElements[index]; + while (index < pathElements.length) { + if (map.containsKey(key)) { + if (index + 1 == pathElements.length) { + // We found the complete path + suffixMaps.add(new SuffixMap(key, map)); + } else { + // We've matched that path partially, keep traversing to try to match it fully + suffixMaps.addAll(extractSuffixMaps(pathElements, index + 1, map.get(key))); + } + } + + if (++index < pathElements.length) { + key += "." + pathElements[index]; + } + } + + if (suffixMaps.isEmpty()) { + // We checked for all remaining elements in the path, and they do not exist. This means we found a leaf map that we should + // add the value to. + suffixMaps.add(new SuffixMap(key, map)); + } + + return suffixMaps; + } else { + throw new IllegalArgumentException( + "Path [" + + String.join(".", Arrays.copyOfRange(pathElements, 0, index)) + + "] has value [" + + currentValue + + "] of type [" + + currentValue.getClass().getSimpleName() + + "], which cannot be traversed into further" + ); + } + } +} diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java index 0b7d136ffb04c..478c81f7c5a32 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java @@ -7,6 +7,8 @@ package org.elasticsearch.xpack.inference.action.filter; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + import org.elasticsearch.ResourceNotFoundException; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.bulk.BulkItemRequest; @@ -16,10 +18,18 @@ import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.support.ActionFilterChain; import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.InferenceFieldMetadata; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.inference.ChunkedInference; import org.elasticsearch.inference.InferenceService; @@ -71,8 +81,18 @@ import static org.mockito.Mockito.when; public class ShardBulkInferenceActionFilterTests extends ESTestCase { + private final boolean useLegacyFormat; private ThreadPool threadPool; + public ShardBulkInferenceActionFilterTests(boolean useLegacyFormat) { + this.useLegacyFormat = useLegacyFormat; + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return List.of(new Object[] { true }, new Object[] { false }); + } + @Before public void setupThreadPool() { threadPool = new TestThreadPool(getTestName()); @@ -85,7 +105,7 @@ public void tearDownThreadPool() throws Exception { @SuppressWarnings({ "unchecked", "rawtypes" }) public void testFilterNoop() throws Exception { - ShardBulkInferenceActionFilter filter = createFilter(threadPool, Map.of(), DEFAULT_BATCH_SIZE); + ShardBulkInferenceActionFilter filter = createFilter(threadPool, Map.of(), DEFAULT_BATCH_SIZE, useLegacyFormat); CountDownLatch chainExecuted = new CountDownLatch(1); ActionFilterChain actionFilterChain = (task, action, request, listener) -> { try { @@ -114,7 +134,8 @@ public void testInferenceNotFound() throws Exception { ShardBulkInferenceActionFilter filter = createFilter( threadPool, Map.of(model.getInferenceEntityId(), model), - randomIntBetween(1, 10) + randomIntBetween(1, 10), + useLegacyFormat ); CountDownLatch chainExecuted = new CountDownLatch(1); ActionFilterChain actionFilterChain = (task, action, request, listener) -> { @@ -144,7 +165,7 @@ public void testInferenceNotFound() throws Exception { ); BulkItemRequest[] items = new BulkItemRequest[10]; for (int i = 0; i < items.length; i++) { - items[i] = randomBulkItemRequest(Map.of(), inferenceFieldMap)[0]; + items[i] = randomBulkItemRequest(useLegacyFormat, Map.of(), inferenceFieldMap)[0]; } BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items); request.setInferenceFieldMap(inferenceFieldMap); @@ -155,10 +176,12 @@ public void testInferenceNotFound() throws Exception { @SuppressWarnings({ "unchecked", "rawtypes" }) public void testItemFailures() throws Exception { StaticModel model = StaticModel.createRandomInstance(); + ShardBulkInferenceActionFilter filter = createFilter( threadPool, Map.of(model.getInferenceEntityId(), model), - randomIntBetween(1, 10) + randomIntBetween(1, 10), + useLegacyFormat ); model.putResult("I am a failure", new ChunkedInferenceError(new IllegalArgumentException("boom"))); model.putResult("I am a success", randomChunkedInferenceEmbeddingSparse(List.of("I am a success"))); @@ -178,7 +201,10 @@ public void testItemFailures() throws Exception { // item 1 is a success assertNull(bulkShardRequest.items()[1].getPrimaryResponse()); IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request()); - assertThat(XContentMapValues.extractValue("field1.text", actualRequest.sourceAsMap()), equalTo("I am a success")); + assertThat( + XContentMapValues.extractValue(useLegacyFormat ? "field1.text" : "field1", actualRequest.sourceAsMap()), + equalTo("I am a success") + ); // item 2 is a failure assertNotNull(bulkShardRequest.items()[2].getPrimaryResponse()); @@ -227,12 +253,12 @@ public void testManyRandomDocs() throws Exception { BulkItemRequest[] originalRequests = new BulkItemRequest[numRequests]; BulkItemRequest[] modifiedRequests = new BulkItemRequest[numRequests]; for (int id = 0; id < numRequests; id++) { - BulkItemRequest[] res = randomBulkItemRequest(inferenceModelMap, inferenceFieldMap); + BulkItemRequest[] res = randomBulkItemRequest(useLegacyFormat, inferenceModelMap, inferenceFieldMap); originalRequests[id] = res[0]; modifiedRequests[id] = res[1]; } - ShardBulkInferenceActionFilter filter = createFilter(threadPool, inferenceModelMap, randomIntBetween(10, 30)); + ShardBulkInferenceActionFilter filter = createFilter(threadPool, inferenceModelMap, randomIntBetween(10, 30), useLegacyFormat); CountDownLatch chainExecuted = new CountDownLatch(1); ActionFilterChain actionFilterChain = (task, action, request, listener) -> { try { @@ -263,7 +289,12 @@ public void testManyRandomDocs() throws Exception { } @SuppressWarnings("unchecked") - private static ShardBulkInferenceActionFilter createFilter(ThreadPool threadPool, Map modelMap, int batchSize) { + private static ShardBulkInferenceActionFilter createFilter( + ThreadPool threadPool, + Map modelMap, + int batchSize, + boolean useLegacyFormat + ) { ModelRegistry modelRegistry = mock(ModelRegistry.class); Answer unparsedModelAnswer = invocationOnMock -> { String id = (String) invocationOnMock.getArguments()[0]; @@ -319,24 +350,50 @@ private static ShardBulkInferenceActionFilter createFilter(ThreadPool threadPool InferenceServiceRegistry inferenceServiceRegistry = mock(InferenceServiceRegistry.class); when(inferenceServiceRegistry.getService(any())).thenReturn(Optional.of(inferenceService)); - ShardBulkInferenceActionFilter filter = new ShardBulkInferenceActionFilter(inferenceServiceRegistry, modelRegistry, batchSize); - return filter; + + return new ShardBulkInferenceActionFilter( + createClusterService(useLegacyFormat), + inferenceServiceRegistry, + modelRegistry, + batchSize + ); + } + + private static ClusterService createClusterService(boolean useLegacyFormat) { + IndexMetadata indexMetadata = mock(IndexMetadata.class); + var settings = Settings.builder() + .put(IndexMetadata.SETTING_INDEX_VERSION_CREATED.getKey(), IndexVersion.current()) + .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat) + .build(); + when(indexMetadata.getSettings()).thenReturn(settings); + + Metadata metadata = mock(Metadata.class); + when(metadata.index(any(String.class))).thenReturn(indexMetadata); + + ClusterState clusterState = ClusterState.builder(new ClusterName("test")).metadata(metadata).build(); + ClusterService clusterService = mock(ClusterService.class); + when(clusterService.state()).thenReturn(clusterState); + + return clusterService; } private static BulkItemRequest[] randomBulkItemRequest( + boolean useLegacyFormat, Map modelMap, Map fieldInferenceMap ) throws IOException { Map docMap = new LinkedHashMap<>(); Map expectedDocMap = new LinkedHashMap<>(); XContentType requestContentType = randomFrom(XContentType.values()); + + Map inferenceMetadataFields = new HashMap<>(); for (var entry : fieldInferenceMap.values()) { String field = entry.getName(); var model = modelMap.get(entry.getInferenceId()); Object inputObject = randomSemanticTextInput(); String inputText = inputObject.toString(); docMap.put(field, inputObject); - expectedDocMap.put(field, inputText); + expectedDocMap.put(field, useLegacyFormat ? inputText : inputObject); if (model == null) { // ignore results, the doc should fail with a resource not found exception continue; @@ -349,6 +406,7 @@ private static BulkItemRequest[] randomBulkItemRequest( if (model.hasResult(inputText)) { var results = model.getResults(inputText); semanticTextField = semanticTextFieldFromChunkedInferenceResults( + useLegacyFormat, field, model, List.of(inputText), @@ -356,11 +414,19 @@ private static BulkItemRequest[] randomBulkItemRequest( requestContentType ); } else { - semanticTextField = randomSemanticText(field, model, List.of(inputText), requestContentType); - model.putResult(inputText, toChunkedResult(semanticTextField)); + Map> inputTextMap = Map.of(field, List.of(inputText)); + semanticTextField = randomSemanticText(useLegacyFormat, field, model, List.of(inputText), requestContentType); + model.putResult(inputText, toChunkedResult(useLegacyFormat, inputTextMap, semanticTextField)); } - expectedDocMap.put(field, semanticTextField); + if (useLegacyFormat) { + expectedDocMap.put(field, semanticTextField); + } else { + inferenceMetadataFields.put(field, semanticTextField); + } + } + if (useLegacyFormat == false) { + expectedDocMap.put(InferenceMetadataFieldsMapper.NAME, inferenceMetadataFields); } int requestId = randomIntBetween(0, Integer.MAX_VALUE); diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index e438090c99163..2358ede81c7a4 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -26,6 +26,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.MapperServiceTestCase; import org.elasticsearch.index.mapper.SourceToParse; @@ -166,7 +167,8 @@ public void testSparseVector() throws Exception { private MapperService createDefaultMapperService() throws IOException { var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")); - return createMapperService(mappings.utf8ToString()); + var settings = Settings.builder().put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), true).build(); + return createMapperService(settings, mappings.utf8ToString()); } private float[] readDenseVector(Object value) { diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldMapperTests.java new file mode 100644 index 0000000000000..6504ccc4dd39f --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldMapperTests.java @@ -0,0 +1,45 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MapperServiceTestCase; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.xpack.inference.InferencePlugin; + +import java.util.Collection; +import java.util.Collections; + +public class SemanticInferenceMetadataFieldMapperTests extends MapperServiceTestCase { + @Override + protected Collection getPlugins() { + return Collections.singletonList(new InferencePlugin(Settings.EMPTY)); + } + + @Override + public void testFieldHasValue() { + assertTrue( + getMappedFieldType().fieldHasValue( + new FieldInfos(new FieldInfo[] { getFieldInfoWithName(SemanticInferenceMetadataFieldsMapper.NAME) }) + ) + ); + } + + @Override + public void testFieldHasValueWithEmptyFieldInfos() { + assertFalse(getMappedFieldType().fieldHasValue(FieldInfos.EMPTY)); + } + + @Override + public MappedFieldType getMappedFieldType() { + return new SemanticInferenceMetadataFieldsMapper.FieldType(); + } +} diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java index b0f1f3c8c0cbe..09073b800f009 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java @@ -7,6 +7,8 @@ package org.elasticsearch.xpack.inference.mapper; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexableField; @@ -29,10 +31,12 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.lucene.search.Queries; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.DocumentParsingException; import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.MappedFieldType; @@ -57,35 +61,30 @@ import org.elasticsearch.search.NestedDocuments; import org.elasticsearch.search.SearchHit; import org.elasticsearch.xcontent.XContentBuilder; -import org.elasticsearch.xcontent.XContentFactory; -import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xcontent.json.JsonXContent; +import org.elasticsearch.xpack.core.XPackClientPlugin; import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryWrapper; import org.elasticsearch.xpack.inference.InferencePlugin; import org.elasticsearch.xpack.inference.model.TestModel; import org.junit.AssumptionViolatedException; import java.io.IOException; -import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.function.BiConsumer; -import java.util.stream.Stream; -import static java.util.Collections.singletonList; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD; -import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_TEXT_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKS_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.INFERENCE_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.INFERENCE_ID_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.MODEL_SETTINGS_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.SEARCH_INFERENCE_ID_FIELD; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.TEXT_FIELD; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getChunksFieldName; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.getEmbeddingsFieldName; import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.DEFAULT_ELSER_2_INFERENCE_ID; @@ -95,9 +94,35 @@ import static org.hamcrest.Matchers.instanceOf; public class SemanticTextFieldMapperTests extends MapperTestCase { + private final boolean useLegacyFormat; + + public SemanticTextFieldMapperTests(boolean useLegacyFormat) { + this.useLegacyFormat = useLegacyFormat; + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return List.of(new Object[] { true }, new Object[] { false }); + } + @Override protected Collection getPlugins() { - return singletonList(new InferencePlugin(Settings.EMPTY)); + return List.of(new InferencePlugin(Settings.EMPTY), new XPackClientPlugin()); + } + + private MapperService createMapperService(XContentBuilder mappings, boolean useLegacyFormat) throws IOException { + var settings = Settings.builder() + .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat) + .build(); + return createMapperService(settings, mappings); + } + + @Override + protected Settings getIndexSettings() { + return Settings.builder() + .put(super.getIndexSettings()) + .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat) + .build(); } @Override @@ -158,7 +183,8 @@ public MappedFieldType getMappedFieldType() { null, null, null, - IndexVersion.current(), + getVersion(), + false, Map.of() ); } @@ -175,7 +201,7 @@ public void testDefaults() throws Exception { final XContentBuilder fieldMapping = fieldMapping(this::minimalMapping); final XContentBuilder expectedMapping = fieldMapping(this::metaMapping); - MapperService mapperService = createMapperService(fieldMapping); + MapperService mapperService = createMapperService(fieldMapping, useLegacyFormat); DocumentMapper mapper = mapperService.documentMapper(); assertEquals(Strings.toString(expectedMapping), mapper.mappingSource().toString()); assertSemanticTextField(mapperService, fieldName, false); @@ -207,7 +233,7 @@ public void testSetInferenceEndpoints() throws IOException { { final XContentBuilder fieldMapping = fieldMapping(b -> b.field("type", "semantic_text").field(INFERENCE_ID_FIELD, inferenceId)); - final MapperService mapperService = createMapperService(fieldMapping); + final MapperService mapperService = createMapperService(fieldMapping, useLegacyFormat); assertSemanticTextField(mapperService, fieldName, false); assertInferenceEndpoints(mapperService, fieldName, inferenceId, inferenceId); assertSerialization.accept(fieldMapping, mapperService); @@ -221,7 +247,7 @@ public void testSetInferenceEndpoints() throws IOException { .field(INFERENCE_ID_FIELD, DEFAULT_ELSER_2_INFERENCE_ID) .field(SEARCH_INFERENCE_ID_FIELD, searchInferenceId) ); - final MapperService mapperService = createMapperService(fieldMapping); + final MapperService mapperService = createMapperService(fieldMapping, useLegacyFormat); assertSemanticTextField(mapperService, fieldName, false); assertInferenceEndpoints(mapperService, fieldName, DEFAULT_ELSER_2_INFERENCE_ID, searchInferenceId); assertSerialization.accept(expectedMapping, mapperService); @@ -232,7 +258,7 @@ public void testSetInferenceEndpoints() throws IOException { .field(INFERENCE_ID_FIELD, inferenceId) .field(SEARCH_INFERENCE_ID_FIELD, searchInferenceId) ); - MapperService mapperService = createMapperService(fieldMapping); + MapperService mapperService = createMapperService(fieldMapping, useLegacyFormat); assertSemanticTextField(mapperService, fieldName, false); assertInferenceEndpoints(mapperService, fieldName, inferenceId, searchInferenceId); assertSerialization.accept(fieldMapping, mapperService); @@ -243,7 +269,10 @@ public void testInvalidInferenceEndpoints() { { Exception e = expectThrows( MapperParsingException.class, - () -> createMapperService(fieldMapping(b -> b.field("type", "semantic_text").field(INFERENCE_ID_FIELD, (String) null))) + () -> createMapperService( + fieldMapping(b -> b.field("type", "semantic_text").field(INFERENCE_ID_FIELD, (String) null)), + useLegacyFormat + ) ); assertThat( e.getMessage(), @@ -253,14 +282,20 @@ public void testInvalidInferenceEndpoints() { { Exception e = expectThrows( MapperParsingException.class, - () -> createMapperService(fieldMapping(b -> b.field("type", "semantic_text").field(INFERENCE_ID_FIELD, ""))) + () -> createMapperService( + fieldMapping(b -> b.field("type", "semantic_text").field(INFERENCE_ID_FIELD, "")), + useLegacyFormat + ) ); assertThat(e.getMessage(), containsString("[inference_id] on mapper [field] of type [semantic_text] must not be empty")); } { Exception e = expectThrows( MapperParsingException.class, - () -> createMapperService(fieldMapping(b -> b.field("type", "semantic_text").field(SEARCH_INFERENCE_ID_FIELD, ""))) + () -> createMapperService( + fieldMapping(b -> b.field("type", "semantic_text").field(SEARCH_INFERENCE_ID_FIELD, "")), + useLegacyFormat + ) ); assertThat(e.getMessage(), containsString("[search_inference_id] on mapper [field] of type [semantic_text] must not be empty")); } @@ -275,14 +310,15 @@ public void testCannotBeUsedInMultiFields() { b.field("inference_id", "my_inference_id"); b.endObject(); b.endObject(); - }))); + }), useLegacyFormat)); assertThat(e.getMessage(), containsString("Field [semantic] of type [semantic_text] can't be used in multifields")); } public void testUpdatesToInferenceIdNotSupported() throws IOException { String fieldName = randomAlphaOfLengthBetween(5, 15); MapperService mapperService = createMapperService( - mapping(b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", "test_model").endObject()) + mapping(b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", "test_model").endObject()), + useLegacyFormat ); assertSemanticTextField(mapperService, fieldName, false); Exception e = expectThrows( @@ -326,7 +362,8 @@ public void testUpdateModelSettings() throws IOException { for (int depth = 1; depth < 5; depth++) { String fieldName = randomFieldName(depth); MapperService mapperService = createMapperService( - mapping(b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", "test_model").endObject()) + mapping(b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", "test_model").endObject()), + useLegacyFormat ); assertSemanticTextField(mapperService, fieldName, false); { @@ -415,7 +452,7 @@ public void testUpdateSearchInferenceId() throws IOException { for (int depth = 1; depth < 5; depth++) { String fieldName = randomFieldName(depth); - MapperService mapperService = createMapperService(buildMapping.apply(fieldName, null)); + MapperService mapperService = createMapperService(buildMapping.apply(fieldName, null), useLegacyFormat); assertSemanticTextField(mapperService, fieldName, false); assertInferenceEndpoints(mapperService, fieldName, inferenceId, inferenceId); @@ -471,12 +508,20 @@ private static void assertSemanticTextField(MapperService mapperService, String .get(getChunksFieldName(fieldName)); assertThat(chunksMapper, equalTo(semanticFieldMapper.fieldType().getChunksField())); assertThat(chunksMapper.fullPath(), equalTo(getChunksFieldName(fieldName))); - Mapper textMapper = chunksMapper.getMapper(CHUNKED_TEXT_FIELD); - assertNotNull(textMapper); - assertThat(textMapper, instanceOf(KeywordFieldMapper.class)); - KeywordFieldMapper textFieldMapper = (KeywordFieldMapper) textMapper; - assertFalse(textFieldMapper.fieldType().isIndexed()); - assertFalse(textFieldMapper.fieldType().hasDocValues()); + + Mapper textMapper = chunksMapper.getMapper(TEXT_FIELD); + if (semanticTextFieldType.useLegacyFormat()) { + assertNotNull(textMapper); + assertThat(textMapper, instanceOf(KeywordFieldMapper.class)); + KeywordFieldMapper textFieldMapper = (KeywordFieldMapper) textMapper; + assertFalse(textFieldMapper.fieldType().isIndexed()); + assertFalse(textFieldMapper.fieldType().hasDocValues()); + } else { + assertNull(textMapper); + var offsetMapper = semanticTextFieldType.getOffsetsField(); + assertThat(offsetMapper, instanceOf(OffsetSourceFieldMapper.class)); + } + if (expectedModelSettings) { assertNotNull(semanticFieldMapper.fieldType().getModelSettings()); Mapper embeddingsMapper = chunksMapper.getMapper(CHUNKED_EMBEDDINGS_FIELD); @@ -523,7 +568,7 @@ public void testSuccessfulParse() throws IOException { addSemanticTextMapping(b, fieldName2, model2.getInferenceEntityId(), setSearchInferenceId ? searchInferenceId : null); }); - MapperService mapperService = createMapperService(mapping); + MapperService mapperService = createMapperService(mapping, useLegacyFormat); assertSemanticTextField(mapperService, fieldName1, false); assertInferenceEndpoints( mapperService, @@ -543,10 +588,11 @@ public void testSuccessfulParse() throws IOException { ParsedDocument doc = documentMapper.parse( source( b -> addSemanticTextInferenceResults( + useLegacyFormat, b, List.of( - randomSemanticText(fieldName1, model1, List.of("a b", "c"), XContentType.JSON), - randomSemanticText(fieldName2, model2, List.of("d e f"), XContentType.JSON) + randomSemanticText(useLegacyFormat, fieldName1, model1, List.of("a b", "c"), XContentType.JSON), + randomSemanticText(useLegacyFormat, fieldName2, model2, List.of("d e f"), XContentType.JSON) ) ) ) @@ -626,52 +672,64 @@ public void testSuccessfulParse() throws IOException { } public void testMissingInferenceId() throws IOException { - DocumentMapper documentMapper = createDocumentMapper(mapping(b -> addSemanticTextMapping(b, "field", "my_id", null))); + final MapperService mapperService = createMapperService( + mapping(b -> addSemanticTextMapping(b, "field", "my_id", null)), + useLegacyFormat + ); + IllegalArgumentException ex = expectThrows( DocumentParsingException.class, IllegalArgumentException.class, - () -> documentMapper.parse( - source( - b -> b.startObject("field") - .startObject(INFERENCE_FIELD) - .field(MODEL_SETTINGS_FIELD, new SemanticTextField.ModelSettings(TaskType.SPARSE_EMBEDDING, null, null, null)) - .field(CHUNKS_FIELD, List.of()) - .endObject() - .endObject() + () -> mapperService.documentMapper() + .parse( + semanticTextInferenceSource( + useLegacyFormat, + b -> b.startObject("field") + .startObject(INFERENCE_FIELD) + .field(MODEL_SETTINGS_FIELD, new SemanticTextField.ModelSettings(TaskType.SPARSE_EMBEDDING, null, null, null)) + .field(CHUNKS_FIELD, useLegacyFormat ? List.of() : Map.of()) + .endObject() + .endObject() + ) ) - ) ); assertThat(ex.getCause().getMessage(), containsString("Required [inference_id]")); } public void testMissingModelSettings() throws IOException { - DocumentMapper documentMapper = createDocumentMapper(mapping(b -> addSemanticTextMapping(b, "field", "my_id", null))); + MapperService mapperService = createMapperService(mapping(b -> addSemanticTextMapping(b, "field", "my_id", null)), useLegacyFormat); IllegalArgumentException ex = expectThrows( DocumentParsingException.class, IllegalArgumentException.class, - () -> documentMapper.parse( - source(b -> b.startObject("field").startObject(INFERENCE_FIELD).field(INFERENCE_ID_FIELD, "my_id").endObject().endObject()) - ) + () -> mapperService.documentMapper() + .parse( + semanticTextInferenceSource( + useLegacyFormat, + b -> b.startObject("field").startObject(INFERENCE_FIELD).field(INFERENCE_ID_FIELD, "my_id").endObject().endObject() + ) + ) ); assertThat(ex.getCause().getMessage(), containsString("Required [model_settings, chunks]")); } public void testMissingTaskType() throws IOException { - DocumentMapper documentMapper = createDocumentMapper(mapping(b -> addSemanticTextMapping(b, "field", "my_id", null))); + MapperService mapperService = createMapperService(mapping(b -> addSemanticTextMapping(b, "field", "my_id", null)), useLegacyFormat); IllegalArgumentException ex = expectThrows( DocumentParsingException.class, IllegalArgumentException.class, - () -> documentMapper.parse( - source( - b -> b.startObject("field") - .startObject(INFERENCE_FIELD) - .field(INFERENCE_ID_FIELD, "my_id") - .startObject(MODEL_SETTINGS_FIELD) - .endObject() - .endObject() - .endObject() + () -> mapperService.documentMapper() + .parse( + semanticTextInferenceSource( + useLegacyFormat, + b -> b.startObject("field") + .startObject(INFERENCE_FIELD) + .field(INFERENCE_ID_FIELD, "my_id") + .startObject(MODEL_SETTINGS_FIELD) + .endObject() + .endObject() + .endObject() + ) ) - ) ); assertThat(ex.getCause().getMessage(), containsString("failed to parse field [model_settings]")); } @@ -731,7 +789,7 @@ private MapperService mapperServiceForFieldWithModelSettings( mappingParams += ",search_inference_id=" + searchInferenceId; } - MapperService mapperService = createMapperService(mapping(b -> {})); + MapperService mapperService = createMapperService(mapping(b -> {}), useLegacyFormat); mapperService.merge( "_doc", new CompressedXContent(Strings.toString(PutMappingRequest.simpleMapping(fieldName, mappingParams))), @@ -739,14 +797,19 @@ private MapperService mapperServiceForFieldWithModelSettings( ); SemanticTextField semanticTextField = new SemanticTextField( + useLegacyFormat, fieldName, List.of(), - new SemanticTextField.InferenceResult(inferenceId, modelSettings, List.of()), + new SemanticTextField.InferenceResult(inferenceId, modelSettings, Map.of()), XContentType.JSON ); XContentBuilder builder = JsonXContent.contentBuilder().startObject(); - builder.field(semanticTextField.fieldName()); - builder.value(semanticTextField); + if (useLegacyFormat) { + builder.field(semanticTextField.fieldName()); + builder.value(semanticTextField); + } else { + builder.field(InferenceMetadataFieldsMapper.NAME, Map.of(semanticTextField.fieldName(), semanticTextField)); + } builder.endObject(); SourceToParse sourceToParse = new SourceToParse("test", BytesReference.bytes(builder), XContentType.JSON); @@ -798,266 +861,6 @@ public void testExistsQueryDenseVector() throws IOException { assertThat(existsQuery, instanceOf(ESToParentBlockJoinQuery.class)); } - public void testInsertValueMapTraversal() throws IOException { - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("test", "value").endObject(); - - Map map = toSourceMap(Strings.toString(builder)); - SemanticTextFieldMapper.insertValue("test", map, "value2"); - assertThat(getMapValue(map, "test"), equalTo("value2")); - SemanticTextFieldMapper.insertValue("something.else", map, "something_else_value"); - assertThat(getMapValue(map, "something\\.else"), equalTo("something_else_value")); - } - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - builder.startObject("path1").startObject("path2").field("test", "value").endObject().endObject(); - builder.endObject(); - - Map map = toSourceMap(Strings.toString(builder)); - SemanticTextFieldMapper.insertValue("path1.path2.test", map, "value2"); - assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); - SemanticTextFieldMapper.insertValue("path1.path2.test_me", map, "test_me_value"); - assertThat(getMapValue(map, "path1.path2.test_me"), equalTo("test_me_value")); - SemanticTextFieldMapper.insertValue("path1.non_path2.test", map, "test_value"); - assertThat(getMapValue(map, "path1.non_path2\\.test"), equalTo("test_value")); - - SemanticTextFieldMapper.insertValue("path1.path2", map, Map.of("path3", "bar")); - assertThat(getMapValue(map, "path1.path2"), equalTo(Map.of("path3", "bar"))); - - SemanticTextFieldMapper.insertValue("path1", map, "baz"); - assertThat(getMapValue(map, "path1"), equalTo("baz")); - - SemanticTextFieldMapper.insertValue("path3.path4", map, Map.of("test", "foo")); - assertThat(getMapValue(map, "path3\\.path4"), equalTo(Map.of("test", "foo"))); - } - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - builder.startObject("path1").array("test", "value1", "value2").endObject(); - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - - SemanticTextFieldMapper.insertValue("path1.test", map, List.of("value3", "value4", "value5")); - assertThat(getMapValue(map, "path1.test"), equalTo(List.of("value3", "value4", "value5"))); - - SemanticTextFieldMapper.insertValue("path2.test", map, List.of("value6", "value7", "value8")); - assertThat(getMapValue(map, "path2\\.test"), equalTo(List.of("value6", "value7", "value8"))); - } - } - - public void testInsertValueListTraversal() throws IOException { - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1"); - { - builder.startArray("path2"); - builder.startObject().field("test", "value1").endObject(); - builder.endArray(); - } - builder.endObject(); - } - { - builder.startObject("path3"); - { - builder.startArray("path4"); - builder.startObject().field("test", "value1").endObject(); - builder.endArray(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - - SemanticTextFieldMapper.insertValue("path1.path2.test", map, "value2"); - assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); - SemanticTextFieldMapper.insertValue("path1.path2.test2", map, "value3"); - assertThat(getMapValue(map, "path1.path2.test2"), equalTo("value3")); - assertThat(getMapValue(map, "path1.path2"), equalTo(List.of(Map.of("test", "value2", "test2", "value3")))); - - SemanticTextFieldMapper.insertValue("path3.path4.test", map, "value4"); - assertThat(getMapValue(map, "path3.path4.test"), equalTo("value4")); - } - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1"); - { - builder.startArray("path2"); - builder.startArray(); - builder.startObject().field("test", "value1").endObject(); - builder.endArray(); - builder.endArray(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - - SemanticTextFieldMapper.insertValue("path1.path2.test", map, "value2"); - assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); - SemanticTextFieldMapper.insertValue("path1.path2.test2", map, "value3"); - assertThat(getMapValue(map, "path1.path2.test2"), equalTo("value3")); - assertThat(getMapValue(map, "path1.path2"), equalTo(List.of(List.of(Map.of("test", "value2", "test2", "value3"))))); - } - } - - public void testInsertValueFieldsWithDots() throws IOException { - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("xxx.yyy", "value1").endObject(); - Map map = toSourceMap(Strings.toString(builder)); - - SemanticTextFieldMapper.insertValue("xxx.yyy", map, "value2"); - assertThat(getMapValue(map, "xxx\\.yyy"), equalTo("value2")); - - SemanticTextFieldMapper.insertValue("xxx", map, "value3"); - assertThat(getMapValue(map, "xxx"), equalTo("value3")); - } - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1.path2"); - { - builder.startObject("path3.path4"); - builder.field("test", "value1"); - builder.endObject(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - - SemanticTextFieldMapper.insertValue("path1.path2.path3.path4.test", map, "value2"); - assertThat(getMapValue(map, "path1\\.path2.path3\\.path4.test"), equalTo("value2")); - - SemanticTextFieldMapper.insertValue("path1.path2.path3.path4.test2", map, "value3"); - assertThat(getMapValue(map, "path1\\.path2.path3\\.path4.test2"), equalTo("value3")); - assertThat(getMapValue(map, "path1\\.path2.path3\\.path4"), equalTo(Map.of("test", "value2", "test2", "value3"))); - } - } - - public void testInsertValueAmbiguousPath() throws IOException { - // Mixed dotted object notation - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1.path2"); - { - builder.startObject("path3"); - builder.field("test1", "value1"); - builder.endObject(); - } - builder.endObject(); - } - { - builder.startObject("path1"); - { - builder.startObject("path2.path3"); - builder.field("test2", "value2"); - builder.endObject(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); - - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, - () -> SemanticTextFieldMapper.insertValue("path1.path2.path3.test1", map, "value3") - ); - assertThat( - ex.getMessage(), - equalTo("Path [path1.path2.path3.test1] could be inserted in 2 distinct ways, it is ambiguous which one to use") - ); - - ex = assertThrows( - IllegalArgumentException.class, - () -> SemanticTextFieldMapper.insertValue("path1.path2.path3.test3", map, "value4") - ); - assertThat( - ex.getMessage(), - equalTo("Path [path1.path2.path3.test3] could be inserted in 2 distinct ways, it is ambiguous which one to use") - ); - - assertThat(map, equalTo(originalMap)); - } - - // traversal through lists - { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1.path2"); - { - builder.startArray("path3"); - builder.startObject().field("test1", "value1").endObject(); - builder.endArray(); - } - builder.endObject(); - } - { - builder.startObject("path1"); - { - builder.startArray("path2.path3"); - builder.startObject().field("test2", "value2").endObject(); - builder.endArray(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); - - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, - () -> SemanticTextFieldMapper.insertValue("path1.path2.path3.test1", map, "value3") - ); - assertThat( - ex.getMessage(), - equalTo("Path [path1.path2.path3.test1] could be inserted in 2 distinct ways, it is ambiguous which one to use") - ); - - ex = assertThrows( - IllegalArgumentException.class, - () -> SemanticTextFieldMapper.insertValue("path1.path2.path3.test3", map, "value4") - ); - assertThat( - ex.getMessage(), - equalTo("Path [path1.path2.path3.test3] could be inserted in 2 distinct ways, it is ambiguous which one to use") - ); - - assertThat(map, equalTo(originalMap)); - } - } - - public void testInsertValueCannotTraversePath() throws IOException { - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - { - builder.startObject("path1"); - { - builder.startArray("path2"); - builder.startArray(); - builder.startObject().field("test", "value1").endObject(); - builder.endArray(); - builder.endArray(); - } - builder.endObject(); - } - builder.endObject(); - Map map = toSourceMap(Strings.toString(builder)); - final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); - - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, - () -> SemanticTextFieldMapper.insertValue("path1.path2.test.test2", map, "value2") - ); - assertThat( - ex.getMessage(), - equalTo("Path [path1.path2.test] has value [value1] of type [String], which cannot be traversed into further") - ); - - assertThat(map, equalTo(originalMap)); - } - @Override protected void assertExistsQuery(MappedFieldType fieldType, Query query, LuceneDocument fields) { // Until a doc is indexed, the query is rewritten as match no docs @@ -1079,11 +882,23 @@ private static void addSemanticTextMapping( mappingBuilder.endObject(); } - private static void addSemanticTextInferenceResults(XContentBuilder sourceBuilder, List semanticTextInferenceResults) - throws IOException { - for (var field : semanticTextInferenceResults) { - sourceBuilder.field(field.fieldName()); - sourceBuilder.value(field); + private static void addSemanticTextInferenceResults( + boolean useLegacyFormat, + XContentBuilder sourceBuilder, + List semanticTextInferenceResults + ) throws IOException { + if (useLegacyFormat) { + for (var field : semanticTextInferenceResults) { + sourceBuilder.field(field.fieldName()); + sourceBuilder.value(field); + } + } else { + // Use a linked hash map to maintain insertion-order iteration over the inference fields + Map inferenceMetadataFields = new LinkedHashMap<>(); + for (var field : semanticTextInferenceResults) { + inferenceMetadataFields.put(field.fieldName(), field); + } + sourceBuilder.field(InferenceMetadataFieldsMapper.NAME, inferenceMetadataFields); } } @@ -1119,6 +934,19 @@ private static Query generateNestedTermSparseVectorQuery(NestedLookup nestedLook ); } + private static SourceToParse semanticTextInferenceSource(boolean useLegacyFormat, CheckedConsumer build) + throws IOException { + return source(b -> { + if (useLegacyFormat == false) { + b.startObject(InferenceMetadataFieldsMapper.NAME); + } + build.accept(b); + if (useLegacyFormat == false) { + b.endObject(); + } + }); + } + private static void assertChildLeafNestedDocument( LeafNestedDocuments leaf, int advanceToDoc, @@ -1143,68 +971,4 @@ private static void assertSparseFeatures(LuceneDocument doc, String fieldName, i } assertThat(count, equalTo(expectedCount)); } - - private Map toSourceMap(String source) throws IOException { - try (XContentParser parser = createParser(JsonXContent.jsonXContent, source)) { - return parser.map(); - } - } - - private static Object getMapValue(Map map, String key) { - // Split the path on unescaped "." chars and then unescape the escaped "." chars - final String[] pathElements = Arrays.stream(key.split("(? k.replace("\\.", ".")).toArray(String[]::new); - - Object value = null; - Object nextLayer = map; - for (int i = 0; i < pathElements.length; i++) { - if (nextLayer instanceof Map nextMap) { - value = nextMap.get(pathElements[i]); - } else if (nextLayer instanceof List nextList) { - final String pathElement = pathElements[i]; - List values = nextList.stream().flatMap(v -> { - Stream.Builder streamBuilder = Stream.builder(); - if (v instanceof List innerList) { - traverseList(innerList, streamBuilder); - } else { - streamBuilder.add(v); - } - return streamBuilder.build(); - }).filter(v -> v instanceof Map).map(v -> ((Map) v).get(pathElement)).filter(Objects::nonNull).toList(); - - if (values.isEmpty()) { - return null; - } else if (values.size() > 1) { - throw new AssertionError("List " + nextList + " contains multiple values for [" + pathElement + "]"); - } else { - value = values.get(0); - } - } else if (nextLayer == null) { - break; - } else { - throw new AssertionError( - "Path [" - + String.join(".", Arrays.copyOfRange(pathElements, 0, i)) - + "] has value [" - + value - + "] of type [" - + value.getClass().getSimpleName() - + "], which cannot be traversed into further" - ); - } - - nextLayer = value; - } - - return value; - } - - private static void traverseList(List list, Stream.Builder streamBuilder) { - for (Object value : list) { - if (value instanceof List innerList) { - traverseList(innerList, streamBuilder); - } else { - streamBuilder.add(value); - } - } - } } diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java index dcdd9b3d42341..29ca71d38e1b2 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java @@ -7,9 +7,10 @@ package org.elasticsearch.xpack.inference.mapper; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.xcontent.XContentHelper; -import org.elasticsearch.core.Tuple; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.inference.ChunkedInference; import org.elasticsearch.inference.Model; @@ -27,18 +28,31 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; +import java.util.ListIterator; import java.util.Map; import java.util.function.Predicate; import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD; -import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunks; +import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.toSemanticTextFieldChunk; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; public class SemanticTextFieldTests extends AbstractXContentTestCase { private static final String NAME = "field"; + private final boolean useLegacyFormat; + + public SemanticTextFieldTests(boolean useLegacyFormat) { + this.useLegacyFormat = useLegacyFormat; + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return List.of(new Object[] { true }, new Object[] { false }); + } + @Override protected Predicate getRandomFieldsExcludeFilter() { return n -> n.endsWith(CHUNKED_EMBEDDINGS_FIELD); @@ -46,39 +60,45 @@ protected Predicate getRandomFieldsExcludeFilter() { @Override protected void assertEqualInstances(SemanticTextField expectedInstance, SemanticTextField newInstance) { + assertThat(newInstance.useLegacyFormat(), equalTo(newInstance.useLegacyFormat())); assertThat(newInstance.fieldName(), equalTo(expectedInstance.fieldName())); assertThat(newInstance.originalValues(), equalTo(expectedInstance.originalValues())); assertThat(newInstance.inference().modelSettings(), equalTo(expectedInstance.inference().modelSettings())); assertThat(newInstance.inference().chunks().size(), equalTo(expectedInstance.inference().chunks().size())); SemanticTextField.ModelSettings modelSettings = newInstance.inference().modelSettings(); - for (int i = 0; i < newInstance.inference().chunks().size(); i++) { - assertThat(newInstance.inference().chunks().get(i).text(), equalTo(expectedInstance.inference().chunks().get(i).text())); - switch (modelSettings.taskType()) { - case TEXT_EMBEDDING -> { - double[] expectedVector = parseDenseVector( - expectedInstance.inference().chunks().get(i).rawEmbeddings(), - modelSettings.dimensions(), - expectedInstance.contentType() - ); - double[] newVector = parseDenseVector( - newInstance.inference().chunks().get(i).rawEmbeddings(), - modelSettings.dimensions(), - newInstance.contentType() - ); - assertArrayEquals(expectedVector, newVector, 0.0000001f); + for (var entry : newInstance.inference().chunks().entrySet()) { + var expectedChunks = expectedInstance.inference().chunks().get(entry.getKey()); + assertNotNull(expectedChunks); + assertThat(entry.getValue().size(), equalTo(expectedChunks.size())); + for (int i = 0; i < entry.getValue().size(); i++) { + var actualChunk = entry.getValue().get(i); + assertThat(actualChunk.text(), equalTo(expectedChunks.get(i).text())); + assertThat(actualChunk.startOffset(), equalTo(expectedChunks.get(i).startOffset())); + assertThat(actualChunk.endOffset(), equalTo(expectedChunks.get(i).endOffset())); + switch (modelSettings.taskType()) { + case TEXT_EMBEDDING -> { + double[] expectedVector = parseDenseVector( + expectedChunks.get(i).rawEmbeddings(), + modelSettings.dimensions(), + expectedInstance.contentType() + ); + double[] newVector = parseDenseVector( + actualChunk.rawEmbeddings(), + modelSettings.dimensions(), + newInstance.contentType() + ); + assertArrayEquals(expectedVector, newVector, 0.0000001f); + } + case SPARSE_EMBEDDING -> { + List expectedTokens = parseWeightedTokens( + expectedChunks.get(i).rawEmbeddings(), + expectedInstance.contentType() + ); + List newTokens = parseWeightedTokens(actualChunk.rawEmbeddings(), newInstance.contentType()); + assertThat(newTokens, equalTo(expectedTokens)); + } + default -> throw new AssertionError("Invalid task type " + modelSettings.taskType()); } - case SPARSE_EMBEDDING -> { - List expectedTokens = parseWeightedTokens( - expectedInstance.inference().chunks().get(i).rawEmbeddings(), - expectedInstance.contentType() - ); - List newTokens = parseWeightedTokens( - newInstance.inference().chunks().get(i).rawEmbeddings(), - newInstance.contentType() - ); - assertThat(newTokens, equalTo(expectedTokens)); - } - default -> throw new AssertionError("Invalid task type " + modelSettings.taskType()); } } } @@ -87,7 +107,13 @@ protected void assertEqualInstances(SemanticTextField expectedInstance, Semantic protected SemanticTextField createTestInstance() { List rawValues = randomList(1, 5, () -> randomSemanticTextInput().toString()); try { // try catch required for override - return randomSemanticText(NAME, TestModel.createRandomInstance(), rawValues, randomFrom(XContentType.values())); + return randomSemanticText( + useLegacyFormat, + NAME, + TestModel.createRandomInstance(), + rawValues, + randomFrom(XContentType.values()) + ); } catch (IOException e) { fail("Failed to create random SemanticTextField instance"); } @@ -96,12 +122,12 @@ protected SemanticTextField createTestInstance() { @Override protected SemanticTextField doParseInstance(XContentParser parser) throws IOException { - return SemanticTextField.parse(parser, new Tuple<>(NAME, parser.contentType())); + return SemanticTextField.parse(parser, new SemanticTextField.ParserContext(useLegacyFormat, NAME, parser.contentType())); } @Override protected boolean supportsUnknownFields() { - return true; + return false; } public void testModelSettingsValidation() { @@ -185,30 +211,60 @@ public static ChunkedInferenceEmbeddingSparse randomChunkedInferenceEmbeddingSpa return new ChunkedInferenceEmbeddingSparse(chunks); } - public static SemanticTextField randomSemanticText(String fieldName, Model model, List inputs, XContentType contentType) - throws IOException { + public static SemanticTextField randomSemanticText( + boolean useLegacyFormat, + String fieldName, + Model model, + List inputs, + XContentType contentType + ) throws IOException { ChunkedInference results = switch (model.getTaskType()) { case TEXT_EMBEDDING -> randomChunkedInferenceEmbeddingFloat(model, inputs); case SPARSE_EMBEDDING -> randomChunkedInferenceEmbeddingSparse(inputs); default -> throw new AssertionError("invalid task type: " + model.getTaskType().name()); }; - return semanticTextFieldFromChunkedInferenceResults(fieldName, model, inputs, results, contentType); + return semanticTextFieldFromChunkedInferenceResults(useLegacyFormat, fieldName, model, inputs, results, contentType); } public static SemanticTextField semanticTextFieldFromChunkedInferenceResults( + boolean useLegacyFormat, String fieldName, Model model, List inputs, ChunkedInference results, XContentType contentType ) throws IOException { + + // In this test framework, we don't perform "real" chunking; each input generates one chunk. Thus, we can assume there is a + // one-to-one relationship between inputs and chunks. Iterate over the inputs and chunks to match each input with its + // corresponding chunk. + final List chunks = new ArrayList<>(inputs.size()); + int offsetAdjustment = 0; + Iterator inputsIt = inputs.iterator(); + Iterator chunkIt = results.chunksAsMatchedTextAndByteReference(contentType.xContent()); + while (inputsIt.hasNext() && chunkIt.hasNext()) { + String input = inputsIt.next(); + var chunk = chunkIt.next(); + chunks.add(toSemanticTextFieldChunk(input, offsetAdjustment, chunk, useLegacyFormat)); + + // When using the inference metadata fields format, all the input values are concatenated so that the + // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment + // to apply to account for this. + offsetAdjustment = input.length() + 1; // Add one for separator char length + } + + if (inputsIt.hasNext() || chunkIt.hasNext()) { + throw new IllegalArgumentException("Input list size and chunk count do not match"); + } + return new SemanticTextField( + useLegacyFormat, fieldName, - inputs, + useLegacyFormat ? inputs : null, new SemanticTextField.InferenceResult( model.getInferenceEntityId(), new SemanticTextField.ModelSettings(model), - toSemanticTextFieldChunks(List.of(results), contentType) + Map.of(fieldName, chunks) ), contentType ); @@ -232,37 +288,53 @@ public static Object randomSemanticTextInput() { } } - public static ChunkedInference toChunkedResult(SemanticTextField field) throws IOException { + public static ChunkedInference toChunkedResult( + boolean useLegacyFormat, + Map> matchedTextMap, + SemanticTextField field + ) { switch (field.inference().modelSettings().taskType()) { case SPARSE_EMBEDDING -> { List chunks = new ArrayList<>(); - for (var chunk : field.inference().chunks()) { - var tokens = parseWeightedTokens(chunk.rawEmbeddings(), field.contentType()); - chunks.add( - new ChunkedInferenceEmbeddingSparse.SparseEmbeddingChunk( - tokens, - chunk.text(), - new ChunkedInference.TextOffset(0, chunk.text().length()) - ) - ); + for (var entry : field.inference().chunks().entrySet()) { + String entryField = entry.getKey(); + List entryChunks = entry.getValue(); + List entryFieldMatchedText = validateAndGetMatchedTextForField(matchedTextMap, entryField, entryChunks.size()); + + ListIterator matchedTextIt = entryFieldMatchedText.listIterator(); + for (var chunk : entryChunks) { + String matchedText = matchedTextIt.next(); + ChunkedInference.TextOffset offset = createOffset(useLegacyFormat, chunk, matchedText); + var tokens = parseWeightedTokens(chunk.rawEmbeddings(), field.contentType()); + chunks.add(new ChunkedInferenceEmbeddingSparse.SparseEmbeddingChunk(tokens, matchedText, offset)); + } } return new ChunkedInferenceEmbeddingSparse(chunks); } case TEXT_EMBEDDING -> { List chunks = new ArrayList<>(); - for (var chunk : field.inference().chunks()) { - double[] values = parseDenseVector( - chunk.rawEmbeddings(), - field.inference().modelSettings().dimensions(), - field.contentType() - ); - chunks.add( - new ChunkedInferenceEmbeddingFloat.FloatEmbeddingChunk( - FloatConversionUtils.floatArrayOf(values), - chunk.text(), - new ChunkedInference.TextOffset(0, chunk.text().length()) - ) - ); + for (var entry : field.inference().chunks().entrySet()) { + String entryField = entry.getKey(); + List entryChunks = entry.getValue(); + List entryFieldMatchedText = validateAndGetMatchedTextForField(matchedTextMap, entryField, entryChunks.size()); + + ListIterator matchedTextIt = entryFieldMatchedText.listIterator(); + for (var chunk : entryChunks) { + String matchedText = matchedTextIt.next(); + ChunkedInference.TextOffset offset = createOffset(useLegacyFormat, chunk, matchedText); + double[] values = parseDenseVector( + chunk.rawEmbeddings(), + field.inference().modelSettings().dimensions(), + field.contentType() + ); + chunks.add( + new ChunkedInferenceEmbeddingFloat.FloatEmbeddingChunk( + FloatConversionUtils.floatArrayOf(values), + matchedText, + offset + ) + ); + } } return new ChunkedInferenceEmbeddingFloat(chunks); } @@ -270,6 +342,38 @@ public static ChunkedInference toChunkedResult(SemanticTextField field) throws I } } + private static List validateAndGetMatchedTextForField( + Map> matchedTextMap, + String fieldName, + int chunkCount + ) { + List fieldMatchedText = matchedTextMap.get(fieldName); + if (fieldMatchedText == null) { + throw new IllegalStateException("No matched text list exists for field [" + fieldName + "]"); + } else if (fieldMatchedText.size() != chunkCount) { + throw new IllegalStateException("Matched text list size does not equal chunk count for field [" + fieldName + "]"); + } + + return fieldMatchedText; + } + + /** + * Create a {@link ChunkedInference.TextOffset} instance with valid offset values. When using the legacy semantic text format, the + * offset values are not written to {@link SemanticTextField.Chunk}, so we cannot read them from there. Instead, use the knowledge that + * the matched text corresponds to one complete input value (i.e. one input value -> one chunk) to calculate the offset values. + * + * @param useLegacyFormat Whether the old format should be used + * @param chunk The chunk to get/calculate offset values for + * @param matchedText The matched text to calculate offset values for + * @return A {@link ChunkedInference.TextOffset} instance with valid offset values + */ + private static ChunkedInference.TextOffset createOffset(boolean useLegacyFormat, SemanticTextField.Chunk chunk, String matchedText) { + final int startOffset = useLegacyFormat ? 0 : chunk.startOffset(); + final int endOffset = useLegacyFormat ? matchedText.length() : chunk.endOffset(); + + return new ChunkedInference.TextOffset(startOffset, endOffset); + } + private static double[] parseDenseVector(BytesReference value, int numDims, XContentType contentType) { try (XContentParser parser = XContentHelper.createParserNotCompressed(XContentParserConfiguration.EMPTY, value, contentType)) { parser.nextToken(); diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtilsTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtilsTests.java new file mode 100644 index 0000000000000..14304233906ca --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextUtilsTests.java @@ -0,0 +1,351 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.elasticsearch.common.Strings; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.json.JsonXContent; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.equalTo; + +public class SemanticTextUtilsTests extends ESTestCase { + public void testInsertValueMapTraversal() throws IOException { + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("test", "value").endObject(); + + Map map = toSourceMap(Strings.toString(builder)); + SemanticTextUtils.insertValue("test", map, "value2"); + assertThat(getMapValue(map, "test"), equalTo("value2")); + SemanticTextUtils.insertValue("something.else", map, "something_else_value"); + assertThat(getMapValue(map, "something\\.else"), equalTo("something_else_value")); + } + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + builder.startObject("path1").startObject("path2").field("test", "value").endObject().endObject(); + builder.endObject(); + + Map map = toSourceMap(Strings.toString(builder)); + SemanticTextUtils.insertValue("path1.path2.test", map, "value2"); + assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); + SemanticTextUtils.insertValue("path1.path2.test_me", map, "test_me_value"); + assertThat(getMapValue(map, "path1.path2.test_me"), equalTo("test_me_value")); + SemanticTextUtils.insertValue("path1.non_path2.test", map, "test_value"); + assertThat(getMapValue(map, "path1.non_path2\\.test"), equalTo("test_value")); + + SemanticTextUtils.insertValue("path1.path2", map, Map.of("path3", "bar")); + assertThat(getMapValue(map, "path1.path2"), equalTo(Map.of("path3", "bar"))); + + SemanticTextUtils.insertValue("path1", map, "baz"); + assertThat(getMapValue(map, "path1"), equalTo("baz")); + + SemanticTextUtils.insertValue("path3.path4", map, Map.of("test", "foo")); + assertThat(getMapValue(map, "path3\\.path4"), equalTo(Map.of("test", "foo"))); + } + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + builder.startObject("path1").array("test", "value1", "value2").endObject(); + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + + SemanticTextUtils.insertValue("path1.test", map, List.of("value3", "value4", "value5")); + assertThat(getMapValue(map, "path1.test"), equalTo(List.of("value3", "value4", "value5"))); + + SemanticTextUtils.insertValue("path2.test", map, List.of("value6", "value7", "value8")); + assertThat(getMapValue(map, "path2\\.test"), equalTo(List.of("value6", "value7", "value8"))); + } + } + + public void testInsertValueListTraversal() throws IOException { + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1"); + { + builder.startArray("path2"); + builder.startObject().field("test", "value1").endObject(); + builder.endArray(); + } + builder.endObject(); + } + { + builder.startObject("path3"); + { + builder.startArray("path4"); + builder.startObject().field("test", "value1").endObject(); + builder.endArray(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + + SemanticTextUtils.insertValue("path1.path2.test", map, "value2"); + assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); + SemanticTextUtils.insertValue("path1.path2.test2", map, "value3"); + assertThat(getMapValue(map, "path1.path2.test2"), equalTo("value3")); + assertThat(getMapValue(map, "path1.path2"), equalTo(List.of(Map.of("test", "value2", "test2", "value3")))); + + SemanticTextUtils.insertValue("path3.path4.test", map, "value4"); + assertThat(getMapValue(map, "path3.path4.test"), equalTo("value4")); + } + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1"); + { + builder.startArray("path2"); + builder.startArray(); + builder.startObject().field("test", "value1").endObject(); + builder.endArray(); + builder.endArray(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + + SemanticTextUtils.insertValue("path1.path2.test", map, "value2"); + assertThat(getMapValue(map, "path1.path2.test"), equalTo("value2")); + SemanticTextUtils.insertValue("path1.path2.test2", map, "value3"); + assertThat(getMapValue(map, "path1.path2.test2"), equalTo("value3")); + assertThat(getMapValue(map, "path1.path2"), equalTo(List.of(List.of(Map.of("test", "value2", "test2", "value3"))))); + } + } + + public void testInsertValueFieldsWithDots() throws IOException { + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("xxx.yyy", "value1").endObject(); + Map map = toSourceMap(Strings.toString(builder)); + + SemanticTextUtils.insertValue("xxx.yyy", map, "value2"); + assertThat(getMapValue(map, "xxx\\.yyy"), equalTo("value2")); + + SemanticTextUtils.insertValue("xxx", map, "value3"); + assertThat(getMapValue(map, "xxx"), equalTo("value3")); + } + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1.path2"); + { + builder.startObject("path3.path4"); + builder.field("test", "value1"); + builder.endObject(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + + SemanticTextUtils.insertValue("path1.path2.path3.path4.test", map, "value2"); + assertThat(getMapValue(map, "path1\\.path2.path3\\.path4.test"), equalTo("value2")); + + SemanticTextUtils.insertValue("path1.path2.path3.path4.test2", map, "value3"); + assertThat(getMapValue(map, "path1\\.path2.path3\\.path4.test2"), equalTo("value3")); + assertThat(getMapValue(map, "path1\\.path2.path3\\.path4"), equalTo(Map.of("test", "value2", "test2", "value3"))); + } + } + + public void testInsertValueAmbiguousPath() throws IOException { + // Mixed dotted object notation + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1.path2"); + { + builder.startObject("path3"); + builder.field("test1", "value1"); + builder.endObject(); + } + builder.endObject(); + } + { + builder.startObject("path1"); + { + builder.startObject("path2.path3"); + builder.field("test2", "value2"); + builder.endObject(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); + + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, + () -> SemanticTextUtils.insertValue("path1.path2.path3.test1", map, "value3") + ); + assertThat( + ex.getMessage(), + equalTo("Path [path1.path2.path3.test1] could be inserted in 2 distinct ways, it is ambiguous which one to use") + ); + + ex = assertThrows( + IllegalArgumentException.class, + () -> SemanticTextUtils.insertValue("path1.path2.path3.test3", map, "value4") + ); + assertThat( + ex.getMessage(), + equalTo("Path [path1.path2.path3.test3] could be inserted in 2 distinct ways, it is ambiguous which one to use") + ); + + assertThat(map, equalTo(originalMap)); + } + + // traversal through lists + { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1.path2"); + { + builder.startArray("path3"); + builder.startObject().field("test1", "value1").endObject(); + builder.endArray(); + } + builder.endObject(); + } + { + builder.startObject("path1"); + { + builder.startArray("path2.path3"); + builder.startObject().field("test2", "value2").endObject(); + builder.endArray(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); + + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, + () -> SemanticTextUtils.insertValue("path1.path2.path3.test1", map, "value3") + ); + assertThat( + ex.getMessage(), + equalTo("Path [path1.path2.path3.test1] could be inserted in 2 distinct ways, it is ambiguous which one to use") + ); + + ex = assertThrows( + IllegalArgumentException.class, + () -> SemanticTextUtils.insertValue("path1.path2.path3.test3", map, "value4") + ); + assertThat( + ex.getMessage(), + equalTo("Path [path1.path2.path3.test3] could be inserted in 2 distinct ways, it is ambiguous which one to use") + ); + + assertThat(map, equalTo(originalMap)); + } + } + + public void testInsertValueCannotTraversePath() throws IOException { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + { + builder.startObject("path1"); + { + builder.startArray("path2"); + builder.startArray(); + builder.startObject().field("test", "value1").endObject(); + builder.endArray(); + builder.endArray(); + } + builder.endObject(); + } + builder.endObject(); + Map map = toSourceMap(Strings.toString(builder)); + final Map originalMap = Collections.unmodifiableMap(toSourceMap(Strings.toString(builder))); + + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, + () -> SemanticTextUtils.insertValue("path1.path2.test.test2", map, "value2") + ); + assertThat( + ex.getMessage(), + equalTo("Path [path1.path2.test] has value [value1] of type [String], which cannot be traversed into further") + ); + + assertThat(map, equalTo(originalMap)); + } + + private Map toSourceMap(String source) throws IOException { + try (XContentParser parser = createParser(JsonXContent.jsonXContent, source)) { + return parser.map(); + } + } + + private static Object getMapValue(Map map, String key) { + // Split the path on unescaped "." chars and then unescape the escaped "." chars + final String[] pathElements = Arrays.stream(key.split("(? k.replace("\\.", ".")).toArray(String[]::new); + + Object value = null; + Object nextLayer = map; + for (int i = 0; i < pathElements.length; i++) { + if (nextLayer instanceof Map nextMap) { + value = nextMap.get(pathElements[i]); + } else if (nextLayer instanceof List nextList) { + final String pathElement = pathElements[i]; + List values = nextList.stream().flatMap(v -> { + Stream.Builder streamBuilder = Stream.builder(); + if (v instanceof List innerList) { + traverseList(innerList, streamBuilder); + } else { + streamBuilder.add(v); + } + return streamBuilder.build(); + }).filter(v -> v instanceof Map).map(v -> ((Map) v).get(pathElement)).filter(Objects::nonNull).toList(); + + if (values.isEmpty()) { + return null; + } else if (values.size() > 1) { + throw new AssertionError("List " + nextList + " contains multiple values for [" + pathElement + "]"); + } else { + value = values.get(0); + } + } else if (nextLayer == null) { + break; + } else { + throw new AssertionError( + "Path [" + + String.join(".", Arrays.copyOfRange(pathElements, 0, i)) + + "] has value [" + + value + + "] of type [" + + value.getClass().getSimpleName() + + "], which cannot be traversed into further" + ); + } + + nextLayer = value; + } + + return value; + } + + private static void traverseList(List list, Stream.Builder streamBuilder) { + for (Object value : list) { + if (value instanceof List innerList) { + traverseList(innerList, streamBuilder); + } else { + streamBuilder.add(value); + } + } + } +} diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/queries/SemanticQueryBuilderTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/queries/SemanticQueryBuilderTests.java index 59e610492db78..5bdd9902b6e34 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/queries/SemanticQueryBuilderTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/queries/SemanticQueryBuilderTests.java @@ -7,6 +7,8 @@ package org.elasticsearch.xpack.inference.queries; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; @@ -27,6 +29,7 @@ import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.SourceToParse; @@ -41,7 +44,6 @@ import org.elasticsearch.inference.TaskType; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.AbstractQueryTestCase; -import org.elasticsearch.test.index.IndexVersionUtils; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xcontent.json.JsonXContent; @@ -65,11 +67,11 @@ import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Map; import static org.apache.lucene.search.BooleanClause.Occur.FILTER; import static org.apache.lucene.search.BooleanClause.Occur.MUST; import static org.apache.lucene.search.BooleanClause.Occur.SHOULD; -import static org.elasticsearch.index.IndexVersions.NEW_SPARSE_VECTOR; import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfig.DEFAULT_RESULTS_FIELD; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -86,6 +88,7 @@ public class SemanticQueryBuilderTests extends AbstractQueryTestCase parameters() throws Exception { + return List.of(new Object[] { true }, new Object[] { false }); + } + @BeforeClass public static void setInferenceResultType() { // These are class variables because they are used when initializing additional mappings, which happens once per test suite run in @@ -121,12 +133,10 @@ protected Collection> getPlugins() { @Override protected Settings createTestIndexSettings() { - // Randomize index version within compatible range - // we have to prefer CURRENT since with the range of versions we support it's rather unlikely to get the current actually. - IndexVersion indexVersionCreated = randomBoolean() - ? IndexVersion.current() - : IndexVersionUtils.randomVersionBetween(random(), NEW_SPARSE_VECTOR, IndexVersion.current()); - return Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, indexVersionCreated).build(); + return Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) + .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat) + .build(); } @Override @@ -148,7 +158,11 @@ protected void initializeAdditionalMappings(MapperService mapperService) throws private void applyRandomInferenceResults(MapperService mapperService) throws IOException { // Parse random inference results (or no inference results) to set up the dynamic inference result mappings under the semantic text // field - SourceToParse sourceToParse = buildSemanticTextFieldWithInferenceResults(inferenceResultType, denseVectorElementType); + SourceToParse sourceToParse = buildSemanticTextFieldWithInferenceResults( + inferenceResultType, + denseVectorElementType, + useLegacyFormat + ); if (sourceToParse != null) { ParsedDocument parsedDocument = mapperService.documentMapper().parse(sourceToParse); mapperService.merge( @@ -198,7 +212,7 @@ private void assertSparseEmbeddingLuceneQuery(Query query) { Query innerQuery = assertOuterBooleanQuery(query); assertThat(innerQuery, instanceOf(SparseVectorQueryWrapper.class)); var sparseQuery = (SparseVectorQueryWrapper) innerQuery; - assertThat(((SparseVectorQueryWrapper) innerQuery).getTermsQuery(), instanceOf(BooleanQuery.class)); + assertThat(sparseQuery.getTermsQuery(), instanceOf(BooleanQuery.class)); BooleanQuery innerBooleanQuery = (BooleanQuery) sparseQuery.getTermsQuery(); assertThat(innerBooleanQuery.clauses().size(), equalTo(queryTokenCount)); @@ -334,7 +348,8 @@ public void testSerializingQueryWhenNoInferenceId() throws IOException { private static SourceToParse buildSemanticTextFieldWithInferenceResults( InferenceResultType inferenceResultType, - DenseVectorFieldMapper.ElementType denseVectorElementType + DenseVectorFieldMapper.ElementType denseVectorElementType, + boolean useLegacyFormat ) throws IOException { SemanticTextField.ModelSettings modelSettings = switch (inferenceResultType) { case NONE -> null; @@ -350,15 +365,21 @@ private static SourceToParse buildSemanticTextFieldWithInferenceResults( SourceToParse sourceToParse = null; if (modelSettings != null) { SemanticTextField semanticTextField = new SemanticTextField( + useLegacyFormat, SEMANTIC_TEXT_FIELD, - List.of(), - new SemanticTextField.InferenceResult(INFERENCE_ID, modelSettings, List.of()), + null, + new SemanticTextField.InferenceResult(INFERENCE_ID, modelSettings, Map.of(SEMANTIC_TEXT_FIELD, List.of())), XContentType.JSON ); XContentBuilder builder = JsonXContent.contentBuilder().startObject(); - builder.field(semanticTextField.fieldName()); - builder.value(semanticTextField); + if (useLegacyFormat == false) { + builder.startObject(InferenceMetadataFieldsMapper.NAME); + } + builder.field(semanticTextField.fieldName(), semanticTextField); + if (useLegacyFormat == false) { + builder.endObject(); + } builder.endObject(); sourceToParse = new SourceToParse("test", BytesReference.bytes(builder), XContentType.JSON); } diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json new file mode 100644 index 0000000000000..2ae09697d55af --- /dev/null +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json @@ -0,0 +1,4310 @@ +{ + "_inference_fields": { + "dense_field": { + "inference": { + "inference_id": ".multilingual-e5-small-elasticsearch", + "model_settings": { + "task_type": "text_embedding", + "dimensions": 384, + "similarity": "cosine", + "element_type": "float" + }, + "chunks": { + "field": [ + { + "start_offset": 0, + "end_offset": 1329, + "embeddings": [ + 0.04979738, + -0.049024884, + -0.06267286, + -0.06284121, + 0.031987894, + -0.08689449, + 0.044664543, + 0.076699525, + 0.06471937, + 0.028753767, + 0.02369647, + 0.011940286, + 0.041063324, + -0.0031249018, + -0.012605156, + 0.020917466, + 0.0698649, + -0.07892161, + -0.010968826, + -0.060116883, + 0.012380837, + -0.022275316, + -0.02007232, + 0.053651124, + 0.045564346, + 0.06287834, + -0.026206115, + 0.034378637, + 0.028786598, + -0.07342769, + -0.05178595, + -0.03394133, + 0.06494073, + -0.07682645, + 0.039480515, + 8.8730786E-4, + -0.035883103, + -0.02245836, + 0.051104713, + -0.02161596, + -0.0014411546, + 0.011840296, + 0.044061452, + 0.018550612, + 0.07816852, + 0.023765374, + -0.04192663, + 0.056223065, + -0.029935915, + -0.039640833, + -0.061091922, + 0.048074532, + 0.03252561, + 0.07752945, + 0.0374488, + -0.0938137, + -0.06071223, + -0.053990547, + -0.06630911, + 0.040193927, + 0.038531914, + -0.023115646, + -0.0046846615, + 0.025255106, + 0.074686274, + 0.10130572, + 0.06328507, + 0.017575556, + -0.040289026, + -0.013285351, + -0.06927493, + 0.08576633, + -0.003492294, + -0.041360997, + 0.036476493, + 0.04270745, + 0.060671005, + -0.0651591, + 0.014901469, + -0.04655241, + -0.006525806, + -0.037813406, + -0.02792913, + 0.0472146, + -0.07142533, + 0.06478618, + 0.0716035, + -0.04885643, + 0.073330306, + -0.055672232, + 0.057761118, + 0.07276108, + -0.076485656, + -0.06970012, + -0.0692586, + -0.051378023, + -0.06273683, + 0.03469511, + 0.05773398, + -0.08031594, + 0.10501066, + 7.310874E-4, + 0.050745558, + -0.016756695, + -0.031716295, + 0.0050844094, + 0.031707063, + -0.039744828, + 0.05837439, + -0.09262242, + -0.04054004, + -0.0075583286, + 0.061934657, + 0.035783943, + -0.055616625, + -0.047291458, + -0.027218537, + -0.011617415, + 0.026992036, + -0.03259098, + 0.08588563, + -0.015476044, + -0.04406553, + -0.058256716, + -0.049162734, + -0.018606737, + 0.02703335, + 0.023426747, + 0.028659008, + 0.014869456, + 0.04368826, + 0.03709602, + 0.0059531354, + 0.012405994, + 0.023295961, + 0.09050855, + -0.025719937, + -0.038713705, + 0.02654418, + -0.07065918, + -0.04294843, + 0.050370634, + -0.0033409365, + 0.052235987, + 0.07693816, + 0.043221552, + 0.07534102, + -0.048658077, + 0.06533618, + -0.016787754, + 0.034524675, + -0.0312765, + 0.05486932, + 0.06286382, + 0.03278902, + -0.06772777, + -0.087687664, + -0.0665437, + 0.032016467, + 0.066101246, + -0.11844821, + -0.032777846, + -0.053238686, + -0.015841002, + -0.067591116, + -0.048692815, + -0.013267198, + 0.09390532, + -0.029956369, + -0.021315884, + -0.03857401, + 0.03929155, + -0.023058, + 0.051734913, + -0.023478175, + 0.035602726, + -0.08242782, + 0.058339056, + 0.045796614, + 0.05448777, + -0.047254823, + 0.020266606, + -0.08056925, + 0.0015524789, + -0.041604258, + 0.00718068, + -0.044556983, + 0.02106678, + 0.04749506, + -0.01840031, + 0.023407241, + 0.070747316, + -0.04295862, + -0.07703961, + -0.0660327, + 0.013576343, + -0.023668775, + 0.056404322, + 0.09587012, + 0.05701044, + -0.036240827, + -0.004225128, + 0.0067939283, + 0.035346694, + 0.026707595, + 0.017638108, + -0.032440145, + 0.04708931, + 0.012399647, + 0.07325736, + 0.027942428, + -0.08172854, + -0.07065871, + 0.033890083, + -0.033598673, + -0.08178952, + 0.028348992, + 0.04411821, + -0.044644725, + 0.03074351, + 0.0935692, + -0.04762361, + 0.051226508, + -0.08009367, + -0.03847554, + 0.016323369, + 0.038776945, + -0.059975337, + -0.057062503, + 0.010849399, + -0.030187564, + -0.026308322, + -0.067967005, + -0.079719126, + -0.08646553, + -0.09048591, + -0.018597756, + 0.0047154897, + 0.058588482, + -0.09175631, + -0.08307076, + -0.035472285, + 0.009541795, + -0.026162423, + 0.03335252, + 0.018381111, + -0.015803808, + 0.021074254, + -0.010327698, + 0.025227644, + 0.06197503, + -0.059137702, + -0.018673804, + 0.00707259, + -0.019355131, + 0.026796991, + 0.025893785, + 0.0685412, + -0.06301929, + 0.003187423, + 0.029026637, + -0.019066911, + 0.09354283, + 0.1061943, + 0.053696748, + -0.0016658951, + -0.0030081598, + -0.028984388, + -0.037768397, + -0.035445668, + -0.026019065, + 0.028805656, + 0.021448314, + -0.059069104, + -0.06226507, + -0.05508101, + 0.022365203, + 0.09221683, + -0.07698258, + -0.055819187, + 0.061300304, + 0.05965072, + 0.029480126, + 0.057750076, + 0.05180143, + -0.0159732, + -0.0823228, + 0.09240897, + -0.08318623, + 0.002020457, + 0.010953976, + -0.09685372, + 0.05271347, + -0.04232834, + 0.061398283, + 0.044973806, + -0.02088832, + 0.044399235, + -0.014687839, + 0.06304118, + -0.022936989, + -0.033005796, + 0.074231274, + 0.023939423, + -0.087914266, + 0.036014125, + 0.0062753465, + -0.03355067, + 0.036039222, + 0.012712498, + 0.057161637, + 0.05654562, + -0.018600527, + -0.035825036, + 0.06950757, + 0.05828935, + 3.8511172E-4, + -0.008722925, + -0.0522819, + -0.10943554, + -0.033790745, + -0.03357093, + -0.031342223, + -0.07834354, + 0.032603115, + 0.026984481, + -0.02969966, + -0.048259087, + -0.012163297, + 0.007793295, + 0.05574152, + -0.022157356, + -0.03623348, + 0.037055306, + -0.033247784, + -0.0070533184, + -0.057643052, + 0.08567554, + -0.07278431, + -0.06556353, + 0.0308075, + 0.052940007, + -0.0566871, + 0.0287218, + -0.06409354, + -0.0627855, + 0.06254832, + -0.027221028, + -0.049813032, + 0.03935744, + 0.07234624, + -0.09398941, + 0.011342199, + 0.028675176, + -0.022932779, + 0.009481765, + -0.022316003, + -0.015413267, + 0.039174553, + 0.061736017, + -0.04229645, + -0.052905895, + 0.018588098, + 0.070939854, + 0.0748456, + 0.08648295, + -0.036223643, + 0.008473833, + 0.053857446, + -0.07680301, + 0.0785199, + 0.03982, + -0.039509695, + 0.03373825, + -0.063460656, + -0.038993217, + -0.073084034, + 0.062789686, + -0.081148736, + -0.035036374, + 0.0754924, + 0.087299235, + 0.04096056, + 0.027776068 + ] + }, + { + "start_offset": 1281, + "end_offset": 2685, + "embeddings": [ + 0.035266396, + -0.044093177, + -0.04158629, + -0.045926083, + 0.06521479, + -0.050932676, + 0.03961649, + 0.037828345, + 0.025232289, + 0.029732272, + 0.034696255, + -8.805868E-4, + 0.053202488, + -0.0047244085, + -0.037418325, + 0.0770543, + 0.105328426, + -0.036611717, + -0.039531372, + -0.082817726, + 0.021342339, + -0.01843601, + -0.042259317, + 0.06317797, + 0.036926534, + 0.069380246, + -0.059219223, + 0.043066744, + -0.006286799, + -0.06797077, + -0.042236328, + -0.036919896, + 0.034179892, + -0.026980922, + 0.051384695, + 0.03826208, + -0.012975077, + -0.025295, + 0.015923942, + -0.027602347, + -0.022515642, + -5.98229E-4, + 0.06122002, + 0.050380763, + 0.04684541, + 0.08975921, + -0.03755087, + 0.046912387, + -0.038697798, + -0.06988436, + -0.05219296, + 0.041337684, + 0.023435602, + 0.023100449, + 0.0352068, + -0.060556572, + -0.042356305, + -0.04503575, + -0.07377149, + 0.084542595, + 0.028644886, + -0.024366854, + -0.009185593, + 0.01255741, + 0.06999743, + 0.09439326, + 0.03800093, + -0.008208419, + -0.09673358, + 0.0023752274, + -0.07626475, + 0.098563485, + -0.012569254, + -0.08954541, + -0.010908005, + 0.016228944, + 0.05984263, + -0.051004995, + 0.024147974, + -0.050623365, + -0.01668758, + -0.007899899, + -0.029833568, + 0.034388572, + -0.03505155, + 0.08271141, + 0.08569518, + -0.053716324, + 0.06806682, + -0.067159526, + 0.043537326, + 0.09806787, + -0.041304354, + -0.05103136, + -0.109280586, + -0.06120091, + -0.09363793, + 0.032154918, + 0.12145496, + -0.049101993, + 0.07359592, + -0.010511772, + 0.074003994, + -0.013990566, + -0.026140982, + 0.052602872, + 0.09067435, + -0.070553906, + 0.057253607, + -0.048433788, + -0.024026526, + 0.018851176, + 0.04090621, + 0.058670815, + -0.08735305, + -0.022817774, + -0.042838365, + -0.016657954, + 0.03224679, + -0.01952135, + 0.016957905, + -2.0869492E-4, + -0.0039428347, + -0.05186959, + -0.062616155, + -0.056938402, + 0.00882266, + 0.055156156, + 0.03221514, + 0.026071686, + 0.073993444, + 0.060973227, + 0.040219847, + 0.030080495, + 0.074190594, + 0.10667069, + -0.035753082, + -0.031658202, + 0.024792355, + -0.056956623, + -0.04320206, + 0.042175233, + -0.04459597, + 0.063075, + 0.03682348, + 0.087945856, + 0.060606126, + -0.02543529, + 0.101843245, + -0.02052844, + 0.065993346, + -0.01580399, + 0.01996002, + 0.025750767, + 0.044288505, + -0.055157375, + -0.0834102, + -0.07820265, + 0.01860491, + 0.052071907, + -0.082538106, + -0.06682723, + -0.031070147, + -5.8769673E-4, + -0.05546835, + -0.041754596, + 0.007750717, + 0.06550786, + -0.024858464, + -0.018027157, + -0.070528544, + 0.04311053, + -0.04646167, + 0.038272627, + -0.023141516, + 0.035724208, + -0.044601943, + 0.031177005, + 0.060686704, + -0.008791896, + -0.045239996, + -0.0015549486, + -0.023560282, + -0.02124949, + -0.028758224, + -0.01994061, + -0.031099308, + 0.033113, + 0.04315839, + -0.014818203, + -0.016493127, + 0.03928858, + -0.049371842, + -0.057269108, + -0.07144285, + 0.045010682, + -0.02822895, + 0.026698994, + 0.08181065, + 0.0497983, + -0.0033907534, + -0.023786934, + 0.013289109, + 0.011108559, + 0.075379916, + 0.012320797, + -0.045297462, + 0.09245994, + -0.027429234, + 0.058199212, + 0.06857553, + -0.0705278, + -0.055046707, + 0.025127407, + -0.044880733, + -0.07819047, + -0.016903652, + 0.031777192, + -0.027202426, + 0.033661053, + 0.082595035, + -0.010536667, + 0.067396104, + -0.048291907, + -0.038250096, + 0.009253138, + 0.040732533, + -0.06330689, + -0.074753396, + 0.04644269, + -0.029993957, + -0.033248927, + -0.053877644, + -0.098819815, + -0.0260293, + -0.030682972, + -0.034318104, + -0.014064486, + -0.020334287, + -0.12791014, + -0.017047742, + -0.052973263, + 0.017977173, + -0.04006773, + 0.066867575, + -0.07052264, + -0.02385362, + 0.028173303, + -0.07004571, + 0.053027462, + 0.039910827, + -0.026693301, + -0.07183149, + -0.073637374, + 0.008942395, + 0.012631494, + 0.040236488, + 0.07312884, + -0.1052349, + 0.013788912, + 0.05933606, + -0.012417836, + 0.07844875, + 0.035665687, + 0.0692123, + 0.011978119, + 0.0032255524, + -0.02082568, + -0.027911682, + -0.008114962, + -0.100171834, + 0.012006536, + 0.027355125, + -0.069779284, + -0.06982269, + -0.02499225, + 0.06460924, + 0.10172508, + -0.036987256, + -0.027838582, + 0.06524349, + 0.03478602, + 0.047589943, + 0.0034753575, + 0.035028856, + 0.03955437, + -0.056392808, + 0.097454645, + -0.067250304, + -0.016183723, + -0.010761581, + -0.046665948, + 0.052830804, + -0.06562526, + 0.0143448245, + 0.035826858, + -0.030075911, + 0.074224986, + -0.01484229, + 0.047223467, + -0.05010028, + -0.08323114, + 0.024850823, + 0.0035780836, + -0.04660368, + 0.012318496, + 0.035511326, + -0.006625753, + 0.023968346, + 0.04152267, + 0.066447295, + 0.031807587, + -0.026121954, + -0.06298641, + 0.09144068, + 0.07982457, + -0.047639504, + -0.011746696, + -0.03417992, + -0.066457696, + -0.015668094, + -0.036196046, + -0.0029406173, + -0.054462895, + 0.0029062356, + 0.019851439, + 0.0064928187, + -0.06603669, + 0.016133538, + 0.0441623, + -0.013663719, + -0.027901169, + -0.05862742, + 0.035473794, + -0.080742985, + -0.012147599, + -0.06269955, + 0.045475967, + -0.07024215, + -0.09113673, + 0.018147662, + 0.037072584, + -0.011495025, + 0.049087547, + 0.00970628, + -0.043941073, + 0.052213665, + -0.027107846, + -0.05408287, + 0.04391075, + 0.05903725, + -0.11579457, + 0.0179941, + 0.023727184, + -0.027765218, + 0.058974497, + -0.041185096, + -0.06411593, + 0.05297974, + 0.014402285, + -0.07491701, + -0.046273973, + 0.025595015, + 0.072552234, + 0.07913544, + 0.05780724, + 0.010108354, + -0.032680638, + 0.07236567, + -0.059348762, + 0.07916222, + 0.06330368, + -0.040674247, + 0.014580703, + -0.056963094, + -0.05973973, + -0.028593862, + 0.054875106, + -0.083951905, + -0.030538274, + 0.04507664, + 0.057579767, + 0.047284584, + 0.029037142 + ] + }, + { + "start_offset": 2660, + "end_offset": 3932, + "embeddings": [ + 0.060263444, + -0.011627793, + -0.07406454, + -0.061137985, + 0.035276245, + -0.06492958, + 0.036304567, + 0.03849267, + 0.032589767, + 0.034697585, + 0.055276874, + 0.0067610983, + 0.07107068, + -0.028453767, + -0.023335157, + 0.066190325, + 0.09514554, + -0.031573914, + -0.036566608, + -0.03254594, + 0.01258663, + -0.008238347, + -0.024652604, + 0.058704935, + 0.029146092, + 0.0538354, + -0.033388253, + 0.035337757, + 0.048961233, + -0.06575967, + -0.060514227, + -0.054762013, + 0.049676932, + -0.062150035, + -0.019077798, + 0.018297857, + -0.043477535, + -0.06992983, + 0.041489013, + -0.06091549, + 0.00857616, + 0.0013787356, + 0.059843466, + 0.065656655, + 0.07694915, + 0.07400389, + -0.008740612, + 0.02598118, + -0.04293424, + -0.029819168, + -0.057130232, + 0.08674767, + 0.0020843677, + 0.094413035, + 0.026790254, + -0.07488432, + -0.06260386, + -0.059874497, + -0.022945922, + 0.07328087, + 0.0012629362, + -0.014891515, + -0.017552191, + 0.04158861, + 0.074740976, + 0.13079657, + 0.03465537, + 0.033060353, + -0.071494736, + -0.042101286, + -0.09333479, + 0.075504355, + -0.048976846, + -0.07538883, + 0.016815975, + 0.014265034, + 0.04265424, + -0.055298902, + 0.021028202, + -0.043243185, + -0.035213232, + -0.03872826, + -0.03735794, + -0.009753857, + -0.06591585, + 0.06382551, + 0.070999734, + -0.07432682, + 0.051665448, + -0.06200163, + 0.035289973, + 0.052576542, + -0.08547946, + -0.051438782, + -0.06883237, + -0.04034897, + -0.1139505, + 0.029103009, + 0.056813173, + -0.033878006, + 0.065993756, + 0.0012909115, + 0.030890198, + -0.026131464, + -0.042535, + 0.044831734, + 0.075214975, + -0.045039084, + 0.056481812, + -0.052748743, + -0.042459268, + 0.016207209, + 0.032704834, + 0.04342557, + -0.031859122, + -0.037544478, + -0.023973966, + -0.056660555, + 0.03458018, + -0.043174002, + 0.07610799, + -0.040468093, + -0.041871496, + -0.04984353, + -0.040546015, + -0.019524354, + 0.04170828, + 0.020450952, + 0.0404415, + 0.03985574, + 0.032101743, + 0.05156037, + 0.048545454, + 0.03334057, + 0.025009904, + 0.090053804, + -0.030840183, + 0.0017696177, + 0.01567415, + -0.04152217, + -0.031758398, + 0.020865917, + -0.05755524, + 0.04980784, + 0.050742626, + 0.07122176, + 0.06281647, + -0.012783542, + 0.08377948, + -0.029796185, + 0.017047247, + 0.011766123, + 0.03557249, + 0.019037597, + 0.028088165, + -0.07208148, + -0.08005564, + -0.057871744, + 0.0153855365, + 0.054635677, + -0.05614729, + -0.031374976, + -0.06079491, + -0.041638877, + -0.055767294, + -0.048497472, + -0.007389678, + 0.012500725, + 0.02392964, + -0.03444656, + -0.032773327, + 0.050030876, + -0.062147807, + 0.03894452, + 0.005381243, + 0.005100098, + -0.082184665, + 0.01259893, + 0.06914528, + 0.0502573, + -0.014370648, + -0.039859537, + -0.06393138, + -0.061919075, + -0.014192415, + -0.032273103, + -0.0464307, + -7.1235467E-4, + 0.051684704, + -0.006423554, + 0.0010265269, + 0.057130195, + -0.044715635, + -0.08753112, + -0.060454912, + 0.04602993, + -0.009173136, + 0.030031096, + 0.05415974, + 0.040149722, + -0.030073693, + -0.0026639393, + 0.06262825, + 0.0073858122, + 0.07543514, + 0.013202129, + -0.055555925, + 0.076006316, + 0.0069068773, + 0.037352845, + 0.05844025, + -0.087049164, + -0.0934209, + 0.021478496, + -0.06904104, + -0.035960656, + 0.012564326, + 0.08203622, + -0.0589588, + 0.038763568, + 0.059626605, + -0.0015563822, + 0.056733213, + -0.06597729, + -0.0487247, + 0.030533105, + 0.059536766, + -0.043689486, + -0.044405177, + 0.039805703, + -0.033027582, + -0.034072082, + -0.080049135, + -0.08942587, + 0.019459073, + -0.044563998, + -0.06931994, + 0.021550108, + 0.022951653, + -0.051044974, + -0.03676219, + -0.050016202, + 0.03538716, + -0.06436871, + 0.09116231, + -0.03250418, + -0.008333591, + 0.02689493, + -0.023252478, + 0.04825159, + 0.07439804, + -0.08796822, + -0.04385184, + -0.05042988, + -0.056784004, + 0.057135444, + 0.055787697, + 0.056427166, + -0.09837734, + -0.0036608325, + 0.013839507, + -0.020212527, + 0.09865649, + 0.080563836, + 0.07525103, + 0.033415828, + -0.02267602, + -0.067864396, + -0.05965757, + -0.010466497, + -0.047837727, + 0.017926434, + 0.032667693, + -0.069811225, + -0.011690649, + -0.044193, + 0.023269301, + 0.07142345, + -0.0031622057, + -0.0047916253, + 0.07077121, + 0.03767678, + 0.03410683, + 0.036370695, + 0.01696176, + -0.026317174, + -0.008320507, + 0.09212631, + -0.07694487, + -0.034243643, + 0.0110022295, + -0.060418822, + 0.07019466, + -0.051362276, + 0.078166254, + 0.055226926, + -0.04018289, + 0.063233584, + -0.032110535, + 0.08297619, + -0.009597479, + -0.057851054, + 0.042411964, + 0.01997834, + -0.07460758, + 0.061238132, + 0.0050869896, + 0.023704918, + 0.03991232, + 0.07121017, + 0.067201145, + 0.04065065, + -0.05990329, + -0.04676335, + 0.08255157, + 0.039478876, + -0.05370604, + -0.015417656, + -0.061638564, + -0.113423236, + -0.020872636, + -0.06506326, + -0.019086778, + -0.07550901, + 0.023448454, + 0.031439524, + -0.018936215, + -0.061786037, + 0.06917624, + -0.016625067, + 0.04495578, + -0.05168137, + -0.06433023, + 0.019382514, + -0.030735377, + 0.010870069, + -0.05917494, + 0.033261493, + -0.04571641, + -0.078268915, + 0.03133073, + 0.04491661, + -0.036725685, + 0.05521663, + -0.02092035, + -0.04205282, + 0.035851613, + -0.0015220186, + -0.02102678, + 0.054027468, + 0.07405003, + -0.09111273, + 0.005834604, + 0.053133536, + -0.018385805, + 0.024131889, + -0.04136735, + -0.060419146, + 0.006526669, + 0.046679422, + -0.07396608, + -0.031180743, + 0.032524955, + 0.05950253, + 0.08502798, + 0.05705178, + 0.041140076, + 0.015673824, + 0.052156717, + 0.008876251, + 0.05783481, + 0.06875354, + -0.01914275, + 0.019451428, + 0.0017306518, + -0.09160311, + -0.06650555, + 0.06903168, + -0.11052152, + -0.08185994, + 0.0152816, + 0.056960557, + 0.06667231, + 0.042444445 + ] + }, + { + "start_offset": 3811, + "end_offset": 5053, + "embeddings": [ + 0.08132793, + -0.047893565, + -0.038560215, + -0.03994145, + 0.0558572, + -0.03973998, + 0.020470386, + 0.058355197, + 0.01980108, + 0.03896921, + 0.04879353, + -0.0074668517, + 0.05397047, + -0.010254351, + -0.042885937, + 0.08040558, + 0.091155075, + -0.052957732, + -0.035930026, + -0.03653066, + 0.013761402, + -0.018923452, + -0.04685841, + 0.04731581, + 0.027308341, + 0.020014657, + -0.04545417, + 0.028795317, + 0.04793647, + -0.0704067, + -0.042252712, + -0.05682541, + 0.066968046, + -0.09382263, + 0.02506045, + 0.019845745, + -0.015298284, + -0.044756494, + 0.032255, + -0.03357616, + -0.01634103, + 0.012012115, + 0.05378444, + 0.036496706, + 0.06764162, + 0.08833494, + -0.021727582, + 0.0363613, + -0.08750663, + -0.006557421, + -0.037404615, + 0.083952226, + -0.005245814, + 0.06731529, + 0.027517168, + -0.069114335, + -0.06600843, + -0.055819273, + -0.09175115, + 0.0908832, + 0.045391496, + -0.03755004, + 0.0018628142, + 0.015974216, + 0.034663454, + 0.07421443, + 0.045072228, + -0.0134752095, + -0.053152926, + -0.011296686, + -0.052672, + 0.064373136, + -0.009546203, + -0.08377613, + -0.0018304663, + 0.023774406, + 0.029625371, + -0.07841949, + 0.025992012, + -0.034211818, + -0.04341797, + -0.074051395, + -0.022789141, + -0.014875852, + -0.050796572, + 0.08730017, + 0.09586879, + -0.06974203, + 0.048677806, + -0.04995857, + 0.038378827, + 0.06020236, + -0.060032416, + -0.05082279, + -0.08157444, + -0.05524207, + -0.09547329, + 0.061129954, + 0.07330997, + -0.060067892, + 0.08218149, + -0.011082627, + 0.041907076, + -0.0016668623, + -0.020462176, + 0.0074657737, + 0.04153701, + -0.053815063, + 0.08984907, + -0.04856424, + -0.019359102, + 0.025180845, + 0.0580883, + 0.051315922, + -0.07716719, + -0.06010258, + -0.024739653, + -0.020786842, + 0.021310974, + -0.049855735, + 0.058490653, + -0.019344086, + -0.064905055, + -0.043594714, + -0.0414785, + -0.026626132, + 0.010384775, + 0.035636406, + 0.023757294, + 0.02353357, + 0.038512193, + 0.043469686, + 0.025641369, + 0.06005725, + 0.033108205, + 0.093584485, + -0.008513592, + 0.001993488, + 0.0266426, + -0.0135798985, + -0.058448963, + 0.030007407, + -0.03873391, + 0.012962885, + 0.03407742, + 0.052897573, + 0.048484456, + -0.0037075893, + 0.10519477, + -0.05359505, + 0.062401634, + -0.02432665, + 0.006226394, + 0.027923357, + 0.0724623, + -0.050624184, + -0.08479024, + -0.08688512, + 0.032354686, + 0.06821751, + -0.077089824, + -0.0014580968, + -0.04177363, + -0.027564395, + -0.0448798, + -0.042052064, + -0.009614605, + 0.07208001, + 7.672266E-4, + -0.075805336, + -0.05364635, + 0.06561775, + -0.032068495, + 0.04494038, + -0.044013828, + -0.0190166, + -0.022102332, + 0.034658328, + 0.050540138, + -0.01942592, + -0.020942092, + -0.02782304, + -0.065396436, + -0.04059357, + -0.049896274, + -0.0376796, + -0.043743063, + 0.040360678, + 0.07515184, + -0.018274747, + -0.009190847, + 0.055620983, + -0.041216724, + -0.073044226, + -0.05465287, + 0.010405976, + -0.013486699, + 0.02830836, + 0.06836122, + 0.020561688, + -0.01688864, + -0.020571496, + 0.04652389, + 0.020004654, + 0.060006775, + 0.00938477, + -0.05559232, + 0.08781834, + -0.025533192, + 0.052398734, + 0.057509296, + -0.09851155, + -0.09180138, + 0.038183447, + -0.06369883, + -0.054243114, + 0.020855743, + 0.10808265, + -0.04326038, + 0.023134552, + 0.088371366, + -0.03126334, + 0.044376496, + -0.07867371, + -0.03890121, + 0.051151622, + 0.037706945, + -0.03370568, + -0.008004474, + 0.041355547, + -0.023588097, + -0.026358435, + -0.04786497, + -0.108022444, + -0.04574715, + -0.03736998, + -0.048178125, + 0.034921553, + 0.06676284, + -0.060398124, + -0.024748335, + -0.02818482, + 0.02239888, + -0.07246388, + 0.04970122, + -0.010178895, + -0.010817003, + 0.05318733, + -0.050516233, + 0.04490196, + 0.057144474, + -0.031509876, + -0.06828971, + -0.057091262, + -0.041589297, + 0.034988903, + 0.05772322, + 0.08349064, + -0.07048785, + 0.02914558, + 0.037508357, + -0.018101186, + 0.09606959, + 0.09399272, + 0.033781327, + 0.026298832, + -0.007974394, + -0.04828518, + -0.030074345, + -0.008707313, + -0.06095452, + 0.0052815387, + 0.053281322, + -0.07403459, + -0.04375484, + -0.0024250182, + 0.030269688, + 0.08677468, + -0.044580005, + -0.023698311, + 0.09059957, + 0.03502518, + 0.039508294, + 0.03801833, + 0.051657647, + -0.023771202, + -0.021416105, + 0.08418382, + -0.07468558, + -0.022965085, + -0.037451513, + -0.070336066, + 0.07278321, + -0.06958301, + 0.061745293, + 0.034864236, + -0.05098527, + 0.075577505, + -0.01925352, + 0.028659336, + -0.01881169, + -0.09233528, + 0.052659664, + 0.046592344, + -0.08144535, + 0.04045172, + 0.021832049, + 0.01539719, + 0.036698546, + 0.048459183, + 0.0750458, + 0.03523083, + -0.093105264, + -0.042830218, + 0.08817936, + 0.05500005, + -0.03145603, + 0.002137886, + -0.09369107, + -0.0859627, + -0.00988302, + -0.03224872, + 0.009135905, + -0.07538188, + 0.01729995, + 0.05211995, + -0.028220842, + -0.09644254, + 0.08197546, + 0.021641405, + 0.044149674, + -0.02265579, + -0.03705849, + 0.0066629667, + -0.038971607, + 0.0077898037, + -0.07302501, + 0.050258975, + -0.031734023, + -0.05120743, + 0.006855154, + 0.03317757, + -0.054895062, + 0.020226864, + -0.028702717, + -0.054496907, + 0.03333692, + -0.01552826, + -0.024065949, + 0.034094118, + 0.06990785, + -0.11025783, + -0.022972278, + 0.094185725, + -0.034931783, + 0.045400895, + 0.0029167728, + -0.040711746, + 0.0069749537, + 0.02316794, + -0.07623587, + -0.032300122, + 0.040407263, + 0.056106865, + 0.084427394, + 0.09241687, + -0.014235544, + -9.3176577E-4, + 0.056472927, + -0.066110075, + 0.07017728, + 0.06319923, + -0.026196225, + 0.013847319, + -0.047189496, + -0.034471143, + -0.035234082, + 0.015169919, + -0.06258794, + -0.044817522, + 0.052238535, + 0.052592035, + 0.024454227, + 0.04652183 + ] + }, + { + "start_offset": 5013, + "end_offset": 6270, + "embeddings": [ + 0.050837185, + -0.058507636, + -0.08578978, + -0.07158996, + 0.062322024, + -0.06394126, + 0.033397503, + 0.066029586, + 0.059980292, + 0.014527764, + 0.027411256, + -0.019332865, + 0.09169677, + -0.028353753, + -0.024152989, + 0.026958432, + 0.06263654, + -0.057214282, + -0.01730705, + -0.06580778, + 0.012587115, + -0.0013240383, + -0.034304086, + 0.07279054, + 0.03153362, + 0.022333346, + -0.019766338, + 0.01765917, + 0.018127792, + -0.031060342, + -0.035549946, + -0.055531062, + 0.020338904, + -0.102598086, + 0.01697388, + 0.01325798, + -0.05225683, + -0.028536074, + 0.018755725, + -0.03648683, + 0.0047455817, + 0.007937342, + 0.05206842, + 0.07168695, + 0.08550893, + 0.0469701, + -0.053452007, + 0.050660927, + -0.028207462, + -0.038872562, + -0.044887412, + 0.0740998, + -0.013441051, + 0.07634305, + 0.0055091325, + -0.11408244, + -0.06909077, + -0.07962894, + -0.066142306, + 0.07568293, + 0.0025674207, + -0.080196865, + -0.006201128, + 0.00818501, + 0.07924847, + 0.10414052, + 0.042439207, + 0.035281047, + -0.040974326, + -0.04297422, + -0.024786443, + 0.06963027, + -0.016090378, + -0.077486746, + 0.013267866, + 0.0382188, + 0.075773925, + -0.045972046, + 0.021897435, + -0.057650458, + -0.026901621, + -0.047625203, + 0.0012063365, + 0.025827816, + -0.023581855, + 0.059192963, + 0.06759525, + -0.06503824, + 0.051352326, + -0.04751885, + 0.06295226, + 0.03710186, + -0.05161417, + -0.049769994, + -0.08769117, + -0.045511324, + -0.051784497, + 0.056573063, + 0.040720508, + -0.035331022, + 0.073139556, + -8.214206E-4, + 0.037490595, + -0.0021819966, + -0.024999384, + 0.019722067, + 0.024325203, + -0.044025563, + 0.06545914, + -0.019343818, + -0.0023573453, + 0.0018968938, + 0.06038538, + 0.02333629, + -0.06574865, + -0.027746813, + -0.025081333, + -0.014503653, + 0.02887482, + -0.034452263, + 0.07113403, + -0.03859757, + -0.06710839, + -0.0383765, + -0.06811556, + 0.0061613885, + 0.034110006, + 0.05640678, + 0.06142383, + 0.009073967, + 0.043047454, + 0.03466423, + 0.027530612, + 0.032211494, + 0.053615883, + 0.07377551, + -0.01758648, + -0.02144349, + 0.03956204, + -0.031308886, + -0.062522896, + 0.07004273, + -0.041059777, + 0.03381151, + 0.096379966, + 0.059807573, + 0.076913215, + 7.038924E-4, + 0.081829004, + -0.06641827, + 0.044492118, + -0.036664132, + 0.08141791, + 0.039923627, + 0.079390235, + -0.05483655, + -0.092164926, + -0.07556358, + 0.024775334, + 0.039525755, + -0.052411165, + -0.044712305, + -0.1251298, + 0.019936236, + -0.05971529, + -0.071407795, + -0.013429681, + 0.045429856, + 9.2904486E-7, + -0.011094936, + -0.053897448, + 0.05120436, + -0.051203646, + 0.05109921, + -3.9564449E-4, + -0.0018849113, + -0.04667166, + 0.051931337, + 0.07190472, + 0.03911436, + 0.0045251944, + -0.048008155, + -0.03397076, + -0.028034845, + -0.048654392, + -0.02667819, + -0.04844982, + 0.04652294, + 0.08667334, + -0.03595206, + 0.0059883194, + 0.04574355, + -0.049042065, + -0.0949724, + -0.0883229, + 0.022961965, + 0.0010751986, + 0.034764428, + 0.07906372, + 0.063135885, + 0.011506904, + -0.01975833, + 0.036684997, + 0.060913093, + 0.045704674, + 0.007864406, + -0.10908467, + 0.05677562, + -0.011089532, + 0.038626347, + 0.009512805, + -0.064039044, + -0.072748266, + 0.077210315, + -0.038597148, + -0.035940252, + 0.028666161, + 0.07342884, + -0.05140841, + 0.03324692, + 0.087146066, + -0.063568234, + 0.046904817, + -0.101345256, + -0.089092165, + 0.020936692, + 0.03865168, + -0.05066454, + -0.020703398, + 0.037939124, + -0.069670096, + -0.04573288, + -0.042975515, + -0.08133061, + -0.04999254, + -0.07754444, + -0.015807157, + 0.005468936, + 0.058917798, + -0.047519706, + -0.011129669, + -0.023593048, + 0.017224371, + -0.08876406, + 0.021552147, + -0.0042216736, + 3.2073245E-4, + 0.020970272, + -0.018367162, + 0.05507523, + 0.049186505, + -0.053686555, + -0.05892317, + -0.04681065, + -0.0346258, + 0.025476422, + 0.018746119, + 0.07847266, + -0.061995696, + 0.054043338, + 0.05290739, + -0.03922319, + 0.09967812, + 0.11260788, + 0.079831325, + 0.038233027, + -0.007090767, + -0.025567437, + -0.059230927, + -0.0053755366, + -0.05934471, + 0.019243969, + 0.028365586, + -0.092337005, + -0.042283885, + -0.02478212, + 0.036973756, + 0.06046009, + -0.08319817, + -0.03466979, + 0.0052572396, + 0.03651634, + 0.0098519325, + 0.054537416, + 0.106752776, + -0.03245272, + -0.021710223, + 0.067718424, + -0.0716523, + -0.0467586, + 0.04351528, + -0.06902318, + 0.0840498, + -0.06641164, + 0.049778968, + 0.068722665, + 0.006945258, + 0.052571226, + -0.018321687, + 0.08851911, + -0.06484523, + -0.05621622, + 0.0138798375, + 0.062657684, + -0.044570502, + 0.04102728, + 0.018748704, + -0.00942585, + 0.031132046, + 0.028199397, + 0.04842188, + 0.05593715, + -0.059101623, + -0.06402159, + 0.098776296, + 0.02233127, + -0.026724212, + -0.0065241847, + -0.04349072, + -0.034313653, + 0.0035007112, + -0.05192231, + -0.038924325, + -0.06474185, + 0.015219527, + 0.015206849, + -0.006182916, + -0.047223445, + 0.03093224, + 0.0028494631, + 0.029578412, + -0.03084317, + -0.064933576, + 0.04518858, + -0.039695684, + 0.00936517, + -0.057235852, + 0.07411994, + -0.03560979, + -0.058608506, + 0.011952328, + 0.038545735, + -0.0027342425, + 0.034396514, + -0.05941442, + -0.059142824, + 0.07352255, + -0.043796647, + -0.02323201, + 0.021158574, + 0.04281619, + -0.06509553, + 0.025277078, + 0.028309572, + -0.025768865, + 0.017667482, + -0.054695044, + -0.0071169212, + 0.024850225, + 0.045802698, + -0.06463908, + -0.06887592, + 0.015381043, + 0.07519754, + 0.057192106, + 0.04958389, + -0.0055669746, + 0.011448934, + 0.03116414, + -0.047596138, + 0.0854336, + 0.04283707, + -0.0740198, + 0.012606065, + -0.06125597, + -0.051641334, + -0.08642954, + 0.051201824, + -0.06496548, + -0.052257292, + 0.042111978, + 0.06265747, + 0.020205691, + 0.030658716 + ] + }, + { + "start_offset": 6143, + "end_offset": 7446, + "embeddings": [ + 0.0424085, + -0.034002542, + -0.03464202, + -0.050363787, + 0.07952863, + -0.06934173, + 0.032258246, + 0.0323823, + 0.058361948, + 0.024646914, + 0.033364307, + 0.014893917, + 0.082809135, + -0.029873388, + -0.029152617, + 0.04554002, + 0.0795821, + -0.036626082, + -0.0474332, + -0.07305637, + 0.013581792, + -0.004326934, + -0.014103911, + 0.034649894, + -0.0026006806, + 0.02861443, + -0.04941399, + 0.04220857, + 0.03800667, + -0.08277502, + 0.0030204614, + -0.053834133, + 0.056124337, + -0.049811907, + 0.039426923, + 0.020071387, + -0.058887776, + -0.028534504, + 0.017018566, + -0.058147434, + -0.004793465, + 0.044247996, + 0.09460399, + 0.015196105, + 0.06281946, + 0.044713628, + -0.060649756, + 0.027246455, + -0.076060586, + -0.049838327, + -0.08404265, + 0.029550698, + -0.03708172, + 0.07957659, + 0.005638496, + -0.06591597, + -0.06454032, + -0.031200824, + -0.08628952, + 0.063782215, + 0.07779158, + -0.030862262, + -5.435849E-4, + 0.019658469, + 0.057832543, + 0.07795239, + 0.0381484, + -7.929322E-4, + -0.0592228, + -0.005782202, + -0.030597664, + 0.087376595, + -0.010526408, + -0.048925165, + -0.02034168, + 0.03517407, + 0.11462333, + -0.045529578, + 0.03299401, + -0.037767082, + -0.042070027, + -0.058737356, + -0.024921589, + 0.034654282, + -0.055172887, + 0.06289939, + 0.020921186, + -0.05699275, + 0.09581658, + -0.06115032, + 0.08512388, + 0.054141954, + -0.0934276, + -0.105145365, + -0.08745115, + -0.06042352, + -0.07095655, + 0.055074938, + 0.0759865, + -0.0045393603, + 0.06166128, + -0.0054426217, + -0.0013491446, + 0.020781914, + -0.013829525, + 0.012210793, + 0.0570243, + -0.026055835, + 0.050172452, + -0.0491802, + -0.03582268, + 0.0012494406, + 0.040490862, + 0.040501244, + -0.098037206, + -0.039755426, + -0.022896642, + 0.003485195, + 0.016366435, + -0.026002685, + 0.06318523, + -0.050691966, + -0.09513729, + -0.064722195, + -0.06132966, + -0.020495446, + 0.014939301, + 0.054761756, + 0.028909337, + -0.0023375573, + 0.042052656, + 0.022837669, + 0.0230999, + 0.03036407, + 0.018764673, + 0.072496034, + -0.036595833, + -0.036863085, + 0.028396215, + -0.091672495, + -0.08657466, + 0.047359336, + -0.055880774, + 0.0070424355, + 0.069609754, + 0.043904763, + 0.07389961, + -0.0059867557, + 0.116695836, + -0.03913718, + 0.036678135, + -0.010901363, + 0.08819442, + 0.03855831, + 0.07974421, + -0.051924232, + -0.10385839, + -0.033763383, + 0.019493395, + 0.050680365, + -0.058339395, + -0.02083137, + -0.08609875, + 0.017414644, + -0.063257225, + -0.056500446, + 0.023052368, + 0.04622413, + -0.018110551, + -0.007981176, + -0.024779806, + 0.0448911, + -0.08686634, + 0.06575812, + -0.04816167, + 0.049937073, + -0.04870519, + 0.078450456, + 0.06596584, + 0.026573703, + -0.054720048, + -0.016695132, + -0.06281992, + -0.033874605, + -0.034129698, + -0.018373003, + -0.050729766, + 0.037208032, + 0.08663066, + 0.0057553193, + 0.018936101, + 0.0683749, + -0.019277481, + -0.111216776, + -0.08299779, + 0.064380944, + -0.023994485, + 0.02228393, + 0.037532013, + 0.027998803, + 0.010780377, + -0.02866339, + 0.035218086, + 0.040947795, + 0.047251962, + 0.022822948, + -0.04361859, + 0.03929657, + -0.02838609, + 0.06326206, + 0.061787914, + -0.06487332, + -0.05326772, + 0.08467877, + -0.037987698, + -0.030701924, + 0.03693124, + 0.079549454, + -0.06695752, + 0.038511194, + 0.059876252, + -0.04255189, + 0.04926685, + -0.06254431, + -0.056073554, + 0.0059021385, + 0.06375891, + -0.028473105, + -0.020516206, + 0.053688798, + -0.0505003, + -0.013776076, + -0.056746498, + -0.074674286, + -0.036429465, + -0.078277834, + -0.033130404, + 0.026524864, + 0.010027121, + -0.052846454, + -0.03245234, + -0.0045730877, + 0.06279463, + -0.09209112, + 0.030202646, + -0.027974173, + -0.018735087, + 0.0051772078, + -0.034461137, + 0.031503055, + 0.024202514, + -0.0384219, + -0.028417397, + -0.0141932685, + -0.01493018, + 0.05603126, + 0.032856, + 0.0636288, + -0.08880921, + 0.0027978476, + 0.07799859, + -0.0328014, + 0.1109901, + 0.103224635, + 0.021524789, + 0.06495574, + 0.008971255, + -0.025503872, + -0.05471651, + -0.037969336, + -0.052947987, + 0.025896605, + 0.040142477, + -0.04655958, + -0.037604652, + -0.04057517, + 0.024616593, + 0.10586181, + -0.018084457, + -0.045486886, + 0.043346837, + 0.040528644, + 0.07145432, + 0.06723152, + 0.0444014, + 0.039035454, + -0.01685273, + 0.09862476, + -0.04053366, + -0.011219273, + 9.4339694E-4, + -0.04893209, + 0.08255836, + -0.06254635, + 0.0643953, + 0.057366677, + -0.035574544, + 0.05627519, + -0.053370558, + 0.07825556, + -0.0464488, + -0.06944344, + 0.06384285, + 0.022012226, + -0.059294943, + 0.015924655, + 0.015040029, + -0.024862552, + 0.0372234, + 0.07461155, + 0.037966266, + 0.05571149, + -0.062487237, + -0.05230138, + 0.09539987, + 0.050107345, + -0.045335423, + -0.008107003, + -0.04972419, + -0.053539097, + -0.022092147, + 0.0025375162, + -0.034666307, + -0.02502986, + -0.0051417495, + 0.051072195, + 0.0013976014, + -0.05035485, + 0.032701, + 0.029351933, + 0.030166088, + -0.056991193, + -0.05375353, + 0.046652608, + -0.0428863, + -0.029472742, + -0.052559793, + 0.091564216, + -0.080590494, + -0.0837016, + -0.019702932, + 0.039997194, + -0.006878238, + 0.03106036, + 0.0039084614, + -0.0647739, + 0.047937315, + -0.04196034, + -0.016512591, + 0.002820003, + 0.06303794, + -0.08405546, + 0.026794465, + 0.027069453, + -0.01786329, + 0.014802783, + -0.05162349, + -0.013761013, + -0.008544942, + 0.058489725, + -0.04009345, + -0.07866012, + 0.050363623, + 0.03921136, + 0.10168464, + 0.017203555, + -0.036566544, + -0.0041820146, + 0.017140131, + -0.04071419, + 0.028168127, + 0.04408699, + -0.051891476, + 0.018359438, + -0.05747516, + -0.042995404, + -0.050385248, + 0.016142845, + -0.097052485, + -0.054681405, + 0.015732065, + 0.04252675, + 0.04927429, + 0.034856237 + ] + }, + { + "start_offset": 7274, + "end_offset": 8428, + "embeddings": [ + 0.053351756, + -0.016210953, + -0.07376261, + -0.053941812, + 0.02817351, + -0.049927928, + 0.037769757, + 0.024953691, + 0.08055997, + 0.032674763, + 0.052936487, + 0.036146153, + 0.09430347, + -0.0028838688, + -0.007466441, + 0.023164729, + 0.10583723, + -0.031802896, + -0.041414317, + -0.0475711, + 0.009346337, + -0.0023871146, + -0.02213494, + 0.050703954, + 0.035117928, + 0.049729533, + -0.041396488, + 0.040562224, + 0.0072581097, + -0.08263742, + -0.0562156, + -0.015488454, + 0.05251555, + -0.093467265, + 0.023409631, + 0.025775665, + -0.044880413, + -0.049109295, + 0.047048803, + 0.0037931658, + -0.0067197834, + 0.06803116, + 0.07420838, + -5.630403E-4, + 0.081702, + 0.06873878, + -0.0719469, + 0.07724739, + -0.05212626, + -0.042729367, + -0.042923346, + 0.03461211, + 0.0384493, + 0.07852812, + 0.010787158, + -0.08513074, + -0.061220147, + -0.064391315, + -0.05105939, + 0.052274473, + 0.051858254, + -0.025238348, + -0.00587187, + 0.027783165, + 0.08390886, + 0.09118287, + 0.0045411596, + -0.007192923, + -0.03402139, + -0.0055287075, + -0.023308607, + 0.048499316, + 0.0056659714, + -0.055594128, + 0.006816471, + 0.06142901, + 0.069629386, + -0.06880756, + 0.03697912, + 4.030213E-4, + -0.016491663, + -0.04839326, + -0.07392797, + 0.043547455, + -0.056421243, + 0.04223018, + 0.08332315, + -0.067911245, + 0.090487525, + -0.055714566, + 0.08206281, + 0.06703987, + -0.08389162, + -0.057403725, + -0.08070137, + -0.08085191, + -0.06221053, + 0.022357801, + 0.05380439, + -0.057247546, + 0.082033284, + -0.040765326, + 0.013981313, + -0.0040798467, + -0.026184458, + 0.041849125, + 0.0670039, + -0.054438762, + 0.05614216, + -0.042283792, + -0.011577375, + -0.005841353, + 0.053594112, + 0.046762522, + -0.052612707, + -0.057888422, + -0.041523386, + -0.024746502, + -0.0075298445, + -0.064313106, + 0.07056589, + -0.060802132, + -0.066174984, + -0.028887944, + -0.045796074, + -0.032927513, + 0.020563344, + 0.03263002, + 0.062557735, + 0.017696919, + 0.07386037, + 0.03261784, + 0.049800515, + 0.030138545, + 0.08249261, + 0.09115441, + -0.042155825, + -0.03988317, + 0.020776471, + -0.051469974, + -0.08725858, + 0.03421217, + -0.05013289, + 0.013482718, + 0.064455256, + 0.03295194, + 0.05072303, + 0.006866378, + 0.07478394, + -0.08232063, + 0.019163597, + 0.004971397, + 0.04126514, + 0.058498725, + 0.051773094, + -0.075701, + -0.10187357, + -0.04737017, + 0.024935009, + 0.05112209, + -0.06950842, + -0.043909222, + -0.08784876, + 0.024858471, + -0.09546347, + -0.066443644, + -0.039961666, + 0.038705625, + 0.024331694, + -8.98396E-4, + -0.05572306, + 0.029712915, + -0.03771733, + 0.03198425, + -0.018850418, + 0.029596135, + -0.03073546, + 0.040810063, + 0.05748256, + 0.073663406, + -4.3307795E-4, + -0.012033559, + -0.04193751, + -0.025243256, + 0.0020644036, + -0.045018397, + -0.041560806, + 0.052930553, + 0.019955857, + -0.026577184, + -0.008272473, + 0.021633727, + -0.025493031, + -0.0703225, + -0.06678734, + 0.03229182, + 0.0071383226, + 0.034542687, + 0.059906006, + 0.053990763, + -0.03435307, + -0.013460787, + 0.0066855447, + 0.06581118, + 0.03435488, + -0.013016893, + -0.06384082, + 0.04292309, + -0.01003905, + 0.07465682, + 0.041681886, + -0.09872228, + -0.073181555, + 0.06117674, + -0.037698943, + -0.04354557, + 0.015390995, + 0.016960131, + -0.08594164, + -0.0031558785, + 0.053712446, + -0.022476645, + 0.049800374, + -0.091516644, + -0.054994736, + 0.0021578616, + 0.0319539, + -0.037861917, + -0.035363257, + 0.029294293, + -0.038181435, + -0.032684956, + -0.059862334, + -0.052932844, + -0.058168415, + -0.09271316, + -0.03091905, + 0.058375362, + 0.033076484, + -0.048589885, + -0.0471485, + -0.036419317, + 0.0197355, + -0.09041303, + 0.005987353, + -0.04762716, + -0.025347468, + 0.01992799, + -0.040301844, + 0.028963821, + 0.04351864, + -0.07274519, + -0.029667713, + 0.002675472, + -0.008265489, + 0.024745574, + 0.015290826, + 0.05244983, + -0.06499378, + 0.062229145, + 0.056773033, + -0.013647868, + 0.10126457, + 0.07742867, + 0.06907199, + 0.064441785, + -0.03506488, + -0.0027899756, + -0.043987043, + -0.049338706, + -0.06806032, + 0.025320068, + 0.07688298, + -0.037168447, + -0.015209554, + -0.04958993, + 0.029053042, + 0.078892104, + -0.05066037, + -0.030179376, + 0.047830258, + 0.05499768, + 0.04351645, + 0.052307993, + 0.044633888, + 0.020583658, + -0.033953577, + 0.095311515, + -0.0630289, + 0.007157878, + 0.038106248, + -0.035896186, + 0.082412794, + -0.029322542, + 0.09868366, + 0.055021353, + -0.0075476193, + 0.06234535, + -0.070212856, + 0.059051443, + -0.034478117, + -0.062892415, + 0.051439803, + 0.027673196, + -0.08141708, + 0.051184427, + 0.0028761302, + 0.016736014, + 0.05301783, + 0.070441864, + 0.034725133, + 0.07278133, + -0.034562826, + -0.08274096, + 0.04781931, + 0.067391045, + -0.028286146, + 0.045300007, + -0.070981935, + -0.0900906, + -0.01804769, + -0.07678485, + -0.054171197, + -0.04371682, + 0.044014435, + 0.019092314, + -0.0533041, + -0.05406611, + 0.001399687, + 0.008414226, + 0.0070721963, + -0.054595735, + -0.06279298, + 0.012740916, + -0.068271995, + -0.016297301, + -0.018569002, + 0.07028272, + -0.021509787, + -0.07611714, + 0.00775331, + 0.043958176, + -0.015166803, + 0.057754774, + -0.013378479, + -0.06428601, + 0.033813998, + -0.03535889, + -0.0053371727, + 0.022787765, + 0.0827088, + -0.12142623, + 0.0026697267, + 0.03981775, + -0.02158926, + 0.03722548, + -0.04657821, + -0.049696047, + 0.027757794, + 0.046377983, + -0.049581885, + -0.08924511, + 0.035119716, + 0.07465048, + 0.07677282, + 0.053386416, + -0.020686079, + 0.013271858, + 0.057107273, + -0.016681688, + 0.015427299, + 0.046444256, + -0.0758986, + 0.03103317, + 0.0036917871, + -0.07186075, + -0.0624062, + 0.043409187, + -0.054538824, + -0.065254256, + 0.05370674, + 0.03439175, + 0.02197341, + 0.025227817 + ] + }, + { + "start_offset": 8427, + "end_offset": 9687, + "embeddings": [ + 0.05744903, + -0.02452922, + -0.08476994, + -0.022428924, + 0.048399355, + -0.036132727, + -0.015275069, + 0.074007444, + 0.07940483, + 0.02248898, + 0.04316835, + -0.0034011744, + 0.08490044, + -4.1730207E-5, + -0.038465198, + 0.047819026, + 0.072968654, + -0.0597117, + 0.01257942, + -0.058731165, + 0.01321756, + 0.015429294, + -0.04443649, + 0.067764916, + 0.032255292, + 0.057302598, + -0.013705533, + -0.002871075, + -0.0017963633, + -0.076624624, + -0.04033174, + -0.03958768, + 0.021592977, + -0.083355255, + 0.02508422, + 0.014075689, + -0.042936496, + -0.069775715, + 0.053771127, + -0.06096773, + -0.038709678, + 0.030585166, + 0.06309865, + 0.0289272, + 0.070409566, + 0.075638674, + -0.039296776, + 0.02741248, + -0.041558262, + -0.009545241, + -0.060929116, + 0.017809264, + 0.04246089, + 0.092424795, + 0.0044749626, + -0.084972195, + -0.071833394, + -0.05189755, + -0.05925639, + 0.07651771, + 0.051788367, + -0.053483434, + -0.021773372, + -0.00506648, + 0.038404945, + 0.06250312, + 0.061945193, + 0.011288415, + -0.060019504, + -0.026446447, + -0.055844307, + 0.06780296, + -0.03332657, + -0.048795506, + 0.03756737, + 0.045220662, + 0.034406263, + -0.058406588, + 0.018282196, + -0.09083589, + -0.03040247, + -0.05790508, + -0.016188977, + 0.022804815, + -0.056110263, + 0.07543798, + 0.038187183, + -0.08649141, + 0.08623204, + -0.042687863, + 0.0573812, + 0.050730344, + -0.0433588, + -0.09344185, + -0.046142764, + -0.07739427, + -0.05609858, + 0.052337695, + 0.053889126, + -0.05016094, + 0.096083306, + 0.011468343, + 0.042769995, + 0.008584574, + -0.028934095, + 0.029772492, + 0.05292526, + -0.024594065, + 0.08542614, + -0.066132575, + -0.0076108603, + 0.0075524007, + 0.09586245, + 0.07127726, + -0.08062749, + -0.06285386, + -0.034123085, + 0.053412784, + 0.03723955, + -0.033416737, + 0.04680435, + -0.03861024, + -0.027420327, + -0.081069514, + -0.059449777, + -0.023693249, + 0.023154624, + 0.052628066, + 0.053673804, + 0.03851477, + 0.048254706, + 0.040450633, + 0.024582013, + 0.030465266, + 0.07089921, + 0.087507665, + -0.009536147, + -0.014239722, + 0.0023720453, + -0.03707558, + -0.025194108, + 0.08157714, + -0.03958548, + 0.051691998, + 0.06314976, + 0.02721075, + 0.052713513, + -0.023559293, + 0.06393838, + -0.07106552, + 0.044660386, + -0.025641244, + 0.06264186, + 0.014594412, + 0.048385747, + -0.055564065, + -0.06955722, + -0.088032804, + 0.034305904, + 0.045169048, + -0.03802287, + -0.013604237, + -0.08036378, + 0.022200659, + -0.055803996, + -0.084766835, + -0.03537992, + 0.0466811, + -0.01768934, + -0.04932191, + -0.028891142, + 0.0119931735, + -0.030645167, + 0.02563793, + 0.011760058, + 0.02289236, + -0.052902717, + 0.0097223595, + 0.042422734, + 0.020096473, + 0.0088921515, + -0.013737467, + -0.03993987, + -0.05381494, + -0.04218381, + -0.03449234, + -0.054990627, + 0.009642538, + 0.05949224, + -0.007698366, + 0.027766742, + 0.031446908, + -0.08122337, + -0.038493186, + -0.06830541, + 0.020205725, + -0.030477056, + 0.044251017, + 0.08096215, + 0.10125872, + -0.009518375, + -0.018208051, + 0.04083479, + 0.021746838, + 0.030360037, + 0.0030146895, + -0.04425533, + 0.063152075, + -0.040584363, + 0.07283654, + 0.062402766, + -0.072093405, + -0.07191966, + 0.041823577, + 0.004934987, + -0.037696403, + 0.032516938, + 0.072518826, + -0.06659665, + -0.006708449, + 0.07320258, + -0.028489655, + 0.0686214, + -0.07320168, + -0.03665047, + 0.020025352, + 0.018766245, + -0.025394067, + -0.043893065, + 0.013678436, + -0.0817917, + -0.02630837, + -0.03421568, + -0.0654703, + -0.042911462, + -0.07311668, + -0.0038604182, + 0.016762605, + 0.021780867, + -0.06629608, + -0.012976095, + -0.051092017, + 0.011383003, + -0.11568767, + 0.056158062, + -0.011376737, + 0.020621011, + 0.015717132, + -0.01347594, + 0.018848866, + 0.039923675, + -0.06502122, + -0.044894896, + -0.032492988, + -0.035042934, + 0.045391146, + 0.047973733, + 0.10662139, + -0.056172207, + 0.031413678, + 0.0125645455, + -0.003751948, + 0.07743928, + 0.084872924, + 0.047170028, + 0.046972826, + -0.00976389, + -0.032883007, + -0.054116864, + -0.027746534, + -0.08914457, + 0.007070583, + 0.04398771, + -0.0475649, + -0.06489332, + -0.060108416, + 0.0143431965, + 0.05955711, + -0.0774654, + -0.030995058, + 0.05263145, + 0.029864812, + 0.01608842, + 0.09080374, + 0.05185686, + -0.032855753, + 0.0063909087, + 0.0853062, + -0.10142854, + -0.07251046, + -0.005085327, + -0.066178784, + 0.046009053, + -0.09079122, + 0.08566233, + 0.06576406, + 0.017733688, + 0.06487284, + -0.039741356, + 0.04176326, + -0.010695733, + -0.050619148, + 0.01245912, + 0.03467508, + -0.06871932, + 0.030141022, + 0.026552299, + -0.0035028423, + 0.030276356, + 0.05361378, + 0.054491397, + 0.06513585, + -0.08491482, + -0.051875558, + 0.086067244, + 0.0396853, + -0.054731067, + 0.016796874, + -0.036002953, + -0.0658579, + -0.016491668, + -0.0758324, + -0.039184928, + -0.068875834, + 0.031522863, + 0.009083638, + -0.024529556, + -0.059996516, + 0.06894157, + -0.033383097, + -0.002836109, + -0.044933245, + -0.09211297, + 0.075231805, + -0.09029687, + -0.0025871666, + -0.02342682, + 0.06138579, + -0.052864984, + -0.078638926, + 0.020620788, + 0.011810836, + -0.014471281, + 0.01986825, + -0.040482074, + -0.06512211, + 0.062289387, + -0.03012425, + -4.7029057E-4, + 0.035347983, + 0.11894842, + -0.050478995, + -0.014397127, + 0.049630538, + -0.01540003, + 0.052197892, + -0.048483927, + -0.0076621673, + 0.04089758, + 0.015284395, + -0.023267174, + -0.0582655, + 0.035793785, + 0.06800681, + 0.11031594, + 0.10364201, + -0.042768136, + 0.03487297, + 0.03780645, + -0.040866226, + 0.046048936, + 0.029865082, + -0.04171421, + 0.03842289, + -0.0154759055, + -0.020621978, + -0.05873017, + 0.05175785, + -0.03108134, + -0.08132814, + 0.04200817, + 0.05092204, + 0.02828486, + 0.06530922 + ] + }, + { + "start_offset": 9554, + "end_offset": 10460, + "embeddings": [ + 0.06795254, + -0.027788855, + -0.06532636, + -0.05325019, + 0.05093753, + -0.055382267, + 0.051724233, + 0.03824768, + 0.07362302, + 0.04002248, + 0.0550001, + -4.6239374E-4, + 0.09122236, + 0.026436811, + -0.06672792, + 0.011183016, + 0.097761884, + -0.082106106, + -0.028012855, + -0.062181316, + -0.008753627, + -0.019600896, + -0.036626942, + 0.03447432, + 0.06013969, + 0.05223775, + -0.016101984, + -0.010203473, + 0.025855985, + -0.060056984, + -0.03937554, + -0.043756496, + 0.030997807, + -0.10395428, + 0.027634699, + 0.005324417, + -0.024001809, + -0.019012816, + 0.057175636, + -0.04684799, + -0.053544424, + 0.010870207, + 0.029463693, + 0.012833155, + 0.09024689, + 0.07413883, + -0.032765836, + 0.015112767, + -0.026345447, + -0.061428167, + -0.03721613, + 0.049783345, + -0.0010639617, + 0.09159631, + 0.02264281, + -0.055135295, + -0.05914746, + -0.052138176, + -0.07942767, + 0.029073795, + 0.02702419, + -0.0532197, + -0.014727404, + 6.2745955E-4, + 0.029936861, + 0.07468935, + 0.017335532, + 0.059831787, + -0.049344696, + 0.01880937, + -0.04200233, + 0.067229606, + -0.0012889965, + -0.0632363, + 0.043949638, + 0.049309365, + 0.036185548, + -0.058062393, + 0.04830483, + -0.02813847, + -0.06845039, + -0.046040177, + 0.0015298559, + 0.0377331, + -0.028661495, + 0.077781945, + 0.050341487, + -0.060820885, + 0.07588156, + -0.034051448, + 0.0768756, + 0.048993148, + -0.066278465, + -0.09077045, + -0.06511732, + -0.04687162, + -0.06367129, + 0.04149166, + 0.07077744, + -0.041742414, + 0.054795545, + -0.029036827, + 0.06274923, + 0.0061874306, + -0.0317641, + 0.0038963803, + 0.056757834, + -0.043710202, + 0.06571763, + -0.039196618, + 0.011638405, + -0.014480316, + 0.0782259, + 0.041573986, + -0.056704924, + -0.044138405, + -0.032456245, + 0.025932135, + 0.044077054, + -0.007750241, + 0.054932345, + -0.03292227, + -0.028155934, + -0.08390399, + -0.044745676, + -0.039294515, + 0.020936523, + 0.059180506, + 0.09919356, + 0.04366707, + 0.02902992, + 0.041400306, + 0.009594294, + -0.0058838082, + 0.042111326, + 0.12996204, + -0.038631447, + -0.019210441, + 0.056625124, + -0.057970613, + -0.08764153, + 0.08361837, + -0.016109295, + 0.030824538, + 0.061104048, + 0.05500983, + 0.045061268, + -0.055872414, + 0.08214088, + -0.046806127, + 0.057676565, + -0.055537637, + 0.072990045, + 0.045658644, + 0.06032115, + 0.0016026857, + -0.08040042, + -0.082738034, + 0.021192942, + 0.06619772, + -0.060728885, + -0.012204158, + -0.05736885, + 0.011759795, + -0.09559732, + -0.03487954, + -0.004853385, + 0.07568596, + 0.0170426, + -0.032266848, + -0.034448244, + 0.0015031097, + -0.051096436, + 0.067675546, + -0.008337999, + 0.02016469, + -0.034166988, + 1.3699784E-4, + 0.036702186, + 0.03628234, + -0.034941807, + 0.00841879, + -0.034299497, + -0.045383744, + -0.021920165, + -0.037155207, + -0.012305447, + -0.018064288, + 0.041540947, + -0.013256499, + -0.01824263, + 0.027535202, + -0.10648625, + -0.10162097, + -0.08293666, + 0.048940066, + -0.008739751, + 0.03177586, + 0.06314134, + 0.08493229, + -0.03178613, + -0.04156539, + 0.021325408, + 0.015963139, + 0.030367697, + 0.0012957318, + -0.054147527, + 0.049582303, + -0.058355026, + 0.059193954, + 0.080090895, + -0.0643068, + -0.078586616, + 0.061390623, + -0.035683163, + -0.05697204, + 0.016509915, + 0.0456678, + -0.06869852, + -0.021377739, + 0.0459535, + -0.034752127, + 0.044991855, + -0.07434412, + -0.061407465, + 0.0066419234, + 0.04971079, + -0.020545505, + -0.03348485, + 0.072175615, + -0.08419868, + -0.06409017, + -0.028774735, + -0.08295683, + -0.067340076, + -0.052633435, + -0.02234827, + -0.0048523103, + 0.036146127, + -0.041977044, + -0.04563754, + -0.056249525, + 0.040496923, + -0.07444201, + 0.042330384, + -0.034291748, + -0.037762616, + 0.035350475, + -0.023694497, + 0.044436026, + 0.034658603, + -0.071404554, + -0.052793555, + -0.030472925, + -0.023625389, + 0.043577187, + 0.05148583, + 0.09133436, + -0.054247066, + 0.04203479, + 0.030377146, + 0.0089587355, + 0.08783934, + 0.102596834, + 0.004045215, + 0.041863658, + -0.049759816, + -0.041472945, + -0.06560168, + -0.049456153, + -0.06286144, + -0.0021516692, + 0.06415723, + -0.057984285, + -0.052246314, + -0.0468379, + 0.005024449, + 0.063394494, + -0.049811874, + -0.007827677, + 0.043182477, + 0.03432028, + 0.059190553, + 0.051201522, + 0.06459717, + -0.0028205558, + 0.011427307, + 0.07478203, + -0.09011506, + -0.06896538, + 0.015105613, + -0.06902061, + 0.048208747, + -0.076154344, + 0.05893959, + 0.026351677, + -0.013113587, + 0.038620975, + -0.020734645, + 0.042907227, + -0.02616936, + -0.012401203, + 0.036476728, + 0.031379998, + -0.07657323, + 0.07557042, + 0.017815659, + 0.057302337, + 0.031211596, + 0.041240353, + 0.06864739, + 0.056433342, + -0.05830147, + -0.027380649, + 0.054324336, + 0.07243749, + -0.044019613, + 0.0029616277, + -0.061004672, + -0.06978305, + -0.055067733, + -0.06398177, + -0.025761655, + -0.031062664, + 0.038432557, + 0.01983404, + -0.022323918, + -0.08653916, + 0.036503706, + 0.027113369, + 0.051526625, + 0.003591905, + -0.043091606, + 0.048455648, + -0.06892166, + -0.007492171, + -0.018578587, + 0.05494636, + -0.05301073, + -0.094928786, + 0.003945227, + 0.033395912, + -0.034273494, + 0.06995625, + -0.024217183, + -0.06057119, + 0.022178173, + -0.048596364, + -0.03847148, + 0.01584574, + 0.08880866, + -0.09683496, + 0.040496774, + 0.0554991, + -0.0325551, + 0.066031836, + -0.07693793, + -0.014788223, + 0.013764252, + 0.04855808, + -0.037729017, + -0.037790805, + 0.033332434, + 0.09727558, + 0.09606235, + 0.07886385, + -0.017046498, + -0.0047775926, + 0.049902774, + -0.06325739, + 0.032437313, + 0.054471422, + -0.06110438, + 0.020669593, + 0.0070950463, + -0.026809083, + -0.05658399, + 0.048453655, + -0.048016146, + -0.047978207, + 0.046292298, + 0.046507128, + 0.022924135, + 0.07091171 + ] + }, + { + "start_offset": 10459, + "end_offset": 11696, + "embeddings": [ + 0.053117193, + -0.015585508, + -0.05423901, + -0.05138859, + 0.06471939, + -0.07901192, + 0.01693148, + 0.050192464, + 0.09575295, + 0.043414578, + 0.011923588, + -0.009796319, + 0.05132375, + -0.014788656, + -0.025382983, + 0.028342921, + 0.06872216, + -0.055240728, + -0.018316492, + -0.053359665, + 0.013118909, + -0.01603142, + -0.05637189, + 0.060144503, + 0.078957014, + 0.052481424, + -0.063893974, + 0.012951693, + 0.037284218, + -0.0989329, + -0.031654015, + -0.018283853, + 0.048968345, + -0.04817539, + 0.026837517, + 6.222096E-5, + -0.024189027, + 0.0112748975, + 0.0207289, + -0.012949756, + -0.03303762, + 0.04864192, + 0.028754367, + 0.025379542, + 0.047412705, + 0.051565237, + -0.057438288, + 0.032263443, + -0.06824788, + -0.019407846, + -0.059993997, + 0.06091319, + -0.0069746193, + 0.099794194, + -6.160557E-4, + -0.09514187, + -0.09679237, + -0.048262954, + -0.047733244, + 0.09921752, + 0.0332093, + -0.04550775, + -0.05143887, + 0.014637188, + 0.07129097, + 0.09510039, + 0.032291826, + 0.034658313, + -0.017751144, + -0.044415683, + -0.05508973, + 0.067787856, + -0.031251505, + -0.044856634, + -0.0033598887, + 0.04760263, + 0.054377872, + -0.040341455, + 0.018044798, + -0.023554679, + -0.049902994, + -0.031270802, + -0.007537713, + 0.03402409, + -0.027234826, + 0.09944215, + 0.045163024, + -0.05408758, + 0.04893289, + -0.048714437, + 0.061802126, + 0.06499505, + -0.07351746, + -0.08406793, + -0.08679661, + -0.051905084, + -0.034713045, + 0.040123433, + 0.07967649, + -0.06041734, + 0.061612148, + -0.020921662, + 0.05665623, + 0.041377034, + -0.007348656, + 0.015952924, + 0.05296665, + -0.052709162, + 0.071244985, + -0.038275376, + -0.01164368, + 0.014391718, + 0.06113161, + 0.034303535, + -0.050069015, + -0.070354894, + -0.011464902, + -0.028307518, + 0.04133397, + -0.049779277, + 0.08302818, + -0.048584647, + -0.06805662, + -0.04735593, + -0.04913521, + -0.005428242, + 0.03233016, + 0.044904802, + 0.06872594, + -0.01780296, + 0.06279163, + 0.039817583, + 0.007986946, + 0.0121078305, + 0.074653216, + 0.12367899, + -0.037977446, + -0.02724532, + 0.021000944, + -0.07356985, + -0.06435206, + 0.013165806, + -0.004956233, + 0.038697783, + 0.0691664, + 0.041731402, + 0.06331449, + -0.0046027564, + 0.078827925, + -0.028814215, + 0.02115456, + -0.030129815, + 0.071896814, + 0.005554397, + 0.060193166, + -0.016858125, + -0.067393966, + -0.06267468, + 0.019814175, + 0.045483287, + -0.093514144, + -0.026769608, + -0.091787525, + 0.019846648, + -0.092779495, + -0.04158812, + 0.01915316, + 0.03254872, + -9.048901E-4, + 0.014318523, + -0.013507805, + 0.040734824, + -0.047447592, + 0.052798737, + -0.04318385, + 0.020014195, + -0.05593343, + 0.045916248, + 0.059790693, + 0.032657895, + -0.038109995, + -0.006277211, + -0.042986337, + -0.090472385, + -0.003131573, + -0.0031761206, + -0.047034763, + 0.023323804, + 0.06837014, + -0.0039844033, + 0.002091333, + 0.021284567, + -0.052262813, + -0.082303464, + -0.052592263, + 0.032273255, + 0.009803746, + 0.0013473934, + 0.05583177, + 0.06429418, + 0.001517824, + -0.0072332155, + 0.020265736, + 0.0033951101, + 0.058746524, + 0.022550192, + -0.07356808, + 0.045694035, + -0.018434413, + 0.025329743, + 0.061233632, + -0.07960079, + -0.08406201, + 0.06330057, + -0.09599598, + -0.025174508, + 0.028187213, + 0.067423336, + -0.06526651, + 0.006772753, + 0.07572871, + -0.031995185, + 0.05916029, + -0.047238894, + -0.06869731, + 0.032157637, + 0.07589699, + -0.043760736, + -0.013728161, + 0.053143077, + -0.021147272, + -0.049572222, + -0.04899867, + -0.08653491, + -0.07939669, + -0.10523086, + -0.001059735, + 0.0063834176, + 0.015314108, + -0.05562784, + -0.056119584, + -0.08270628, + -3.2258706E-4, + -0.083571695, + 0.052503087, + -0.03977744, + -0.047228433, + 0.07893777, + -0.017052159, + 0.035709318, + 0.04999642, + -0.100927435, + -0.025071207, + -0.046351615, + -0.026675632, + 0.025651569, + 0.068944395, + 0.031405594, + -0.07291537, + 0.018124148, + 0.018039903, + -0.034970153, + 0.10088425, + 0.09433116, + 0.0689122, + 0.049935102, + -0.02560692, + -0.06034739, + -0.060196366, + -0.02504903, + -0.058731087, + 0.04172741, + 0.0038506123, + -0.035828065, + -0.018227967, + -0.07467086, + 0.037910078, + 0.054497574, + -0.07775332, + -0.017336372, + 0.046693277, + 0.022060173, + 0.036212, + 0.046006728, + 0.027395774, + -0.020391421, + -0.029180788, + 0.05312558, + -0.072161354, + 0.016918298, + -0.025024151, + -0.031315047, + 0.08490075, + -0.03845013, + 0.047562983, + 0.021411635, + -0.023305604, + 0.039255943, + -0.026875794, + 0.08610026, + -0.029386222, + -0.016845187, + 0.054429937, + 0.027040144, + -0.06772479, + 0.095606916, + -0.036488708, + 7.7485084E-4, + 0.037060957, + 0.07791183, + 0.07910346, + 0.013702623, + -0.03475561, + -0.040810455, + 0.0774657, + 0.043717105, + -0.05542658, + 0.0030442073, + -0.1050271, + -0.07705069, + -0.029897174, + -0.03622423, + -0.044971265, + -0.06206865, + 0.019566234, + 0.023725986, + 0.010738356, + -0.07149888, + 0.002263669, + 0.023846326, + 0.037898906, + 0.008864181, + -0.03436943, + 0.03523395, + -0.061920922, + -0.022051072, + -0.07035821, + 0.09721548, + -0.047868855, + -0.09084715, + 0.01050229, + 0.06422868, + 0.02094103, + 0.038811173, + -0.023608131, + -0.04335279, + 0.053133078, + -0.021861738, + -0.039793, + 0.048549335, + 0.07316228, + -0.08636803, + 0.017843066, + 0.06287863, + -0.034799643, + 0.06658666, + -0.042144388, + -0.025062915, + -0.005463377, + 0.024971562, + -0.05923357, + -0.041639276, + 0.039569613, + 0.06571587, + 0.096652776, + 0.061983064, + -0.036815662, + 0.0028833281, + 0.05262061, + -0.05568962, + 0.05730981, + 0.08141181, + -0.030994864, + 0.020174727, + -0.06336232, + -0.012292672, + -0.02354779, + 0.03636813, + -0.062137593, + -0.06593778, + 0.008968277, + 0.08741745, + -0.0025689485, + 0.043467455 + ] + }, + { + "start_offset": 11635, + "end_offset": 12404, + "embeddings": [ + 0.060443457, + -0.018814357, + -0.073390484, + -0.072757736, + 0.07602336, + -0.08031318, + 0.049980927, + 0.09587944, + 0.051309362, + 0.06949769, + -0.0072211474, + -0.015413545, + 0.07187972, + -0.027608033, + -0.050633453, + 0.0033393581, + 0.08592932, + -0.05384897, + 0.0090165865, + -0.067287035, + 0.047661647, + -0.034592163, + -0.039346103, + 0.0146116605, + 0.071183585, + 0.08697948, + -0.024614712, + 0.028339177, + 0.022019284, + -0.09268575, + -0.019253781, + -0.041030932, + 0.007305104, + -0.07492374, + 0.053613797, + 0.01886426, + -0.04425684, + -0.024854647, + 0.031085161, + -0.017220812, + -0.020209908, + 0.05369729, + 0.03976705, + 0.029386787, + 0.050238505, + 0.054753933, + -0.06296793, + -0.0058290027, + -0.03582435, + -0.017357286, + -0.02952249, + 0.08404001, + 0.005996583, + 0.07228626, + 0.0453729, + -0.100141585, + -0.092281535, + -0.04168767, + -0.10046059, + 0.1075754, + 0.024743102, + -0.056973584, + -0.035330076, + 0.011421968, + 0.030275127, + 0.09172467, + 0.017434414, + 0.015847225, + -0.0726862, + -0.06845117, + -0.055281464, + 0.067980886, + -3.857295E-4, + -0.054992713, + 0.0075252233, + 0.047023434, + 0.035014, + -0.012921049, + 0.02088017, + -0.058113724, + -0.025709266, + -0.054443315, + -0.019515503, + 0.064740464, + -0.05698313, + 0.09463141, + 0.04497404, + -0.049769837, + 0.0833754, + -0.029804397, + 0.048232727, + 0.06960264, + -0.0549942, + -0.052367542, + -0.053988345, + -0.043551333, + -0.0440573, + 0.0228508, + 0.06116432, + -0.034144748, + 0.07046748, + 0.030397533, + 0.08092524, + 0.01595668, + -0.040372074, + 0.005287498, + 0.08518292, + -0.085493654, + 0.07491553, + -0.057637572, + -0.052299142, + 0.025728408, + 0.061017167, + 0.062338777, + -0.025426613, + -0.046602402, + -0.00770177, + -0.022468466, + 0.04037256, + -0.027729545, + 0.049696933, + -0.04159955, + -0.03250282, + -0.05583671, + -0.057482447, + -0.013210838, + 0.010765793, + -6.568303E-4, + 0.01951912, + 0.0042298064, + 0.06481922, + 0.051263362, + 0.03024384, + 0.0143968, + 0.057499222, + 0.09256815, + -0.057120778, + 0.0056950618, + 0.009500937, + -0.085512474, + -0.062135834, + 0.03066087, + -0.022205362, + 0.04599781, + 0.03531616, + 0.033788696, + 0.092292726, + -0.010158623, + 0.080052234, + -0.0060746367, + 0.05273896, + -0.017564675, + 0.057575084, + -0.005175612, + 0.030423889, + -0.04613064, + -0.067384765, + -0.0777474, + 0.050731033, + 0.06055307, + -0.057881925, + -0.05091726, + -0.09201947, + 0.004303206, + -0.055900693, + -0.0481762, + 0.016243042, + 0.027040469, + -0.0034547276, + -0.049395755, + -0.011644979, + 0.080957845, + -0.058048993, + 0.0492391, + -0.0063328324, + -2.4730185E-4, + -0.07200027, + 0.09804746, + 0.048087306, + 0.048413623, + -0.043330252, + -0.008462916, + -0.046491988, + -0.070683904, + -0.04057368, + 0.0125348065, + -0.059007447, + 0.038095772, + 0.041568384, + -0.024388209, + -0.013926745, + -0.009171631, + -0.05361981, + -0.04222372, + -0.02127755, + 0.051767804, + 0.0061274986, + 0.053409755, + 0.035079833, + 0.027605304, + -0.013933335, + -0.031408813, + 0.005120374, + 0.020053213, + 0.039294656, + -0.005544306, + -0.10680002, + 0.034485042, + -0.02083935, + 0.04792578, + 0.068743885, + -0.11507496, + -0.10216752, + 0.056888673, + -0.06800507, + -0.059926618, + 0.008200659, + 0.030621173, + -0.059572708, + 0.020859051, + 0.11224187, + -0.026253646, + 0.05764227, + -0.047062173, + -0.056426648, + -0.018189454, + 0.06514884, + -0.060741644, + -0.039313216, + 0.011433455, + -0.038083345, + -0.05282726, + -0.052797362, + -0.08434047, + -0.06285792, + -0.058861967, + -0.059305865, + 0.004766285, + 0.06191272, + -0.061296433, + -0.05848144, + -0.038482025, + 0.033259515, + -0.11248364, + 0.017305091, + -0.024461089, + -0.03555484, + 0.0663307, + -0.014705792, + 0.014617273, + 0.04280535, + -0.074889824, + -0.052947134, + -0.030037379, + -0.0077148285, + 0.057981927, + 0.047073305, + 0.06093273, + -0.05974137, + 0.064445026, + 0.029471356, + 9.505361E-4, + 0.08606595, + 0.086340785, + 0.046603594, + 0.05858932, + -0.032077473, + -0.019977393, + -0.03431287, + -0.047011334, + -0.056820385, + 0.013462027, + 0.042805202, + -0.0677109, + -0.023264, + -0.05485633, + 0.055690948, + 0.05651245, + -0.071241796, + -0.00915478, + 0.053730387, + 0.056755595, + 0.03846687, + 0.05310068, + 0.0056861867, + 0.005360462, + -0.039538994, + 0.03497575, + -0.06781882, + -0.011839252, + -0.042778153, + -0.041556057, + 0.034042798, + -0.04711682, + 0.034821853, + 0.043264013, + 0.01696988, + 0.048099734, + -0.019220253, + 0.11707801, + -0.054026723, + -0.018543418, + 0.03708661, + 0.032689948, + -0.05068468, + 0.0691382, + -8.202863E-4, + -0.0019999356, + 0.014463376, + 0.07405446, + 0.07647231, + 0.011515665, + -0.034625884, + -0.055066366, + 0.058186855, + 0.024419852, + -0.031584322, + -0.029543389, + -0.07736987, + -0.07829203, + -0.0053546038, + -0.06570245, + -0.010904349, + -0.06490494, + 0.03149016, + 0.009859199, + -0.052703284, + -0.06884288, + 0.02139801, + 0.026021063, + 0.07129519, + -0.022009108, + -0.041722506, + 0.07192836, + -0.01098611, + -0.0027591465, + -0.04109422, + 0.046898007, + -0.036396515, + -0.090931825, + 0.022607114, + 0.030457368, + -0.03448933, + 0.021621844, + -0.048675153, + -0.033822104, + 0.057124067, + -0.048874307, + -0.054428495, + 0.056644462, + 0.08236624, + -0.08848753, + 0.033674665, + 0.053105284, + -0.019323535, + 0.05071626, + -0.025313161, + 0.0112252515, + 0.018250609, + 0.07660853, + -0.054870043, + -0.07221262, + 0.018527014, + 0.073324844, + 0.061312377, + 0.052779585, + -0.017265014, + 0.0044537727, + 0.054682422, + -0.055878393, + 0.064048916, + 0.03346393, + -0.028677322, + 0.050278228, + -0.02905618, + -0.038899165, + -0.03864768, + 0.042319845, + -0.06788429, + -0.06178736, + 0.06868577, + 0.05830974, + 0.02179902, + 0.052530885 + ] + } + ] + } + } + } + }, + "field": "CHAPTER 1\n\n\n\nLOOMINGS\n\n\n\nCall me Ishmael. Some years ago--never mind how long precisely--having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen, and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people's hats off--then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost all men in their degree, some time or other, cherish very nearly the same feelings towards the ocean with me.\n\nThere now is your insular city of the Manhattoes, belted round by wharves as Indian isles by coral reefs--commerce surrounds it with her surf. Right and left, the streets take you waterward. Its extreme down-town is the battery, where that noble mole is washed by waves, and cooled by breezes, which a few hours previous were out of sight of land. Look at the crowds of water-gazers there.\n\nCircumambulate the city of a dreamy Sabbath afternoon. Go from Corlears Hook to Coenties Slip, and from thence, by Whitehall, northward. What do you see?--Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries. Some leaning against the spiles; some seated upon the pier-heads; some looking over the bulwarks of ships from China; some high aloft in the rigging, as if striving to get a still better seaward peep. But these are all landsmen; of week days pent up in lath and plaster--tied to counters, nailed to benches, clinched to desks. How then is this? Are the green fields gone? What do they here?\n\nBut look! here come more crowds, pacing straight for the water, and seemingly bound for a dive. Strange! Nothing will content them but the extremest limit of the land; loitering under the shady lee of yonder warehouses will not suffice. No. They must get just as nigh the water as they possibly can without falling in. And there they stand--miles of them--leagues. Inlanders all, they come from lanes and alleys, streets and avenues--north, east, south, and west. Yet here they all unite. Tell me, does the magnetic virtue of the needles of the compasses of all those ships attract them thither?\n\nOnce more. Say, you are in the country; in some high land of lakes. Take almost any path you please, and ten to one it carries you down in a dale, and leaves you there by a pool in the stream. There is magic in it. Let the most absentminded of men be plunged in his deepest reveries--stand that man on his legs, set his feet a-going, and he will infallibly lead you to water, if water there be in all that region. Should you ever be athirst in the great American desert, try this experiment, if your caravan happen to be supplied with a metaphysical professor. Yes, as every one knows, meditation and water are wedded for ever.\n\nBut here is an artist. He desires to paint you the dreamiest, shadiest, quietest, most enchanting bit of romantic landscape in all the valley of the Saco. What is the chief element he employs? There stand his trees, each with a hollow trunk, as if a hermit and a crucifix were within; and here sleeps his meadow, and there sleep his cattle; and up from yonder cottage goes a sleepy smoke. Deep into distant woodlands winds a mazy way, reaching to overlapping spurs of mountains bathed in their hill-side blue. But though the picture lies thus tranced, and though this pine-tree shakes down its sighs like leaves upon this shepherd's head, yet all were vain, unless the shepherd's eye were fixed upon the magic stream before him. Go visit the Prairies in June, when for scores on scores of miles you wade knee-deep among Tiger-lilies--what is the one charm wanting?--Water--there is not a drop of water there! Were Niagara but a cataract of sand, would you travel your thousand miles to see it? Why did the poor poet of Tennessee, upon suddenly receiving two handfuls of silver, deliberate whether to buy him a coat, which he sadly needed, or invest his money in a pedestrian trip to Rockaway Beach? Why is almost every robust healthy boy with a robust healthy soul in him, at some time or other crazy to go to sea? Why upon your first voyage as a passenger, did you yourself feel such a mystical vibration, when first told that you and your ship were now out of sight of land? Why did the old Persians hold the sea holy? Why did the Greeks give it a separate deity, and own brother of Jove? Surely all this is not without meaning. And still deeper the meaning of that story of Narcissus, who because he could not grasp the tormenting, mild image he saw in the fountain, plunged into it and was drowned. But that same image, we ourselves see in all rivers and oceans. It is the image of the ungraspable phantom of life; and this is the key to it all.\n\nNow, when I say that I am in the habit of going to sea whenever I begin to grow hazy about the eyes, and begin to be over conscious of my lungs, I do not mean to have it inferred that I ever go to sea as a passenger. For to go as a passenger you must needs have a purse, and a purse is but a rag unless you have something in it. Besides, passengers get sea-sick--grow quarrelsome--don't sleep of nights--do not enjoy themselves much, as a general thing;--no, I never go as a passenger; nor, though I am something of a salt, do I ever go to sea as a Commodore, or a Captain, or a Cook. I abandon the glory and distinction of such offices to those who like them. For my part, I abominate all honorable respectable toils, trials, and tribulations of every kind whatsoever. It is quite as much as I can do to take care of myself, without taking care of ships, barques, brigs, schooners, and what not. And as for going as cook,--though I confess there is considerable glory in that, a cook being a sort of officer on ship-board--yet, somehow, I never fancied broiling fowls;--though once broiled, judiciously buttered, and judgmatically salted and peppered, there is no one who will speak more respectfully, not to say reverentially, of a broiled fowl than I will. It is out of the idolatrous dotings of the old Egyptians upon broiled ibis and roasted river horse, that you see the mummies of those creatures in their huge bake-houses the pyramids.\n\nNo, when I go to sea, I go as a simple sailor, right before the mast, plumb down into the forecastle, aloft there to the royal mast-head. True, they rather order me about some, and make me jump from spar to spar, like a grasshopper in a May meadow. And at first, this sort of thing is unpleasant enough. It touches one's sense of honor, particularly if you come of an old established family in the land, the Van Rensselaers, or Randolphs, or Hardicanutes. And more than all, if just previous to putting your hand into the tar-pot, you have been lording it as a country schoolmaster, making the tallest boys stand in awe of you. The transition is a keen one, I assure you, from a schoolmaster to a sailor, and requires a strong decoction of Seneca and the Stoics to enable you to grin and bear it. But even this wears off in time.\n\nWhat of it, if some old hunks of a sea-captain orders me to get a broom and sweep down the decks? What does that indignity amount to, weighed, I mean, in the scales of the New Testament? Do you think the archangel Gabriel thinks anything the less of me, because I promptly and respectfully obey that old hunks in that particular instance? Who ain't a slave? Tell me that. Well, then, however the old sea-captains may order me about--however they may thump and punch me about, I have the satisfaction of knowing that it is all right; that everybody else is one way or other served in much the same way--either in a physical or metaphysical point of view, that is; and so the universal thump is passed round, and all hands should rub each other's shoulder-blades, and be content.\n\nAgain, I always go to sea as a sailor, because they make a point of paying me for my trouble, whereas they never pay passengers a single penny that I ever heard of. On the contrary, passengers themselves must pay. And there is all the difference in the world between paying and being paid. The act of paying is perhaps the most uncomfortable infliction that the two orchard thieves entailed upon us. But being paid,--what will compare with it? The urbane activity with which a man receives money is really marvellous, considering that we so earnestly believe money to be the root of all earthly ills, and that on no account can a monied man enter heaven. Ah! how cheerfully we consign ourselves to perdition!\n\nFinally, I always go to sea as a sailor, because of the wholesome exercise and pure air of the forecastle deck. For as in this world, head winds are far more prevalent than winds from astern (that is, if you never violate the Pythagorean maxim), so for the most part the Commodore on the quarter-deck gets his atmosphere at second hand from the sailors on the forecastle. He thinks he breathes it first; but not so. In much the same way do the commonalty lead their leaders in many other things, at the same time that the leaders little suspect it. But wherefore it was that after having repeatedly smelt the sea as a merchant sailor, I should now take it into my head to go on a whaling voyage; this the invisible police officer of the Fates, who has the constant surveillance of me, and secretly dogs me, and influences me in some unaccountable way--he can better answer than any one else. And, doubtless, my going on this whaling voyage, formed part of the grand programme of Providence that was drawn up a long time ago. It came in as a sort of brief interlude and solo between more extensive performances. I take it that this part of the bill must have run something like this:\n\n\n\n\"grand contested election for the presidency of the united states.\n\n\"whaling voyage by one ishmael.\n\n\"bloody battle in afghanistan.\"\n\n\n\nThough I cannot tell why it was exactly that those stage managers, the Fates, put me down for this shabby part of a whaling voyage, when others were set down for magnificent parts in high tragedies, and short and easy parts in genteel comedies, and jolly parts in farces--though I cannot tell why this was exactly; yet, now that I recall all the circumstances, I think I can see a little into the springs and motives which being cunningly presented to me under various disguises, induced me to set about performing the part I did, besides cajoling me into the delusion that it was a choice resulting from my own unbiased freewill and discriminating judgment.\n\nChief among these motives was the overwhelming idea of the great whale himself. Such a portentous and mysterious monster roused all my curiosity. Then the wild and distant seas where he rolled his island bulk; the undeliverable, nameless perils of the whale; these, with all the attending marvels of a thousand Patagonian sights and sounds, helped to sway me to my wish. With other men, perhaps, such things would not have been inducements; but as for me, I am tormented with an everlasting itch for things remote. I love to sail forbidden seas, and land on barbarous coasts. Not ignoring what is good, I am quick to perceive a horror, and could still be social with it--would they let me--since it is but well to be on friendly terms with all the inmates of the place one lodges in.\n\nBy reason of these things, then, the whaling voyage was welcome; the great flood-gates of the wonder-world swung open, and in the wild conceits that swayed me to my purpose, two and two there floated into my inmost soul, endless processions of the whale, and, mid most of them all, one grand hooded phantom, like a snow hill in the air.\n\nCopyright © 1967 by Bantam Books\n\nPublisher\nModern Library\n\nCategories\nClassic Fiction\nLiterary Fiction\nFiction\nClassics\n\n\nAbout Moby-Dick\n\nAbout Herman Melville" +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml index 882f1df03e926..853fbc583ed9e 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml @@ -7,6 +7,11 @@ setup: indices.create: index: test-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -18,7 +23,6 @@ setup: --- "Indexes sparse vector document": - # Checks mapping is not updated until first doc arrives - do: indices.get_mapping: @@ -33,25 +37,28 @@ setup: index: test-index id: doc_1 body: - sparse_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.sparse_field: inference: inference_id: sparse-inference-id model_settings: task_type: sparse_embedding chunks: - - text: "these are not the droids you're looking for" - embeddings: - feature_0: 1.0 - feature_1: 2.0 - feature_2: 3.0 - feature_3: 4.0 - - text: "He's free to go around" - embeddings: - feature_4: 0.1 - feature_5: 0.2 - feature_6: 0.3 - feature_7: 0.4 + sparse_field: + - start_offset: 0 + end_offset: 44 + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - start_offset: 44 + end_offset: 67 + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 # Checks mapping is updated when first doc arrives - do: @@ -65,7 +72,6 @@ setup: --- "Field caps with sparse embedding": - - requires: cluster_features: "gte_v8.16.0" reason: field_caps support for semantic_text added in 8.16.0 @@ -95,25 +101,28 @@ setup: index: test-index id: doc_1 body: - sparse_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.sparse_field: inference: inference_id: sparse-inference-id model_settings: task_type: sparse_embedding chunks: - - text: "these are not the droids you're looking for" - embeddings: - feature_0: 1.0 - feature_1: 2.0 - feature_2: 3.0 - feature_3: 4.0 - - text: "He's free to go around" - embeddings: - feature_4: 0.1 - feature_5: 0.2 - feature_6: 0.3 - feature_7: 0.4 + sparse_field: + - start_offset: 0 + end_offset: 44 + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - start_offset: 44 + end_offset: 67 + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 refresh: true - do: @@ -141,7 +150,6 @@ setup: --- "Indexes dense vector document": - # Checks mapping is not updated until first doc arrives - do: indices.get_mapping: @@ -156,8 +164,8 @@ setup: index: test-index id: doc_2 body: - dense_field: - text: "these are not the droids you're looking for. He's free to go around" + dense_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -166,10 +174,13 @@ setup: similarity: cosine element_type: float chunks: - - text: "these are not the droids you're looking for" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] - - text: "He's free to go around" - embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] + dense_field: + - start_offset: 0 + end_offset: 44 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + - start_offset: 44 + end_offset: 67 + embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] # Checks mapping is updated when first doc arrives - do: @@ -183,7 +194,6 @@ setup: --- "Field caps with text embedding": - - requires: cluster_features: "gte_v8.16.0" reason: field_caps support for semantic_text added in 8.16.0 @@ -213,8 +223,8 @@ setup: index: test-index id: doc_2 body: - dense_field: - text: "these are not the droids you're looking for. He's free to go around" + dense_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -223,10 +233,13 @@ setup: similarity: cosine element_type: float chunks: - - text: "these are not the droids you're looking for" - embeddings: [ 0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416 ] - - text: "He's free to go around" - embeddings: [ 0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896 ] + dense_field: + - start_offset: 0 + end_offset: 44 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + - start_offset: 44 + end_offset: 67 + embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] refresh: true - do: diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml new file mode 100644 index 0000000000000..6984c0f67053d --- /dev/null +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml @@ -0,0 +1,254 @@ +setup: + - requires: + cluster_features: "gte_v8.15.0" + reason: semantic_text introduced in 8.15.0 + + - do: + indices.create: + index: test-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + +--- +"Indexes sparse vector document": + # Checks mapping is not updated until first doc arrives + - do: + indices.get_mapping: + index: test-index + + - match: { "test-index.mappings.properties.sparse_field.type": semantic_text } + - match: { "test-index.mappings.properties.sparse_field.inference_id": sparse-inference-id } + - length: { "test-index.mappings.properties.sparse_field": 2 } + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: sparse-inference-id + model_settings: + task_type: sparse_embedding + chunks: + - text: "these are not the droids you're looking for" + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - text: "He's free to go around" + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 + + # Checks mapping is updated when first doc arrives + - do: + indices.get_mapping: + index: test-index + + - match: { "test-index.mappings.properties.sparse_field.type": semantic_text } + - match: { "test-index.mappings.properties.sparse_field.inference_id": sparse-inference-id } + - match: { "test-index.mappings.properties.sparse_field.model_settings.task_type": sparse_embedding } + - length: { "test-index.mappings.properties.sparse_field": 3 } + +--- +"Field caps with sparse embedding": + - requires: + cluster_features: "gte_v8.16.0" + reason: field_caps support for semantic_text added in 8.16.0 + + - do: + field_caps: + include_empty_fields: true + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - exists: fields.sparse_field + - exists: fields.dense_field + + - do: + field_caps: + include_empty_fields: false + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - not_exists: fields.sparse_field + - not_exists: fields.dense_field + + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: sparse-inference-id + model_settings: + task_type: sparse_embedding + chunks: + - text: "these are not the droids you're looking for" + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - text: "He's free to go around" + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 + refresh: true + + - do: + field_caps: + include_empty_fields: true + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - exists: fields.sparse_field + - exists: fields.dense_field + - match: { fields.sparse_field.semantic_text.searchable: true } + - match: { fields.dense_field.semantic_text.searchable: true } + + - do: + field_caps: + include_empty_fields: false + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - exists: fields.sparse_field + - not_exists: fields.dense_field + - match: { fields.sparse_field.semantic_text.searchable: true } + +--- +"Indexes dense vector document": + # Checks mapping is not updated until first doc arrives + - do: + indices.get_mapping: + index: test-index + + - match: { "test-index.mappings.properties.dense_field.type": semantic_text } + - match: { "test-index.mappings.properties.dense_field.inference_id": dense-inference-id } + - length: { "test-index.mappings.properties.dense_field": 2 } + + - do: + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "these are not the droids you're looking for" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + - text: "He's free to go around" + embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] + + # Checks mapping is updated when first doc arrives + - do: + indices.get_mapping: + index: test-index + + - match: { "test-index.mappings.properties.dense_field.type": semantic_text } + - match: { "test-index.mappings.properties.dense_field.inference_id": dense-inference-id } + - match: { "test-index.mappings.properties.dense_field.model_settings.task_type": text_embedding } + - length: { "test-index.mappings.properties.dense_field": 3 } + +--- +"Field caps with text embedding": + - requires: + cluster_features: "gte_v8.16.0" + reason: field_caps support for semantic_text added in 8.16.0 + + - do: + field_caps: + include_empty_fields: true + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - exists: fields.sparse_field + - exists: fields.dense_field + + - do: + field_caps: + include_empty_fields: false + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - not_exists: fields.sparse_field + - not_exists: fields.dense_field + + - do: + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "these are not the droids you're looking for" + embeddings: [ 0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416 ] + - text: "He's free to go around" + embeddings: [ 0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896 ] + refresh: true + + - do: + field_caps: + include_empty_fields: true + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - exists: fields.sparse_field + - exists: fields.dense_field + - match: { fields.sparse_field.semantic_text.searchable: true } + - match: { fields.dense_field.semantic_text.searchable: true } + + - do: + field_caps: + include_empty_fields: false + index: test-index + fields: "*" + + - match: { indices: [ "test-index" ] } + - not_exists: fields.sparse_field + - exists: fields.dense_field + - match: { fields.dense_field.semantic_text.searchable: true } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping.yml index 3d46c3b23d7e3..39f3a1641c18c 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping.yml @@ -7,6 +7,11 @@ setup: indices.create: index: test-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -22,27 +27,30 @@ setup: index: test-index id: doc_1 body: - sparse_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.sparse_field: inference: inference_id: sparse-inference-id model_settings: task_type: sparse_embedding chunks: - - text: "these are not the droids you're looking for" - embeddings: - feature_0: 1.0 - feature_1: 2.0 - feature_2: 3.0 - feature_3: 4.0 - - text: "He's free to go around" - embeddings: - feature_4: 0.1 - feature_5: 0.2 - feature_6: 0.3 - feature_7: 0.4 - dense_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: + - start_offset: 0 + end_offset: 44 + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - start_offset: 44 + end_offset: 67 + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 + dense_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -51,23 +59,25 @@ setup: similarity: cosine element_type: float chunks: - - text: "these are not the droids you're looking for" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] - - text: "He's free to go around" - embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] + dense_field: + - start_offset: 0 + end_offset: 44 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + - start_offset: 44 + end_offset: 67 + embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] --- "Fails for non-compatible dimensions": - - do: catch: /Incompatible model settings for field \[dense_field\].+/ index: index: test-index id: doc_2 body: - dense_field: - text: "other text" + dense_field: "other text" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -76,20 +86,21 @@ setup: similarity: cosine element_type: float chunks: - - text: "other text" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416, 0.053438711911439896] + dense_field: + - start_offset: 0 + end_offset: 10 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416, 0.053438711911439896] --- "Fails for non-compatible inference id": - - do: catch: /The configured inference_id \[a-different-inference-id\] for field \[dense_field\] doesn't match the inference_id \[dense-inference-id\].+/ index: index: test-index id: doc_2 body: - dense_field: - text: "other text" + dense_field: "other text" + _inference_fields.dense_field: inference: inference_id: a-different-inference-id model_settings: @@ -98,20 +109,21 @@ setup: similarity: cosine element_type: float chunks: - - text: "other text" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + dense_field: + - start_offset: 0 + end_offset: 10 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] --- "Fails for non-compatible similarity": - - do: catch: /Incompatible model settings for field \[dense_field\].+/ index: index: test-index id: doc_2 body: - dense_field: - text: "other text" + dense_field: "other text" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -120,20 +132,21 @@ setup: similarity: dot_product element_type: float chunks: - - text: "other text" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + dense_field: + - start_offset: 0 + end_offset: 10 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] --- "Fails for non-compatible element type": - - do: catch: /Incompatible model settings for field \[dense_field\].+/ index: index: test-index id: doc_2 body: - dense_field: - text: "other text" + dense_field: "other text" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -142,43 +155,45 @@ setup: similarity: cosine element_type: byte chunks: - - text: "other text" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + dense_field: + - start_offset: 0 + end_offset: 10 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] --- "Fails for non-compatible task type for dense vectors": - - do: catch: /Incompatible model settings for field \[dense_field\].+/ index: index: test-index id: doc_2 body: - dense_field: - text: "other text" + dense_field: "other text" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: task_type: sparse_embedding chunks: - - text: "these are not the droids you're looking for" - embeddings: - feature_0: 1.0 - feature_1: 2.0 - feature_2: 3.0 - feature_3: 4.0 + dense_field: + - start_offset: 0 + end_offset: 10 + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 --- "Fails for non-compatible task type for sparse vectors": - - do: catch: /Incompatible model settings for field \[sparse_field\].+/ index: index: test-index id: doc_2 body: - sparse_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.sparse_field: inference: inference_id: sparse-inference-id model_settings: @@ -187,20 +202,21 @@ setup: similarity: cosine element_type: float chunks: - - text: "these are not the droids you're looking for" - embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + sparse_field: + - start_offset: 0 + end_offset: 44 + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] --- "Fails for missing dense vector inference results in chunks": - - do: - catch: /failed to parse field \[dense_field\] of type \[semantic_text\]/ + catch: /failed to parse field \[_inference_fields\] of type \[_inference_fields\]/ index: index: test-index id: doc_2 body: - dense_field: - text: "these are not the droids you're looking for. He's free to go around" + dense_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -209,37 +225,39 @@ setup: similarity: cosine element_type: float chunks: - - text: "these are not the droids you're looking for" + dense_field: + - start_offset: 0 + end_offset: 44 --- "Fails for missing sparse vector inference results in chunks": - - do: - catch: /failed to parse field \[sparse_field\] of type \[semantic_text\]/ + catch: /failed to parse field \[_inference_fields\] of type \[_inference_fields\]/ index: index: test-index id: doc_2 body: - sparse_field: - text: "these are not the droids you're looking for. He's free to go around" + sparse_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.sparse_field: inference: inference_id: sparse-inference-id model_settings: task_type: sparse_embedding chunks: - - text: "these are not the droids you're looking for" + sparse_field: + - start_offset: 0 + end_offset: 44 --- -"Fails for missing text in chunks": - +"Fails for missing offsets in chunks": - do: - catch: /failed to parse field \[dense_field\] of type \[semantic_text\]/ + catch: /failed to parse field \[_inference_fields\] of type \[_inference_fields\]/ index: index: test-index id: doc_2 body: - dense_field: - text: "these are not the droids you're looking for. He's free to go around" + dense_field: "these are not the droids you're looking for. He's free to go around" + _inference_fields.dense_field: inference: inference_id: dense-inference-id model_settings: @@ -248,5 +266,6 @@ setup: similarity: cosine element_type: float chunks: - - embeddings: [ 0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416 ] + dense_field: + - embeddings: [ 0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416 ] diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping_bwc.yml new file mode 100644 index 0000000000000..ffdb2a24c3dcf --- /dev/null +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/20_semantic_text_field_mapping_incompatible_field_mapping_bwc.yml @@ -0,0 +1,248 @@ +setup: + - requires: + cluster_features: "gte_v8.15.0" + reason: semantic_text introduced in 8.15.0 + + - do: + indices.create: + index: test-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + + # Indexes a doc with inference results to update mappings + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: sparse-inference-id + model_settings: + task_type: sparse_embedding + chunks: + - text: "these are not the droids you're looking for" + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + - text: "He's free to go around" + embeddings: + feature_4: 0.1 + feature_5: 0.2 + feature_6: 0.3 + feature_7: 0.4 + dense_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "these are not the droids you're looking for" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + - text: "He's free to go around" + embeddings: [0.00641461368650198, -0.0016253676731139421, -0.05126338079571724, 0.053438711911439896] + + +--- +"Fails for non-compatible dimensions": + - do: + catch: /Incompatible model settings for field \[dense_field\].+/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "other text" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 5 + similarity: cosine + element_type: float + chunks: + - text: "other text" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416, 0.053438711911439896] + +--- +"Fails for non-compatible inference id": + - do: + catch: /The configured inference_id \[a-different-inference-id\] for field \[dense_field\] doesn't match the inference_id \[dense-inference-id\].+/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "other text" + inference: + inference_id: a-different-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "other text" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + +--- +"Fails for non-compatible similarity": + - do: + catch: /Incompatible model settings for field \[dense_field\].+/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "other text" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: dot_product + element_type: float + chunks: + - text: "other text" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + +--- +"Fails for non-compatible element type": + - do: + catch: /Incompatible model settings for field \[dense_field\].+/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "other text" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: byte + chunks: + - text: "other text" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + +--- +"Fails for non-compatible task type for dense vectors": + - do: + catch: /Incompatible model settings for field \[dense_field\].+/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "other text" + inference: + inference_id: dense-inference-id + model_settings: + task_type: sparse_embedding + chunks: + - text: "these are not the droids you're looking for" + embeddings: + feature_0: 1.0 + feature_1: 2.0 + feature_2: 3.0 + feature_3: 4.0 + +--- +"Fails for non-compatible task type for sparse vectors": + - do: + catch: /Incompatible model settings for field \[sparse_field\].+/ + index: + index: test-index + id: doc_2 + body: + sparse_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: sparse-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "these are not the droids you're looking for" + embeddings: [0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416] + +--- +"Fails for missing dense vector inference results in chunks": + - do: + catch: /failed to parse field \[dense_field\] of type \[semantic_text\]/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - text: "these are not the droids you're looking for" + +--- +"Fails for missing sparse vector inference results in chunks": + - do: + catch: /failed to parse field \[sparse_field\] of type \[semantic_text\]/ + index: + index: test-index + id: doc_2 + body: + sparse_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: sparse-inference-id + model_settings: + task_type: sparse_embedding + chunks: + - text: "these are not the droids you're looking for" + +--- +"Fails for missing text in chunks": + - do: + catch: /failed to parse field \[dense_field\] of type \[semantic_text\]/ + index: + index: test-index + id: doc_2 + body: + dense_field: + text: "these are not the droids you're looking for. He's free to go around" + inference: + inference_id: dense-inference-id + model_settings: + task_type: text_embedding + dimensions: 4 + similarity: cosine + element_type: float + chunks: + - embeddings: [ 0.04673296958208084, -0.03237321600317955, -0.02543032355606556, 0.056035321205854416 ] + diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml index 534e4831c4a0a..1adf0d58de8e3 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml @@ -39,6 +39,11 @@ setup: indices.create: index: test-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -60,19 +65,30 @@ setup: sparse_field: "inference test" dense_field: "another inference test" non_inference_field: "non inference test" + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.non_inference_field: "non inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } --- "Calculates sparse embedding and text embedding results for new documents with integer value": @@ -83,18 +99,30 @@ setup: body: sparse_field: 75 dense_field: 100 + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "75" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "75" } - - match: { _source.dense_field.text: "100" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "100" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 2 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 3 } --- "Calculates sparse embedding and text embedding results for new documents with boolean value": @@ -105,18 +133,30 @@ setup: body: sparse_field: true dense_field: false + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "true" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "true" } - - match: { _source.dense_field.text: "false" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "false" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 4 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 5 } --- "Calculates sparse embedding and text embedding results for new documents with a collection of mixed data types": @@ -127,39 +167,48 @@ setup: body: sparse_field: [false, 75, "inference test", 13.49] dense_field: [true, 49.99, "another inference test", 5654] + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } - - length: { _source.sparse_field.text: 4 } - - match: { _source.sparse_field.text.0: "false" } - - match: { _source.sparse_field.text.1: "75" } - - match: { _source.sparse_field.text.2: "inference test" } - - match: { _source.sparse_field.text.3: "13.49" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - exists: _source.sparse_field.inference.chunks.1.embeddings - - exists: _source.sparse_field.inference.chunks.2.embeddings - - exists: _source.sparse_field.inference.chunks.3.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "false" } - - match: { _source.sparse_field.inference.chunks.1.text: "75" } - - match: { _source.sparse_field.inference.chunks.2.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.3.text: "13.49" } - - - length: { _source.dense_field.text: 4 } - - match: { _source.dense_field.text.0: "true" } - - match: { _source.dense_field.text.1: "49.99" } - - match: { _source.dense_field.text.2: "another inference test" } - - match: { _source.dense_field.text.3: "5654" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - exists: _source.dense_field.inference.chunks.1.embeddings - - exists: _source.dense_field.inference.chunks.2.embeddings - - exists: _source.dense_field.inference.chunks.3.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "true" } - - match: { _source.dense_field.inference.chunks.1.text: "49.99" } - - match: { _source.dense_field.inference.chunks.2.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.3.text: "5654" } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 4 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 5 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 6 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 8 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.2.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.2.start_offset: 9 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.2.end_offset: 23 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.3.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.3.start_offset: 24 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.3.end_offset: 29 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 4 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 4 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.start_offset: 5 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.end_offset: 10 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.2.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.2.start_offset: 11 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.2.end_offset: 33 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.3.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.3.start_offset: 34 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.3.end_offset: 38 } --- "Inference fields do not create new mappings": @@ -197,6 +246,7 @@ setup: search: index: test-index body: + fields: [ _inference_fields ] query: nested: path: sparse_field.inference.chunks @@ -208,37 +258,24 @@ setup: - match: { hits.total.value: 2 } - match: { hits.total.relation: eq } - - length: { hits.hits.0._source.sparse_field.inference.chunks: 2 } - - length: { hits.hits.1._source.sparse_field.inference.chunks: 2 } - - # Search with inner hits - - do: - search: - _source: false - index: test-index - body: - query: - nested: - path: sparse_field.inference.chunks - inner_hits: - _source: false - fields: [ sparse_field.inference.chunks.text ] - query: - sparse_vector: - field: sparse_field.inference.chunks.embeddings - inference_id: sparse-inference-id - query: "you know, for testing" - - - match: { hits.total.value: 2 } - - match: { hits.total.relation: eq } - - match: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.total.value: 2 } - - match: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.total.relation: eq } - - - length: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.hits.0.fields.sparse_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.hits.1.fields.sparse_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.1.inner_hits.sparse_field\.inference\.chunks.hits.hits.0.fields.sparse_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.1.inner_hits.sparse_field\.inference\.chunks.hits.hits.1.fields.sparse_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 21 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 22 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 37 } + + - length: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 } + - exists: hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 15 } + - exists: hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings + - match: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 16 } + - match: { hits.hits.1._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 } --- "Dense vector results are indexed as nested chunks and searchable": @@ -256,6 +293,7 @@ setup: search: index: test-index body: + fields: [ _inference_fields ] query: nested: path: dense_field.inference.chunks @@ -269,38 +307,24 @@ setup: - match: { hits.total.value: 2 } - match: { hits.total.relation: eq } - - length: { hits.hits.0._source.dense_field.inference.chunks: 2 } - - length: { hits.hits.1._source.dense_field.inference.chunks: 2 } - - # Search with inner hits - - do: - search: - _source: false - index: test-index - body: - query: - nested: - path: dense_field.inference.chunks - inner_hits: - _source: false - fields: [ dense_field.inference.chunks.text ] - query: - knn: - field: dense_field.inference.chunks.embeddings - query_vector_builder: - text_embedding: - model_id: dense-inference-id - model_text: "you know, for testing" - - - match: { hits.total.value: 2 } - - match: { hits.total.relation: eq } - - match: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.total.value: 2 } - - match: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.total.relation: eq } - - length: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.hits.0.fields.dense_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.hits.1.fields.dense_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.1.inner_hits.dense_field\.inference\.chunks.hits.hits.0.fields.dense_field\.inference\.chunks.0.text: 1 } - - length: { hits.hits.1.inner_hits.dense_field\.inference\.chunks.hits.hits.1.fields.dense_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 2 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 21 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.start_offset: 22 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.1.end_offset: 37 } + + - length: { hits.hits.1._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field: 2 } + - exists: hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 15 } + - exists: hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.1.embeddings + - match: { hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.1.start_offset: 16 } + - match: { hits.hits.1._source._inference_fields.dense_field.inference.chunks.dense_field.1.end_offset: 35 } --- "Reindex works for semantic_text fields": @@ -312,22 +336,17 @@ setup: sparse_field: "inference test" dense_field: "another inference test" non_inference_field: "non inference test" - - - do: - get: - index: test-index - id: doc_1 - - - set: { _source.sparse_field.inference.chunks.0.embeddings: sparse_field_embedding } - - set: { _source.dense_field.inference.chunks.0.embeddings: dense_field_embedding } - - - do: - indices.refresh: { } + refresh: true - do: indices.create: index: destination-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -347,19 +366,42 @@ setup: index: test-index dest: index: destination-index + + - do: + indices.refresh: { } + - do: get: index: destination-index id: doc_1 - - match: { _source.sparse_field.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.embeddings: $sparse_field_embedding } - - match: { _source.dense_field.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.embeddings: $dense_field_embedding } + - match: { _source.sparse_field: "inference test" } + - match: { _source.dense_field: "another inference test" } - match: { _source.non_inference_field: "non inference test" } + - do: + search: + index: destination-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + --- "Fails for non-existent inference": - do: @@ -399,6 +441,11 @@ setup: indices.create: index: test-copy-to-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -419,20 +466,32 @@ setup: source_field: "copy_to inference test" sparse_field: "inference test" another_source_field: "another copy_to inference test" + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "inference test" } - - length: { _source.sparse_field.inference.chunks: 3 } - - match: { _source.sparse_field.inference.chunks.0.text: "another copy_to inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.1.text: "copy_to inference test" } - - exists: _source.sparse_field.inference.chunks.1.embeddings - - match: { _source.sparse_field.inference.chunks.2.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.2.embeddings + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 3 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.end_offset: 22 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.another_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.another_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.another_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.another_source_field.0.end_offset: 30 } --- "Calculates embeddings for bulk operations - index": @@ -441,6 +500,7 @@ setup: body: - '{"index": {"_index": "test-index", "_id": "doc_1"}}' - '{"sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test"}' + refresh: true - match: { errors: false } - match: { items.0.index.result: "created" } @@ -450,21 +510,44 @@ setup: index: test-index id: doc_1 - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.sparse_field: "inference test" } + - match: { _source.dense_field: "another inference test" } - match: { _source.non_inference_field: "non inference test" } + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + --- "Update by query picks up new semantic_text fields": - - do: indices.create: index: mapping-update-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: dynamic: false properties: @@ -502,22 +585,43 @@ setup: - match: { updated: 1 } + - do: + indices.refresh: { } + - do: get: index: mapping-update-index id: doc_1 - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.sparse_field: "inference test" } + - match: { _source.dense_field: "another inference test" } - match: { _source.non_inference_field: "non inference test" } + - do: + search: + index: mapping-update-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + --- "Update by query works for scripts": - - do: index: index: test-index @@ -528,8 +632,26 @@ setup: dense_field: "another inference test" non_inference_field: "non inference test" + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } + - do: update_by_query: + refresh: true wait_for_completion: true index: test-index body: { "script": "ctx._source.sparse_field = 'updated inference test'; ctx._source.dense_field = 'another updated inference test'" } @@ -541,12 +663,27 @@ setup: index: test-index id: doc_1 - - match: { _source.sparse_field.text: "updated inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.dense_field.text: "another updated inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.sparse_field: "updated inference test" } + - match: { _source.dense_field: "another updated inference test" } + + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + # We can't directly check that the embeddings are different since there isn't a "does not match" assertion in the + # YAML test framework. Check that the start and end offsets change as expected as a proxy. + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 22 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 } --- "Calculates embeddings using the default ELSER 2 endpoint": @@ -562,6 +699,11 @@ setup: indices.create: index: test-elser-2-default-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -573,15 +715,24 @@ setup: id: doc_1 body: sparse_field: "inference test" + refresh: true - do: - get: + search: index: test-elser-2-default-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } --- "Can be used inside an object field": @@ -593,6 +744,11 @@ setup: indices.create: index: test-in-object-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: level_1: @@ -612,18 +768,38 @@ setup: level_1: sparse_field: "inference test" dense_field: "another inference test" + refresh: true - do: get: index: test-in-object-index id: doc_1 - - match: { _source.level_1.sparse_field.text: "inference test" } - - exists: _source.level_1.sparse_field.inference.chunks.0.embeddings - - match: { _source.level_1.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.level_1.dense_field.text: "another inference test" } - - exists: _source.level_1.dense_field.inference.chunks.0.embeddings - - match: { _source.level_1.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.level_1.sparse_field: "inference test" } + - match: { _source.level_1.dense_field: "another inference test" } + + - do: + search: + index: test-in-object-index + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.level_1\\.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.level_1\\.sparse_field.inference.chunks.level_1\\.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.level_1\\.sparse_field.inference.chunks.level_1\\.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.level_1\\.sparse_field.inference.chunks.level_1\\.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.level_1\\.sparse_field.inference.chunks.level_1\\.sparse_field.0.end_offset: 14 } + + - length: { hits.hits.0._source._inference_fields.level_1\\.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.level_1\\.dense_field.inference.chunks.level_1\\.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.level_1\\.dense_field.inference.chunks.level_1\\.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.level_1\\.dense_field.inference.chunks.level_1\\.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.level_1\\.dense_field.inference.chunks.level_1\\.dense_field.0.end_offset: 22 } --- "Deletes on bulk operation": @@ -652,8 +828,8 @@ setup: - match: { hits.total.value: 2 } - match: { hits.total.relation: eq } - - match: { hits.hits.0._source.dense_field.text: ["you know, for testing", "now with chunks"] } - - match: { hits.hits.1._source.dense_field.text: ["some more tests", "that include chunks"] } + - match: { hits.hits.0._source.dense_field: ["you know, for testing", "now with chunks"] } + - match: { hits.hits.1._source.dense_field: ["some more tests", "that include chunks"] } - do: bulk: @@ -675,4 +851,4 @@ setup: - match: { hits.total.value: 1 } - match: { hits.total.relation: eq } - - match: { hits.hits.0._source.dense_field.text: "updated text" } + - match: { hits.hits.0._source.dense_field: "updated text" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml new file mode 100644 index 0000000000000..07341273151bc --- /dev/null +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml @@ -0,0 +1,652 @@ +setup: + - requires: + cluster_features: "gte_v8.15.0" + reason: semantic_text introduced in 8.15.0 + + - do: + inference.put: + task_type: sparse_embedding + inference_id: sparse-inference-id + body: > + { + "service": "test_service", + "service_settings": { + "model": "my_model", + "api_key": "abc64" + }, + "task_settings": { + } + } + + - do: + inference.put: + task_type: text_embedding + inference_id: dense-inference-id + body: > + { + "service": "text_embedding_test_service", + "service_settings": { + "model": "my_model", + "dimensions": 10, + "similarity": "cosine", + "api_key": "abc64" + }, + "task_settings": { + } + } + + - do: + indices.create: + index: test-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + non_inference_field: + type: text + +--- +"Calculates sparse embedding and text embedding results for new documents": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + +--- +"Calculates sparse embedding and text embedding results for new documents with integer value": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: 75 + dense_field: 100 + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "75" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "75" } + - match: { _source.dense_field.text: "100" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "100" } + +--- +"Calculates sparse embedding and text embedding results for new documents with boolean value": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: true + dense_field: false + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "true" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "true" } + - match: { _source.dense_field.text: "false" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "false" } + +--- +"Calculates sparse embedding and text embedding results for new documents with a collection of mixed data types": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: [false, 75, "inference test", 13.49] + dense_field: [true, 49.99, "another inference test", 5654] + + - do: + get: + index: test-index + id: doc_1 + + - length: { _source.sparse_field.text: 4 } + - match: { _source.sparse_field.text.0: "false" } + - match: { _source.sparse_field.text.1: "75" } + - match: { _source.sparse_field.text.2: "inference test" } + - match: { _source.sparse_field.text.3: "13.49" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - exists: _source.sparse_field.inference.chunks.1.embeddings + - exists: _source.sparse_field.inference.chunks.2.embeddings + - exists: _source.sparse_field.inference.chunks.3.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "false" } + - match: { _source.sparse_field.inference.chunks.1.text: "75" } + - match: { _source.sparse_field.inference.chunks.2.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.3.text: "13.49" } + + - length: { _source.dense_field.text: 4 } + - match: { _source.dense_field.text.0: "true" } + - match: { _source.dense_field.text.1: "49.99" } + - match: { _source.dense_field.text.2: "another inference test" } + - match: { _source.dense_field.text.3: "5654" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - exists: _source.dense_field.inference.chunks.1.embeddings + - exists: _source.dense_field.inference.chunks.2.embeddings + - exists: _source.dense_field.inference.chunks.3.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "true" } + - match: { _source.dense_field.inference.chunks.1.text: "49.99" } + - match: { _source.dense_field.inference.chunks.2.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.3.text: "5654" } + +--- +"Sparse vector results are indexed as nested chunks and searchable": + - do: + bulk: + index: test-index + refresh: true + body: | + {"index":{}} + {"sparse_field": ["you know, for testing", "now with chunks"]} + {"index":{}} + {"sparse_field": ["some more tests", "that include chunks"]} + + - do: + search: + index: test-index + body: + query: + nested: + path: sparse_field.inference.chunks + query: + sparse_vector: + field: sparse_field.inference.chunks.embeddings + inference_id: sparse-inference-id + query: "you know, for testing" + + - match: { hits.total.value: 2 } + - match: { hits.total.relation: eq } + - length: { hits.hits.0._source.sparse_field.inference.chunks: 2 } + - length: { hits.hits.1._source.sparse_field.inference.chunks: 2 } + + # Search with inner hits + - do: + search: + _source: false + index: test-index + body: + query: + nested: + path: sparse_field.inference.chunks + inner_hits: + _source: false + fields: [ sparse_field.inference.chunks.text ] + query: + sparse_vector: + field: sparse_field.inference.chunks.embeddings + inference_id: sparse-inference-id + query: "you know, for testing" + + - match: { hits.total.value: 2 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.total.value: 2 } + - match: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.total.relation: eq } + + - length: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.hits.0.fields.sparse_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.0.inner_hits.sparse_field\.inference\.chunks.hits.hits.1.fields.sparse_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.1.inner_hits.sparse_field\.inference\.chunks.hits.hits.0.fields.sparse_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.1.inner_hits.sparse_field\.inference\.chunks.hits.hits.1.fields.sparse_field\.inference\.chunks.0.text: 1 } + +--- +"Dense vector results are indexed as nested chunks and searchable": + - do: + bulk: + index: test-index + refresh: true + body: | + {"index":{}} + {"dense_field": ["you know, for testing", "now with chunks"]} + {"index":{}} + {"dense_field": ["some more tests", "that include chunks"]} + + - do: + search: + index: test-index + body: + query: + nested: + path: dense_field.inference.chunks + query: + knn: + field: dense_field.inference.chunks.embeddings + query_vector_builder: + text_embedding: + model_id: dense-inference-id + model_text: "you know, for testing" + + - match: { hits.total.value: 2 } + - match: { hits.total.relation: eq } + - length: { hits.hits.0._source.dense_field.inference.chunks: 2 } + - length: { hits.hits.1._source.dense_field.inference.chunks: 2 } + + # Search with inner hits + - do: + search: + _source: false + index: test-index + body: + query: + nested: + path: dense_field.inference.chunks + inner_hits: + _source: false + fields: [ dense_field.inference.chunks.text ] + query: + knn: + field: dense_field.inference.chunks.embeddings + query_vector_builder: + text_embedding: + model_id: dense-inference-id + model_text: "you know, for testing" + + - match: { hits.total.value: 2 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.total.value: 2 } + - match: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.total.relation: eq } + + - length: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.hits.0.fields.dense_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.0.inner_hits.dense_field\.inference\.chunks.hits.hits.1.fields.dense_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.1.inner_hits.dense_field\.inference\.chunks.hits.hits.0.fields.dense_field\.inference\.chunks.0.text: 1 } + - length: { hits.hits.1.inner_hits.dense_field\.inference\.chunks.hits.hits.1.fields.dense_field\.inference\.chunks.0.text: 1 } + +--- +"Reindex works for semantic_text fields": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + get: + index: test-index + id: doc_1 + + - set: { _source.sparse_field.inference.chunks.0.embeddings: sparse_field_embedding } + - set: { _source.dense_field.inference.chunks.0.embeddings: dense_field_embedding } + + - do: + indices.refresh: { } + + - do: + indices.create: + index: destination-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + non_inference_field: + type: text + + - do: + reindex: + wait_for_completion: true + body: + source: + index: test-index + dest: + index: destination-index + - do: + get: + index: destination-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.0.embeddings: $sparse_field_embedding } + - match: { _source.dense_field.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.0.embeddings: $dense_field_embedding } + - match: { _source.non_inference_field: "non inference test" } + +--- +"semantic_text copy_to calculates embeddings for source fields": + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + source_field: + type: text + copy_to: sparse_field + another_source_field: + type: text + copy_to: sparse_field + + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + source_field: "copy_to inference test" + sparse_field: "inference test" + another_source_field: "another copy_to inference test" + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - length: { _source.sparse_field.inference.chunks: 3 } + - match: { _source.sparse_field.inference.chunks.0.text: "another copy_to inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.1.text: "copy_to inference test" } + - exists: _source.sparse_field.inference.chunks.1.embeddings + - match: { _source.sparse_field.inference.chunks.2.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.2.embeddings + +--- +"Calculates embeddings for bulk operations - index": + - do: + bulk: + body: + - '{"index": {"_index": "test-index", "_id": "doc_1"}}' + - '{"sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test"}' + + - match: { errors: false } + - match: { items.0.index.result: "created" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + +--- +"Update by query picks up new semantic_text fields": + - do: + indices.create: + index: mapping-update-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + dynamic: false + properties: + non_inference_field: + type: text + + - do: + index: + index: mapping-update-index + id: doc_1 + refresh: true + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + indices.put_mapping: + index: mapping-update-index + body: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + non_inference_field: + type: text + + - do: + update_by_query: + wait_for_completion: true + index: mapping-update-index + + - match: { updated: 1 } + + - do: + get: + index: mapping-update-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + +--- +"Update by query works for scripts": + - do: + index: + index: test-index + id: doc_1 + refresh: true + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + update_by_query: + wait_for_completion: true + index: test-index + body: { "script": "ctx._source.sparse_field = 'updated inference test'; ctx._source.dense_field = 'another updated inference test'" } + + - match: { updated: 1 } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "updated inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.dense_field.text: "another updated inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + +--- +"Calculates embeddings using the default ELSER 2 endpoint": + - requires: + reason: "default ELSER 2 inference ID is enabled via a capability" + test_runner_features: [capabilities] + capabilities: + - method: GET + path: /_inference + capabilities: [default_elser_2] + + - do: + indices.create: + index: test-elser-2-default-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + + - do: + index: + index: test-elser-2-default-index + id: doc_1 + body: + sparse_field: "inference test" + + - do: + get: + index: test-elser-2-default-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + +--- +"Can be used inside an object field": + - requires: + cluster_features: "semantic_text.in_object_field_fix" + reason: object field fix added in 8.16.0 & 8.15.4 + + - do: + indices.create: + index: test-in-object-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + level_1: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + + - do: + index: + index: test-in-object-index + id: doc_1 + body: + level_1: + sparse_field: "inference test" + dense_field: "another inference test" + + - do: + get: + index: test-in-object-index + id: doc_1 + + - match: { _source.level_1.sparse_field.text: "inference test" } + - exists: _source.level_1.sparse_field.inference.chunks.0.embeddings + - match: { _source.level_1.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.level_1.dense_field.text: "another inference test" } + - exists: _source.level_1.dense_field.inference.chunks.0.embeddings + - match: { _source.level_1.dense_field.inference.chunks.0.text: "another inference test" } + +--- +"Deletes on bulk operation": + - requires: + cluster_features: semantic_text.delete_fix + reason: Delete operations are properly applied when subsequent operations include a semantic text field. + + - do: + bulk: + index: test-index + refresh: true + body: | + {"index":{"_id": "1"}} + {"dense_field": ["you know, for testing", "now with chunks"]} + {"index":{"_id": "2"}} + {"dense_field": ["some more tests", "that include chunks"]} + + - do: + search: + index: test-index + body: + query: + semantic: + field: dense_field + query: "you know, for testing" + + - match: { hits.total.value: 2 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.dense_field.text: ["you know, for testing", "now with chunks"] } + - match: { hits.hits.1._source.dense_field.text: ["some more tests", "that include chunks"] } + + - do: + bulk: + index: test-index + refresh: true + body: | + {"delete":{ "_id": "2"}} + {"update":{"_id": "1"}} + {"doc":{"dense_field": "updated text"}} + + - do: + search: + index: test-index + body: + query: + semantic: + field: dense_field + query: "you know, for testing" + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source.dense_field.text: "updated text" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/40_semantic_text_query.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/40_semantic_text_query.yml index 3d3790d879ef1..2f3bcfae600e7 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/40_semantic_text_query.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/40_semantic_text_query.yml @@ -120,7 +120,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 3.7837332e17, error: 1e10 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Numeric query using a sparse embedding model": @@ -146,7 +145,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Boolean query using a sparse embedding model": @@ -172,7 +170,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - - length: { hits.hits.0._source.inference_field.inference.chunks: 1 } --- "Query using a sparse embedding model via a search inference ID": @@ -217,7 +214,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 3.7837332e17, error: 1e10 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Query using a dense embedding model": @@ -248,7 +244,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 1.0, error: 0.0001 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Numeric query using a dense embedding model": @@ -274,7 +269,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Boolean query using a dense embedding model": @@ -300,7 +294,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - - length: { hits.hits.0._source.inference_field.inference.chunks: 1 } --- "Query using a dense embedding model that uses byte embeddings": @@ -361,7 +354,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 1.0, error: 0.0001 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Query using a dense embedding model via a search inference ID": @@ -406,7 +398,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 1.0, error: 0.0001 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Apply boost and query name": @@ -439,7 +430,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 3.783733e19, error: 1e13 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } - match: { hits.hits.0.matched_queries: [ "i-like-naming-my-queries" ] } --- @@ -476,7 +466,6 @@ setup: - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "doc_1" } - close_to: { hits.hits.0._score: { value: 3.7837332e17, error: 1e10 } } - - length: { hits.hits.0._source.inference_field.inference.chunks: 2 } --- "Query the wrong field type": diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml index 294761608ee81..660d3e37f4242 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update.yml @@ -39,6 +39,11 @@ setup: indices.create: index: test-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -52,6 +57,9 @@ setup: --- "Updating non semantic_text fields does not recalculate embeddings": + - skip: + features: [ "headers" ] + - do: index: index: test-index @@ -60,14 +68,23 @@ setup: sparse_field: "inference test" dense_field: "another inference test" non_inference_field: "non inference test" + refresh: true - do: - get: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - set: { _source.sparse_field.inference.chunks.0.embeddings: sparse_field_embedding } - - set: { _source.dense_field.inference.chunks.0.embeddings: dense_field_embedding } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - set: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings: sparse_field_embedding } + - set: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings: dense_field_embedding } - do: update: @@ -76,19 +93,23 @@ setup: body: doc: non_inference_field: "another non inference test" + refresh: true - do: - get: + headers: + # Force JSON content type so that we use a parser that interprets the embeddings as doubles + Content-Type: application/json + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.embeddings: $sparse_field_embedding } - - match: { _source.dense_field.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.embeddings: $dense_field_embedding } - - match: { _source.non_inference_field: "another non inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings: $sparse_field_embedding } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings: $dense_field_embedding } --- "Updating semantic_text fields recalculates embeddings": @@ -100,17 +121,22 @@ setup: sparse_field: "inference test" dense_field: "another inference test" non_inference_field: "non inference test" + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.non_inference_field: "non inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } - do: bulk: @@ -118,17 +144,24 @@ setup: body: - '{"update": {"_id": "doc_1"}}' - '{"doc":{"sparse_field": "I am a test", "dense_field": "I am a teapot"}}' + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "I am a test" } - - match: { _source.sparse_field.inference.chunks.0.text: "I am a test" } - - match: { _source.dense_field.text: "I am a teapot" } - - match: { _source.dense_field.inference.chunks.0.text: "I am a teapot" } - - match: { _source.non_inference_field: "non inference test" } + # We can't directly check that the embeddings are different since there isn't a "does not match" assertion in the + # YAML test framework. Check that the start and end offsets change as expected as a proxy. + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 11 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 13 } - do: update: @@ -138,17 +171,22 @@ setup: doc: sparse_field: "updated inference test" dense_field: "another updated inference test" + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "updated inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } - - match: { _source.dense_field.text: "another updated inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } - - match: { _source.non_inference_field: "non inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 22 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 } - do: bulk: @@ -156,17 +194,22 @@ setup: body: - '{"update": {"_id": "doc_1"}}' - '{"doc":{"sparse_field": "bulk inference test", "dense_field": "bulk updated inference test"}}' + refresh: true - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "bulk inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "bulk inference test" } - - match: { _source.dense_field.text: "bulk updated inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "bulk updated inference test" } - - match: { _source.non_inference_field: "non inference test" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 19 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 27 } --- "Update logic handles source fields in object fields": @@ -174,6 +217,11 @@ setup: indices.create: index: test-copy-to-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -198,18 +246,22 @@ setup: body: sparse_field: "sparse data 1" dense_field: "dense data 1" + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "sparse data 1" } - - length: { _source.sparse_field.inference.chunks: 1 } - - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 1" } - - match: { _source.dense_field.text: "dense data 1" } - - length: { _source.dense_field.inference.chunks: 1 } - - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field - do: bulk: @@ -217,28 +269,38 @@ setup: body: - '{"update": {"_id": "doc_1"}}' - > - { - "doc": { - "sparse_field": "sparse data 1", - "object_source.sparse_field": "sparse data 2", - "dense_field": "dense data 1", - "object_source.dense_field": "dense data 2" - } - } + { + "doc": { + "object_source.sparse_field": "sparse data two", + "object_source.dense_field": "dense data two" + } + } + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "sparse data 1" } - - length: { _source.sparse_field.inference.chunks: 2 } - - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 2" } - - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 1" } - - match: { _source.dense_field.text: "dense data 1" } - - length: { _source.dense_field.inference.chunks: 2 } - - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } - - match: { _source.dense_field.inference.chunks.1.text: "dense data 2" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.end_offset: 15 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.end_offset: 14 } - do: update: @@ -247,25 +309,35 @@ setup: body: doc: { - "sparse_field": "sparse data 1", - "object_source.sparse_field": "sparse data 3", - "dense_field": "dense data 1", - "object_source.dense_field": "dense data 3" + "object_source.sparse_field": "sparse data three", + "object_source.dense_field": "dense data three" } + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} - - match: { _source.sparse_field.text: "sparse data 1" } - - length: { _source.sparse_field.inference.chunks: 2 } - - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 3" } - - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 1" } - - match: { _source.dense_field.text: "dense data 1" } - - length: { _source.dense_field.inference.chunks: 2 } - - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } - - match: { _source.dense_field.inference.chunks.1.text: "dense data 3" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.object_source\\.sparse_field.0.end_offset: 17 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.object_source\\.dense_field.0.end_offset: 16 } --- "Updates fail when the updated value is invalid": @@ -278,17 +350,6 @@ setup: dense_field: "another inference test" non_inference_field: "non inference test" - - do: - get: - index: test-index - id: doc_1 - - - match: { _source.sparse_field.text: "inference test" } - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.non_inference_field: "non inference test" } - - do: bulk: index: test-index @@ -317,6 +378,11 @@ setup: indices.create: index: test-copy-to-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -338,42 +404,68 @@ setup: id: doc_1 body: sparse_field: "sparse data 1" - sparse_source_field: "sparse data 2" + sparse_source_field: "sparse data two" dense_field: "dense data 1" - dense_source_field: "dense data 2" + dense_source_field: "dense data two" + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 15 } - - length: { _source.sparse_field.inference.chunks: 2 } - - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 2" } - - exists: _source.sparse_field.inference.chunks.1.embeddings - - length: { _source.dense_field.inference.chunks: 2 } - - match: { _source.dense_field.inference.chunks.1.text: "dense data 2" } - - exists: _source.dense_field.inference.chunks.1.embeddings + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 14 } - do: update: index: test-copy-to-index id: doc_1 body: - doc: { "sparse_source_field": "sparse data 3", "dense_source_field": "dense data 3" } + doc: { "sparse_source_field": "sparse data three", "dense_source_field": "dense data three" } + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 17 } - - match: { _source.sparse_field.text: "sparse data 1" } - - length: { _source.sparse_field.inference.chunks: 2 } - - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 3" } - - exists: _source.sparse_field.inference.chunks.1.embeddings - - match: { _source.dense_field.text: "dense data 1" } - - length: { _source.dense_field.inference.chunks: 2 } - - match: { _source.dense_field.inference.chunks.1.text: "dense data 3" } - - exists: _source.dense_field.inference.chunks.1.embeddings + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 2 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 16 } --- "Partial updates work when using the update API and the semantic_text field's original value is null": @@ -381,6 +473,11 @@ setup: indices.create: index: test-copy-to-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -402,83 +499,142 @@ setup: index: test-copy-to-index id: doc_1 body: - sparse_source_field: "sparse data 2" - dense_source_field: "dense data 2" + sparse_source_field: "sparse data two" + dense_source_field: "dense data two" + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 15 } - - match: { _source.sparse_field.text: null } - - length: { _source.sparse_field.inference.chunks: 1 } - - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 2" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.dense_field.text: null } - - length: { _source.dense_field.inference.chunks: 1 } - - match: { _source.dense_field.inference.chunks.0.text: "dense data 2" } - - exists: _source.dense_field.inference.chunks.0.embeddings + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 14 } - do: update: index: test-copy-to-index id: doc_1 body: - doc: { "sparse_source_field": "sparse data 3", "dense_source_field": "dense data 3" } + doc: { "sparse_source_field": "sparse data three", "dense_source_field": "dense data three" } + refresh: true - do: - get: + search: index: test-copy-to-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: {} + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } - - match: { _source.sparse_field.text: null } - - length: { _source.sparse_field.inference.chunks: 1 } - - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 3" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.dense_field.text: null } - - length: { _source.dense_field.inference.chunks: 1 } - - match: { _source.dense_field.inference.chunks.0.text: "dense data 3" } - - exists: _source.dense_field.inference.chunks.0.embeddings + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_source_field.0.end_offset: 17 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_source_field.0.end_offset: 16 } --- -"Updates with script are not allowed": +"Updates with script via bulk API are not allowed": - do: bulk: index: test-index body: - '{"index": {"_id": "doc_1"}}' - '{"doc":{"sparse_field": "I am a test", "dense_field": "I am a teapot"}}' + refresh: true + + - match: { errors: false } - do: bulk: index: test-index body: - '{"update": {"_id": "doc_1"}}' - - '{"script": "ctx._source.new_field = \"hello\"", "scripted_upsert": true}' + - '{"script": "ctx._source.sparse_field = \"sparse data two\"", "scripted_upsert": true}' + refresh: true - match: { errors: true } - match: { items.0.update.status: 400 } - match: { items.0.update.error.reason: "Cannot apply update with a script on indices that contain [semantic_text] field(s)" } +--- +"Updates with script works when using the update API": + - do: + bulk: + index: test-index + body: + - '{"index": {"_id": "doc_1"}}' + - '{"doc":{"sparse_field": "I am a test", "dense_field": "I am a teapot"}}' + refresh: true + + - match: { errors: false } + - do: - catch: bad_request update: index: test-index id: doc_1 body: script: - source: "ctx._source.new_field = \"hello\"" + source: "ctx._source.sparse_field = \"sparse data two\"; ctx._source.dense_field = \"dense data two\"" lang: "painless" + refresh: true - - match: { error.type: "status_exception" } - - match: { error.reason: "Cannot apply update with a script on indices that contain inference field(s)" } + - do: + search: + index: test-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 15 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 14 } --- -"semantic_text copy_to needs values for every source field for bulk updates": +"semantic_text copy_to does not need values for every source field for bulk updates": - do: indices.create: index: test-copy-to-index body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: false mappings: properties: sparse_field: @@ -491,7 +647,6 @@ setup: type: text copy_to: sparse_field - # Not every source field needed on creation - do: index: index: test-copy-to-index @@ -500,15 +655,35 @@ setup: source_field: "a single source field provided" sparse_field: "inference test" - # Every source field needed on bulk updates - do: bulk: body: - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}' - - '{"doc": {"source_field": "a single source field is kept as provided via bulk", "sparse_field": "updated inference test" }}' + - '{"doc": {"source_field": "updated source field value"}}' + refresh: true - - match: { items.0.update.status: 400 } - - match: { items.0.update.error.reason: "Field [another_source_field] must be specified on an update request to calculate inference for field [sparse_field]" } + - match: { errors: false } + + - do: + search: + index: test-copy-to-index + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 2 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.source_field.0.end_offset: 26 } --- "Calculates embeddings for bulk operations - update": @@ -526,22 +701,33 @@ setup: body: - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - '{"doc": { "sparse_field": "updated inference test", "dense_field": "another updated inference test", "non_inference_field": "updated non inference test" }}' + refresh: true - match: { errors: false } - match: { items.0.update.result: "updated" } - do: - get: + search: index: test-index - id: doc_1 + body: + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 22 } - - match: { _source.sparse_field.text: "updated inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } - - match: { _source.dense_field.text: "another updated inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } - - match: { _source.non_inference_field: "updated non inference test" } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 } # Script update not supported - do: @@ -572,97 +758,145 @@ setup: body: - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' + refresh: true - match: { errors: false } - match: { items.0.update.result: "created" } - do: - get: + search: index: test-index - id: doc_1 - - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.non_inference_field: "non inference test" } - - - do: - bulk: body: - - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - - '{"doc": { "sparse_field": "updated inference test", "dense_field": "another updated inference test", "non_inference_field": "updated non inference test" }, "doc_as_upsert": true}' + fields: [ _inference_fields ] + query: + match_all: { } - - match: { errors: false } - - match: { items.0.update.result: "updated" } + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } - - do: - get: - index: test-index - id: doc_1 - - - match: { _source.sparse_field.text: "updated inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } - - match: { _source.dense_field.text: "another updated inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } - - match: { _source.non_inference_field: "updated non inference test" } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } ---- -"Bypass inference on bulk update operation": - - requires: - cluster_features: semantic_text.single_field_update_fix - reason: Standalone semantic text fields are now optional in a bulk update operation + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } - # Update as upsert - do: bulk: body: - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' - - - match: { errors: false } - - match: { items.0.update.result: "created" } - - - do: - bulk: - body: - - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}' + - '{"doc": { "sparse_field": "updated inference test", "dense_field": "another updated inference test", "non_inference_field": "updated non inference test" }, "doc_as_upsert": true}' + refresh: true - match: { errors: false } - match: { items.0.update.result: "updated" } - do: - get: + search: index: test-index - id: doc_1 - - - match: { _source.sparse_field.text: "inference test" } - - exists: _source.sparse_field.inference.chunks.0.embeddings - - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } - - match: { _source.dense_field.text: "another inference test" } - - exists: _source.dense_field.inference.chunks.0.embeddings - - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } - - match: { _source.non_inference_field: "another value" } - - - do: - bulk: body: - - '{"update": {"_index": "test-index", "_id": "doc_1"}}' - - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}' - - - match: { errors: false } - - match: { items.0.update.result: "updated" } - - - do: - get: - index: test-index - id: doc_1 - - - match: { _source.sparse_field: null } - - match: { _source.dense_field: null } - - match: { _source.non_inference_field: "updated value" } + fields: [ _inference_fields ] + query: + match_all: { } + + - match: { hits.total.value: 1 } + - match: { hits.total.relation: eq } + + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } + - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 22 } + + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } + - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } + - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } + - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 30 } + +# TODO: Uncomment this test once we implement a fix +#--- +#"Bypass inference on bulk update operation": +# # Update as upsert +# - do: +# bulk: +# body: +# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' +# - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' +# +# - match: { errors: false } +# - match: { items.0.update.result: "created" } +# +# - do: +# bulk: +# body: +# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' +# - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}' +# refresh: true +# +# - match: { errors: false } +# - match: { items.0.update.result: "updated" } +# +# - do: +# search: +# index: test-index +# body: +# fields: [ _inference_fields ] +# query: +# match_all: { } +# +# - match: { hits.total.value: 1 } +# - match: { hits.total.relation: eq } +# +# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } +# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 1 } +# - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings +# - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 } +# - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 } +# +# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } +# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 1 } +# - exists: hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.embeddings +# - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.start_offset: 0 } +# - match: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field.0.end_offset: 22 } +# +# - match: { hits.hits.0._source.sparse_field: "inference test" } +# - match: { hits.hits.0._source.dense_field: "another inference test" } +# - match: { hits.hits.0._source.non_inference_field: "another value" } +# +# - do: +# bulk: +# body: +# - '{"update": {"_index": "test-index", "_id": "doc_1"}}' +# - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}' +# refresh: true +# +# - match: { errors: false } +# - match: { items.0.update.result: "updated" } +# +# - do: +# search: +# index: test-index +# body: +# fields: [ _inference_fields ] +# query: +# match_all: { } +# +# - match: { hits.total.value: 1 } +# - match: { hits.total.relation: eq } +# +# # TODO: BUG! Setting sparse_field & dense_field to null does not clear _inference_fields +# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 } +# - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 0 } +# +# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks: 1 } +# - length: { hits.hits.0._source._inference_fields.dense_field.inference.chunks.dense_field: 0 } +# +# - not_exists: hits.hits.0._source.sparse_field +# - not_exists: hits.hits.0._source.dense_field +# - match: { hits.hits.0._source.non_inference_field: "updated value" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml new file mode 100644 index 0000000000000..6b494d531b2d1 --- /dev/null +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/60_semantic_text_inference_update_bwc.yml @@ -0,0 +1,651 @@ +setup: + - requires: + cluster_features: "gte_v8.15.0" + reason: semantic_text introduced in 8.15.0 + + - do: + inference.put: + task_type: sparse_embedding + inference_id: sparse-inference-id + body: > + { + "service": "test_service", + "service_settings": { + "model": "my_model", + "api_key": "abc64" + }, + "task_settings": { + } + } + + - do: + inference.put: + task_type: text_embedding + inference_id: dense-inference-id + body: > + { + "service": "text_embedding_test_service", + "service_settings": { + "model": "my_model", + "dimensions": 10, + "similarity": "cosine", + "api_key": "abc64" + }, + "task_settings": { + } + } + + - do: + indices.create: + index: test-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + non_inference_field: + type: text + +--- +"Updating non semantic_text fields does not recalculate embeddings": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + get: + index: test-index + id: doc_1 + + - set: { _source.sparse_field.inference.chunks.0.embeddings: sparse_field_embedding } + - set: { _source.dense_field.inference.chunks.0.embeddings: dense_field_embedding } + + - do: + update: + index: test-index + id: doc_1 + body: + doc: + non_inference_field: "another non inference test" + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.0.embeddings: $sparse_field_embedding } + - match: { _source.dense_field.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.0.embeddings: $dense_field_embedding } + - match: { _source.non_inference_field: "another non inference test" } + +--- +"Updating semantic_text fields recalculates embeddings": + - do: + index: + index: test-index + id: doc_1 + body: + sparse_field: "inference test" + dense_field: "another inference test" + non_inference_field: "non inference test" + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + + - do: + bulk: + index: test-index + body: + - '{"update": {"_id": "doc_1"}}' + - '{"doc":{"sparse_field": "I am a test", "dense_field": "I am a teapot"}}' + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "I am a test" } + - match: { _source.sparse_field.inference.chunks.0.text: "I am a test" } + - match: { _source.dense_field.text: "I am a teapot" } + - match: { _source.dense_field.inference.chunks.0.text: "I am a teapot" } + - match: { _source.non_inference_field: "non inference test" } + + - do: + update: + index: test-index + id: doc_1 + body: + doc: + sparse_field: "updated inference test" + dense_field: "another updated inference test" + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "updated inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } + - match: { _source.dense_field.text: "another updated inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } + - match: { _source.non_inference_field: "non inference test" } + + - do: + bulk: + index: test-index + body: + - '{"update": {"_id": "doc_1"}}' + - '{"doc":{"sparse_field": "bulk inference test", "dense_field": "bulk updated inference test"}}' + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "bulk inference test" } + - match: { _source.sparse_field.inference.chunks.0.text: "bulk inference test" } + - match: { _source.dense_field.text: "bulk updated inference test" } + - match: { _source.dense_field.inference.chunks.0.text: "bulk updated inference test" } + - match: { _source.non_inference_field: "non inference test" } + +--- +"Update logic handles source fields in object fields": + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + dense_field: + type: semantic_text + inference_id: dense-inference-id + object_source: + properties: + sparse_field: + type: text + copy_to: sparse_field + dense_field: + type: text + copy_to: dense_field + + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + sparse_field: "sparse data 1" + dense_field: "dense data 1" + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: "sparse data 1" } + - length: { _source.sparse_field.inference.chunks: 1 } + - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 1" } + - match: { _source.dense_field.text: "dense data 1" } + - length: { _source.dense_field.inference.chunks: 1 } + - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } + + - do: + bulk: + index: test-copy-to-index + body: + - '{"update": {"_id": "doc_1"}}' + - > + { + "doc": { + "sparse_field": "sparse data 1", + "object_source.sparse_field": "sparse data 2", + "dense_field": "dense data 1", + "object_source.dense_field": "dense data 2" + } + } + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: "sparse data 1" } + - length: { _source.sparse_field.inference.chunks: 2 } + - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 2" } + - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 1" } + - match: { _source.dense_field.text: "dense data 1" } + - length: { _source.dense_field.inference.chunks: 2 } + - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } + - match: { _source.dense_field.inference.chunks.1.text: "dense data 2" } + + - do: + update: + index: test-copy-to-index + id: doc_1 + body: + doc: + { + "sparse_field": "sparse data 1", + "object_source.sparse_field": "sparse data 3", + "dense_field": "dense data 1", + "object_source.dense_field": "dense data 3" + } + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: "sparse data 1" } + - length: { _source.sparse_field.inference.chunks: 2 } + - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 3" } + - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 1" } + - match: { _source.dense_field.text: "dense data 1" } + - length: { _source.dense_field.inference.chunks: 2 } + - match: { _source.dense_field.inference.chunks.0.text: "dense data 1" } + - match: { _source.dense_field.inference.chunks.1.text: "dense data 3" } + +--- +"Partial updates work when using the update API": + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + sparse_source_field: + type: text + copy_to: sparse_field + dense_field: + type: semantic_text + inference_id: dense-inference-id + dense_source_field: + type: text + copy_to: dense_field + + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + sparse_field: "sparse data 1" + sparse_source_field: "sparse data 2" + dense_field: "dense data 1" + dense_source_field: "dense data 2" + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - length: { _source.sparse_field.inference.chunks: 2 } + - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 2" } + - exists: _source.sparse_field.inference.chunks.1.embeddings + - length: { _source.dense_field.inference.chunks: 2 } + - match: { _source.dense_field.inference.chunks.1.text: "dense data 2" } + - exists: _source.dense_field.inference.chunks.1.embeddings + + - do: + update: + index: test-copy-to-index + id: doc_1 + body: + doc: { "sparse_source_field": "sparse data 3", "dense_source_field": "dense data 3" } + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: "sparse data 1" } + - length: { _source.sparse_field.inference.chunks: 2 } + - match: { _source.sparse_field.inference.chunks.1.text: "sparse data 3" } + - exists: _source.sparse_field.inference.chunks.1.embeddings + - match: { _source.dense_field.text: "dense data 1" } + - length: { _source.dense_field.inference.chunks: 2 } + - match: { _source.dense_field.inference.chunks.1.text: "dense data 3" } + - exists: _source.dense_field.inference.chunks.1.embeddings + +--- +"Partial updates work when using the update API and the semantic_text field's original value is null": + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + sparse_source_field: + type: text + copy_to: sparse_field + dense_field: + type: semantic_text + inference_id: dense-inference-id + dense_source_field: + type: text + copy_to: dense_field + + # Don't set sparse_field or dense_field so their original value is null + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + sparse_source_field: "sparse data 2" + dense_source_field: "dense data 2" + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: null } + - length: { _source.sparse_field.inference.chunks: 1 } + - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 2" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.dense_field.text: null } + - length: { _source.dense_field.inference.chunks: 1 } + - match: { _source.dense_field.inference.chunks.0.text: "dense data 2" } + - exists: _source.dense_field.inference.chunks.0.embeddings + + - do: + update: + index: test-copy-to-index + id: doc_1 + body: + doc: { "sparse_source_field": "sparse data 3", "dense_source_field": "dense data 3" } + + - do: + get: + index: test-copy-to-index + id: doc_1 + + - match: { _source.sparse_field.text: null } + - length: { _source.sparse_field.inference.chunks: 1 } + - match: { _source.sparse_field.inference.chunks.0.text: "sparse data 3" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.dense_field.text: null } + - length: { _source.dense_field.inference.chunks: 1 } + - match: { _source.dense_field.inference.chunks.0.text: "dense data 3" } + - exists: _source.dense_field.inference.chunks.0.embeddings + +--- +"Updates with script are not allowed": + - do: + bulk: + index: test-index + body: + - '{"index": {"_id": "doc_1"}}' + - '{"doc":{"sparse_field": "I am a test", "dense_field": "I am a teapot"}}' + + - match: { errors: false } + + - do: + bulk: + index: test-index + body: + - '{"update": {"_id": "doc_1"}}' + - '{"script": "ctx._source.new_field = \"hello\"", "scripted_upsert": true}' + + - match: { errors: true } + - match: { items.0.update.status: 400 } + - match: { items.0.update.error.reason: "Cannot apply update with a script on indices that contain [semantic_text] field(s)" } + + - do: + catch: bad_request + update: + index: test-index + id: doc_1 + body: + script: + source: "ctx._source.new_field = \"hello\"" + lang: "painless" + + - match: { error.type: "status_exception" } + - match: { error.reason: "Cannot apply update with a script on indices that contain inference field(s)" } + +--- +"semantic_text copy_to needs values for every source field for bulk updates": + - do: + indices.create: + index: test-copy-to-index + body: + settings: + index: + mapping: + semantic_text: + use_legacy_format: true + mappings: + properties: + sparse_field: + type: semantic_text + inference_id: sparse-inference-id + source_field: + type: text + copy_to: sparse_field + another_source_field: + type: text + copy_to: sparse_field + + # Not every source field needed on creation + - do: + index: + index: test-copy-to-index + id: doc_1 + body: + source_field: "a single source field provided" + sparse_field: "inference test" + + # Every source field needed on bulk updates + - do: + bulk: + body: + - '{"update": {"_index": "test-copy-to-index", "_id": "doc_1"}}' + - '{"doc": {"source_field": "a single source field is kept as provided via bulk", "sparse_field": "updated inference test" }}' + + - match: { items.0.update.status: 400 } + - match: { items.0.update.error.reason: "Field [another_source_field] must be specified on an update request to calculate inference for field [sparse_field]" } + +--- +"Calculates embeddings for bulk operations - update": + - do: + bulk: + body: + - '{"index": {"_index": "test-index", "_id": "doc_1"}}' + - '{"sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test"}' + + - match: { errors: false } + - match: { items.0.index.result: "created" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "updated inference test", "dense_field": "another updated inference test", "non_inference_field": "updated non inference test" }}' + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "updated inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } + - match: { _source.dense_field.text: "another updated inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } + - match: { _source.non_inference_field: "updated non inference test" } + + # Script update not supported + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"script": {"source": {"ctx.sparse_field": "updated inference test"}}}' + + - match: { errors: true } + - match: { items.0.update.status: 400 } + - match: { items.0.update.error.reason: "Cannot apply update with a script on indices that contain [semantic_text] field(s)" } + +--- +"Calculates embeddings for bulk operations - upsert": + # Initial update fails + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }}' + + - match: { errors: true } + - match: { items.0.update.status: 404 } + + # Update as upsert + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "created" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "non inference test" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "updated inference test", "dense_field": "another updated inference test", "non_inference_field": "updated non inference test" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "updated inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "updated inference test" } + - match: { _source.dense_field.text: "another updated inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another updated inference test" } + - match: { _source.non_inference_field: "updated non inference test" } + +--- +"Bypass inference on bulk update operation": + - requires: + cluster_features: semantic_text.single_field_update_fix + reason: Standalone semantic text fields are now optional in a bulk update operation + + # Update as upsert + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": "inference test", "dense_field": "another inference test", "non_inference_field": "non inference test" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "created" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "non_inference_field": "another value" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field.text: "inference test" } + - exists: _source.sparse_field.inference.chunks.0.embeddings + - match: { _source.sparse_field.inference.chunks.0.text: "inference test" } + - match: { _source.dense_field.text: "another inference test" } + - exists: _source.dense_field.inference.chunks.0.embeddings + - match: { _source.dense_field.inference.chunks.0.text: "another inference test" } + - match: { _source.non_inference_field: "another value" } + + - do: + bulk: + body: + - '{"update": {"_index": "test-index", "_id": "doc_1"}}' + - '{"doc": { "sparse_field": null, "dense_field": null, "non_inference_field": "updated value" }, "doc_as_upsert": true}' + + - match: { errors: false } + - match: { items.0.update.result: "updated" } + + - do: + get: + index: test-index + id: doc_1 + + - match: { _source.sparse_field: null } + - match: { _source.dense_field: null } + - match: { _source.non_inference_field: "updated value" } diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml index 25cd1b5aec48a..9e2bd8fefd15a 100644 --- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml @@ -2,6 +2,15 @@ setup: - requires: cluster_features: "semantic_text.highlighter" reason: a new highlighter for semantic text field + test_runner_features: [ capabilities ] + + # TODO: Remove once highlighter supports inference metadata fields + - skip: + reason: Test suite targets semantic text without inference metadata fields + capabilities: + - method: GET + path: /_inference + capabilities: [ inference_metadata_fields ] - do: inference.put: diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml index 049895bc9f31a..e100f30717aef 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/40_unsupported_types.yml @@ -504,52 +504,3 @@ double nested declared in mapping: # The `nested` field is not visible, nor are any of it's subfields. - match: { columns: [{name: name, type: keyword}] } - ---- -semantic_text declared in mapping: - - requires: - test_runner_features: [ capabilities ] - capabilities: - - method: POST - path: /_query - parameters: [ ] - capabilities: [ semantic_text_type ] - reason: "support for semantic_text type" - - do: - indices.create: - index: test_semantic_text - body: - settings: - number_of_shards: 5 - mappings: - properties: - semantic_text_field: - type: semantic_text - inference_id: my_inference_id - - do: - bulk: - index: test_semantic_text - refresh: true - body: - - { "index": { } } - - { - "semantic_text_field": { - "text": "be excellent to each other", - "inference": { - "inference_id": "my_inference_id", - "model_settings": { - "task_type": "sparse_embedding" - }, - "chunks": [{ "text": "be excellent to each other", "embeddings": { "a": 1,"b": 2 } }] - } - } - } - - do: - allowed_warnings_regex: - - "No limit defined, adding default limit of \\[.*\\]" - esql.query: - body: - query: 'FROM test_semantic_text' - - match: { columns: [{name: semantic_text_field, type: semantic_text}] } - - length: { values: 1 } - - match: { values.0: ["be excellent to each other"] }