Merge branch 'main' into entitlements/reorg-projects

rjernst · Oct 30, 2024 · 71da655 · 71da655
2 parents 29844e1 + 5cc2a47
commit 71da655
Show file tree

Hide file tree

Showing 185 changed files with 9,816 additions and 1,404 deletions.
diff --git a/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle b/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle
@@ -137,15 +137,15 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
     }
   }
 
-  // modifies the idea module config to enable preview features on 'elasticsearch-native' module
+  // modifies the idea module config to enable preview features on ':libs:native' module
   tasks.register("enablePreviewFeatures") {
     group = 'ide'
     description = 'Enables preview features on native library module'
     dependsOn tasks.named("enableExternalConfiguration")
 
     doLast {
       ['main', 'test'].each { sourceSet ->
-        modifyXml(".idea/modules/libs/native/elasticsearch.libs.elasticsearch-native.${sourceSet}.iml") { xml ->
+        modifyXml(".idea/modules/libs/native/elasticsearch.libs.${project.project(':libs:native').name}.${sourceSet}.iml") { xml ->
           xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = 'JDK_21_PREVIEW'
         }
       }

diff --git a/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/DockerBase.java b/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/DockerBase.java
@@ -24,7 +24,7 @@ public enum DockerBase {
     // Chainguard based wolfi image with latest jdk
     // This is usually updated via renovatebot
     // spotless:off
-    WOLFI("docker.elastic.co/wolfi/chainguard-base:latest@sha256:bf163e1977002301f7b9fd28fe6837a8cb2dd5c83e4cd45fb67fb28d15d5d40f",
+    WOLFI("docker.elastic.co/wolfi/chainguard-base:latest@sha256:973431347ad45f40e01afbbd010bf9de929c088a63382239b90dd84f39618bc8",
         "-wolfi",
         "apk"
     ),

diff --git a/...ools-internal/src/main/java/org/elasticsearch/gradle/internal/ResolveAllDependencies.java b/...ools-internal/src/main/java/org/elasticsearch/gradle/internal/ResolveAllDependencies.java
@@ -12,8 +12,11 @@
 import org.elasticsearch.gradle.VersionProperties;
 import org.gradle.api.DefaultTask;
 import org.gradle.api.artifacts.Configuration;
+import org.gradle.api.artifacts.FileCollectionDependency;
+import org.gradle.api.artifacts.component.ModuleComponentIdentifier;
 import org.gradle.api.file.FileCollection;
 import org.gradle.api.model.ObjectFactory;
+import org.gradle.api.provider.ProviderFactory;
 import org.gradle.api.tasks.InputFiles;
 import org.gradle.api.tasks.Internal;
 import org.gradle.api.tasks.TaskAction;
@@ -26,9 +29,6 @@
 
 import javax.inject.Inject;
 
-import static org.elasticsearch.gradle.DistributionDownloadPlugin.DISTRO_EXTRACTED_CONFIG_PREFIX;
-import static org.elasticsearch.gradle.internal.test.rest.compat.compat.LegacyYamlRestCompatTestPlugin.BWC_MINOR_CONFIG_NAME;
-
 public abstract class ResolveAllDependencies extends DefaultTask {
 
     private boolean resolveJavaToolChain = false;
@@ -37,18 +37,28 @@ public abstract class ResolveAllDependencies extends DefaultTask {
     protected abstract JavaToolchainService getJavaToolchainService();
 
     private final ObjectFactory objectFactory;
+    private final ProviderFactory providerFactory;
 
     private Collection<Configuration> configs;
 
     @Inject
-    public ResolveAllDependencies(ObjectFactory objectFactory) {
+    public ResolveAllDependencies(ObjectFactory objectFactory, ProviderFactory providerFactory) {
         this.objectFactory = objectFactory;
+        this.providerFactory = providerFactory;
     }
 
     @InputFiles
     public FileCollection getResolvedArtifacts() {
-        return objectFactory.fileCollection()
-            .from(configs.stream().filter(ResolveAllDependencies::canBeResolved).collect(Collectors.toList()));
+        return objectFactory.fileCollection().from(configs.stream().filter(ResolveAllDependencies::canBeResolved).map(c -> {
+            // Make a copy of the configuration, omitting file collection dependencies to avoid building project artifacts
+            Configuration copy = c.copyRecursive(d -> d instanceof FileCollectionDependency == false);
+            copy.setCanBeConsumed(false);
+            return copy;
+        })
+            // Include only module dependencies, ignoring things like project dependencies so we don't unnecessarily build stuff
+            .map(c -> c.getIncoming().artifactView(v -> v.lenient(true).componentFilter(i -> i instanceof ModuleComponentIdentifier)))
+            .map(artifactView -> providerFactory.provider(artifactView::getFiles))
+            .collect(Collectors.toList()));
     }
 
     @TaskAction
@@ -95,8 +105,8 @@ private static boolean canBeResolved(Configuration configuration) {
                 return false;
             }
         }
-        return configuration.getName().startsWith(DISTRO_EXTRACTED_CONFIG_PREFIX) == false
-            && configuration.getName().equals(BWC_MINOR_CONFIG_NAME) == false;
+
+        return true;
     }
 
 }
diff --git a/docs/changelog/114862.yaml b/docs/changelog/114862.yaml
@@ -0,0 +1,5 @@
+pr: 114862
+summary: "[Inference API] Add API to get configuration of inference services"
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/115624.yaml b/docs/changelog/115624.yaml
@@ -0,0 +1,7 @@
+pr: 115624
+summary: "ES|QL: fix LIMIT pushdown past MV_EXPAND"
+area: ES|QL
+type: bug
+issues:
+ - 102084
+ - 102061
diff --git a/docs/changelog/115811.yaml b/docs/changelog/115811.yaml
@@ -0,0 +1,5 @@
+pr: 115811
+summary: "Prohibit changes to index mode, source, and sort settings during restore"
+area: Logs
+type: bug
+issues: []
diff --git a/docs/changelog/115812.yaml b/docs/changelog/115812.yaml
@@ -0,0 +1,5 @@
+pr: 115812
+summary: "Prohibit changes to index mode, source, and sort settings during resize"
+area: Logs
+type: bug
+issues: []
diff --git a/docs/changelog/115868.yaml b/docs/changelog/115868.yaml
@@ -0,0 +1,5 @@
+pr: 115868
+summary: Forward bedrock connection errors to user
+area: Machine Learning
+type: bug
+issues: []
diff --git a/docs/changelog/115923.yaml b/docs/changelog/115923.yaml
@@ -0,0 +1,16 @@
+pr: 115923
+summary: Apply more strict parsing of actions in bulk API
+area: Indices APIs
+type: breaking
+issues: [ ]
+breaking:
+  title: Apply more strict parsing of actions in bulk API
+  area: REST API
+  details: >-
+    Previously, the following classes of malformed input were deprecated but not rejected in the action lines of the a
+    bulk request: missing closing brace; additional keys after the action (which were ignored); additional data after
+    the closing brace (which was ignored). They will now be considered errors and rejected.
+  impact: >-
+    Users must provide well-formed input when using the bulk API. (They can request REST API compatibility with v8 to
+    get the previous behaviour back as an interim measure.)
+  notable: false
diff --git a/docs/changelog/115952.yaml b/docs/changelog/115952.yaml
@@ -0,0 +1,5 @@
+pr: 115952
+summary: "ESQL: Fix a bug in VALUES agg"
+area: ES|QL
+type: bug
+issues: []
diff --git a/docs/reference/landing-page.asciidoc b/docs/reference/landing-page.asciidoc
@@ -128,7 +128,7 @@
       <a href="https://www.elastic.co/guide/en/cloud/current/ec-cloud-ingest-data.html">Adding data to Elasticsearch</a>
     </li>
     <li>
-      <a href="https://www.elastic.co/guide/en/enterprise-search/current/connectors.html">Connectors</a>
+      <a href="es-connectors.html">Connectors</a>
     </li>
     <li>
       <a href="https://www.elastic.co/guide/en/enterprise-search/current/crawler.html">Web crawler</a>

diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc
@@ -62,10 +62,17 @@ multiplied by `boost` to produce final documents' scores. Defaults to `1.0`.
 ===== Use relevance scores in a script
 
 Within a script, you can
-{ref}/modules-scripting-fields.html#scripting-score[access] 
+{ref}/modules-scripting-fields.html#scripting-score[access]
 the `_score` variable which represents the current relevance score of a
 document.
 
+[[script-score-access-term-statistics]]
+===== Use term statistics in a script
+
+Within a script, you can
+{ref}/modules-scripting-fields.html#scripting-term-statistics[access]
+the `_termStats` variable which provides statistical information about the terms used in the child query of the `script_score` query.
+
 [[script-score-predefined-functions]]
 ===== Predefined functions
 You can use any of the available {painless}/painless-contexts.html[painless
@@ -147,7 +154,7 @@ updated since update operations also update the value of the `_seq_no` field.
 
 [[decay-functions-numeric-fields]]
 ====== Decay functions for numeric fields
-You can read more about decay functions 
+You can read more about decay functions
 {ref}/query-dsl-function-score-query.html#function-decay[here].
 
 * `double decayNumericLinear(double origin, double scale, double offset, double decay, double docValue)`
@@ -233,7 +240,7 @@ The `script_score` query calculates the score for
 every matching document, or hit. There are faster alternative query types that
 can efficiently skip non-competitive hits:
 
-* If you want to boost documents on some static fields, use the 
+* If you want to boost documents on some static fields, use the
  <<query-dsl-rank-feature-query, `rank_feature`>> query.
  * If you want to boost documents closer to a date or geographic point, use the
  <<query-dsl-distance-feature-query, `distance_feature`>> query.

diff --git a/docs/reference/reranking/learning-to-rank-model-training.asciidoc b/docs/reference/reranking/learning-to-rank-model-training.asciidoc
@@ -38,11 +38,21 @@ Feature extractors are defined using templated queries. https://eland.readthedoc
 from eland.ml.ltr import QueryFeatureExtractor
 
 feature_extractors=[
-    # We want to use the score of the match query for the title field as a feature:
+    # We want to use the BM25 score of the match query for the title field as a feature:
     QueryFeatureExtractor(
         feature_name="title_bm25",
         query={"match": {"title": "{{query}}"}}
     ),
+    # We want to use the the number of matched terms in the title field as a feature:
+    QueryFeatureExtractor(
+        feature_name="title_matched_term_count",
+        query={
+            "script_score": {
+                "query": {"match": {"title": "{{query}}"}},
+                "script": {"source": "return _termStats.matchedTermsCount();"},
+            }
+        },
+    ),
     # We can use a script_score query to get the value
     # of the field rating directly as a feature:
     QueryFeatureExtractor(
@@ -54,26 +64,29 @@ feature_extractors=[
             }
         },
     ),
-    # We can execute a script on the value of the query
-    # and use the return value as a feature:
-    QueryFeatureExtractor(
-        feature_name="query_length",
+    # We extract the number of terms in the query as feature.
+   QueryFeatureExtractor(
+        feature_name="query_term_count",
         query={
             "script_score": {
-                "query": {"match_all": {}},
-                "script": {
-                    "source": "return params['query'].splitOnToken(' ').length;",
-                    "params": {
-                        "query": "{{query}}",
-                    }
-                },
+                "query": {"match": {"title": "{{query}}"}},
+                "script": {"source": "return _termStats.uniqueTermsCount();"},
             }
         },
     ),
 ]
 ----
 // NOTCONSOLE
 
+[NOTE]
+.Tern statistics as features
+===================================================
+
+It is very common for an LTR model to leverage raw term statistics as features.
+To extract this information, you can use the {ref}/modules-scripting-fields.html#scripting-term-statistics[term statistics feature] provided as part of the  <<query-dsl-script-score-query,`script_score`>> query.
+
+===================================================
+
 Once the feature extractors have been defined, they are wrapped in an `eland.ml.ltr.LTRModelConfig` object for use in later training steps:
 
 [source,python]

diff --git a/docs/reference/reranking/learning-to-rank-search-usage.asciidoc b/docs/reference/reranking/learning-to-rank-search-usage.asciidoc
@@ -61,10 +61,3 @@ When exposing pagination to users, `window_size` should remain constant as each
 ====== Negative scores
 
 Depending on how your model is trained, it’s possible that the model will return negative scores for documents. While negative scores are not allowed from first-stage retrieval and ranking, it is possible to use them in the LTR rescorer.
-
-[discrete]
-[[learning-to-rank-rescorer-limitations-term-statistics]]
-====== Term statistics as features
-
-We do not currently support term statistics as features, however future releases will introduce this capability.
-
diff --git a/docs/reference/scripting/fields.asciidoc b/docs/reference/scripting/fields.asciidoc
@@ -80,6 +80,79 @@ GET my-index-000001/_search
 }
 -------------------------------------
 
+[discrete]
+[[scripting-term-statistics]]
+=== Accessing term statistics of a document within a script
+
+Scripts used in a <<query-dsl-script-score-query,`script_score`>> query have access to the `_termStats` variable which provides statistical information about the terms in the child query.
+
+In the following example, `_termStats` is used within a <<query-dsl-script-score-query,`script_score`>> query to retrieve the average term frequency for the terms `quick`, `brown`, and `fox` in the `text` field:
+
+[source,console]
+-------------------------------------
+PUT my-index-000001/_doc/1?refresh
+{
+  "text": "quick brown fox"
+}
+
+PUT my-index-000001/_doc/2?refresh
+{
+  "text": "quick fox"
+}
+
+GET my-index-000001/_search
+{
+  "query": {
+    "script_score": {
+      "query": { <1>
+        "match": {
+          "text": "quick brown fox"
+        }
+      },
+      "script": {
+        "source": "_termStats.termFreq().getAverage()" <2>
+      }
+    }
+  }
+}
+-------------------------------------
+
+<1> Child query used to infer the field and the terms considered in term statistics.
+
+<2> The script calculates the average document frequency for the terms in the query using `_termStats`.
+
+`_termStats` provides access to the following functions for working with term statistics:
+
+- `uniqueTermsCount`: Returns the total number of unique terms in the query. This value is the same across all documents.
+- `matchedTermsCount`: Returns the count of query terms that matched within the current document.
+- `docFreq`: Provides document frequency statistics for the terms in the query, indicating how many documents contain each term. This value is consistent across all documents.
+- `totalTermFreq`: Provides the total frequency of terms across all documents, representing how often each term appears in the entire corpus. This value is consistent across all documents.
+- `termFreq`: Returns the frequency of query terms within the current document, showing how often each term appears in that document.
+
+[NOTE]
+.Functions returning aggregated statistics
+===================================================
+
+The `docFreq`, `termFreq` and `totalTermFreq` functions return objects that represent statistics across all terms of the child query.
+
+Statistics provides support for the following methods:
+
+`getAverage()`: Returns the average value of the metric.
+`getMin()`: Returns the minimum value of the metric.
+`getMax()`: Returns the maximum value of the metric.
+`getSum()`: Returns the sum of the metric values.
+`getCount()`: Returns the count of terms included in the metric calculation.
+
+===================================================
+
+
+[NOTE]
+.Painless language required
+===================================================
+
+The `_termStats` variable is only available when using the <<modules-scripting-painless, Painless>> scripting language.
+
+===================================================
 
 [discrete]
 [[modules-scripting-doc-vals]]

diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml
@@ -579,11 +579,6 @@
             <sha256 value="c8fb4839054d280b3033f800d1f5a97de2f028eb8ba2eb458ad287e536f3f25f" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="com.google.crypto.tink" name="tink" version="1.14.0">
-         <artifact name="tink-1.14.0.jar">
-            <sha256 value="47b2248705e0c9771bc259f22465a79655c1296e2d47aaee852adb7cdacb6198" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="com.google.errorprone" name="error_prone_annotations" version="2.11.0">
          <artifact name="error_prone_annotations-2.11.0.jar">
             <sha256 value="721cb91842b46fa056847d104d5225c8b8e1e8b62263b993051e1e5a0137b7ec" origin="Generated by Gradle"/>
@@ -759,11 +754,6 @@
             <sha256 value="8540247fad9e06baefa8fb45eb313802d019f485f14300e0f9d6b556ed88e753" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="com.google.protobuf" name="protobuf-java" version="4.27.0">
-         <artifact name="protobuf-java-4.27.0.jar">
-            <sha256 value="9072e60fe66cff5d6c0f11a1df21d8f3e4b29b5ee782b45c3fc75f59fbe2b839" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="com.google.protobuf" name="protobuf-java-util" version="3.25.5">
          <artifact name="protobuf-java-util-3.25.5.jar">
             <sha256 value="dacc58b2c3d2fa8d4bddc1acb881e78d6cf7c137dd78bc1d67f6aca732436a8d" origin="Generated by Gradle"/>

diff --git a/...as/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java b/...as/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.plugins.Plugin;
@@ -89,13 +90,14 @@ public void testHighlightingWithMatchOnlyTextFieldSyntheticSource() throws IOExc
         // load the source.
 
         String mappings = """
-            { "_source" : { "mode" : "synthetic" },
+            {
               "properties" : {
                 "message" : { "type" : "match_only_text" }
               }
             }
             """;
-        assertAcked(prepareCreate("test").setMapping(mappings));
+        Settings.Builder settings = Settings.builder().put(indexSettings()).put("index.mapping.source.mode", "synthetic");
+        assertAcked(prepareCreate("test").setSettings(settings).setMapping(mappings));
         BulkRequestBuilder bulk = client().prepareBulk("test").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
         for (int i = 0; i < 2000; i++) {
             bulk.add(