elastic · demjened · Jan 29, 2025 · Jan 17, 2025 · Jan 20, 2025 · Jan 20, 2025
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -159,6 +159,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion BYTE_SIZE_VALUE_ALWAYS_USES_BYTES_1 = def(8_825_00_0);
     public static final TransportVersion REVERT_BYTE_SIZE_VALUE_ALWAYS_USES_BYTES_1 = def(8_826_00_0);
     public static final TransportVersion ESQL_SKIP_ES_INDEX_SERIALIZATION = def(8_827_00_0);
+    public static final TransportVersion INFERENCE_REQUEST_ADAPTIVE_RATE_LIMITING_ADDED = def(8_828_00_0);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/...c/main/java/org/elasticsearch/xpack/core/inference/action/BaseInferenceActionRequest.java b/...c/main/java/org/elasticsearch/xpack/core/inference/action/BaseInferenceActionRequest.java
@@ -7,25 +7,56 @@
 
 package org.elasticsearch.xpack.core.inference.action;
 
+import org.elasticsearch.TransportVersions;
 import org.elasticsearch.action.ActionRequest;
 import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.inference.TaskType;
 
 import java.io.IOException;
 
+/**
+ * Base class for inference action requests. Tracks request routing state to prevent potential routing loops
+ * and supports both streaming and non-streaming inference operations.
+ */
 public abstract class BaseInferenceActionRequest extends ActionRequest {
 
+    private boolean hasBeenRerouted;
+
     public BaseInferenceActionRequest() {
         super();
     }
 
     public BaseInferenceActionRequest(StreamInput in) throws IOException {
         super(in);
+        if (in.getTransportVersion().onOrAfter(TransportVersions.INFERENCE_REQUEST_ADAPTIVE_RATE_LIMITING_ADDED)) {
+            this.hasBeenRerouted = in.readBoolean();
+        } else {
+            // For backwards compatibility, we treat all inference requests coming from ES nodes having
-            // For backwards compatibility, we treat all inference requests coming from ES nodes having
+            // For backwards compatibility, we treat all inference requests coming into ES nodes having
-            // For backwards compatibility, we treat all inference requests coming from ES nodes having
+            // For backwards compatibility, we treat all inference requests coming into ES nodes having
+            // a version pre-node-local-rate-limiting as already rerouted to maintain pre-node-local-rate-limiting behavior.
+            this.hasBeenRerouted = true;
+        }
     }
 
     public abstract boolean isStreaming();
 
     public abstract TaskType getTaskType();
 
     public abstract String getInferenceEntityId();
+
+    public void setHasBeenRerouted(boolean hasBeenRerouted) {
+        this.hasBeenRerouted = hasBeenRerouted;
+    }
+
+    public boolean hasBeenRerouted() {
+        return hasBeenRerouted;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        if (out.getTransportVersion().onOrAfter(TransportVersions.INFERENCE_REQUEST_ADAPTIVE_RATE_LIMITING_ADDED)) {
+            out.writeBoolean(hasBeenRerouted);
+        }
+    }
 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java
@@ -72,6 +72,7 @@
 import org.elasticsearch.xpack.inference.action.TransportUnifiedCompletionInferenceAction;
 import org.elasticsearch.xpack.inference.action.TransportUpdateInferenceModelAction;
 import org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter;
+import org.elasticsearch.xpack.inference.common.InferenceServiceNodeLocalRateLimitCalculator;
 import org.elasticsearch.xpack.inference.common.Truncator;
 import org.elasticsearch.xpack.inference.external.amazonbedrock.AmazonBedrockRequestSender;
 import org.elasticsearch.xpack.inference.external.http.HttpClientManager;
@@ -133,6 +134,7 @@
 import java.util.stream.Stream;
 
 import static java.util.Collections.singletonList;
+import static org.elasticsearch.xpack.inference.common.InferenceAPIClusterAwareRateLimitingFeature.*;
 import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceService.ELASTIC_INFERENCE_SERVICE_IDENTIFIER;
 import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.DEPRECATED_ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;
 import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;
@@ -243,6 +245,9 @@ public List<RestHandler> getRestHandlers(
 
     @Override
     public Collection<?> createComponents(PluginServices services) {
+        var components = new ArrayList<>();
+
+        var clusterService = services.clusterService();
         var throttlerManager = new ThrottlerManager(settings, services.threadPool(), services.clusterService());
         var truncator = new Truncator(settings, services.clusterService());
         serviceComponents.set(new ServiceComponents(services.threadPool(), throttlerManager, settings, truncator));
@@ -298,26 +303,37 @@ public Collection<?> createComponents(PluginServices services) {
         var factoryContext = new InferenceServiceExtension.InferenceServiceFactoryContext(
             services.client(),
             services.threadPool(),
-            services.clusterService(),
+            clusterService,
             settings
         );
 
         // This must be done after the HttpRequestSenderFactory is created so that the services can get the
         // reference correctly
-        var registry = new InferenceServiceRegistry(inferenceServices, factoryContext);
-        registry.init(services.client());
-        for (var service : registry.getServices().values()) {
+        var serviceRegistry = new InferenceServiceRegistry(inferenceServices, factoryContext);
+        serviceRegistry.init(services.client());
+        for (var service : serviceRegistry.getServices().values()) {
             service.defaultConfigIds().forEach(modelRegistry::addDefaultIds);
         }
-        inferenceServiceRegistry.set(registry);
+        inferenceServiceRegistry.set(serviceRegistry);
 
-        var actionFilter = new ShardBulkInferenceActionFilter(services.clusterService(), registry, modelRegistry);
+        var actionFilter = new ShardBulkInferenceActionFilter(services.clusterService(), serviceRegistry, modelRegistry);
         shardBulkInferenceActionFilter.set(actionFilter);
 
         var meterRegistry = services.telemetryProvider().getMeterRegistry();
-        var stats = new PluginComponentBinding<>(InferenceStats.class, InferenceStats.create(meterRegistry));
+        var inferenceStats = new PluginComponentBinding<>(InferenceStats.class, InferenceStats.create(meterRegistry));
+
+        components.add(serviceRegistry);
+        components.add(modelRegistry);
+        components.add(httpClientManager);
+        components.add(inferenceStats);
+
+        // Only add InferenceServiceNodeLocalRateLimitCalculator (which is a ClusterStateListener) for cluster aware rate limiting,
+        // if elastic inference service and the rate limiting feature flags are enabled
+        if (isElasticInferenceServiceEnabled() && INFERENCE_API_CLUSTER_AWARE_RATE_LIMITING_FEATURE_FLAG.isEnabled()) {
+            components.add(new InferenceServiceNodeLocalRateLimitCalculator(services.clusterService(), serviceRegistry));
+        }
 
-        return List.of(modelRegistry, registry, httpClientManager, stats);
+        return components;
     }
 
     @Override