elastic · timgrein · Jan 31, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/muted-tests.yml b/muted-tests.yml
@@ -362,12 +362,6 @@ tests:
 - class: org.elasticsearch.xpack.security.CoreWithSecurityClientYamlTestSuiteIT
   method: test {yaml=indices.get_alias/10_basic/Get aliases via /*/_alias/}
   issue: https://github.com/elastic/elasticsearch/issues/121290
-- class: org.elasticsearch.xpack.inference.action.TransportInferenceActionTests
-  method: testRerouting_HandlesTransportException_FromOtherNode
-  issue: https://github.com/elastic/elasticsearch/issues/121292
-- class: org.elasticsearch.xpack.inference.action.TransportInferenceActionTests
-  method: testRerouting_ToOtherNode
-  issue: https://github.com/elastic/elasticsearch/issues/121293
 - class: org.elasticsearch.xpack.inference.common.InferenceServiceNodeLocalRateLimitCalculatorTests
   issue: https://github.com/elastic/elasticsearch/issues/121294
 - class: org.elasticsearch.entitlement.runtime.policy.PolicyManagerTests

diff --git a/.../src/main/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceAction.java b/.../src/main/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceAction.java
@@ -55,7 +55,6 @@
 
 import static org.elasticsearch.core.Strings.format;
 import static org.elasticsearch.xpack.inference.InferencePlugin.INFERENCE_API_FEATURE;
-import static org.elasticsearch.xpack.inference.common.InferenceAPIClusterAwareRateLimitingFeature.INFERENCE_API_CLUSTER_AWARE_RATE_LIMITING_FEATURE_FLAG;
 import static org.elasticsearch.xpack.inference.telemetry.InferenceStats.modelAttributes;
 import static org.elasticsearch.xpack.inference.telemetry.InferenceStats.responseAttributes;
 
@@ -188,10 +187,6 @@ private void validateRequest(Request request, UnparsedModel unparsedModel) {
     }
 
     private NodeRoutingDecision determineRouting(String serviceName, Request request, UnparsedModel unparsedModel) {
-        if (INFERENCE_API_CLUSTER_AWARE_RATE_LIMITING_FEATURE_FLAG.isEnabled() == false) {
-            return NodeRoutingDecision.handleLocally();
-        }
-
         var modelTaskType = unparsedModel.taskType();
 
         // Rerouting not supported or request was already rerouted

diff --git a/...t/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceActionTestCase.java b/...t/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceActionTestCase.java
@@ -28,7 +28,7 @@
 import org.elasticsearch.xpack.core.inference.action.InferenceAction;
 import org.elasticsearch.xpack.inference.InferencePlugin;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceNodeLocalRateLimitCalculator;
+import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.registry.ModelRegistry;
 import org.elasticsearch.xpack.inference.telemetry.InferenceStats;
 import org.junit.Before;
@@ -64,7 +64,7 @@ public abstract class BaseTransportInferenceActionTestCase<Request extends BaseI
     protected static final String inferenceId = "inferenceEntityId";
     protected InferenceServiceRegistry serviceRegistry;
     protected InferenceStats inferenceStats;
-    protected InferenceServiceNodeLocalRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator;
+    protected InferenceServiceRateLimitCalculator inferenceServiceRateLimitCalculator;
     protected TransportService transportService;
     protected NodeClient nodeClient;
 
@@ -79,7 +79,7 @@ public void setUp() throws Exception {
         ThreadPool threadPool = mock();
         nodeClient = mock();
         transportService = mock();
-        inferenceServiceNodeLocalRateLimitCalculator = mock();
+        inferenceServiceRateLimitCalculator = mock();
         licenseState = mock();
         modelRegistry = mock();
         serviceRegistry = mock();
@@ -94,7 +94,7 @@ public void setUp() throws Exception {
             serviceRegistry,
             inferenceStats,
             streamingTaskManager,
-            inferenceServiceNodeLocalRateLimitCalculator,
+            inferenceServiceRateLimitCalculator,
             nodeClient,
             threadPool
         );
@@ -110,7 +110,7 @@ protected abstract BaseTransportInferenceAction<Request> createAction(
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceNodeLocalRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
+        InferenceServiceRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     );

diff --git a/...src/test/java/org/elasticsearch/xpack/inference/action/TransportInferenceActionTests.java b/...src/test/java/org/elasticsearch/xpack/inference/action/TransportInferenceActionTests.java
@@ -19,7 +19,7 @@
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.core.inference.action.InferenceAction;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceNodeLocalRateLimitCalculator;
+import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.common.RateLimitAssignment;
 import org.elasticsearch.xpack.inference.registry.ModelRegistry;
 import org.elasticsearch.xpack.inference.telemetry.InferenceStats;
@@ -50,7 +50,7 @@ protected BaseTransportInferenceAction<InferenceAction.Request> createAction(
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceNodeLocalRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
+        InferenceServiceRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     ) {
@@ -77,7 +77,7 @@ public void testNoRerouting_WhenTaskTypeNotSupported() {
         TaskType unsupportedTaskType = TaskType.COMPLETION;
         mockService(listener -> listener.onResponse(mock()));
 
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, unsupportedTaskType)).thenReturn(false);
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, unsupportedTaskType)).thenReturn(false);
 
         var listener = doExecute(unsupportedTaskType);
 
@@ -89,8 +89,8 @@ public void testNoRerouting_WhenTaskTypeNotSupported() {
     public void testNoRerouting_WhenNoGroupingCalculatedYet() {
         mockService(listener -> listener.onResponse(mock()));
 
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        when(inferenceServiceNodeLocalRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(null);
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
+        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(null);
 
         var listener = doExecute(taskType);
 
@@ -102,8 +102,8 @@ public void testNoRerouting_WhenNoGroupingCalculatedYet() {
     public void testNoRerouting_WhenEmptyNodeList() {
         mockService(listener -> listener.onResponse(mock()));
 
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        when(inferenceServiceNodeLocalRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
+        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(
             new RateLimitAssignment(List.of())
         );
 
@@ -120,10 +120,10 @@ public void testRerouting_ToOtherNode() {
 
         // The local node is different to the "other-node" responsible for serviceId
         when(nodeClient.getLocalNodeId()).thenReturn("local-node");
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
         // Requests for serviceId are always routed to "other-node"
         var assignment = new RateLimitAssignment(List.of(otherNode));
-        when(inferenceServiceNodeLocalRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
+        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
 
         mockService(listener -> listener.onResponse(mock()));
         var listener = doExecute(taskType);
@@ -141,9 +141,9 @@ public void testRerouting_ToLocalNode_WithoutGoingThroughTransportLayerAgain() {
 
         // The local node is the only one responsible for serviceId
         when(nodeClient.getLocalNodeId()).thenReturn(localNodeId);
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
         var assignment = new RateLimitAssignment(List.of(localNode));
-        when(inferenceServiceNodeLocalRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
+        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
 
         mockService(listener -> listener.onResponse(mock()));
         var listener = doExecute(taskType);
@@ -158,9 +158,9 @@ public void testRerouting_HandlesTransportException_FromOtherNode() {
         when(otherNode.getId()).thenReturn("other-node");
 
         when(nodeClient.getLocalNodeId()).thenReturn("local-node");
-        when(inferenceServiceNodeLocalRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
+        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
         var assignment = new RateLimitAssignment(List.of(otherNode));
-        when(inferenceServiceNodeLocalRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
+        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
 
         mockService(listener -> listener.onResponse(mock()));
 
@@ -173,6 +173,10 @@ public void testRerouting_HandlesTransportException_FromOtherNode() {
 
         var listener = doExecute(taskType);
 
+        // Verify request was rerouted
+        verify(transportService).sendRequest(same(otherNode), eq(InferenceAction.NAME), any(), any());
+        // Verify local execution didn't happen
+        verify(listener, never()).onResponse(any());
         // Verify exception was propagated from "other-node" to "local-node"
         verify(listener).onFailure(same(expectedException));
     }

diff --git a/.../java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionActionTests.java b/.../java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionActionTests.java
@@ -18,7 +18,7 @@
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.core.inference.action.UnifiedCompletionAction;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceNodeLocalRateLimitCalculator;
+import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.registry.ModelRegistry;
 import org.elasticsearch.xpack.inference.telemetry.InferenceStats;
 
@@ -49,7 +49,7 @@ protected BaseTransportInferenceAction<UnifiedCompletionAction.Request> createAc
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceNodeLocalRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
+        InferenceServiceRateLimitCalculator inferenceServiceRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     ) {
@@ -61,7 +61,7 @@ protected BaseTransportInferenceAction<UnifiedCompletionAction.Request> createAc
             serviceRegistry,
             inferenceStats,
             streamingTaskManager,
-            inferenceServiceNodeLocalRateLimitCalculator,
+            inferenceServiceRateLimitCalculator,
             nodeClient,
             threadPool
         );