From 5cc7871b2c7488330ec2576c5f92c8f2c8a3db92 Mon Sep 17 00:00:00 2001 From: Josh Karpel Date: Wed, 8 Jan 2025 17:54:24 -0600 Subject: [PATCH] [Serve] Deflake `test_reconfigure_with_queries` (#49696) Small isolated PR to (hopefully) fix flakiness issues with `python/ray/serve/tests/test_deploy.py::test_reconfigure_with_queries`, noticed while working on #48807 and other PRs. --------- Signed-off-by: Josh Karpel Co-authored-by: Edward Oakes --- python/ray/serve/tests/test_deploy.py | 37 +++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/python/ray/serve/tests/test_deploy.py b/python/ray/serve/tests/test_deploy.py index 769bff110093..0068f0c07466 100644 --- a/python/ray/serve/tests/test_deploy.py +++ b/python/ray/serve/tests/test_deploy.py @@ -10,7 +10,7 @@ import ray from ray import serve from ray._private.pydantic_compat import ValidationError -from ray._private.test_utils import SignalActor +from ray._private.test_utils import SignalActor, wait_for_condition from ray.serve._private.constants import RAY_SERVE_EAGERLY_START_REPLACEMENT_REPLICAS from ray.serve._private.utils import get_random_string from ray.serve.exceptions import RayServeException @@ -324,10 +324,16 @@ def make_nonblocking_calls(expected, expect_blocking=False): make_nonblocking_calls({"2": 2}) -def test_reconfigure_with_queries(serve_instance): +def test_reconfigure_does_not_run_while_there_are_active_queries(serve_instance): + """ + This tests checks that reconfigure can't trigger while there are active requests, + so that the actor's state is not mutated mid-request. + + https://github.com/ray-project/ray/pull/20315 + """ signal = SignalActor.remote() - @serve.deployment(max_ongoing_requests=10, num_replicas=3) + @serve.deployment(max_ongoing_requests=10, num_replicas=1) class A: def __init__(self): self.state = None @@ -340,17 +346,38 @@ async def __call__(self): return self.state["a"] handle = serve.run(A.options(version="1", user_config={"a": 1}).bind()) - responses = [handle.remote() for _ in range(30)] + responses = [handle.remote() for _ in range(10)] + + # Give the queries time to get to the replicas before the reconfigure. + wait_for_condition( + lambda: ray.get(signal.cur_num_waiters.remote()) == len(responses) + ) @ray.remote(num_cpus=0) def reconfigure(): serve.run(A.options(version="1", user_config={"a": 2}).bind()) + # Start the reconfigure; + # this will not complete until the signal is released + # to allow the queries to complete. reconfigure_ref = reconfigure.remote() + + # Release the signal to allow the queries to complete. signal.send.remote() + + # Wait for the reconfigure to complete. ray.get(reconfigure_ref) - assert all([r.result() == 1 for r in responses]) + # These should all be 1 because the queries were sent before the reconfigure, + # the reconfigure blocks until they complete, + # and we just waited for the reconfigure to finish. + results = [r.result() for r in responses] + print(results) + assert all([r == 1 for r in results]) + + # If we query again, it should be 2, + # because the reconfigure will have gone through after the + # original queries completed. assert handle.remote().result() == 2