Skip to content

Commit

Permalink
problematic test case
Browse files Browse the repository at this point in the history
Signed-off-by: Rui Qiao <[email protected]>
  • Loading branch information
ruisearch42 committed Dec 27, 2024
1 parent ab6b61f commit 5d35b71
Showing 1 changed file with 57 additions and 0 deletions.
57 changes: 57 additions & 0 deletions python/ray/dag/tests/experimental/test_torch_tensor_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ def send_or_raise(self, shape, dtype, value: int, raise_exception=False):
raise RuntimeError()
return torch.ones(shape, dtype=dtype, device=self.device) * value

def echo(self, tensor):
return tensor

def recv(self, tensor):
# Check that tensor got loaded to the correct device.
assert tensor.device == self.device
Expand Down Expand Up @@ -344,6 +347,60 @@ def test_torch_tensor_nccl_overlap2(ray_start_regular):
assert ray.get(ref) == [(i, shape, dtype)] * 2


@pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True)
def test_torch_tensor_nccl_overlap3(ray_start_regular):
"""
Test GPU communication can be overlapped when NCCL read and write
peers are different.
Our prior overlap algorithm works by moving a candidate READ (NCCL)
operation before a COMPUTE operation. If there are other READ (NCCL)
or WRITE (NCCL) operations in between, the swap is skipped to avoid
deadlock.
Now we make the algorithm less conservative by removing the constraint for a
WRITE (NCCL) operation in between. Specifically, if there is a WRITE (NCCL)
operation in between, and if its peer rank is different from the peer rank
of the candidate READ (NCCL) operation, we still allow the swap to happen.
We test this case here.
"""

if not USE_GPU:
pytest.skip("NCCL tests require GPUs")

assert (
sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) >= 3
), "This test requires at least 3 GPUs"

actor_cls = TorchTensorWorker.options(num_cpus=0, num_gpus=1)

worker1 = actor_cls.remote()
worker2 = actor_cls.remote()
worker3 = actor_cls.remote()

shape = (10,)
dtype = torch.float16

# Test normal execution.
with InputNode() as inp:
w1 = worker1.send.bind(inp.shape, inp.dtype, inp[0])
w1 = w1.with_type_hint(TorchTensorType(transport="nccl"))
w2 = worker2.echo.bind(w1)
w2 = w2.with_type_hint(TorchTensorType(transport="nccl"))
w3 = worker3.echo.bind(w2)
w3 = w3.with_type_hint(TorchTensorType(transport="nccl"))
dag = worker1.recv.bind(w3)

compiled_dag = dag.experimental_compile(_overlap_gpu_communication=True)

# Test that we can pass different shapes and data.
for i in range(3):
shape = (10 * (i + 1),)
ref = compiled_dag.execute(i, shape=shape, dtype=dtype)
print(ray.get(ref))
# assert ray.get(ref) == [(i, shape, dtype)] * 2


def test_torch_tensor_nccl_disallows_driver(ray_start_regular):
"""
Check that the driver cannot participate in the NCCL group, i.e. DAG input
Expand Down

0 comments on commit 5d35b71

Please sign in to comment.