Skip to content

Commit

Permalink
fabtests: Bugfixes for neuron
Browse files Browse the repository at this point in the history
This commit fixes the following bugs in neuron fabtests
1. The neuron accelerator detection is broken on some OSs because the
   full path of the executable `neuron-ls` was not used

2. Before this commit, each pytest worker was assigned a single  neuron
   core. This works on multi node tests but fails on single node tests
because a neuron core can only be opened by a single process. This
commit assigns two different neuron cores to each pytest worker for
client-server tests: one for the server and one for the client. Trn1 has
2 cores per neuron device and Trn2 has 8 cores per neuron device, so
  this assignment works for both.

3. When running in serial mode, the env var PYTEST_XDIST_WORKER is not
   set, so the NEURON_RT_VISIBLE_CORES env var is also not set. This
causes the server to occupy all neuron cores and the client fails. So
this commit assigns device 0 to the server and client when running with
one worker.

Signed-off-by: Sai Sunku <[email protected]>
  • Loading branch information
sunkuamzn authored and shijin-aws committed Dec 28, 2024
1 parent 442fa89 commit f893f5f
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions fabtests/pytest/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def num_cuda_devices(ip):
@functools.lru_cache(10)
@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
def num_neuron_devices(ip):
proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
timeout=60, encoding="utf-8")

Expand All @@ -84,7 +84,7 @@ def num_neuron_devices(ip):
@functools.lru_cache(10)
@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
def num_neuron_cores_on_device(ip, device_id):
proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
timeout=60, encoding="utf-8")

Expand All @@ -97,7 +97,7 @@ def num_neuron_cores_on_device(ip, device_id):

@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
def is_neuron_device_available(ip, device_id):
proc = run("ssh {} neuron-ls -j".format(ip), shell=True,
proc = run("ssh {} /opt/aws/neuron/bin/neuron-ls -j".format(ip), shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
timeout=60, encoding="utf-8")

Expand Down Expand Up @@ -455,19 +455,26 @@ def prepare_base_command(self, command_type, executable,
if "PYTEST_XDIST_WORKER" in os.environ:
worker_id = int(os.environ["PYTEST_XDIST_WORKER"].replace("gw", ""))
hmem_device_id = worker_id % num_hmem
if host_memory_type == "cuda":
command += " -i {}".format(hmem_device_id)
else:
assert host_memory_type == "neuron"
num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
else:
hmem_device_id = 0

if host_memory_type == "cuda":
command += " -i {}".format(hmem_device_id)
else:
assert host_memory_type == "neuron"
num_cores = num_neuron_cores_on_device(host_ip, hmem_device_id)
if command_type == "server":
additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
hmem_device_id * num_cores)
wait_until_neuron_device_available(host_ip, hmem_device_id)
else:
additional_environment = "NEURON_RT_VISIBLE_CORES={}".format(
hmem_device_id * num_cores + 1)
wait_until_neuron_device_available(host_ip, hmem_device_id)

if self._cmdline_args.provider == "efa":
import efa.efa_common
efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
command += " -d {}-rdm".format(efa_device)
if self._cmdline_args.provider == "efa":
import efa.efa_common
efa_device = efa.efa_common.get_efa_device_name_for_cuda_device(host_ip, hmem_device_id, num_hmem)
command += " -d {}-rdm".format(efa_device)

return command, additional_environment

Expand Down

0 comments on commit f893f5f

Please sign in to comment.