Skip to content

Commit

Permalink
Stabilize MPI tests for Azure Linux
Browse files Browse the repository at this point in the history
  • Loading branch information
cyberbandya007 committed Nov 20, 2024
1 parent 5c8d025 commit 81559cd
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 12 deletions.
53 changes: 51 additions & 2 deletions lisa/features/infiniband.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from retry import retry

from lisa.base_tools import Cat, Sed, Uname, Wget
from lisa.tools.git import Git
from lisa.feature import Feature
from lisa.features import Disk
from lisa.operating_system import CBLMariner, Oracle, Redhat, Ubuntu
from lisa.tools import Firewall, Ls, Lspci, Make, Service
from lisa.tools import Chmod, Find, Firewall, Ls, Lspci, Make, Service
from lisa.tools.tar import Tar
from lisa.util import (
LisaException,
Expand Down Expand Up @@ -466,7 +467,6 @@ def install_intel_mpi(self) -> None:

def install_open_mpi(self) -> None:
node = self._node
# Install Open MPI
wget = node.tools[Wget]
tar_file = (
"https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz"
Expand Down Expand Up @@ -497,6 +497,55 @@ def install_open_mpi(self) -> None:
make.make("", cwd=openmpi_folder, sudo=True)
make.make_install(cwd=openmpi_folder, sudo=True)

def install_intel_mpi_benchmarking_tool(self, tool_names: List[str] = ["IMB-MPI1"]) -> None:
# Assumption is we have required mpi package built and installed
node = self._node
if not isinstance(node.os, CBLMariner):
# These tools are included in other distro packages
return
# Clone and build Intel MPI Benchmarks https://github.com/intel/mpi-benchmarks.git
git = node.tools[Git]
git.clone(url="https://github.com/intel/mpi-benchmarks.git", cwd=node.working_path)

imb_src_folder = node.get_pure_path(f"{node.working_path}/mpi-benchmarks")

find = node.tools[Find]
# find mpicc path
find_results = find.find_files(
node.get_pure_path("/"), "mpicc", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicc from MPI package"
).is_greater_than(0)
mpicc_path = find_results[0]
assert_that(mpicc_path).described_as(
"Could not find location of mpicc from MPI package"
).is_not_empty()

# find mpicxx path
find_results = find.find_files(
node.get_pure_path("/"), "mpicxx", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of mpicxx from MPI package"
).is_greater_than(0)
mpicxx_path = find_results[0]
assert_that(mpicxx_path).described_as(
"Could not find location of mpicxx from MPI package"
).is_not_empty()

node.tools[Chmod].chmod(mpicc_path, "755", sudo=True)
node.tools[Chmod].chmod(mpicxx_path, "755", sudo=True)

# tool_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for tool in tool_names:
make = node.tools[Make]
make.make(f"{tool} CC={mpicc_path} CXX={mpicxx_path}",
cwd=imb_src_folder, sudo=True,
shell=False, sendYesCmd=False)
node.tools[Chmod].chmod(f"{imb_src_folder}/{tool}", "755", sudo=True)


def install_ibm_mpi(self, platform_mpi_url: str) -> None:
node = self._node
if isinstance(node.os, Redhat):
Expand Down
12 changes: 9 additions & 3 deletions lisa/tools/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def make(
thread_count: int = 0,
update_envs: Optional[Dict[str, str]] = None,
ignore_error: bool = False,
shell: bool = True,
sendYesCmd: bool = True
) -> ExecutableResult:
expected_exit_code: Optional[int] = 0
if thread_count == 0:
Expand All @@ -95,13 +97,17 @@ def make(

if ignore_error:
expected_exit_code = None
# yes '' answers all questions with default value.
command = ""
if sendYesCmd:
# yes '' answers all questions with default value.
command = "yes '' | "

result = self.node.execute(
f"yes '' | make -j{thread_count} {arguments}",
f"{command} make -j{thread_count} {arguments}",
cwd=cwd,
timeout=timeout,
sudo=sudo,
shell=True,
shell=shell,
update_envs=update_envs,
expected_exit_code=expected_exit_code,
expected_exit_code_failure_message="Failed to make",
Expand Down
32 changes: 25 additions & 7 deletions microsoft/testsuites/hpc/infinibandsuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
simple_requirement,
)
from lisa.features import AvailabilitySetEnabled, Infiniband, Sriov
from lisa.operating_system import BSD, Windows
from lisa.operating_system import BSD, CBLMariner, Windows
from lisa.sut_orchestrator.azure.tools import Waagent
from lisa.tools import Find, KernelConfig, Ls, Modprobe, Ssh
from lisa.util import (
Expand Down Expand Up @@ -286,6 +286,9 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True

# Note: Using bash because script is not supported by Dash
# sh points to dash on Ubuntu
Expand All @@ -295,6 +298,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed intra-node pingpong test "
"with intel mpi",
Expand All @@ -306,6 +310,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
"/opt/intel/oneapi/mpi/2021.1.1/bin/IMB-MPI1 pingpong",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message="Failed inter-node pingpong test "
"with intel mpi",
Expand All @@ -319,6 +324,7 @@ def verify_intel_mpi(self, environment: Environment, log: Logger) -> None:
"-n 44 -env I_MPI_FABRICS=shm:ofi -env SECS_PER_SAMPLE=600 "
"-env FI_PROVIDER=mlx -env I_MPI_DEBUG=5 -env I_MPI_PIN_DOMAIN=numa "
f"/opt/intel/oneapi/mpi/2021.1.1/bin/{test}",
sudo=sudo,
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test with intel mpi",
timeout=3000,
Expand Down Expand Up @@ -360,10 +366,13 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_open_mpi, client_ib.install_open_mpi])

server_node.execute("ldconfig", sudo=True)
client_node.execute("ldconfig", sudo=True)

# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool()

# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
server_node.close()
Expand All @@ -386,7 +395,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
# Ping Pong test
find = server_node.tools[Find]
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of IMB-MPI1 for Open MPI"
Expand All @@ -407,7 +416,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:

# IMB-MPI Tests
find_results = find.find_files(
server_node.get_pure_path("/usr"), "IMB-MPI1", sudo=True
server_node.get_pure_path("/"), "IMB-MPI1", sudo=True
)
assert_that(len(find_results)).described_as(
"Could not find location of Open MPI test: IMB-MPI1"
Expand All @@ -417,7 +426,7 @@ def verify_open_mpi(self, environment: Environment, log: Logger) -> None:
"Could not find location of Open MPI test: IMB-MPI1"
).is_not_empty()
server_node.execute(
f"/usr/local/bin/mpirun --host {server_ip},{client_ip} "
f"/usr/local/bin/mpirun -hosts {server_ip},{client_ip} "
"-n 2 --mca btl self,vader,openib --mca btl_openib_cq_size 4096 "
"--mca btl_openib_allow_ib 1 --mca "
f"btl_openib_warn_no_device_params_found 0 {test_path}",
Expand Down Expand Up @@ -571,6 +580,12 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
raise SkippedException(err)

run_in_parallel([server_ib.install_mvapich_mpi, client_ib.install_mvapich_mpi])
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
# Only for mariner, we need to build intel benchmarking tools
# as they are not included in our packages
server_ib.install_intel_mpi_benchmarking_tool(tool_names=test_names)

server_node.execute("ldconfig", sudo=True)

# Restart the ssh sessions for changes to /etc/security/limits.conf
# to take effect
Expand All @@ -590,13 +605,15 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
client_ssh.enable_public_key(server_ssh.generate_key_pairs())
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)
sudo=False
if isinstance(server_node.os, CBLMariner):
sudo=True

# Run MPI tests
find = server_node.tools[Find]
test_names = ["IMB-MPI1", "IMB-RMA", "IMB-NBC"]
for test in test_names:
find_results = find.find_files(
server_node.get_pure_path("/usr"), test, sudo=True
server_node.get_pure_path("/"), test, sudo=True
)
assert_that(len(find_results)).described_as(
f"Could not find location of MVAPICH MPI test: {test}"
Expand All @@ -611,6 +628,7 @@ def verify_mvapich_mpi(self, environment: Environment, log: Logger) -> None:
expected_exit_code=0,
expected_exit_code_failure_message=f"Failed {test} test "
"with MVAPICH MPI",
sudo=sudo
)

def _check_nd_enabled(self, node: Node) -> None:
Expand Down

0 comments on commit 81559cd

Please sign in to comment.