From d61075a94588ec30ae80e5c79929ed5f1179993e Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 22 Mar 2024 13:50:07 -0700 Subject: [PATCH 1/4] contrib/intel/jenkins: Point Jenkinsfile at Intel Internal Repo Scripts Intel CI scripts have been moved to an internal repository. The jenkinsfile needs to be updated to point at those instead of the ones in libfabric Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 70b9a874e9f..292dfbb904a 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -3,7 +3,6 @@ import groovy.transform.Field properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def DO_RUN=true @Field def TARGET="main" -@Field def SCRIPT_LOCATION="upstream/libfabric/contrib/intel/jenkins" @Field def RELEASE=false @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def PYTHON_VERSION="3.9" @@ -320,9 +319,9 @@ pipeline { environment { JOB_CADENCE = 'PR' WITH_ENV="'PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH'" - RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/" CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" DELETE_LOCATION="${env.CUSTOM_WORKSPACE}/middlewares" + RUN_LOCATION="${env.CUSTOM_WORKSPACE}/ci_resources/legacy_pipeline_scripts/" LOG_DIR = "${env.CUSTOM_WORKSPACE}/log_dir" } stages { @@ -766,7 +765,7 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") { + dir (RUN_LOCATION) { dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf" cmd = """ python3.9 runtests.py --test=dmabuf \ --prov=verbs --util=rxm --build_hw=gpu""" From 418a84ac62465bee05f712d4bc81399e6d2ac799 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 22 Mar 2024 13:53:54 -0700 Subject: [PATCH 2/4] contrib/intel/jenkins: Remove Intel CI pipeline scripts from ofiwg libfabric Pipeline scripts have been put into an Intel internal repository for use instead Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/build.py | 253 ------ contrib/intel/jenkins/common.py | 169 ---- contrib/intel/jenkins/run.py | 228 ------ contrib/intel/jenkins/runtests.py | 154 ---- contrib/intel/jenkins/summary.py | 955 ---------------------- contrib/intel/jenkins/tests.py | 1238 ----------------------------- 6 files changed, 2997 deletions(-) delete mode 100755 contrib/intel/jenkins/build.py delete mode 100755 contrib/intel/jenkins/common.py delete mode 100755 contrib/intel/jenkins/run.py delete mode 100755 contrib/intel/jenkins/runtests.py delete mode 100755 contrib/intel/jenkins/summary.py delete mode 100755 contrib/intel/jenkins/tests.py diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py deleted file mode 100755 index 122a707cb66..00000000000 --- a/contrib/intel/jenkins/build.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -import sys - -# add jenkins config location to PATH -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config - -import argparse -import subprocess -import shlex -import common -import re -import shutil - -def build_libfabric(libfab_install_path, mode, hw_type, gpu=False, cuda=False): - - if (os.path.exists(libfab_install_path) != True): - os.makedirs(libfab_install_path) - - config_cmd = ['./configure', f'--prefix={libfab_install_path}'] - enable_prov_val = 'yes' - - if (mode == 'dbg'): - config_cmd.append('--enable-debug') - elif (mode == 'dl'): - enable_prov_val = 'dl' - - for prov in common.providers[hw_type]['enable']: - config_cmd.append(f'--enable-{prov}={enable_prov_val}') - - for prov in common.providers[hw_type]['disable']: - config_cmd.append(f'--enable-{prov}=no') - - for op in common.common_disable_list: - config_cmd.append(f'--enable-{op}=no') - - if (gpu): - config_cmd.append('--enable-ze-dlopen') - else: - config_cmd.append('--with-ze=no') - - if (cuda): - config_cmd.append(f'--with-cuda={os.environ["CUDA_INSTALL"]}') - - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(" ".join(config_cmd))) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make','install']) - - -def build_fabtests(libfab_install_path, mode, cuda=False): - if (mode == 'dbg'): - config_cmd = ['./configure', '--enable-debug', - f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - else: - config_cmd = ['./configure', f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - - if cuda: - config_cmd.append(f'--with-cuda={os.environ["CUDA_INSTALL"]}') - - common.run_command(['./autogen.sh']) - common.run_command(config_cmd) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make', 'install']) - -def build_mpich(install_path, libfab_installpath, hw_type): - mpich_build_dir = f'{install_path}/middlewares/mpich_{hw_type}/mpich' - cwd = os.getcwd() - if (os.path.exists(mpich_build_dir)): - print("configure mpich") - os.chdir(mpich_build_dir) - configure_cmd = f"./configure " - configure_cmd += f"--prefix={install_path}/middlewares/mpich_{hw_type} " - configure_cmd += f"--with-libfabric={libfab_installpath} " - configure_cmd += "--disable-oshmem " - configure_cmd += "--disable-fortran " - configure_cmd += "--without-ch4-shmmods " - configure_cmd += "--with-device=ch4:ofi " - configure_cmd += "--without-ze " - print(configure_cmd) - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(configure_cmd)) - common.run_command(['make','-j']) - common.run_command(['make','install']) - os.chdir(cwd) - -def build_mpich_osu(install_path, libfab_installpath, hw_type): - mpich_build = f'{install_path}/middlewares/mpich_{hw_type}' - osu_build_dir = f'{install_path}/middlewares/mpich_{hw_type}/osu_source' - cwd = os.getcwd() - if (os.path.exists(osu_build_dir)): - os.chdir(osu_build_dir) - if 'LD_LIBRARY_PATH' in dict(os.environ).keys(): - ld_library_path = os.environ['LD_LIBRARY_PATH'] - else: - ld_library_path = '' - - if 'PATH' in dict(os.environ).keys(): - path = os.environ['PATH'] - else: - path = '' - - os.environ['CC']=f'{mpich_build}/bin/mpicc' - os.environ['CXX']=f'{mpich_build}/bin/mpicxx' - os.environ['CFLAGS']=f'-I{osu_build_dir}/util' - os.environ['PATH']=f'{libfab_installpath}/bin:{mpich_build}/bin/:{path}' - os.environ['LD_LIBRARY_PATH']=f'{libfab_installpath}/lib:'\ - f'{mpich_build}/bin/lib:{ld_library_path}' - configure_cmd = f"./configure " - configure_cmd += f"--prefix={mpich_build}/osu " - print(f"Building OSU Tests: {configure_cmd}") - common.run_command(shlex.split(configure_cmd)) - common.run_command(shlex.split("make -j install")) - os.chdir(cwd) - os.environ['PATH'] = path - os.environ['LD_LIBRARY_PATH'] = ld_library_path - -def build_shmem(install_path, libfab_installpath, hw_type): - shmem_build = f'{install_path}/middlewares/shmem_{hw_type}' - shmem_build_dir = f'{shmem_build}/SOS' - cwd = os.getcwd() - if (os.path.exists(shmem_build_dir)): - os.chdir(shmem_build_dir) - - command = "bash -c \'" - command += "./autogen.sh; " - command += "./configure " - command += f"--prefix={shmem_build} " - command += "--disable-fortran " - command += "--enable-pmi-simple " - command += "--enable-hard-polling " - command += "--enable-manual-progress " - if hw_type == 'water': - command += "--enable-ofi-mr=basic " - - command += f"--with-ofi={libfab_installpath}; " - - command += "make clean; " - command += "make -j; " - command += "make check TESTS=; " - command += "make install" - command += "\'" - - common.run_command(shlex.split(command)) - - os.chdir(cwd) - - -def copy_build_dir(install_path): - middlewares_path = f'{install_path}/middlewares' - if (os.path.exists(middlewares_path) != True): - os.makedirs(f'{install_path}/middlewares') - - shutil.copytree(f'{cloudbees_config.build_dir}/shmem_grass', - f'{middlewares_path}/shmem_grass') - shutil.copytree(f'{cloudbees_config.build_dir}/shmem_water', - f'{middlewares_path}/shmem_water') - shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', - f'{middlewares_path}/oneccl') - shutil.copytree(f'{cloudbees_config.build_dir}/mpich_water', - f'{middlewares_path}/mpich_water') - shutil.copytree(f'{cloudbees_config.build_dir}/mpich_grass', - f'{middlewares_path}/mpich_grass') - - os.symlink(f'{cloudbees_config.build_dir}/impi', - f'{middlewares_path}/impi') - os.symlink(f'{cloudbees_config.build_dir}/ompi', - f'{middlewares_path}/ompi') - os.symlink(f'{cloudbees_config.build_dir}/oneccl_gpu', - f'{middlewares_path}/oneccl_gpu') - -def copy_file(file_name): - if (os.path.exists(f'{workspace}/{file_name}')): - shutil.copyfile(f'{workspace}/{file_name}', - f'{install_path}/log_dir/{file_name}') - -def log_dir(install_path, release=False): - if (os.path.exists(f'{install_path}/log_dir') != True): - os.makedirs(f'{install_path}/log_dir') - - if (release): - copy_file('Makefile.am.diff') - copy_file('configure.ac.diff') - copy_file('release_num.txt') - -if __name__ == "__main__": -#read Jenkins environment variables - # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' - # job name is better to use to distinguish between builds of different - # jobs but with same branch name. - jobname = os.environ['JOB_NAME'] - buildno = os.environ['BUILD_NUMBER'] - workspace = os.environ['WORKSPACE'] - custom_workspace = os.environ['CUSTOM_WORKSPACE'] - - parser = argparse.ArgumentParser() - parser.add_argument('--build_item', help="build libfabric or fabtests", \ - choices=['libfabric', 'fabtests', 'builddir', 'logdir',\ - 'mpich', 'shmem']) - parser.add_argument('--build_hw', help="HW type for build", - choices=['water', 'grass', 'fire', 'electric', 'ucx', - 'daos', 'gpu', 'ivysaur', 'cyndaquil', - 'quilava']) - parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ - "build mode", choices=['reg', 'dbg', 'dl']) - parser.add_argument('--build_loc', help="build location for libfabric "\ - "and fabtests", type=str, default='./') - parser.add_argument('--release', help="This job is likely testing a "\ - "release and will be checked into a git tree.", - action='store_true') - parser.add_argument('--gpu', help="Enable ZE dlopen", action='store_true') - parser.add_argument('--cuda', help="Enable cuda", action='store_true') - - args = parser.parse_args() - build_item = args.build_item - build_hw = args.build_hw - build_loc = args.build_loc - release = args.release - gpu = args.gpu - cuda = args.cuda - - if (args.ofi_build_mode): - ofi_build_mode = args.ofi_build_mode - else: - ofi_build_mode = 'reg' - - libfab_install_path = f'{custom_workspace}/{build_hw}/{ofi_build_mode}' - - p = re.compile('mpi*') - - curr_dir = os.getcwd() - os.chdir(build_loc) - - if (build_item == 'libfabric'): - build_libfabric(libfab_install_path, ofi_build_mode, build_hw, gpu, - cuda) - elif (build_item == 'fabtests'): - build_fabtests(libfab_install_path, ofi_build_mode, cuda) - elif (build_item == 'builddir'): - copy_build_dir(custom_workspace) - elif (build_item == 'logdir'): - log_dir(custom_workspace, release) - elif(build_item == 'mpich'): - build_mpich(custom_workspace, libfab_install_path, build_hw) - build_mpich_osu(custom_workspace, libfab_install_path, build_hw) - elif (build_item == 'shmem'): - build_shmem(custom_workspace, libfab_install_path, build_hw) - - os.chdir(curr_dir) diff --git a/contrib/intel/jenkins/common.py b/contrib/intel/jenkins/common.py deleted file mode 100755 index 205c678bf8d..00000000000 --- a/contrib/intel/jenkins/common.py +++ /dev/null @@ -1,169 +0,0 @@ -import collections -import subprocess -import sys -import os -from subprocess import Popen, TimeoutExpired -from time import sleep - -def get_node_name(host, interface): - return '%s-%s' % (host, interface) - -def run_command(command): - print(" ".join(command)) - p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) - print(p.returncode) - while True: - out = p.stdout.read(1) - if (out == '' and p.poll() != None): - break - if (out != ''): - sys.stdout.write(out) - sys.stdout.flush() - - print(f"Return code is {p.returncode}") - if (p.returncode != 0): - print("exiting with " + str(p.poll())) - sys.exit(p.returncode) - -def run_logging_command(command, log_file): - print("filename: ".format(log_file)) - f = open(log_file, 'a') - print(" ".join(command)) - p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) - print(p.returncode) - f.write(" ".join(command) + '\n') - while True: - out = p.stdout.read(1) - f.write(out) - if (out == '' and p.poll() != None): - break - if (out != ''): - sys.stdout.write(out) - sys.stdout.flush() - - print(f"Return code is {p.returncode}") - if (p.returncode != 0): - print("exiting with " + str(p.poll())) - f.close() - sys.exit(p.returncode) - f.close() - -def read_file(file_name): - with open(file_name) as file_out: - output = file_out.read() - return output - -class ClientServerTest: - def __init__(self, server_cmd, client_cmd, server_log, client_log, - timeout=None): - self.server_cmd = server_cmd - self.client_cmd = client_cmd - self.server_log = server_log - self.client_log = client_log - self._timeout = timeout - - def run(self): - server_process = Popen( - f"{self.server_cmd} > {self.server_log} 2>&1", - shell=True, close_fds=True - ) - sleep(1) - client_process = Popen( - f"{self.client_cmd} > {self.client_log} 2>&1", - shell=True, close_fds=True - ) - - try: - server_process.wait(timeout=self._timeout) - except TimeoutExpired: - server_process.terminate() - - try: - client_process.wait(timeout=self._timeout) - except TimeoutExpired: - client_process.terminate() - - server_output = read_file(self.server_log) - client_output = read_file(self.client_log) - - print("") - print(f"server_command: {self.server_cmd}") - print('server_stdout:') - print(server_output) - print(f"client_command: {self.client_cmd}") - print('client_stdout:') - print(client_output) - - return (server_process.returncode, client_process.returncode) - -Prov = collections.namedtuple('Prov', 'core util') -prov_list = [ - Prov('psm3', None), - Prov('verbs', None), - Prov('verbs', 'rxd'), - Prov('verbs', 'rxm'), - Prov('sockets', None), - Prov('tcp', None), - Prov('udp', None), - Prov('udp', 'rxd'), - Prov('shm', None), - Prov('ucx', None) -] - -providers = { - 'daos' : { - 'enable' : ['verbs', 'tcp'], - 'disable' : [] - }, - 'gpu' : { - 'enable' : ['verbs', 'shm'], - 'disable' : ['psm3'] - }, - 'dsa' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'ucx' : { - 'enable' : ['ucx'], - 'disable' : [] - }, - 'water' : { - 'enable' : ['tcp', 'verbs', 'psm3', 'sockets'], - 'disable' : [] - }, - 'grass' : { - 'enable' : ['tcp', 'sockets', 'udp', 'shm'], - 'disable' : [] - }, - 'fire' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'cyndaquil' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'quilava' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'ivysaur': { - 'enable' : ['tcp'], - 'disable' : [] - }, - 'electric' : { - 'enable' : ['shm'], - 'disable' : [] - } -} - -common_disable_list = [ - 'usnic', - 'efa', - 'perf', - 'hook_debug', - 'mrail', - 'opx' -] - -cloudbees_log_start_string = "Begin Cloudbees Test Output" diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py deleted file mode 100755 index e7f286acce7..00000000000 --- a/contrib/intel/jenkins/run.py +++ /dev/null @@ -1,228 +0,0 @@ -import tests -import subprocess -import sys -import argparse -import os -import common - -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config - -# read Jenkins environment variables -# In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' -# job name is better to use to distinguish between builds of different -# jobs but with the same branch name. -fab = os.environ['FABRIC'] -if 'slurm' in fab: - fab = cloudbees_config.fabric_map[f"{os.environ['SLURM_JOB_PARTITION']}"] - -jbname = os.environ['JOB_NAME']#args.jobname -bno = os.environ['BUILD_NUMBER']#args.buildno - -def fi_info_test(hw, core, hosts, mode, user_env, log_file, util): - - fi_info_test = tests.FiInfoTest(jobname=jbname,buildno=bno, - testname='fi_info', hw=hw, core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - print('-------------------------------------------------------------------') - print(f"Running fi_info test for {core}-{util}-{fab}") - fi_info_test.execute_cmd() - print('-------------------------------------------------------------------') - -def fabtests(hw, core, hosts, mode, user_env, log_file, util, way): - - runfabtest = tests.Fabtest(jobname=jbname,buildno=bno, - testname='runfabtests', hw=hw, core_prov=core, - fabric=fab, hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util, way=way) - - print('-------------------------------------------------------------------') - if (runfabtest.execute_condn): - print(f"Running Fabtests for {core}-{util}-{fab}") - runfabtest.execute_cmd() - else: - print(f"Skipping {core} {runfabtest.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def shmemtest(hw, core, hosts, mode, user_env, log_file, util, weekly): - runshmemtest = tests.ShmemTest(jobname=jbname,buildno=bno, - testname="shmem test", hw=hw, core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util, - weekly=weekly) - - print('-------------------------------------------------------------------') - if (runshmemtest.execute_condn): - print(f"Running shmem SOS test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("sos") - - print('--------------------------------------------------------------') - print(f"Running shmem PRK test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("prk") - - print('--------------------------------------------------------------') - print(f"Running shmem ISx test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("isx") - else: - print(f"Skipping {core} {runshmemtest.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def multinodetest(hw, core, hosts, mode, user_env, log_file, util): - - runmultinodetest = tests.MultinodeTests(jobname=jbname,buildno=bno, - testname="multinode performance test", - hw=hw, core_prov=core, fabric=fab, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util) - - print("-------------------------------------------------------------------") - if (runmultinodetest.execute_condn): - print("Running multinode performance test for {}-{}-{}" \ - .format(core, util, fab)) - runmultinodetest.execute_cmd() - - print("---------------------------------------------------------------") - else: - print("Skipping {} as execute condition fails" \ - .format(runmultinodetest.testname)) - print("-------------------------------------------------------------------") - -def intel_mpi_benchmark(hw, core, hosts, mpi, mode, group, user_env, log_file, - util): - - imb = tests.IMBtests(jobname=jbname, buildno=bno, - testname='IntelMPIbenchmark', core_prov=core, hw=hw, - fabric=fab, hosts=hosts, mpitype=mpi, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, test_group=group, util_prov=util) - - print('-------------------------------------------------------------------') - if (imb.execute_condn == True): - print(f"Running IMB-tests for {core}-{util}-{fab}-{mpi}") - imb.execute_cmd() - else: - print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def mpich_test_suite(hw, core, hosts, mpi, mode, user_env, log_file, util, - weekly=None): - - mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno, - testname="MpichTestSuite",core_prov=core, - hw=hw, fabric=fab, mpitype=mpi, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util, weekly=weekly) - - print('-------------------------------------------------------------------') - if (mpich_tests.execute_condn == True): - print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}") - mpich_tests.execute_cmd() - else: - print(f"Skipping {mpi.upper()} {mpich_tests.testname} exec condn fails") - print('-------------------------------------------------------------------') - -def osu_benchmark(hw, core, hosts, mpi, mode, user_env, log_file, util): - - osu_test = tests.OSUtests(jobname=jbname, buildno=bno, - testname='osu-benchmarks', core_prov=core, - hw=hw, fabric=fab, mpitype=mpi, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (osu_test.execute_condn == True): - print(f"Running OSU-Test for {core}-{util}-{fab}-{mpi}") - osu_test.execute_cmd() - else: - print(f"Skipping {mpi.upper()} {osu_test.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def oneccltest(hw, core, hosts, mode, user_env, log_file, util): - - runoneccltest = tests.OneCCLTests(jobname=jbname,buildno=bno, - testname="oneccl test", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (runoneccltest.execute_condn): - print(f"Running oneCCL cpu tests for {core}-{util}-{fab}") - runoneccltest.execute_cmd() - else: - print(f"Skipping {runoneccltest.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def oneccltestgpu(hw, core, hosts, mode, user_env, log_file, util): - - runoneccltestgpu = tests.OneCCLTestsGPU(jobname=jbname,buildno=bno, - testname="oneccl GPU test", - core_prov=core, hw=hw, fabric=fab, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util) - - print('-------------------------------------------------------------------') - if (runoneccltestgpu.execute_condn): - print(f"Running oneCCL GPU examples test for {core}-{util}-{fab}") - runoneccltestgpu.execute_cmd('examples') - - print('---------------------------------------------------------------') - print(f"Running oneCCL GPU functional test for {core}-{util}-{fab}") - runoneccltestgpu.execute_cmd('functional') - else: - print(f"Skipping {runoneccltestgpu.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def daos_cart_tests(hw, core, hosts, mode, user_env, log_file, util): - - runcarttests = tests.DaosCartTest(jobname=jbname, buildno=bno, - testname="Daos Cart Test", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (runcarttests.execute_condn): - print(f"Running cart test for {core}-{util}-{fab}") - runcarttests.execute_cmd() - print('-------------------------------------------------------------------') - -def dmabuftests(hw, core, hosts, mode, user_env, log_file, util): - - rundmabuftests = tests.DMABUFTest(jobname=jbname,buildno=bno, - testname="DMABUF Tests", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (rundmabuftests.execute_condn): - print(f"Running dmabuf H->H tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('H2H') - - print('---------------------------------------------------------------') - print(f"Running dmabuf H->D tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('H2D') - - print('---------------------------------------------------------------') - print(f"Running dmabuf D->H tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('D2H') - - print('---------------------------------------------------------------') - print(f"Running dmabuf D->D tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('D2D') - - print('---------------------------------------------------------------') - else: - print(f"Skipping {rundmabuftests.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -if __name__ == "__main__": - pass diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py deleted file mode 100755 index 6a974335c32..00000000000 --- a/contrib/intel/jenkins/runtests.py +++ /dev/null @@ -1,154 +0,0 @@ -import argparse -import os -import sys -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config -import subprocess -import run -import common -import shlex - -class ParseDict(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, dict()) - for value in values: - key, value = value.split('=') - getattr(namespace, self.dest)[key] = value - -parser = argparse.ArgumentParser() -parser.add_argument('--build_hw', help="HW type for build", - choices=['water', 'grass', 'fire', 'electric', 'daos',\ - 'gpu', 'ucx', 'ivysaur', 'cyndaquil', 'quilava']) -parser.add_argument('--prov', help="core provider", choices=['verbs', \ - 'tcp', 'udp', 'sockets', 'shm', 'psm3', 'ucx']) -parser.add_argument('--util', help="utility provider", choices=['rxd', 'rxm']) -parser.add_argument('--ofi_build_mode', help="specify the build configuration",\ - choices = ['reg', 'dbg', 'dl'], default='reg') -parser.add_argument('--test', help="specify test to execute", \ - choices = ['all', 'shmem', 'IMB', 'osu', 'oneccl', \ - 'mpichtestsuite', 'fabtests', 'onecclgpu', \ - 'fi_info', 'daos', 'multinode', 'dmabuf']) - -parser.add_argument('--imb_grp', help="IMB test group 1:[MPI1, P2P], \ - 2:[EXT, IO], 3:[NBC, RMA, MT]", choices=['1', '2', '3']) -parser.add_argument('--way', help="direction to run with device option", - choices=['h2d', 'd2d', 'xd2d'], default=None) -parser.add_argument('--user_env', help="Run with additional environment " \ - "variables", nargs='*', action=ParseDict, default={}) -parser.add_argument('--mpi', help="Select mpi to use for middlewares", - choices=['impi', 'mpich', 'ompi'], default='impi') -parser.add_argument('--log_file', help="Full path to log file", - default=os.environ['DEFAULT_LOG_LOCATION'], type=str) -parser.add_argument('--weekly', help="run weekly", default=False, type=bool) - -args = parser.parse_args() -build_hw = args.build_hw -args_core = args.prov -args_util = args.util -user_env = args.user_env -log_file = args.log_file -weekly = args.weekly - -if (args.ofi_build_mode): - ofi_build_mode = args.ofi_build_mode -else: - ofi_build_mode='reg' - -if (args.test): - run_test = args.test -else: - run_test = 'all' - -if (args.imb_grp): - imb_group = args.imb_grp -else: - imb_group = '1' - -mpi = args.mpi -way = args.way - -hosts = [] -if 'slurm' in os.environ['FABRIC']: - slurm_nodes = os.environ['SLURM_JOB_NODELIST'] # example cb[1-4,11] - common.run_command(shlex.split(f"sinfo --Format=Features -n {slurm_nodes}")) - if int(os.environ['SLURM_NNODES']) == 1: - hosts.append(slurm_nodes) - else: - prefix = slurm_nodes[0:slurm_nodes.find('[')] - nodes = slurm_nodes[slurm_nodes.find('[') + 1 : - slurm_nodes.find(']')].split(',') # ['1-4', '11'] - for item in nodes: # ['1-4', '11'] -> ['cb1', 'cb2', 'cb3', 'cb4', 'cb11'] - if '-' in item: - rng = item.split('-') - node_list = list(range(int(rng[0]), int(rng[1]) + 1)) - for node in node_list: - hosts.append(f'{prefix}{node}') - else: - hosts.append(f'{prefix}{item}') -else: - node = (os.environ['NODE_NAME']).split('_')[0] - hosts = [node] - for host in cloudbees_config.node_map[node]: - hosts.append(host) - print(f"hosts = {hosts}") - -print(common.cloudbees_log_start_string) - -#this script is executed from /tmp -#this is done since some mpi tests -#look for a valid location before running -# the test on the secondary host(client) -# but jenkins only creates a valid path on -# the primary host (server/test node) - -os.chdir('/tmp/') - -if(args_core): - if (run_test == 'all' or run_test == 'fi_info'): - run.fi_info_test(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, util=args.util) - - if (run_test == 'all' or run_test == 'fabtests'): - run.fabtests(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util, way) - - if (run_test == 'all' or run_test == 'shmem'): - run.shmemtest(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util, weekly) - - if (run_test == 'all' or run_test == 'oneccl'): - run.oneccltest(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util) - - if (run_test == 'all' or run_test == 'onecclgpu'): - run.oneccltestgpu(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'daos'): - run.daos_cart_tests(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'multinode'): - run.multinodetest(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'mpichtestsuite'): - run.mpich_test_suite(build_hw, args_core, hosts, mpi, - ofi_build_mode, user_env, log_file, - args_util, weekly) - - if (run_test == 'all' or run_test == 'IMB'): - run.intel_mpi_benchmark(build_hw, args_core, hosts, mpi, - ofi_build_mode, imb_group, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'osu'): - run.osu_benchmark(build_hw, args_core, hosts, mpi, - ofi_build_mode, user_env, log_file, - args_util) - - if (run_test == 'all' or run_test == 'dmabuf'): - run.dmabuftests(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) -else: - print("Error : Specify a core provider to run tests") diff --git a/contrib/intel/jenkins/summary.py b/contrib/intel/jenkins/summary.py deleted file mode 100755 index fa8a7a37f1a..00000000000 --- a/contrib/intel/jenkins/summary.py +++ /dev/null @@ -1,955 +0,0 @@ -from abc import ABC, abstractmethod -import shutil -from datetime import datetime -from typing import Tuple -import os -from pickle import FALSE -import sys -import smtplib -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText -from email.mime.base import MIMEBase -from email import encoders - -# add jenkins config location to PATH -sys.path.append(f"{os.environ['CUSTOM_WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") - -import cloudbees_config -import argparse -import common - -verbose = False - -class SendEmail: - def __init__(self, sender=None, receivers=None, attachment=None): - self.sender = sender if sender is not None else os.environ['SENDER'] - self.receivers = (receivers if receivers is not None else \ - f"{os.environ['RECEIVER']}").split(',') - self.attachment = attachment - self.work_week = datetime.today().isocalendar()[1] - self.msg = MIMEMultipart() - - def __add_attachments(self): - print(f"Attachment is {self.attachment}") - if self.attachment is None: - return - - attachment = MIMEBase('application', 'octet-stream') - attachment.set_payload(open(self.attachment, 'rb').read()) - encoders.encode_base64(attachment) - name = f"Jenkins_Summary_ww{self.work_week}" - if (verbose): - name = f"{name}_all" - attachment.add_header('Content-Disposition', - f"attachment; filename={name}") - self.msg.attach(attachment) - - def __write_msg(self): - self.msg['Subject'] = f"Cloudbees Summary {os.environ['JOB_NAME']}" - self.msg['From'] = self.sender - self.msg['To'] = ", ".join(self.receivers) - self.msg.attach(MIMEText(f"WW{self.work_week} Summary for Libfabric "\ - "From Cloudbees")) - - def send_mail(self): - self.__write_msg() - self.__add_attachments() - server = smtplib.SMTP(os.environ['SMTP_SERVER'], - os.environ['SMTP_PORT']) - server.sendmail(self.sender, self.receivers, self.msg.as_string()) - server.quit() - -class Release: - def __init__(self, log_dir, output_file, logger, release_num): - self.log_dir = log_dir - self.output_file = output_file - self.logger = logger - self.release_num = release_num - - def __log_entire_file(self, file_name): - with open(file_name) as f: - for line in f: - self.logger.log(line, end_delimiter = '') - - def __append_release_changes(self, file_name): - if os.path.exists(file_name): - self.__log_entire_file(file_name) - - def add_release_changes(self): - self.logger.log(F"Release number: {self.release_num}") - self.__append_release_changes(f'{self.log_dir}/Makefile.am.diff') - self.__append_release_changes(f'{self.log_dir}/configure.ac.diff') - -class Logger: - def __init__(self, output_file, release): - self.output_file = output_file - self.release = release - self.padding = '\t' - - def log(self, line, end_delimiter='\n', lpad=0, ljust=0): - print(f'{self.padding * lpad}{line}'.ljust(ljust), end = end_delimiter) - self.output_file.write(f'{self.padding * lpad}{line}{end_delimiter}') - -class Summarizer(ABC): - @classmethod - def __subclasshook__(cls, subclass): - return ( - hasattr(subclass, "print_results") - and callable(subclass.print_results) - and hasattr(subclass, "check_features") - and callable(subclass.check_features) - and hasattr(subclass, "check_node") - and callable(subclass.check_node) - and hasattr(subclass, "check_name") - and callable(subclass.check_name) - and hasattr(subclass, "check_pass") - and callable(subclass.check_pass) - and hasattr(subclass, "check_fail") - and callable(subclass.check_fail) - and hasattr(subclass, "check_exclude") - and callable(subclass.check_exclude) - and hasattr(subclass, "fast_forward") - and callable(subclass.fast_forward) - and hasattr(subclass, "read_file") - and callable(subclass.read_file) - and hasattr(subclass, "run") - and callable(subclass.run) - or NotImplemented - ) - - @abstractmethod - def __init__(self, logger, log_dir, prov, file_name, stage_name): - self.logger = logger - self.log_dir = log_dir - self.prov = prov - self.file_name = file_name - self.stage_name = stage_name - self.file_path = os.path.join(self.log_dir, self.file_name) - self.exists = os.path.exists(self.file_path) - self.log = None - self.passes = 0 - self.passed_tests = [] - self.fails = 0 - self.failed_tests = [] - self.excludes = 0 - self.excluded_tests = [] - self.error = 0 - self.errored_tests = [] - self.test_name ='no_test' - self.name = 'no_name' - self.features = "no_features_found" - self.node = "no_node_found" - - def print_results(self): - total = self.passes + self.fails - # log was empty or not valid - if not total: - return - - percent = self.passes/total * 100 - if (verbose): - self.logger.log( - f"<>{self.stage_name} : ", lpad=1, ljust=50, end_delimiter = '' - ) - else: - self.logger.log( - f"{self.stage_name} : ", - lpad=1, ljust=50, end_delimiter = '' - ) - self.logger.log( - f"{self.node} : ", - lpad=1, ljust=20, end_delimiter = '' - ) - self.logger.log( - f"[{self.features}] : ", - lpad=1, ljust=30, end_delimiter = '' - ) - self.logger.log(f"{self.passes}:{total} ", ljust=10, end_delimiter = '') - self.logger.log(f": {percent:.2f}% : ", ljust=12, end_delimiter = '') - self.logger.log("Pass", end_delimiter = '') - if (self.excludes > 0): - self.logger.log(f" : {self.excludes:3.0f} : Excluded/Notrun") - else: - self.logger.log("") - - if (verbose and self.passes): - self.logger.log(f"Passed tests: {self.passes}", lpad=2) - for test in self.passed_tests: - self.logger.log(f'{test}', lpad=3) - if self.fails: - self.logger.log(f"Failed tests: {self.fails}", lpad=2) - for test in self.failed_tests: - self.logger.log(f'{test}', lpad=3) - if (verbose): - if self.excludes: - self.logger.log( - f"Excluded/Notrun tests: {self.excludes} ", lpad=2 - ) - for test in self.excluded_tests: - self.logger.log(f'{test}', lpad=3) - - if self.error: - self.logger.log( - "Errored, Interrupt, or Canceled Tests: "\ - f"{self.excludes} ", lpad=2 - ) - for test in self.errored_tests: - self.logger.log(f'{test}', lpad=3) - - def check_features(self, previous, line): - if ('avail_features') in previous: - self.features = line.strip() - - def check_node(self, line): - if ('slurm_nodelist' in line): - self.node = line.strip().split('=')[1] - - def check_name(self, line): - return - - def check_pass(self, line): - return - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - - def check_exclude(self, line): - return - - def check_line(self, line): - self.check_name(line) - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - - def fast_forward(self, log_file): - previous = "" - line = log_file.readline().lower() - while line != "": - self.check_node(line) - self.check_features(previous, line) - if common.cloudbees_log_start_string.lower() in line: - break - - previous = line - line = log_file.readline().lower() - - def read_file(self): - with open(self.file_path, 'r') as log_file: - self.fast_forward(log_file) - for line in log_file: - self.check_line(line.lower()) - - def summarize(self): - if not self.exists: - return 0 - - self.read_file() - self.print_results() - return int(self.fails) - -class FiInfoSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"fi_info {self.prov}") - - def read_file(self): - super().read_file() - - if not self.fails: - self.passes += 1 - self.passed_tests.append(f"fi_info {self.prov}") - -class FabtestsSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.trace = False - - def check_name(self, line): - # don't double count ubertest output and don't count fi_ubertest's - # invocation - if 'ubertest' in line and 'client_cmd:' in line: - self.test_name = 'no_test' - if 'name:' not in line: # skip past client output in ubertest - return - - test_name = line.split("name:") - if len(test_name) > 1: - self.test_name = test_name[-1].lower().strip() - - def get_result_line(self, line) -> Tuple[str,str]: - result = line.split("result:") - if len(result) > 1: - return (result[-1].lower().strip(), line.split()) - return None, None - - def check_pass(self, line): - result, result_line = self.get_result_line(line) - if result == 'pass' or result == 'success' or result == 'passed': - self.passes += 1 - if 'ubertest' in self.test_name: - idx = (result_line.index('result:') - 1) - try: - int((result_line[idx].split(',')[0])) - except: - return - - ubertest_number = int((result_line[idx].split(',')[0])) - self.passed_tests.append(f"{self.test_name}: "\ - f"{ubertest_number}") - else: - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - result, result_line = self.get_result_line(line) - if result == 'fail': - self.fails += 1 - if 'ubertest' in self.test_name: - idx = (result_line.index('result:') - 1) - try: - int((result_line[idx].split(',')[0])) - except: - return - ubertest_number = int((result_line[idx].split(',')[0])) - self.failed_tests.append(f"{self.test_name}: " \ - f"{ubertest_number}") - else: - self.failed_tests.append(self.test_name) - - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - def check_exclude(self, line): - result, _ = self.get_result_line(line) - if result == 'excluded' or result == 'notrun': - self.excludes += 1 - self.excluded_tests.append(self.test_name) - - def check_trace(self, line): - if not self.trace: - cmd_count = 0 - faults_count = 0 - if ("user to sar buffer" in line): - tokens = line.split(' ') - for i in range(0, len(tokens)): - if 'cmd' in tokens[i]: - cmd_count += int(tokens[i + 1]) - if 'faults' in tokens[i]: - faults_count += int(tokens[i + 1]) - - if (cmd_count > 0 or faults_count > 0): - self.trace = True - - def check_line(self, line): - self.check_name(line) - if (self.test_name != 'no_test'): - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - if ('dsa' in self.file_name): - self.check_trace(line) - - def summarize(self): - if not self.exists: - return 0 - - self.read_file() - self.print_results() - if ('dsa' in self.file_name and not self.trace): - exit("Expected: DSA to run. Actual: DSA Not Run") - - return int(self.fails) - -class MultinodePerformanceSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - def check_name(self, line): - #name lines look like "starting ... " - if 'starting' in line and '...' in line: - self.test_name = line.split()[1].split('.')[0] - - def check_pass(self, line): - if 'pass' in line: - self.passes += 1 - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - if 'fail' in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - -class OnecclSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.file_path = os.path.join(self.log_dir, self.file_name) - self.exists = os.path.exists(self.file_path) - self.name = 'no_test' - self.trace = False - - def check_name(self, line): - #OneCCL GPU tests: - if "bash -c" in line and "./run.sh" not in line: - tokens = line.split('./')[1] - self.name = tokens.split()[0] - #OneCCL CPU tests: - if "Running" in line and "CCL_LOG_LEVEL=debug" not in line: - if './' in line: - tokens = line.split('./')[1] - self.name = tokens.split()[0] - - def check_pass(self, line): - if '[0] PASSED' in line or "All done" in line: - self.passes += 1 - self.passed_tests.append(f"{self.name}: 1") - if ("[0] [ PASSED ]" in line and "tests." in line) or \ - ("tests." in line and "[1] [ PASSED ]" not in line and \ - "[0] [ PASSED ]" not in line): - token = line.split() - no_of_tests = f"{token[token.index('tests.') - 1]} " - self.passes += int(no_of_tests) - self.passed_tests.append(f"{self.name}: {no_of_tests}") - - def check_fail(self, line): - if 'failed' in line or "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.name) - - def check_trace(self, line): - if not self.trace: - cmd_count = 0 - faults_count = 0 - if ("user to sar buffer" in line): - tokens = line.split(' ') - for i in range(0, len(tokens)): - if 'cmd' in tokens[i]: - cmd_count += int(tokens[i + 1]) - if 'faults' in tokens[i]: - faults_count += int(tokens[i + 1]) - if (cmd_count > 0 or faults_count > 0): - self.trace = True - - def check_line(self, line): - self.check_name(line) - if (self.name != 'no_test'): - self.check_pass(line) - self.check_fail(line) - if ('DSA' in self.file_name): - self.check_trace(line.lower()) - - def read_file(self): - with open(self.file_path, 'r') as log_file: - self.fast_forward(log_file) - for line in log_file: - self.check_line(line) - - def summarize(self): - if not self.exists: - return 0 - - self.read_file() - self.print_results() - if ('DSA' in self.file_name and not self.trace): - exit("Expected: DSA to run. Actual: DSA Not Run") - - return int(self.fails) - -class ShmemSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.name = 'no_test' - - def check_name(self, line): - line = line.strip() - if "running " in line: - tokens = line.split(' ') - self.name = ' '.join(tokens[1:]) - - def check_pass(self, line): - line = line.strip() - if "pass!" in line: - self.passes += 1 - self.passed_tests.append(self.name) - - -class MpichTestSuiteSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.mpi = mpi - self.run = 'mpiexec' - - def read_file(self): - with open(self.file_path,'r') as log_file: - super().fast_forward(log_file) - for line in log_file: - super().check_line(line.lower().strip()) - - def check_exclude(self, line): - if line.startswith('excluding:'): - test = line.split(':')[-1] - self.excludes += 1 - self.excluded_tests.append(test) - - def check_name(self, line): - if (line.startswith('ok') or - line.startswith('not ok')): - self.name = line.split('-')[1].split('#')[0].strip() - - def check_pass(self, line): - if (line.startswith('ok') and not - line.split('#')[1].strip().startswith('skip')): - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if (line.startswith('not ok') and not - line.split('#')[1].strip().startswith('skip')): - self.fails += 1 - self.failed_tests.append(self.name) - - -class ImbSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.mpi = mpi - if self.mpi == 'impi': - self.run = 'mpiexec' - else: - self.run = 'mpirun' - self.test_type = '' - - def check_type(self, line): - if 'part' in line: - self.test_type = line.split()[len(line.split()) - 2] - - def check_name(self, line): - if "benchmarking" in line: - self.name = line.split()[len(line.split()) - 1] - - def check_pass(self, line): - if "benchmarking" in line: - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.test_type} {self.name}") - self.passes -= 1 - - def check_line(self, line): - self.check_type(line) - self.check_name(line) - self.check_pass(line) - self.check_fail(line) - super().check_exclude(line) - -class OsuSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.mpi = mpi - if self.mpi == 'impi': - self.run = 'mpiexec' - else: - self.run = 'mpirun' - - self.type = '' - self.tokens = [] - - def get_tokens(self, line): - if "# osu" in line: - self.tokens = line.split() - else: - self.tokens = [] - - def check_name(self, line): - if 'osu' in self.tokens: - self.name = " ".join(self.tokens[self.tokens.index('osu') + \ - 1:self.tokens.index('test')]) - - def check_type(self): - if self.tokens: - self.test_type = self.tokens[1] - - def check_pass(self, line): - if 'osu' in self.tokens: - # Assume pass - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.test_type} {self.name}") - # Remove assumed pass - self.passes -= 1 - - def check_line(self, line): - self.get_tokens(line) - self.check_name(line) - self.check_type() - self.check_pass(line) - self.check_fail(line) - super().check_exclude(line) - -class DaosSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - if (self.exists): - if ('verbs' in file_name): - self.node = cloudbees_config.daos_prov_node_map['verbs'] - if ('tcp' in file_name): - self.node = cloudbees_config.daos_prov_node_map['tcp'] - - self.features = cloudbees_config.daos_node_features - - def check_name(self, line): - if "reading ." in line: - self.test_name = line.split('/')[len(line.split('/')) - 1] \ - .rstrip('.yaml\n') - - def check_pass(self, line): - res_string = line.lstrip("results :").rstrip() - res_list = res_string.split(' | ') - for elem in res_list: - if 'pass' in elem: - self.passes += [int(s) for s in elem.split() if s.isdigit()][0] - display_testname = self.test_name.ljust(20) - self.passed_tests.append(f"{display_testname} : {res_string}") - - def check_fail(self, line): - res_list = line.lstrip("results :").rstrip().split('|') - for elem in res_list: - total = [int(s) for s in elem.split() if s.isdigit()][0] - if total != 0: - if 'fail' in elem: - self.fails += total - self.failed_tests.append(f'{self.test_name}') - if 'error' in elem: - self.error += total - self.errored_tests.append(f'error: {self.test_name}') - if 'interrupt' in elem: - self.error += total - self.errored_tests.append(f'interrupt: {self.test_name}') - if 'cancel' in elem: - self.error += total - self.errored_tests.append(f'cancel: {self.test_name}') - - def check_exclude(self, line): - res_list = line.lstrip("results :").rstrip().split('|') - for elem in res_list: - total = [int(s) for s in elem.split() if s.isdigit()][0] - if total != 0: - if 'skip' in elem: - self.excludes += total - self.excluded_tests.append(f'skip: {self.test_name}') - if 'warn' in elem: - self.excludes += total - self.excluded_tests.append(f'warn: {self.test_name}') - - def check_line(self, line): - self.check_name(line) - if "results :" in line: - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - -class DmabufSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.test_type = '' - - def check_type(self, line): - if "Running" in line: - self.test_type = line.split()[2] - - def check_num_node(self, line): - if "SLURM_NNODES" in line: - self.num_nodes = line.split("=")[-1].strip() - self.num_nodes = ' '.join([self.num_nodes, 'node']) - - def check_name(self, line): - if "client_command" in line: - name_list = line.split()[-2:] - name_list.insert(0, str(self.num_nodes)) - name_list.insert(1, str(self.test_type)) - self.test_name = name_list - - def check_pass(self, line): - if "TEST COMPLETED" in line: - self.passes += 1 - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - if "TEST FAILED" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - def fast_forward(self, log_file): - previous = "" - line = log_file.readline() - while line != "": - self.check_num_node(line) - self.check_node(line.lower()) - self.check_features(previous.lower(), line.lower()) - if common.cloudbees_log_start_string.lower() in line.lower(): - break - - previous = line - line = log_file.readline() - - def read_file(self): - with open(self.file_path, 'r') as log_file: - self.fast_forward(log_file) - for line in log_file: - self.check_type(line) - self.check_line(line) - -def get_release_num(): - file_name = f'{os.environ["CUSTOM_WORKSPACE"]}/source/libfabric/'\ - 'release_num.txt' - if os.path.exists(file_name): - with open(file_name) as f: - num = f.readline() - - return num.strip() - - raise Exception("No release num") - -def summarize_items(summary_item, logger, log_dir, mode): - err = 0 - mpi_list = ['impi', 'mpich', 'ompi'] - logger.log(f"Summarizing {mode} build mode:") - provs = common.prov_list + [('tcp-iouring', None)] - if summary_item == 'fabtests' or summary_item == 'all': - for prov,util in provs: - if util: - prov = f'{prov}-{util}' - ret = FabtestsSummarizer( - logger, log_dir, prov, - f'{prov}_fabtests_{mode}', - f"{prov} fabtests {mode}" - ).summarize() - err += ret if ret else 0 - ret = FiInfoSummarizer( - logger, log_dir, prov, - f'{prov}_fi_info_{mode}', - f"{prov} fi_info {mode}" - ).summarize() - err += ret if ret else 0 - - if ((summary_item == 'daos' or summary_item == 'all') - and mode == 'reg'): - for prov in ['tcp-rxm', 'verbs-rxm']: - ret = DaosSummarizer( - logger, log_dir, prov, - f'daos_{prov}_{mode}', - f"{prov} daos {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'imb' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: - ret = ImbSummarizer( - logger, log_dir, item, mpi, - f'MPI_{item}_{mpi}_IMB_{mode}', - f"{item} {mpi} IMB {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'osu' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: - ret = OsuSummarizer( - logger, log_dir, item, mpi, - f'MPI_{item}_{mpi}_osu_{mode}', - f"{item} {mpi} OSU {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'mpichtestsuite' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp', 'verbs-rxm']: - ret = MpichTestSuiteSummarizer( - logger, log_dir, item, mpi, - f'mpichtestsuite_{item}_{mpi}_'\ - f'mpichtestsuite_{mode}', - f"{item} {mpi} mpichtestsuite {mode}" - ).summarize() - err += ret if ret else 0 - if summary_item == 'multinode' or summary_item == 'all': - for prov,util in common.prov_list: - if util: - prov = f'{prov}-{util}' - - ret = MultinodePerformanceSummarizer( - logger, log_dir, prov, - f'multinode_performance_{prov}_multinode_{mode}', - f"multinode performance {prov} {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'oneccl' or summary_item == 'all': - for prov in ['tcp', 'verbs', 'psm3', 'shm']: - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL', - f'oneCCL_{prov}_oneccl_{mode}', - f'oneCCL {prov} {mode}' - ).summarize() - err += ret if ret else 0 - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL-GPU', - f'oneCCL-GPU_{prov}_onecclgpu_{mode}', - f'oneCCL-GPU {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'shmem' or summary_item == 'all': - for prov in ['tcp', 'verbs-rxm', 'sockets']: - ret= ShmemSummarizer( - logger, log_dir, prov, - f'SHMEM_{prov}_shmem_{mode}', - f'shmem {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'v3' or summary_item == 'all': - test_types = ['h2d', 'd2d', 'xd2d'] - for t in test_types: - ret = FabtestsSummarizer( - logger, log_dir, 'shm', - f'ze_v3_shm_{t}_fabtests_{mode}', - f"ze v3 shm {t} fabtests {mode}" - ).summarize() - err += ret if ret else 0 - for prov in ['tcp', 'verbs', 'psm3']: - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL-GPU', - f'oneCCL-GPU-v3_{prov}_onecclgpu_{mode}', - f'oneCCL-GPU-v3 {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'dsa' or summary_item == 'all': - for prov in ['shm']: - ret = FabtestsSummarizer( - logger, log_dir, 'shm', - f'{prov}_dsa_fabtests_{mode}', - f"{prov} dsa fabtests {mode}" - ).summarize() - err += ret if ret else 0 - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL', - f'oneCCL_DSA_shm_oneccl_{mode}', - f'oneCCL DSA {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'dmabuf' or summary_item == 'all': - for prov in ['verbs-rxm']: - for num_nodes in range(1,3): - ret = DmabufSummarizer( - logger, log_dir, 'verbs-rxm', - f'DMABUF-Tests_{prov}_dmabuf_{num_nodes}_{mode}', - f"DMABUF-Tests {prov} dmabuf {num_nodes} node {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'cuda' or summary_item == 'all': - test_types = ['h2d', 'd2d', 'xd2d'] - for v in range(1, 3): - for t in test_types: - ret = FabtestsSummarizer( - logger, log_dir, 'shm', - f'cuda_v{v}_shm_{t}_fabtests_{mode}', - f"cuda v{v} shm {t} fabtests {mode}" - ).summarize() - err += ret if ret else 0 - - return err - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--summary_item', help="functional test to summarize", - choices=['fabtests', 'imb', 'osu', 'mpichtestsuite', - 'oneccl', 'shmem', 'multinode', 'daos', 'v3', - 'dsa', 'dmabuf', 'all']) - parser.add_argument('--ofi_build_mode', help="select buildmode debug or dl", - choices=['dbg', 'dl', 'reg'], default='all') - parser.add_argument('-v', help="Verbose mode. Print all tests", \ - action='store_true') - parser.add_argument('--release', help="This job is testing a release."\ - "It will be saved and checked into a git tree.", - action='store_true') - parser.add_argument('--send_mail', help="Email mailing list with summary "\ - "results", action='store_true') - - args = parser.parse_args() - verbose = args.v - summary_item = args.summary_item - release = args.release - ofi_build_mode = args.ofi_build_mode - send_mail = args.send_mail - - mpi_list = ['impi', 'mpich', 'ompi'] - custom_workspace = os.environ['CUSTOM_WORKSPACE'] - log_dir = f'{custom_workspace}/log_dir' - if (not os.path.exists(log_dir)): - os.makedirs(log_dir) - - job_name = os.environ['JOB_NAME'].replace('/', '_') - - print(f"Files to be summarized: {os.listdir(log_dir)}") - - if (release): - release_num = get_release_num() - date = datetime.now().strftime("%Y%m%d%H%M%S") - output_name = f'summary_{release_num}_{job_name}_{date}.log' - else: - output_name = f'summary_{job_name}.log' - - full_file_name = f'{log_dir}/{output_name}' - - with open(full_file_name, 'a') as output_file: - if (ofi_build_mode == 'all'): - output_file.truncate(0) - - logger = Logger(output_file, release) - if (release): - Release( - log_dir, output_file, logger, release_num - ).add_release_changes() - - err = 0 - build_modes = ['reg', 'dbg', 'dl'] - for mode in build_modes: - if ofi_build_mode != 'all' and mode != ofi_build_mode: - continue - - err += summarize_items(summary_item, logger, log_dir, mode) - - if (release): - shutil.copyfile(f'{full_file_name}', f'{custom_workspace}/{output_name}') - - if (send_mail): - SendEmail(sender = os.environ['SENDER'], - receivers = os.environ['mailrecipients'], - attachment = full_file_name - ).send_mail() - - exit(err) diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py deleted file mode 100755 index 7d224aa3488..00000000000 --- a/contrib/intel/jenkins/tests.py +++ /dev/null @@ -1,1238 +0,0 @@ -import sys -import os -import io - -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") - -import subprocess -import re -import cloudbees_config -import common -import shlex -import time - -# A Jenkins env variable for job name is composed of the name of the jenkins job and the branch name -# it is building for. for e.g. in our case jobname = 'ofi_libfabric/master' -class Test: - - def __init__ (self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, mpitype=None, - util_prov=None, way=None): - self.jobname = jobname - self.buildno = buildno - self.testname = testname - self.hw = hw - self.core_prov = core_prov - self.util_prov = f'ofi_{util_prov}' if util_prov != None else '' - self.fabric = fabric - self.hosts = hosts - self.log_file = log_file - self.mpi_type = mpitype - self.ofi_build_mode = ofi_build_mode - if (len(hosts) == 1): - self.server = hosts[0] - self.client = hosts[0] - elif (len(hosts) == 2): - self.server = hosts[0] - self.client = hosts[1] - - self.nw_interface = cloudbees_config.interface_map[self.fabric] - self.custom_workspace = os.environ['CUSTOM_WORKSPACE'] - self.libfab_installpath = f'{self.custom_workspace}/'\ - f'{self.hw}/{self.ofi_build_mode}' - - self.middlewares_path = f'{self.custom_workspace}/middlewares' - self.ci_logdir_path = f'{self.custom_workspace}/log_dir' - self.env = user_env - self.way = way - - self.mpi = '' - if (self.mpi_type == 'impi'): - self.mpi = IMPI(self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - elif (self.mpi_type == 'ompi'): - self.mpi = OMPI(self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - elif (self.mpi_type == 'mpich'): - self.mpi = MPICH(self.hw, self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - - -class FiInfoTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.fi_info_testpath = f'{self.libfab_installpath}/bin' - - @property - def cmd(self): - return f"{self.fi_info_testpath}/fi_info " - - @property - def options(self): - if (self.util_prov): - opts = f"-f {self.fabric} -p {self.core_prov};{self.util_prov}" - elif (self.core_prov == 'psm3'): - opts = f"-p {self.core_prov}" - else: - opts = f"-f {self.fabric} -p {self.core_prov}" - - return opts - - def execute_cmd(self): - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - - -class Fabtest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None, - way=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, - util_prov, way) - self.fabtestpath = f'{self.libfab_installpath}/bin' - self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' - self.device = cloudbees_config.fabric_map[self.hw] - - def get_exclude_file(self): - path = self.libfab_installpath - efile_path = f'{path}/share/fabtests/test_configs' - - if self.hw == 'ivysaur': - efile = f'{efile_path}/{self.core_prov}/io_uring.exclude' - elif self.hw == 'cyndaquil' or self.hw == 'quilava': - efile = f'{efile_path}/{self.core_prov}/cuda.exclude' - else: - prov = self.util_prov if self.util_prov else self.core_prov - efile_old = f'{efile_path}/{prov}/{prov}.exclude' - - if self.util_prov: - efile = f'{efile_path}/{self.util_prov}/{self.core_prov}/exclude' - else: - efile = f'{efile_path}/{self.core_prov}/exclude' - - if os.path.isfile(efile): - return efile - elif os.path.isfile(efile_old): - return efile_old - else: - print(f"Exclude file: {efile} not found!") - return None - - @property - def cmd(self): - return f"{self.fabtestpath}/runfabtests.sh " - - @property - def options(self): - opts = f"-T 300 -vvv -p {self.fabtestpath} -S " - if (self.core_prov != 'shm' and self.nw_interface): - opts += f"-s {common.get_node_name(self.server, self.nw_interface)} " - opts += f"-c {common.get_node_name(self.client, self.nw_interface)} " - - if (self.core_prov == 'shm'): - opts += f"-s {self.server} " - opts += f"-c {self.client} " - opts += "-N " - - if (self.core_prov == 'ucx'): - opts += "-b " - - if (self.ofi_build_mode == 'dl'): - opts += "-t short " - else: - opts += "-t all " - - if (self.way == 'h2d'): - opts += f"-C \"-H\" -L \"-D {self.device}\" " - elif (self.way == 'd2d'): - opts += f"-C \"-D {self.device}\" -L \"-D {self.device}\" " - elif (self.way == 'xd2d'): - opts += f"-C \"-D {self.device}\" -L \"-D {self.device} -i 1\" " - - if (self.core_prov == 'sockets' and self.ofi_build_mode == 'reg'): - complex_test_file = f'{self.libfab_installpath}/share/fabtests/'\ - f'test_configs/{self.core_prov}/quick.test' - if (os.path.isfile(complex_test_file)): - opts += "-u {complex_test_file} " - else: - print(f"{self.core_prov} Complex test file not found") - - if (self.ofi_build_mode != 'reg' or self.core_prov == 'udp'): - opts += "-e \'ubertest,multinode\' " - - efile = self.get_exclude_file() - if efile: - opts += "-R " - opts += f"-f {efile} " - - for key in self.env: - opts += f"-E {key}={self.env[key]} " - - if self.util_prov: - opts += f"{self.core_prov};{self.util_prov} " - else: - opts += f"{self.core_prov} " - - if (self.core_prov == 'shm'): - opts += f"{self.server} {self.server} " - else: - opts += f"{self.server} {self.client} " - - return opts - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - curdir = os.getcwd() - os.chdir(self.fabtestconfigpath) - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curdir) - - -class ShmemTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None, - weekly=False): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, - util_prov) - - self.n = 2 - self.ppn = 1 - self.weekly = weekly - self.shmem_dir = f'{self.middlewares_path}/shmem_{self.hw}' - self.oshrun = f'{self.shmem_dir}/bin/oshrun' - self.hydra = f'{cloudbees_config.hydra}' - self.shmem_testname = '' - self.threshold = '1' - self.isx_shmem_total_size = 33554432 - self.isx_shmem_kernel_max = 134217728 - self.prk_iterations = 10 - self.prk_first_arr_dim = 1000 - self.prk_second_arr_dim = 1000 - if self.util_prov: - self.prov = f'{self.core_prov}\\;{self.util_prov}' - else: - self.prov = self.core_prov - - self.test_dir = { - 'sos' : 'SOS/test', - 'isx' : 'ISx/SHMEM', - 'prk' : 'PRK/SHMEM' - } - - self.shmem_environ = { - 'SHMEM_OFI_USE_PROVIDER': self.prov, - 'OSHRUN_LAUNCHER' : self.hydra, - 'PATH' : f'{self.shmem_dir}/bin:$PATH', - 'LD_LIBRARY_PATH' : f'{self.shmem_dir}/lib:'\ - f'{self.libfab_installpath}/lib', - 'SHMEM_SYMMETRIC_SIZE' : '4G', - 'LD_PRELOAD' : f'{self.libfab_installpath}'\ - '/lib/libfabric.so', - 'threshold' : self.threshold, - 'SHMEM_DEBUG' : '1' - } - - self.exclude_extensions = ['.cpp', '.c', '.o', '.h', '.f90', '.log', - '.am', '.in', '.deps', '.libs'] - - self.SOS_tests = [ - 'unit', - 'shmemx', - 'apps', - 'spec-example' - ] - - if self.weekly: - self.SOS_tests.append('performance/shmem_perf_suite') - self.SOS_tests.append('performance/tests') - - self.exclude = { - 'sos' : { - 'verbs' : [ - 'makefile', - 'readme' - ], - 'tcp' : [ - 'makefile', - 'readme' - ], - 'sockets' : [ - 'makefile', - 'readme' - ] - } - } - - def export_env(self): - environ = '' - if self.shmem_testname == 'isx' or self.shmem_testname == 'prk': - self.threshold = '0' - - for key,val in self.shmem_environ.items(): - environ += f"export {key}={val}; " - return environ - - def check_ending(self, f_name): - """ - Returns True if ending is okay, false if not - """ - for ext in self.exclude_extensions: - if f_name.lower().endswith(ext): - return False - - return True - - def get_cmds(self): - cmd_list = [] - if self.shmem_testname == 'sos': - for test_dir in self.SOS_tests: - test_dir_path = f'{self.shmem_dir}/' \ - f'{self.test_dir[self.shmem_testname]}/' \ - f'{test_dir}' - for f_name in os.listdir(test_dir_path): - if not self.check_ending(f_name) or \ - f_name.lower() in \ - self.exclude[self.shmem_testname][self.core_prov]: - continue - - cmd_list.append(f"{test_dir_path}/{f_name}") - - elif self.shmem_testname == 'isx': - exec_path = f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}/bin' - cmd_list.append(f"{exec_path}/isx.strong {self.isx_shmem_kernel_max} " \ - "output_strong") - cmd_list.append(f"{exec_path}/isx.weak " \ - f"{self.isx_shmem_total_size} output_weak") - cmd_list.append(f"{exec_path}/isx.weak_iso " \ - f"{self.isx_shmem_total_size} output_weak_iso") - elif self.shmem_testname == 'prk': - exec_path = f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}' - cmd_list.append(f"{exec_path}/Stencil/stencil " \ - f"{self.prk_iterations} {self.prk_first_arr_dim}") - cmd_list.append(f"{exec_path}/Synch_p2p/p2p " \ - f"{self.prk_iterations} {self.prk_first_arr_dim} "\ - f"{self.prk_second_arr_dim}") - cmd_list.append(f"{exec_path}/Transpose/transpose " \ - f"{self.prk_iterations} {self.prk_first_arr_dim}") - - return cmd_list - - @property - def execute_condn(self): - return True - - def execute_cmd(self, shmem_testname): - self.shmem_testname = shmem_testname - base_cmd = f"{self.oshrun}" - base_cmd = f"{base_cmd} -n {self.n}" - base_cmd = f"{base_cmd} -ppn {self.ppn}" - cmds = self.get_cmds() - for cmd in self.get_cmds(): - command = f"bash -c \'{self.export_env()} {base_cmd} {cmd}\'" - outputcmd = shlex.split(command) - print(f"Running {self.shmem_testname} {cmd.split('/')[-1]}") - common.run_command(outputcmd) - print(f"{self.shmem_testname} {cmd.split('/')[-1]} PASS!") - -class MultinodeTests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - self.fabtestpath = f'{self.libfab_installpath}/bin' - self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' - self.n = 2 - self.ppn = 64 - self.iterations = 1 - self.method = 'msg' - self.pattern = "full_mesh" - - @property - def cmd(self): - return f"{self.fabtestpath}/runmultinode.sh " - - @property - def options(self): - opts = f"-h {common.get_node_name(self.server, self.nw_interface)}" - opts += f",{common.get_node_name(self.client, self.nw_interface)}" - opts += f" -n {self.ppn}" - opts += f" -I {self.iterations}" - opts += f" -z {self.pattern}" - opts += f" -C {self.method}" - if self.util_prov: - opts += f" -p {self.core_prov};{self.util_prov}" - else: - opts += f" -p {self.core_prov}" - opts += f" --ci {self.fabtestpath}/" #enable ci mode to disable tput - - return opts - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - if self.util_prov: - prov = f"{self.core_prov}-{self.util_prov} " - else: - prov = self.core_prov - curdir = os.getcwd() - os.chdir(self.fabtestconfigpath) - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curdir) - -class OMPI: - def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.ompi_src = f'{middlewares_path}/ompi' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 2 - - @property - def env(self): - cmd = "bash -c \'" - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += f"export LD_LIBRARY_PATH={self.ompi_src}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.ompi_src}/bin:$PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-np {self.n} " - hosts = '\',\''.join([':'.join([common.get_node_name(host, \ - self.nw_interface), str(self.ppn)]) \ - for host in self.hosts]) - opts += f"--host \'{hosts}\' " - if self.util_prov: - opts += f"--mca mtl_ofi_provider_include {self.core_prov}\\;"\ - f"{self.util_prov} " - opts += f"--mca btl_ofi_provider_include {self.core_prov}\\;"\ - f"{self.util_prov} " - else: - opts += f"--mca mtl_ofi_provider_include {self.core_prov} " - opts += f"--mca btl_ofi_provider_include {self.core_prov} " - opts += "--mca orte_base_help_aggregate 0 " - # This is necessary to prevent verbs from printing warning messages - # The test still uses libfabric verbs even when enabled. - # if (self.core_prov == 'verbs'): - # opts += "--mca btl_openib_allow_ib 1 " - opts += "--mca mtl ofi " - opts += "--mca pml cm -tag-output " - for key in self.environ: - opts += f"-x {key}={self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.ompi_src}/bin/mpirun {self.options}" - -class MPICH: - def __init__(self, hw, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.mpich_dir = f'{middlewares_path}/mpich_{hw}' - self.mpichpath = f'{self.mpich_dir}/mpich' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 1 - - @property - def env(self): - cmd = "bash -c \'" - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += "export HYDRA_LAUNCHER=fork;" - cmd += "export MPIR_CVAR_CH4_OFI_ENABLE_ATOMICS=0; " - cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=0; " - cmd += f"export LD_LIBRARY_PATH={self.mpich_dir}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.mpich_dir}/bin:$PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += "-launcher ssh " - # Removed because sbatch does this for us whenwe use mpirun - # opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ - # f"{common.get_node_name(self.client, self.nw_interface)} " - for key in self.environ: - opts += f"-genv {key} {self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.mpich_dir}/bin/mpirun {self.options}" - -class IMPI: - def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.impi_src = f'{cloudbees_config.impi_root}' - self.mpichpath = f'{middlewares_path}/impi/mpichsuite/' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 1 - - @property - def env(self): - cmd = f"bash -c \'source {self.impi_src}/env/vars.sh "\ - "-i_mpi_ofi_internal=0; " - cmd += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - if (self.core_prov == 'tcp'): - cmd += "export FI_IFACE=eth0; " - elif (self.core_prov == 'verbs'): - cmd += "export FI_IFACE=ib0; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib/release:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ - f"{common.get_node_name(self.client, self.nw_interface)} " - for key in self.environ: - opts += f"-genv {key} {self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.impi_src}/bin/mpiexec {self.options}" - - -class IMBtests(Test): - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, test_group, - util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - - self.test_group = test_group - self.mpi_type = mpitype - self.imb_src = '' - self.imb_tests = { - '1' :[ - 'MPI1', - 'P2P' - ], - '2' :[ - 'EXT', - 'IO' - ], - '3' :[ - 'NBC', - 'RMA', - 'MT' - ] - } - self.iter = 100 - self.include = { - 'MPI1':[ - 'Biband', - 'Uniband', - 'PingPongAnySource', - 'PingPingAnySource', - 'PingPongSpecificSource', - 'PingPingSpecificSource' - ], - 'P2P':[], - 'EXT':[], - 'IO':[], - 'NBC':[], - 'RMA':[], - 'MT':[] - } - self.exclude = { - 'MPI1':[], - 'P2P':[], - 'EXT':[ - 'Accumulate' - ], - 'IO':[], - 'NBC':[], - 'RMA':[ - 'Accumulate', - 'Get_accumulate', - 'Fetch_and_op', - 'Compare_and_swap', - 'All_put_all', - 'All_get_all' - ], - 'MT':[] - } - self.imb_src = f'{self.middlewares_path}/{self.mpi_type}/imb' - - @property - def execute_condn(self): - # Mpich and ompi are excluded to save time. Run manually if needed - return (self.mpi_type == 'impi') - - def imb_cmd(self, imb_test): - print(f"Running IMB-{imb_test}") - cmd = f"{self.imb_src}/IMB-{imb_test} " - if (imb_test != 'MT'): - cmd += f"-iter {self.iter} " - - if (len(self.include[imb_test]) > 0): - cmd += f"-include {','.join(self.include[imb_test])}" - - if (len(self.exclude[imb_test]) > 0): - cmd += f"-exclude {','.join(self.exclude[imb_test])}" - - return cmd - - def execute_cmd(self): - for test_type in self.imb_tests[self.test_group]: - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ - self.imb_cmd(test_type) + '\'') - common.run_command(outputcmd) - - -class OSUtests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - - self.n_ppn = { - 'pt2pt': (2, 1), - 'collective': (4, 2), - 'one-sided': (2, 1), - 'startup': (2, 1) - } - if mpitype == 'mpich' and hw in ['water', 'grass']: - self.mpitype = f'{mpitype}_{hw}' - else: - self.mpitype = mpitype - - self.osu_src = f'{self.middlewares_path}/{self.mpitype}/osu/libexec/'\ - 'osu-micro-benchmarks/mpi/' - - @property - def execute_condn(self): - # mpich-tcp, ompi are the only osu test combinations failing - return False if ((self.mpi_type == 'mpich' and self.core_prov == 'tcp') or \ - self.mpi_type == 'ompi') \ - else True - - def osu_cmd(self, test_type, test): - print(f"Running OSU-{test_type}-{test}") - cmd = f'{self.osu_src}/{test_type}/{test} ' - return cmd - - def execute_cmd(self): - assert(self.osu_src) - p = re.compile('osu_put*') - for root, dirs, tests in os.walk(self.osu_src): - for test in tests: - self.mpi.n = self.n_ppn[os.path.basename(root)][0] - self.mpi.ppn = self.n_ppn[os.path.basename(root)][1] - - if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): - self.env['IBV_FORK_SAFE'] = '1' - - if(p.search(test) == None): - osu_command = self.osu_cmd(os.path.basename(root), test) - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ - osu_command + '\'') - common.run_command(outputcmd) - - if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): - self.env.pop('IBV_FORK_SAFE') - - -class MpichTestSuite(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None, weekly=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - self.mpi_type = mpitype - if (mpitype != 'ompi'): - self.mpichsuitepath = f'{self.mpi.mpichpath}/test/mpi/' - self.pwd = os.getcwd() - self.weekly = weekly - self.mpichtests_exclude = { - 'tcp' : { 'rma' : [('win_shared_put_flush_load 3', 'test')], - 'threads/comm' : [('idup_nb 4','test')] - }, - 'verbs' : { 'threads/comm' : [('idup_nb 4','test')], - 'spawn' : [('concurrent_spawns 1', 'test')], - 'pt2pt' : [('sendrecv3 2','test'), - ('sendrecv3 2 arg=-isendrecv','test')], - 'threads/pt2pt': [(f"mt_improbe_sendrecv_huge 2 " - f"arg=-iter=64 arg=-count=4194304 " - f"env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE" - f"=16384", 'test')] - } - } - - def create_hostfile(self, file, hostlist): - with open(file, "w") as f: - for host in hostlist: - f.write(f"{host}\n") - - def update_testlists(self, filename, category): - with open(filename, 'r') as file: - lines = file.read().splitlines() - for line in lines: - if (line == category): - lines[lines.index(line)] = f'#{line}' - else: - continue - with open(filename, 'w') as file: - file.write('\n'.join(lines)) - - def exclude_tests(self, test_root, provider): - for path,exclude_list in self.mpichtests_exclude[f'{provider}'].items(): - for item in exclude_list: - self.update_testlists(f'{test_root}/{path}/testlist', item[0]) - if (item[1] == 'dir'): - filename = f'{test_root}/{path}/{item[0]}/testlist' - with open(filename,'r') as file: - for line in file: - line = line.strip() - if (not line.startswith('#')): - print(f'excluding:{path}/{item[0]}:{line}') - else: #item[1]=test - print(f'excluding:{path}/{item[0]}') - - @property - def execute_condn(self): - return ((self.mpi_type == 'impi' and self.weekly) or \ - self.mpi_type == 'mpich') - - def execute_cmd(self): - if (self.mpi_type == 'mpich'): - configure_cmd = f"./configure --with-mpi={self.mpi.mpich_dir} " - if (self.weekly): - print(f'Weekly {self.mpi_type} mpichsuite tests') - os.chdir(self.mpichsuitepath) - common.run_command(shlex.split(self.mpi.env + - configure_cmd + '\'')) - self.exclude_tests(self.mpichsuitepath, self.core_prov) - testcmd = 'make testing' - outputcmd = shlex.split(self.mpi.env + testcmd + '\'') - common.run_command(outputcmd) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - else: - print(f"PR {self.mpi_type} mpichsuite tests") - os.chdir(self.mpichsuitepath) - common.run_command(shlex.split(self.mpi.env + - configure_cmd + '\'')) - common.run_command(['make', '-j']) - self.exclude_tests(self.mpichsuitepath, self.core_prov) - testcmd = "./runtests -tests=testlist " - testcmd += f" -xmlfile=summary.xml -tapfile=summary.tap " \ - f"-junitfile=summary.junit.xml " - common.run_command(shlex.split(self.mpi.env + testcmd + '\'')) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - if (self.mpi_type == 'impi' and self.weekly == True): - print (f'Weekly {self.mpi_type} mpichsuite tests') - os.chdir(self.mpi.mpichpath) - print(self.hosts) - self.create_hostfile(f'{self.mpi.mpichpath}/hostfile', - self.hosts) - os.environ["I_MPI_HYDRA_HOST_FILE"] = \ - f'{self.mpi.mpichpath}/hostfile' - test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \ - f"{self.mpi.mpichpath}/hostfile; " - test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; " - common.run_command(shlex.split(self.mpi.env + test_cmd + '\'')) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - -class OneCCLTests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.oneccl_path = f'{self.middlewares_path}/oneccl/' - self.test_dir = f'{self.middlewares_path}/oneccl/ci_tests' - if self.util_prov: - self.prov = f"{self.core_prov}\;{self.util_prov}" - else: - self.prov = self.core_prov - self.oneccl_environ = { - 'FI_PROVIDER' : f"\"{self.prov}\"", - 'CCL_ATL_TRANSPORT' : 'ofi', - 'CCL_ATL_TRANSPORT_LIST' : 'ofi' - } - - if self.env: - for key in self.env: - self.oneccl_environ[key] = self.env[key] - - self.ld_library = [ - f'{self.libfab_installpath}/lib', - f'{self.oneccl_path}/build/_install/lib' - ] - - def export_env(self): - environ = f"source {cloudbees_config.oneapi_root}/setvars.sh; " - environ += f"source {self.oneccl_path}/build/_install/env/setvars.sh; " - if self.core_prov == 'psm3': - self.oneccl_environ['PSM3_MULTI_EP'] = '1' - - if self.core_prov == 'shm': - self.oneccl_environ['CCL_ATL_SHM'] = '1' - - for key, val in self.oneccl_environ.items(): - environ += f"export {key}={val}; " - - ld_library_path = 'LD_LIBRARY_PATH=' - for item in self.ld_library: - ld_library_path += f'{item}:' - - environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " - return environ - - def cmd(self): - return './run.sh ' - - def options(self): - opts = "--mode cpu " - return opts - - @property - def execute_condn(self): - return True - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - curr_dir = os.getcwd() - os.chdir(self.test_dir) - command = f"bash -c \'{self.export_env()} {self.cmd()} "\ - f"{self.options()}\'" - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curr_dir) - -class OneCCLTestsGPU(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.n = 2 - self.ppn = 1 - self.oneccl_path = f'{self.middlewares_path}/oneccl_gpu/build' - if self.util_prov: - self.prov = f"{self.core_prov}\;{self.util_prov}" - else: - self.prov = self.core_prov - - self.onecclgpu_environ = { - 'FI_PROVIDER' : self.prov, - # 'LD_PRELOAD' : f"{self.libfab_installpath}/lib/libfabric.so", - 'CCL_ATL_TRANSPORT' : 'ofi', - 'CCL_ROOT' : f"{self.oneccl_path}/_install" - } - - self.ld_library = [ - f'{self.libfab_installpath}/lib', - '$LD_LIBRARY_PATH', - f'{self.oneccl_path}/_install/lib' - ] - - self.tests = { - 'examples' : [ - 'sycl_allgatherv_custom_usm_test', - 'sycl_allgatherv_inplace_test', - 'sycl_allgatherv_inplace_usm_test', - 'sycl_allgatherv_test', - 'sycl_allgatherv_usm_test', - 'sycl_allreduce_inplace_usm_test', - 'sycl_allreduce_test', - 'sycl_allreduce_usm_test', - 'sycl_alltoall_test', - 'sycl_alltoall_usm_test', - 'sycl_alltoallv_test', - 'sycl_alltoallv_usm_test', - 'sycl_broadcast_test', - 'sycl_broadcast_usm_test', - 'sycl_reduce_inplace_usm_test', - 'sycl_reduce_scatter_test', - 'sycl_reduce_scatter_usm_test', - 'sycl_reduce_test', - 'sycl_reduce_usm_test' - ], - 'functional' : [ - 'allgatherv_test', - 'alltoall_test', - 'alltoallv_test', - 'bcast_test', - 'reduce_scatter_test', - 'reduce_test' - ] - } - - def export_env(self): - environ = f"source {cloudbees_config.impi_root}/env/vars.sh "\ - "-i_mpi_internal=0; " - environ += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " - for key, val in self.onecclgpu_environ.items(): - environ += f"export {key}={val}; " - - ld_library_path = 'LD_LIBRARY_PATH=' - for item in self.ld_library: - ld_library_path += f'{item}:' - - environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " - return environ - - def cmd(self): - return f"{self.oneccl_path}/_install/bin/mpiexec " - - def options(self): - opts = "-l " - opts += f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += f"-hosts {self.server},{self.client} " - return opts - - @property - def execute_condn(self): - return True - - - def execute_cmd(self, oneccl_test_gpu): - curr_dir = os.getcwd() - if 'examples' in oneccl_test_gpu: - os.chdir(f"{self.oneccl_path}/_install/examples/sycl") - else: - os.chdir(f"{self.oneccl_path}/tests/functional") - - for test in self.tests[oneccl_test_gpu]: - if '_usm_' in test: - gpu_selector = 'device' - else: - gpu_selector = 'default' - - if self.core_prov == 'psm3': - command = f"bash -c \'{self.export_env()} export PSM3_MULTI_EP=1; {self.cmd()} "\ - f"{self.options()} ./{test} " - elif self.core_prov == 'shm': - command = f"bash -c \'{self.export_env()} export CCL_ATL_SHM=1; {self.cmd()} "\ - f"{self.options()} ./{test} " - else: - command = f"bash -c \'{self.export_env()} {self.cmd()} "\ - f"{self.options()} ./{test} " - - if 'examples' in oneccl_test_gpu: - command += f"gpu {gpu_selector}" - command += "\'" - - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curr_dir) - -class DaosCartTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - - self.set_paths(core_prov) - print(core_prov) - self.daos_nodes = cloudbees_config.prov_node_map[core_prov] - print(self.daos_nodes) - self.launch_node = self.daos_nodes[0] - - self.cart_tests = { - 'corpc_one_node' : {'tags' :'cart,corpc,one_node', 'numservers':1, 'numclients':0}, - 'corpc_two_node' : {'tags' :'cart,corpc,two_node', 'numservers':2, 'numclients':0}, - 'ctl_one_node' : {'tags' :'cart,ctl,one_node', 'numservers':1, 'numclients':1}, - 'ghost_rank_rpc_one_node' : {'tags' :'cart,ghost_rank_rpc,one_node', 'numservers':1, 'numclients':0}, - 'group_test' : {'tags' :'cart,group_test,one_node', 'numservers':1, 'numclients':0}, - 'iv_one_node' : {'tags' :'cart,iv,one_node', 'numservers':1, 'numclients':1}, - 'iv_two_node' : {'tags' :'cart,iv,two_node', 'numservers':2, 'numclients':1}, - 'launcher_one_node' : {'tags' :'cart,no_pmix_launcher,one_node','numservers':1, 'numclients':1}, - 'multictx_one_node' : {'tags' :'cart,no_pmix,one_node', 'numservers':1, 'numclients':0}, - 'rpc_one_node' : {'tags' :'cart,rpc,one_node', 'numservers':1, 'numclients':1}, - 'rpc_two_node' : {'tags' :'cart,rpc,two_node','numservers':2, 'numclients':1}, - 'swim_notification' : {'tags' :'cart,rpc,swim_rank_eviction,one_node', 'numservers':1, 'numclients':1} - } - - - def set_paths(self, core_prov): - self.ci_middlewares_path = f'{cloudbees_config.build_dir}/{core_prov}' - self.daos_install_root = f'{self.ci_middlewares_path}/daos/install' - self.cart_test_scripts = f'{self.daos_install_root}/lib/daos/TESTING/ftest' - self.mpipath = f'{cloudbees_config.daos_mpi}/bin' - self.pathlist = [f'{self.daos_install_root}/bin/', self.cart_test_scripts, self.mpipath, \ - f'{self.daos_install_root}/lib/daos/TESTING/tests'] - self.daos_prereq = f'{self.daos_install_root}/prereq' - common.run_command(['rm', '-rf', f'{self.ci_middlewares_path}/daos_logs/*']) - common.run_command(['rm','-rf', f'{self.daos_prereq}/debug/ofi']) - common.run_command(['ln', '-sfn', self.libfab_installpath, f'{self.daos_prereq}/debug/ofi']) - - @property - def cmd(self): - return f"env; echo {common.cloudbees_log_start_string}; "\ - "python3.6 launch.py " - - def remote_launch_cmd(self, testname): - -# The following env variables must be set appropriately prior -# to running the daos/cart tests OFI_DOMAIN, OFI_INTERFACE, -# CRT_PHY_ADDR_STR, PATH, DAOS_TEST_SHARED_DIR DAOS_TEST_LOG_DIR, -# LD_LIBRARY_PATH in the script being sourced below. - launch_cmd = f"ssh {self.launch_node} \"source {self.ci_middlewares_path}/daos_ci_env_setup.sh && \ - cd {self.cart_test_scripts} &&\" " - return launch_cmd - - def options(self, testname): - opts = "-s " - opts += f"{self.cart_tests[testname]['tags']} " - - if (self.cart_tests[testname]['numservers'] != 0): - servers = ",".join(self.daos_nodes[:self.cart_tests[testname]['numservers']]) - opts += f"--test_servers={servers} " - if (self.cart_tests[testname]['numclients'] != 0): - clients = ",".join(self.daos_nodes[:self.cart_tests[testname]['numclients']]) - opts += f"--test_clients={clients}" - return opts - - @property - def execute_condn(self): - return True - def execute_cmd(self): - sys.path.append(f'{self.daos_install_root}/lib64/python3.6/site-packages') - os.environ['PYTHONPATH']=f'{self.daos_install_root}/lib64/python3.6/site-packages' - - test_dir=self.cart_test_scripts - curdir=os.getcwd() - os.chdir(test_dir) - for test in self.cart_tests: - print(test) - command = self.remote_launch_cmd(test) + self.cmd + self.options(test) - outputcmd = shlex.split(command) - common.run_logging_command(outputcmd, self.log_file) - print("--------------------TEST COMPLETED----------------------") - os.chdir(curdir) - -class DMABUFTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, - None, util_prov) - self.DMABUFtestpath = f'{self.libfab_installpath}/bin' - self.timeout = 300 - self.n = os.environ['SLURM_NNODES'] if 'SLURM_NNODES' \ - in os.environ.keys() \ - else 0 - - if util_prov: - self.prov = f"{self.core_prov}\;{self.util_prov}" - else: - self.prov = self.core_prov - - self.dmabuf_environ = { - 'ZEX_NUMBER_OF_CCS' : '0:4,1:4', - 'NEOReadDebugKeys' : '1', - 'EnableImplicitScaling' : '0', - 'MLX5_SCATTER_TO_CQE' : '0' - } - - self.single_node_combinations = { - 'H2H' : { - '-m malloc' : ['-m malloc'] - }, - 'H2D' : { - '-m malloc' : [ - '-m device -d 0', - '-m device -d 1' - ] - }, - 'D2H' : { - '-m device -d 0' : ['-m malloc'], - '-m device -d 1' : ['-m malloc'] - }, - 'D2D' : { - '-m device -d 0' : [ - '-m device -d 1', - '-m device -d 2', - '-m device -d 3' - ], - '-m device -d 1' : [ - '-m device -d 2', - '-m device -d 3' - ] - } - } - - self.double_node_combinations = { - 'H2H' : { - '-m malloc' : ['-m malloc'] - }, - 'H2D' : { - '-m malloc' : [ - '-m device -d 0', - '-m device -d 1', - '-m device -d 2' - ] - }, - 'D2H' : { - '-m device -d 0' : ['-m malloc'], - '-m device -d 1' : ['-m malloc'], - '-m device -d 2' : ['-m malloc'], - '-m device -d 3' : ['-m malloc'] - }, - 'D2D' : { - '-m device -d 0' : [ - '-m device -d 0', - '-m device -d 1', - '-m device -d 2', - '-m device -d 3' - ], - '-m device -d 1' : [ - '-m device -d 1', - '-m device -d 2', - '-m device -d 3' - ] - } - } - - @property - def execute_condn(self): - return True if (self.core_prov == 'verbs') \ - else False - - @property - def cmd(self): - return f"{self.DMABUFtestpath}/fi_xe_rdmabw" - - def dmabuf_env(self): - return ' '.join([f"{key}={self.dmabuf_environ[key]}" \ - for key in self.dmabuf_environ]) - - def execute_cmd(self, test_type): - os.chdir(self.DMABUFtestpath) - base_cmd = '' - operations = ['write', 'read', 'send'] - log_prefix = f"{os.environ['LOG_DIR']}/dmabuf_{self.n}" - if self.n == '1': - self.tests = self.single_node_combinations - else: - self.tests = self.double_node_combinations - for operation in operations: - for key,value in self.tests[test_type].items(): - for values in value: - server_command = f"{self.cmd} {values} -p {self.core_prov}" - if 'send' in operation: - server_command += f" -t {operation}" - base_cmd = f"-t {operation} -p {self.core_prov} {self.server}" - client_command = f"{self.cmd} {key} {base_cmd}" - RC = common.ClientServerTest( - f"ssh {self.server} {self.dmabuf_env()} {server_command}", \ - f"ssh {self.client} {self.dmabuf_env()} {client_command}", \ - f"{log_prefix}_server.log", f"{log_prefix}_client.log", \ - self.timeout - ).run() - - if RC == (0, 0): - print("-------------- TEST COMPLETED ---------------") - else: - print("-------------- TEST FAILED ---------------") - sys.exit(f"Exiting with returncode: {RC}") From 361fd549d2d30de69e6ba9573cb44438417092e2 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Fri, 22 Mar 2024 15:15:33 -0700 Subject: [PATCH 3/4] contrib/intel/jenkins: Update README to reflect move of scripts README needs to be updated to explain why there is only a Jenkinsfile in this folder now. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/README | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/contrib/intel/jenkins/README b/contrib/intel/jenkins/README index 01fef9f964b..a3334f72021 100644 --- a/contrib/intel/jenkins/README +++ b/contrib/intel/jenkins/README @@ -1,8 +1,11 @@ Introduction ============ -Jenkins is a CI/CD (Continuous Integration/Continuous Development) Pipelining tool that Intel uses to test code changes to libfabric. It follows the Jenkinsfile pipeline stages to build, test, and cleanup resources. +Jenkins is a CI/CD (Continuous Integration/Continuous Development) Pipelining +tool that Intel uses to test code changes to libfabric. It follows the +Jenkinsfile pipeline stages to build, test, and cleanup resources. -The runtime flow generally follows, Jenkinsfile -> build.py -> runtests.py -> run.py -> tests.py. +The scripts that this pipeline uses are stored in an internal Intel CI +repository. Tests, Middlewares and Libraries supported by Intel CI/CD are: Fabtests From 0281986980fb69ee7d655440a5eeb3961e296543 Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Thu, 14 Mar 2024 10:56:56 -0700 Subject: [PATCH 4/4] contrib/intel/jenkins: Introduce new CI Tool for running tests New CI Tool for running tests is an Intel internal project. It will be referred to as CI in our Jenkinsfile. Signed-off-by: Zach Dworkin --- contrib/intel/jenkins/Jenkinsfile | 100 +++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index 292dfbb904a..465be7362ea 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -93,6 +93,14 @@ def run_middleware(providers, stage_name, test, hw, partition, node_num, } } +def run_ci(stage_name, config_name) { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python run.py \ + --output=${env.LOG_DIR}/${stage_name} \ + --job=${config_name} + """ +} + def gather_logs(cluster, key, dest, source) { def address = "${env.USER}@${cluster}" @@ -159,9 +167,22 @@ def checkout_ci_resources() { """ } +def checkout_ci() { + sh """ + if [[ ! -d ${env.WORKSPACE}/ci ]]; then + mkdir ${env.WORKSPACE}/ci + else + rm -rf ${env.WORKSPACE}/ci && mkdir ${env.WORKSPACE}/ci + fi + + git clone --recurse-submodules ${env.CI} ${env.WORKSPACE}/ci + """ +} + def checkout_external_resources() { checkout_ci_resources() checkout_upstream() + checkout_ci() } def generate_diff(def branch_name, def output_loc) { @@ -240,6 +261,10 @@ def build(item, mode=null, hw=null, additional_args=null) { run_python(PYTHON_VERSION, cmd) } +def build_ci() { + sh "${CI_LOCATION}/${env.CI_MODULE}/bootstrap.sh" +} + def check_target() { echo "CHANGE_TARGET = ${env.CHANGE_TARGET}" if (changeRequest()) { @@ -322,6 +347,7 @@ pipeline { CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" DELETE_LOCATION="${env.CUSTOM_WORKSPACE}/middlewares" RUN_LOCATION="${env.CUSTOM_WORKSPACE}/ci_resources/legacy_pipeline_scripts/" + CI_LOCATION="${env.CUSTOM_WORKSPACE}/ci" LOG_DIR = "${env.CUSTOM_WORKSPACE}/log_dir" } stages { @@ -394,7 +420,17 @@ pipeline { stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { + stage ('build-ci') { + steps { + script { + build_ci() + } + } + } stage ('build-water') { + environment { + build_type = "water" + } steps { script { slurm_build(BUILD_MODES, "water", "water", "water") @@ -412,6 +448,9 @@ pipeline { } } stage ('build-grass') { + environment { + build_type = "grass" + } steps { script { slurm_build(BUILD_MODES, "grass", "grass", "grass") @@ -429,6 +468,9 @@ pipeline { } } stage ('build-electric') { + environment { + build_type = "electric" + } steps { script { slurm_build(BUILD_MODES, "electric", "electric", "electric") @@ -505,48 +547,50 @@ pipeline { stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { - stage('MPI_verbs-rxm_IMB') { + stage ('CI_MPI_verbs-rxm_IMB') { + environment { + build_type = "water" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (def mpi in ["impi"]) { - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", "water", - "squirtle,totodile", "2", "${mpi}", - "${imb_grp}") - } - } + dir (CI_LOCATION) { + run_ci("CI_MPI_verbs-rxm_IMB", "pr_imb_water.json") } } } } - stage('MPI_verbs-rxm_OSU') { + stage ('CI_MPI_verbs-rxm_OSU') { + environment { + build_type = "water" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (def mpi in ["impi", "mpich"]) { - run_middleware(providers, "MPI", "osu", "water", - "squirtle,totodile", "2", "${mpi}") - } + dir (CI_LOCATION) { + run_ci("CI_MPI_verbs-rxm_OSU", "pr_osu_water.json") } } } } - stage('MPI_tcp') { + stage ('CI_MPI_tcp_IMB') { + environment { + build_type = "grass" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["tcp", null]] - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", "grass", - "bulbasaur", "2", "impi", "${imb_grp}") - } - for (def mpi in ["impi", "mpich"]) { - run_middleware(providers, "MPI", "osu", "grass", "bulbasaur", - "2", "${mpi}") - } + dir (CI_LOCATION) { + run_ci("CI_MPI_tcp_IMB", "pr_imb_grass.json") + } + } + } + } + stage ('CI_MPI_tcp_OSU') { + environment { + build_type = "grass" + } + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_MPI_tcp_OSU", "pr_osu_grass.json") } } }