From c7a14f080be68fc52d7846b0359464a5b5a42fa5 Mon Sep 17 00:00:00 2001 From: Carlo Emilio MONTANARI Date: Fri, 13 Oct 2023 23:05:01 +0200 Subject: [PATCH] General updates to new htcondor knowledge! --- setup.py | 2 +- simanager/cli_tools.py | 136 ++++++++++++++++++++++++++++ simanager/job_run_htcondor.py | 10 +- simanager/simulation_study.py | 134 +++++++++++++++++++-------- simanager/templates/run_config.yaml | 2 +- 5 files changed, 241 insertions(+), 43 deletions(-) diff --git a/setup.py b/setup.py index 57d5000..af98b10 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="simanager", - version="0.0.3", + version="0.0.4", author="Carlo Emilio Montanari", author_email="carlo.emilio.montanari@cern.ch", description="A Python package for managing simulations locally, on HTCondor and on Slurm, with some specific elements that are good in a CERNy environment.", diff --git a/simanager/cli_tools.py b/simanager/cli_tools.py index 10866b6..44fffdc 100644 --- a/simanager/cli_tools.py +++ b/simanager/cli_tools.py @@ -1,6 +1,8 @@ # simanager/cli_tool.py import argparse import os +import re +import subprocess import sys import time @@ -104,6 +106,51 @@ def generate_parser(): status_parser = subparsers.add_parser("status", help="Print simulation status") status_parser.add_argument("--simpath", help="Simulation path", default="./") + # Subcommand: cat-err + cat_err_parser = subparsers.add_parser( + "cat-err", help="Print contents of err files" + ) + cat_err_parser.add_argument("--simpath", help="Simulation path", default="./") + cat_err_parser.add_argument("--errpath", help="Error path", default="err") + cat_err_parser.add_argument("--idx", help="Simulation index", default=-1, type=int) + + # Subcommand: cat-out + cat_out_parser = subparsers.add_parser( + "cat-out", help="Print contents of out files" + ) + cat_out_parser.add_argument("--simpath", help="Simulation path", default="./") + cat_out_parser.add_argument("--outpath", help="Output path", default="out") + cat_out_parser.add_argument("--idx", help="Simulation index", default=-1, type=int) + + # Subcommand: cat-log + cat_log_parser = subparsers.add_parser( + "cat-log", help="Print contents of log files" + ) + cat_log_parser.add_argument("--simpath", help="Simulation path", default="./") + cat_log_parser.add_argument("--logpath", help="Log path", default="log") + cat_log_parser.add_argument("--idx", help="Simulation index", default=-1, type=int) + + # Subcommand: extract-file + extract_file_parser = subparsers.add_parser( + "extract-file", + help="Extract output files from a simulation and places them in a target folder. If the target folder does not exist, it will be created. If the target file is a symlink, an equivalent symlink will be created in the target folder.", + ) + extract_file_parser.add_argument("--simpath", help="Simulation path", default="./") + extract_file_parser.add_argument( + "--target", help="Target folder", default="extracted_files" + ) + extract_file_parser.add_argument( + "--file", + help="Regex of files to extract. If not specified, all .h5 and .pkl files will be extracted.", + default=None, + ) + + # Subcommand: self-update + subparsers.add_parser( + "self-update", + help="CURSED AND CRISPY: Update simanager to the latest version. Assumes that the package is installed with 'pip install -e' and that the directory is a clone of the git repo.", + ) + return parser @@ -215,6 +262,95 @@ def main(): sim = SimulationStudy.load_folder(args.simpath) # print the simulation status sim.print_sim_status() + elif args.subcommand == "cat-err": + # load the simulation + sim = SimulationStudy.load_folder(args.simpath) + # get the path of the err folder + sim_folder = os.path.join(sim.study_path, sim.study_name) + err_folder = os.path.join(sim_folder, args.errpath) + err_files = os.listdir(err_folder) + if args.idx == -1: + # print the contents of all err files + for err_file in err_files: + with open(os.path.join(err_folder, err_file), "r") as f: + print(f.read()) + else: + # print the contents of the err file with index args.idx + with open(os.path.join(err_folder, err_files[args.idx]), "r") as f: + print(f.read()) + elif args.subcommand == "cat-out": + # load the simulation + sim = SimulationStudy.load_folder(args.simpath) + # get the path of the out folder + sim_folder = os.path.join(sim.study_path, sim.study_name) + out_folder = os.path.join(sim_folder, args.outpath) + out_files = os.listdir(out_folder) + if args.idx == -1: + # print the contents of all out files + for out_file in out_files: + with open(os.path.join(out_folder, out_file), "r") as f: + print(f.read()) + else: + # print the contents of the out file with index args.idx + with open(os.path.join(out_folder, out_files[args.idx]), "r") as f: + print(f.read()) + elif args.subcommand == "cat-log": + # load the simulation + sim = SimulationStudy.load_folder(args.simpath) + # get the path of the log folder + sim_folder = os.path.join(sim.study_path, sim.study_name) + log_folder = os.path.join(sim_folder, args.logpath) + log_files = os.listdir(log_folder) + if args.idx == -1: + # print the contents of all log files + for log_file in log_files: + with open(os.path.join(log_folder, log_file), "r") as f: + print(f.read()) + else: + # print the contents of the log file with index args.idx + with open(os.path.join(log_folder, log_files[args.idx]), "r") as f: + print(f.read()) + elif args.subcommand == "extract-file": + # load the simulation + sim = SimulationStudy.load_folder(args.simpath) + # extract the file + sim_folder = os.path.join(sim.study_path, sim.study_name) + scan_folder = os.path.join(sim_folder, "scan") + # get list of all files in a sim finished folder + files = os.listdir(os.path.join(scan_folder, sim.finished[0])) + # filter files based on regex + if args.file is not None: + files = [f for f in files if re.match(args.file, f)] + else: + files = [f for f in files if re.match(r".*\.(h5|pkl)", f)] + + # create target folder + os.makedirs(os.path.join(sim_folder, args.target), exist_ok=True) + + # extract files + for sim in sim.finished: + sim_folder = os.path.join(scan_folder, sim) + for f in files: + new_filename = f"{sim}_{f}" + is_symlink = os.path.islink(os.path.join(sim_folder, f)) + if is_symlink: + target = os.readlink(os.path.join(sim_folder, f)) + os.symlink(target, os.path.join(sim_folder, new_filename)) + else: + # copy the file + os.system( + f"cp {os.path.join(sim_folder, f)} {os.path.join(sim_folder, new_filename)}" + ) + elif args.subcommand == "self-update": + # get the directory of this python script + this_directory = os.path.dirname(os.path.realpath(__file__)) + # attempt a git pull + print("Attempting to update simanager...") + subprocess.run( + ["git", "pull"], + cwd=this_directory, + check=True, + ) if __name__ == "__main__": diff --git a/simanager/job_run_htcondor.py b/simanager/job_run_htcondor.py index e53c385..14d3dd5 100644 --- a/simanager/job_run_htcondor.py +++ b/simanager/job_run_htcondor.py @@ -91,13 +91,12 @@ transfer_output_files = "" -requirements = (TARGET.OpSysAndVer =?= "CentOS7") - request_cpus = __REPLACE_WITH_REQUEST_CPUS__ +JobFlavour = "__REPLACE_WITH_TIME_LIMIT__" +AccountingGroup = "group_u_BE.ABP.normal" ++WantOS = "el9" queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__ """ @@ -114,7 +113,7 @@ transfer_output_files = "" -requirements = (regexp("(V100|A100)", Target.CUDADeviceName) && (TARGET.OpSysAndVer =?= "CentOS7")) +requirements = (regexp("(V100|A100)", Target.CUDADeviceName) request_GPUs = __REPLACE_WITH_REQUEST_GPUS__ request_cpus = __REPLACE_WITH_REQUEST_CPUS__ @@ -122,6 +121,7 @@ +JobFlavour = "__REPLACE_WITH_TIME_LIMIT__" +AccountingGroup = "group_u_BE.ABP.normal" ++WantOS = "el9" queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__ """ @@ -168,7 +168,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs): default is "longlunch". cvmfs_path : str The path to the CVMFS environment to use. - Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh". + Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh". venv_path : str The path to the virtual environment to use. Default is the same as cvmfs_path. @@ -208,7 +208,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs): cvmfs_path = kwargs.pop( "cvmfs_path", - "/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh", + "/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh", ) # if no venv path is provided, just reload the cvmfs environment venv_path = kwargs.pop("venv_path", cvmfs_path) diff --git a/simanager/simulation_study.py b/simanager/simulation_study.py index 4fa5c87..64f0b91 100644 --- a/simanager/simulation_study.py +++ b/simanager/simulation_study.py @@ -296,6 +296,48 @@ def set_sim_status(self, sim_name, status): with open(parameter_file, "w", encoding="utf-8") as f: yaml.dump(parameters, f) + def _update_remote_status(self): + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + sim_to_check = ( + simulation_info["sim_not_started"] + simulation_info["sim_running"] + ) + for sim in sim_to_check: + folder_path = os.path.join(self.study_path, self.study_name, "scan", sim) + if os.path.exists(os.path.join(folder_path, "remote_finished")): + try: + simulation_info["sim_running"].remove(sim) + print(f"Removed {sim} from sim_running") + except ValueError: + pass + try: + simulation_info["sim_not_started"].remove(sim) + print(f"Removed {sim} from sim_not_started") + except ValueError: + pass + try: + simulation_info["sim_error"].remove(sim) + print(f"Removed {sim} from sim_error") + except ValueError: + pass + try: + simulation_info["sim_interrupted"].remove(sim) + print(f"Removed {sim} from sim_interrupted") + except ValueError: + pass + try: + simulation_info["sim_finished"].remove(sim) + print(f"{sim} has indeed finished") + except ValueError: + pass + simulation_info["sim_finished"].append(sim) + self.set_sim_status(sim, "finished") + print(f"Simulation {sim} finished remotely.") + def print_sim_status(self, update_remote_status=True): """Prints the simulation status. If update_remote_status is True, also checks if the simulations running remotely are finished by checking the @@ -318,42 +360,7 @@ def print_sim_status(self, update_remote_status=True): simulation_info = yaml.safe_load(f) if update_remote_status: - sim_to_check = ( - simulation_info["sim_not_started"] + simulation_info["sim_running"] - ) - for sim in sim_to_check: - folder_path = os.path.join( - self.study_path, self.study_name, "scan", sim - ) - if os.path.exists(os.path.join(folder_path, "remote_finished")): - try: - simulation_info["sim_running"].remove(sim) - print(f"Removed {sim} from sim_running") - except ValueError: - pass - try: - simulation_info["sim_not_started"].remove(sim) - print(f"Removed {sim} from sim_not_started") - except ValueError: - pass - try: - simulation_info["sim_error"].remove(sim) - print(f"Removed {sim} from sim_error") - except ValueError: - pass - try: - simulation_info["sim_interrupted"].remove(sim) - print(f"Removed {sim} from sim_interrupted") - except ValueError: - pass - try: - simulation_info["sim_finished"].remove(sim) - print(f"{sim} has indeed finished") - except ValueError: - pass - simulation_info["sim_finished"].append(sim) - self.set_sim_status(sim, "finished") - print(f"Simulation {sim} finished remotely.") + self._update_remote_status() print("------------------------------------------------------------") print("Simulation status:") @@ -524,3 +531,58 @@ def nuke_simulation(self): shutil.rmtree(main_folder) print("NUKING COMPLETE!") print("DO YOU FEEL LIKE OPPENHEIMER YET?") + + @property + def finished(self): + """Returns a list of the simulations that are finished.""" + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + return simulation_info["sim_finished"] + + @property + def not_started(self): + """Returns a list of the simulations that are not started.""" + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + return simulation_info["sim_not_started"] + + @property + def running(self): + """Returns a list of the simulations that are running.""" + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + return simulation_info["sim_running"] + + @property + def interrupted(self): + """Returns a list of the simulations that are interrupted.""" + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + return simulation_info["sim_interrupted"] + + @property + def error(self): + """Returns a list of the simulations that have error.""" + simulation_info_file = os.path.join( + self.study_path, self.study_name, "simulation_info.yaml" + ) + with open(simulation_info_file, "r", encoding="utf-8") as f: + simulation_info = yaml.safe_load(f) + + return simulation_info["sim_error"] diff --git a/simanager/templates/run_config.yaml b/simanager/templates/run_config.yaml index 1e728fa..e40db68 100644 --- a/simanager/templates/run_config.yaml +++ b/simanager/templates/run_config.yaml @@ -4,7 +4,7 @@ run_htcondor: time_limit: testmatch request_gpus: True request_cpus: 1 - cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-centos7-gcc11-opt/setup.sh + cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh venv_path: /path/to/my/venv eos_dir: /eos/user/c/camontan/data run_slurm: