General updates to new htcondor knowledge!

carlidel · Oct 13, 2023 · c7a14f0 · c7a14f0
1 parent 42d6262
commit c7a14f0
Show file tree

Hide file tree

Showing 5 changed files with 241 additions and 43 deletions.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="simanager",
-    version="0.0.3",
+    version="0.0.4",
     author="Carlo Emilio Montanari",
     author_email="[email protected]",
     description="A Python package for managing simulations locally, on HTCondor and on Slurm, with some specific elements that are good in a CERNy environment.",

diff --git a/simanager/cli_tools.py b/simanager/cli_tools.py
@@ -1,6 +1,8 @@
 # simanager/cli_tool.py
 import argparse
 import os
+import re
+import subprocess
 import sys
 import time
 
@@ -104,6 +106,51 @@ def generate_parser():
     status_parser = subparsers.add_parser("status", help="Print simulation status")
     status_parser.add_argument("--simpath", help="Simulation path", default="./")
 
+    # Subcommand: cat-err
+    cat_err_parser = subparsers.add_parser(
+        "cat-err", help="Print contents of err files"
+    )
+    cat_err_parser.add_argument("--simpath", help="Simulation path", default="./")
+    cat_err_parser.add_argument("--errpath", help="Error path", default="err")
+    cat_err_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)
+
+    # Subcommand: cat-out
+    cat_out_parser = subparsers.add_parser(
+        "cat-out", help="Print contents of out files"
+    )
+    cat_out_parser.add_argument("--simpath", help="Simulation path", default="./")
+    cat_out_parser.add_argument("--outpath", help="Output path", default="out")
+    cat_out_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)
+
+    # Subcommand: cat-log
+    cat_log_parser = subparsers.add_parser(
+        "cat-log", help="Print contents of log files"
+    )
+    cat_log_parser.add_argument("--simpath", help="Simulation path", default="./")
+    cat_log_parser.add_argument("--logpath", help="Log path", default="log")
+    cat_log_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)
+
+    # Subcommand: extract-file
+    extract_file_parser = subparsers.add_parser(
+        "extract-file",
+        help="Extract output files from a simulation and places them in a target folder. If the target folder does not exist, it will be created. If the target file is a symlink, an equivalent symlink will be created in the target folder.",
+    )
+    extract_file_parser.add_argument("--simpath", help="Simulation path", default="./")
+    extract_file_parser.add_argument(
+        "--target", help="Target folder", default="extracted_files"
+    )
+    extract_file_parser.add_argument(
+        "--file",
+        help="Regex of files to extract. If not specified, all .h5 and .pkl files will be extracted.",
+        default=None,
+    )
+
+    # Subcommand: self-update
+    subparsers.add_parser(
+        "self-update",
+        help="CURSED AND CRISPY: Update simanager to the latest version. Assumes that the package is installed with 'pip install -e' and that the directory is a clone of the git repo.",
+    )
+
     return parser
 
 
@@ -215,6 +262,95 @@ def main():
         sim = SimulationStudy.load_folder(args.simpath)
         # print the simulation status
         sim.print_sim_status()
+    elif args.subcommand == "cat-err":
+        # load the simulation
+        sim = SimulationStudy.load_folder(args.simpath)
+        # get the path of the err folder
+        sim_folder = os.path.join(sim.study_path, sim.study_name)
+        err_folder = os.path.join(sim_folder, args.errpath)
+        err_files = os.listdir(err_folder)
+        if args.idx == -1:
+            # print the contents of all err files
+            for err_file in err_files:
+                with open(os.path.join(err_folder, err_file), "r") as f:
+                    print(f.read())
+        else:
+            # print the contents of the err file with index args.idx
+            with open(os.path.join(err_folder, err_files[args.idx]), "r") as f:
+                print(f.read())
+    elif args.subcommand == "cat-out":
+        # load the simulation
+        sim = SimulationStudy.load_folder(args.simpath)
+        # get the path of the out folder
+        sim_folder = os.path.join(sim.study_path, sim.study_name)
+        out_folder = os.path.join(sim_folder, args.outpath)
+        out_files = os.listdir(out_folder)
+        if args.idx == -1:
+            # print the contents of all out files
+            for out_file in out_files:
+                with open(os.path.join(out_folder, out_file), "r") as f:
+                    print(f.read())
+        else:
+            # print the contents of the out file with index args.idx
+            with open(os.path.join(out_folder, out_files[args.idx]), "r") as f:
+                print(f.read())
+    elif args.subcommand == "cat-log":
+        # load the simulation
+        sim = SimulationStudy.load_folder(args.simpath)
+        # get the path of the log folder
+        sim_folder = os.path.join(sim.study_path, sim.study_name)
+        log_folder = os.path.join(sim_folder, args.logpath)
+        log_files = os.listdir(log_folder)
+        if args.idx == -1:
+            # print the contents of all log files
+            for log_file in log_files:
+                with open(os.path.join(log_folder, log_file), "r") as f:
+                    print(f.read())
+        else:
+            # print the contents of the log file with index args.idx
+            with open(os.path.join(log_folder, log_files[args.idx]), "r") as f:
+                print(f.read())
+    elif args.subcommand == "extract-file":
+        # load the simulation
+        sim = SimulationStudy.load_folder(args.simpath)
+        # extract the file
+        sim_folder = os.path.join(sim.study_path, sim.study_name)
+        scan_folder = os.path.join(sim_folder, "scan")
+        # get list of all files in a sim finished folder
+        files = os.listdir(os.path.join(scan_folder, sim.finished[0]))
+        # filter files based on regex
+        if args.file is not None:
+            files = [f for f in files if re.match(args.file, f)]
+        else:
+            files = [f for f in files if re.match(r".*\.(h5|pkl)", f)]
+
+        # create target folder
+        os.makedirs(os.path.join(sim_folder, args.target), exist_ok=True)
+
+        # extract files
+        for sim in sim.finished:
+            sim_folder = os.path.join(scan_folder, sim)
+            for f in files:
+                new_filename = f"{sim}_{f}"
+                is_symlink = os.path.islink(os.path.join(sim_folder, f))
+                if is_symlink:
+                    target = os.readlink(os.path.join(sim_folder, f))
+                    os.symlink(target, os.path.join(sim_folder, new_filename))
+                else:
+                    # copy the file
+                    os.system(
+                        f"cp {os.path.join(sim_folder, f)} {os.path.join(sim_folder, new_filename)}"
+                    )
+    elif args.subcommand == "self-update":
+        # get the directory of this python script
+        this_directory = os.path.dirname(os.path.realpath(__file__))
+        # attempt a git pull
+        print("Attempting to update simanager...")
+        subprocess.run(
+            ["git", "pull"],
+            cwd=this_directory,
+            check=True,
+        )
 
 
 if __name__ == "__main__":

diff --git a/simanager/job_run_htcondor.py b/simanager/job_run_htcondor.py
@@ -91,13 +91,12 @@
 
 transfer_output_files = ""
 
-requirements = (TARGET.OpSysAndVer =?= "CentOS7")
-
 request_cpus = __REPLACE_WITH_REQUEST_CPUS__
 
 +JobFlavour = "__REPLACE_WITH_TIME_LIMIT__"
 
 +AccountingGroup = "group_u_BE.ABP.normal"
++WantOS = "el9"
 
 queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__
 """
@@ -114,14 +113,15 @@
 
 transfer_output_files = ""
 
-requirements = (regexp("(V100|A100)", Target.CUDADeviceName) && (TARGET.OpSysAndVer =?= "CentOS7"))
+requirements = (regexp("(V100|A100)", Target.CUDADeviceName)
 
 request_GPUs = __REPLACE_WITH_REQUEST_GPUS__
 request_cpus = __REPLACE_WITH_REQUEST_CPUS__
 
 +JobFlavour = "__REPLACE_WITH_TIME_LIMIT__"
 
 +AccountingGroup = "group_u_BE.ABP.normal"
++WantOS = "el9"
 
 queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__
 """
@@ -168,7 +168,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs):
         default is "longlunch".
     cvmfs_path : str
         The path to the CVMFS environment to use.
-        Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh".
+        Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh".
     venv_path : str
         The path to the virtual environment to use.
         Default is the same as cvmfs_path.
@@ -208,7 +208,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs):
 
     cvmfs_path = kwargs.pop(
         "cvmfs_path",
-        "/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh",
+        "/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh",
     )
     # if no venv path is provided, just reload the cvmfs environment
     venv_path = kwargs.pop("venv_path", cvmfs_path)

diff --git a/simanager/simulation_study.py b/simanager/simulation_study.py
@@ -296,6 +296,48 @@ def set_sim_status(self, sim_name, status):
         with open(parameter_file, "w", encoding="utf-8") as f:
             yaml.dump(parameters, f)
 
+    def _update_remote_status(self):
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        sim_to_check = (
+            simulation_info["sim_not_started"] + simulation_info["sim_running"]
+        )
+        for sim in sim_to_check:
+            folder_path = os.path.join(self.study_path, self.study_name, "scan", sim)
+            if os.path.exists(os.path.join(folder_path, "remote_finished")):
+                try:
+                    simulation_info["sim_running"].remove(sim)
+                    print(f"Removed {sim} from sim_running")
+                except ValueError:
+                    pass
+                try:
+                    simulation_info["sim_not_started"].remove(sim)
+                    print(f"Removed {sim} from sim_not_started")
+                except ValueError:
+                    pass
+                try:
+                    simulation_info["sim_error"].remove(sim)
+                    print(f"Removed {sim} from sim_error")
+                except ValueError:
+                    pass
+                try:
+                    simulation_info["sim_interrupted"].remove(sim)
+                    print(f"Removed {sim} from sim_interrupted")
+                except ValueError:
+                    pass
+                try:
+                    simulation_info["sim_finished"].remove(sim)
+                    print(f"{sim} has indeed finished")
+                except ValueError:
+                    pass
+                simulation_info["sim_finished"].append(sim)
+                self.set_sim_status(sim, "finished")
+                print(f"Simulation {sim} finished remotely.")
+
     def print_sim_status(self, update_remote_status=True):
         """Prints the simulation status. If update_remote_status is True, also
         checks if the simulations running remotely are finished by checking the
@@ -318,42 +360,7 @@ def print_sim_status(self, update_remote_status=True):
             simulation_info = yaml.safe_load(f)
 
         if update_remote_status:
-            sim_to_check = (
-                simulation_info["sim_not_started"] + simulation_info["sim_running"]
-            )
-            for sim in sim_to_check:
-                folder_path = os.path.join(
-                    self.study_path, self.study_name, "scan", sim
-                )
-                if os.path.exists(os.path.join(folder_path, "remote_finished")):
-                    try:
-                        simulation_info["sim_running"].remove(sim)
-                        print(f"Removed {sim} from sim_running")
-                    except ValueError:
-                        pass
-                    try:
-                        simulation_info["sim_not_started"].remove(sim)
-                        print(f"Removed {sim} from sim_not_started")
-                    except ValueError:
-                        pass
-                    try:
-                        simulation_info["sim_error"].remove(sim)
-                        print(f"Removed {sim} from sim_error")
-                    except ValueError:
-                        pass
-                    try:
-                        simulation_info["sim_interrupted"].remove(sim)
-                        print(f"Removed {sim} from sim_interrupted")
-                    except ValueError:
-                        pass
-                    try:
-                        simulation_info["sim_finished"].remove(sim)
-                        print(f"{sim} has indeed finished")
-                    except ValueError:
-                        pass
-                    simulation_info["sim_finished"].append(sim)
-                    self.set_sim_status(sim, "finished")
-                    print(f"Simulation {sim} finished remotely.")
+            self._update_remote_status()
 
         print("------------------------------------------------------------")
         print("Simulation status:")
@@ -524,3 +531,58 @@ def nuke_simulation(self):
         shutil.rmtree(main_folder)
         print("NUKING COMPLETE!")
         print("DO YOU FEEL LIKE OPPENHEIMER YET?")
+
+    @property
+    def finished(self):
+        """Returns a list of the simulations that are finished."""
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        return simulation_info["sim_finished"]
+
+    @property
+    def not_started(self):
+        """Returns a list of the simulations that are not started."""
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        return simulation_info["sim_not_started"]
+
+    @property
+    def running(self):
+        """Returns a list of the simulations that are running."""
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        return simulation_info["sim_running"]
+
+    @property
+    def interrupted(self):
+        """Returns a list of the simulations that are interrupted."""
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        return simulation_info["sim_interrupted"]
+
+    @property
+    def error(self):
+        """Returns a list of the simulations that have error."""
+        simulation_info_file = os.path.join(
+            self.study_path, self.study_name, "simulation_info.yaml"
+        )
+        with open(simulation_info_file, "r", encoding="utf-8") as f:
+            simulation_info = yaml.safe_load(f)
+
+        return simulation_info["sim_error"]
diff --git a/simanager/templates/run_config.yaml b/simanager/templates/run_config.yaml
@@ -4,7 +4,7 @@ run_htcondor:
   time_limit: testmatch
   request_gpus: True
   request_cpus: 1
-  cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-centos7-gcc11-opt/setup.sh
+  cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh
   venv_path: /path/to/my/venv
   eos_dir: /eos/user/c/camontan/data
 run_slurm: