Skip to content

Commit

Permalink
General updates to new htcondor knowledge!
Browse files Browse the repository at this point in the history
  • Loading branch information
carlidel committed Oct 13, 2023
1 parent 42d6262 commit c7a14f0
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 43 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="simanager",
version="0.0.3",
version="0.0.4",
author="Carlo Emilio Montanari",
author_email="[email protected]",
description="A Python package for managing simulations locally, on HTCondor and on Slurm, with some specific elements that are good in a CERNy environment.",
Expand Down
136 changes: 136 additions & 0 deletions simanager/cli_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# simanager/cli_tool.py
import argparse
import os
import re
import subprocess
import sys
import time

Expand Down Expand Up @@ -104,6 +106,51 @@ def generate_parser():
status_parser = subparsers.add_parser("status", help="Print simulation status")
status_parser.add_argument("--simpath", help="Simulation path", default="./")

# Subcommand: cat-err
cat_err_parser = subparsers.add_parser(
"cat-err", help="Print contents of err files"
)
cat_err_parser.add_argument("--simpath", help="Simulation path", default="./")
cat_err_parser.add_argument("--errpath", help="Error path", default="err")
cat_err_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)

# Subcommand: cat-out
cat_out_parser = subparsers.add_parser(
"cat-out", help="Print contents of out files"
)
cat_out_parser.add_argument("--simpath", help="Simulation path", default="./")
cat_out_parser.add_argument("--outpath", help="Output path", default="out")
cat_out_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)

# Subcommand: cat-log
cat_log_parser = subparsers.add_parser(
"cat-log", help="Print contents of log files"
)
cat_log_parser.add_argument("--simpath", help="Simulation path", default="./")
cat_log_parser.add_argument("--logpath", help="Log path", default="log")
cat_log_parser.add_argument("--idx", help="Simulation index", default=-1, type=int)

# Subcommand: extract-file
extract_file_parser = subparsers.add_parser(
"extract-file",
help="Extract output files from a simulation and places them in a target folder. If the target folder does not exist, it will be created. If the target file is a symlink, an equivalent symlink will be created in the target folder.",
)
extract_file_parser.add_argument("--simpath", help="Simulation path", default="./")
extract_file_parser.add_argument(
"--target", help="Target folder", default="extracted_files"
)
extract_file_parser.add_argument(
"--file",
help="Regex of files to extract. If not specified, all .h5 and .pkl files will be extracted.",
default=None,
)

# Subcommand: self-update
subparsers.add_parser(
"self-update",
help="CURSED AND CRISPY: Update simanager to the latest version. Assumes that the package is installed with 'pip install -e' and that the directory is a clone of the git repo.",
)

return parser


Expand Down Expand Up @@ -215,6 +262,95 @@ def main():
sim = SimulationStudy.load_folder(args.simpath)
# print the simulation status
sim.print_sim_status()
elif args.subcommand == "cat-err":
# load the simulation
sim = SimulationStudy.load_folder(args.simpath)
# get the path of the err folder
sim_folder = os.path.join(sim.study_path, sim.study_name)
err_folder = os.path.join(sim_folder, args.errpath)
err_files = os.listdir(err_folder)
if args.idx == -1:
# print the contents of all err files
for err_file in err_files:
with open(os.path.join(err_folder, err_file), "r") as f:
print(f.read())
else:
# print the contents of the err file with index args.idx
with open(os.path.join(err_folder, err_files[args.idx]), "r") as f:
print(f.read())
elif args.subcommand == "cat-out":
# load the simulation
sim = SimulationStudy.load_folder(args.simpath)
# get the path of the out folder
sim_folder = os.path.join(sim.study_path, sim.study_name)
out_folder = os.path.join(sim_folder, args.outpath)
out_files = os.listdir(out_folder)
if args.idx == -1:
# print the contents of all out files
for out_file in out_files:
with open(os.path.join(out_folder, out_file), "r") as f:
print(f.read())
else:
# print the contents of the out file with index args.idx
with open(os.path.join(out_folder, out_files[args.idx]), "r") as f:
print(f.read())
elif args.subcommand == "cat-log":
# load the simulation
sim = SimulationStudy.load_folder(args.simpath)
# get the path of the log folder
sim_folder = os.path.join(sim.study_path, sim.study_name)
log_folder = os.path.join(sim_folder, args.logpath)
log_files = os.listdir(log_folder)
if args.idx == -1:
# print the contents of all log files
for log_file in log_files:
with open(os.path.join(log_folder, log_file), "r") as f:
print(f.read())
else:
# print the contents of the log file with index args.idx
with open(os.path.join(log_folder, log_files[args.idx]), "r") as f:
print(f.read())
elif args.subcommand == "extract-file":
# load the simulation
sim = SimulationStudy.load_folder(args.simpath)
# extract the file
sim_folder = os.path.join(sim.study_path, sim.study_name)
scan_folder = os.path.join(sim_folder, "scan")
# get list of all files in a sim finished folder
files = os.listdir(os.path.join(scan_folder, sim.finished[0]))
# filter files based on regex
if args.file is not None:
files = [f for f in files if re.match(args.file, f)]
else:
files = [f for f in files if re.match(r".*\.(h5|pkl)", f)]

# create target folder
os.makedirs(os.path.join(sim_folder, args.target), exist_ok=True)

# extract files
for sim in sim.finished:
sim_folder = os.path.join(scan_folder, sim)
for f in files:
new_filename = f"{sim}_{f}"
is_symlink = os.path.islink(os.path.join(sim_folder, f))
if is_symlink:
target = os.readlink(os.path.join(sim_folder, f))
os.symlink(target, os.path.join(sim_folder, new_filename))
else:
# copy the file
os.system(
f"cp {os.path.join(sim_folder, f)} {os.path.join(sim_folder, new_filename)}"
)
elif args.subcommand == "self-update":
# get the directory of this python script
this_directory = os.path.dirname(os.path.realpath(__file__))
# attempt a git pull
print("Attempting to update simanager...")
subprocess.run(
["git", "pull"],
cwd=this_directory,
check=True,
)


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions simanager/job_run_htcondor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,12 @@
transfer_output_files = ""
requirements = (TARGET.OpSysAndVer =?= "CentOS7")
request_cpus = __REPLACE_WITH_REQUEST_CPUS__
+JobFlavour = "__REPLACE_WITH_TIME_LIMIT__"
+AccountingGroup = "group_u_BE.ABP.normal"
+WantOS = "el9"
queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__
"""
Expand All @@ -114,14 +113,15 @@
transfer_output_files = ""
requirements = (regexp("(V100|A100)", Target.CUDADeviceName) && (TARGET.OpSysAndVer =?= "CentOS7"))
requirements = (regexp("(V100|A100)", Target.CUDADeviceName)
request_GPUs = __REPLACE_WITH_REQUEST_GPUS__
request_cpus = __REPLACE_WITH_REQUEST_CPUS__
+JobFlavour = "__REPLACE_WITH_TIME_LIMIT__"
+AccountingGroup = "group_u_BE.ABP.normal"
+WantOS = "el9"
queue Executable,Simpath,Outpath,Errpath from __REPLACE_WITH_QUEUE_FILE__
"""
Expand Down Expand Up @@ -168,7 +168,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs):
default is "longlunch".
cvmfs_path : str
The path to the CVMFS environment to use.
Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh".
Default is "/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh".
venv_path : str
The path to the virtual environment to use.
Default is the same as cvmfs_path.
Expand Down Expand Up @@ -208,7 +208,7 @@ def job_run_htcondor(simulation_study: SimulationStudy, **kwargs):

cvmfs_path = kwargs.pop(
"cvmfs_path",
"/cvmfs/sft.cern.ch/lcg/views/LCG_102b_cuda/x86_64-centos7-gcc8-opt/setup.sh",
"/cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh",
)
# if no venv path is provided, just reload the cvmfs environment
venv_path = kwargs.pop("venv_path", cvmfs_path)
Expand Down
134 changes: 98 additions & 36 deletions simanager/simulation_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,48 @@ def set_sim_status(self, sim_name, status):
with open(parameter_file, "w", encoding="utf-8") as f:
yaml.dump(parameters, f)

def _update_remote_status(self):
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

sim_to_check = (
simulation_info["sim_not_started"] + simulation_info["sim_running"]
)
for sim in sim_to_check:
folder_path = os.path.join(self.study_path, self.study_name, "scan", sim)
if os.path.exists(os.path.join(folder_path, "remote_finished")):
try:
simulation_info["sim_running"].remove(sim)
print(f"Removed {sim} from sim_running")
except ValueError:
pass
try:
simulation_info["sim_not_started"].remove(sim)
print(f"Removed {sim} from sim_not_started")
except ValueError:
pass
try:
simulation_info["sim_error"].remove(sim)
print(f"Removed {sim} from sim_error")
except ValueError:
pass
try:
simulation_info["sim_interrupted"].remove(sim)
print(f"Removed {sim} from sim_interrupted")
except ValueError:
pass
try:
simulation_info["sim_finished"].remove(sim)
print(f"{sim} has indeed finished")
except ValueError:
pass
simulation_info["sim_finished"].append(sim)
self.set_sim_status(sim, "finished")
print(f"Simulation {sim} finished remotely.")

def print_sim_status(self, update_remote_status=True):
"""Prints the simulation status. If update_remote_status is True, also
checks if the simulations running remotely are finished by checking the
Expand All @@ -318,42 +360,7 @@ def print_sim_status(self, update_remote_status=True):
simulation_info = yaml.safe_load(f)

if update_remote_status:
sim_to_check = (
simulation_info["sim_not_started"] + simulation_info["sim_running"]
)
for sim in sim_to_check:
folder_path = os.path.join(
self.study_path, self.study_name, "scan", sim
)
if os.path.exists(os.path.join(folder_path, "remote_finished")):
try:
simulation_info["sim_running"].remove(sim)
print(f"Removed {sim} from sim_running")
except ValueError:
pass
try:
simulation_info["sim_not_started"].remove(sim)
print(f"Removed {sim} from sim_not_started")
except ValueError:
pass
try:
simulation_info["sim_error"].remove(sim)
print(f"Removed {sim} from sim_error")
except ValueError:
pass
try:
simulation_info["sim_interrupted"].remove(sim)
print(f"Removed {sim} from sim_interrupted")
except ValueError:
pass
try:
simulation_info["sim_finished"].remove(sim)
print(f"{sim} has indeed finished")
except ValueError:
pass
simulation_info["sim_finished"].append(sim)
self.set_sim_status(sim, "finished")
print(f"Simulation {sim} finished remotely.")
self._update_remote_status()

print("------------------------------------------------------------")
print("Simulation status:")
Expand Down Expand Up @@ -524,3 +531,58 @@ def nuke_simulation(self):
shutil.rmtree(main_folder)
print("NUKING COMPLETE!")
print("DO YOU FEEL LIKE OPPENHEIMER YET?")

@property
def finished(self):
"""Returns a list of the simulations that are finished."""
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

return simulation_info["sim_finished"]

@property
def not_started(self):
"""Returns a list of the simulations that are not started."""
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

return simulation_info["sim_not_started"]

@property
def running(self):
"""Returns a list of the simulations that are running."""
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

return simulation_info["sim_running"]

@property
def interrupted(self):
"""Returns a list of the simulations that are interrupted."""
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

return simulation_info["sim_interrupted"]

@property
def error(self):
"""Returns a list of the simulations that have error."""
simulation_info_file = os.path.join(
self.study_path, self.study_name, "simulation_info.yaml"
)
with open(simulation_info_file, "r", encoding="utf-8") as f:
simulation_info = yaml.safe_load(f)

return simulation_info["sim_error"]
2 changes: 1 addition & 1 deletion simanager/templates/run_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ run_htcondor:
time_limit: testmatch
request_gpus: True
request_cpus: 1
cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-centos7-gcc11-opt/setup.sh
cvmfs_path: /cvmfs/sft.cern.ch/lcg/views/LCG_104a_cuda/x86_64-el9-gcc11-opt/setup.sh
venv_path: /path/to/my/venv
eos_dir: /eos/user/c/camontan/data
run_slurm:
Expand Down

0 comments on commit c7a14f0

Please sign in to comment.