From d141e7ea784f5b4cbb56bccc4de7f450761feaf8 Mon Sep 17 00:00:00 2001 From: tomeichlersmith Date: Tue, 13 Jul 2021 09:25:57 -0500 Subject: [PATCH] allow for user to avoid recursion on input directories this is required to be able to run over a directory of DB event libraries (which are directories themselves) --- batch/README.md | 16 +++- batch/python/umn_htcondor/submit.py | 16 +++- batch/run_fire.sh | 139 ---------------------------- batch/submit_jobs.py | 3 +- 4 files changed, 28 insertions(+), 146 deletions(-) delete mode 100644 batch/run_fire.sh diff --git a/batch/README.md b/batch/README.md index 93036a4..c47e827 100644 --- a/batch/README.md +++ b/batch/README.md @@ -58,7 +58,7 @@ ldmx-submit-jobs -c production.py -o EXAMPLE -d ldmx/pro:v2.3.0 -n 5 *Comments* : - The output directory defined using `-o` is relative to your hdfs directory (so you will find the output of these five jobs in `/EXAMPLE/`. If you want the output in some other directory, you need to specify the full path. -- The version of ldmx-sw you want to use can be defined using the name of the directory it is in when using `ldmx-make-stable`. Your options for a stable installation are in `/local/cms/user/$USER/ldmx/stable-installs/`. +- The version of ldmx-sw you want to use can be defined by providing the production container using a DockerHub tag (`-d`) or providing the path to the singularity file you built (`-s`). - By default, the run numbers will start at `0` and count up from there. You can change the first run number by using the `--start_job` option. This is helpful when (for example), you want to run small group of jobs to make sure everything is working, but you don't want to waste time re-running the same run numbers. #### 2. Analysis @@ -72,7 +72,7 @@ ldmx-submit-jobs -c analysis.py -o EXAMPLE/hists -i EXAMPLE -d ldmx/pro:v2.3.0 - *Comments*: - Like the output directory, the input directory is also relative to your hdfs directory unless a full path is specified. - **The current `run_fire.sh` script only mounts hdfs, so the container will think directories/files outside of hdfs don't exist.** + **The current `run_ldmx.sh` script only mounts hdfs, so the container will think directories/files outside of hdfs don't exist.** - Since there are five files to analyze and we are asking for two files per job, we will have three jobs (two with two files and one with one). @@ -112,3 +112,15 @@ We put all of these generated files in the `/detail` directory - You can use the command `condor_q` to see the current status of your jobs. - The `-long` option to `condor_q` or `condor_history` dumps all of the information about the job(s) that you have selected with the other command line options. This is helpful for seeing exactly what was run. - If you see a long list of sequential jobs "fail", it might be that a specific worker node isn't configured properly. Check that it is one worker-node's fault by running `my-q -held -long | uniq-hosts`. If only one worker node shows up (but you know that you have tens of failed jobs), then you can `ssh` to that machine to try to figure it out (or email csehelp if you aren't sure what to do). In the mean time, you can put that machine in your list of `Machine != ` at the top of the submit file. + +# Dark Brem Signal Generation + +This sample generation is a special case that requires some modification. +Normally, we want to recursively enter directories in order to get a list of all `.root` files to use as input. +The DB event libraries are directories themselves, so we need to turn off recursion. +Here is an example of submitting a job where we provide the directory hold the DB event libraries. +Notice that we need _both_ `--no_recursive` _and_ `--files_per_job 1` so that we can run the DB sim once for each event library we have. + +``` +ldmx-submit-jobs -c db_sim.py -d ldmx/pro:edge -i /hdfs/cms/user/eichl008/ldmx/dark-brem-event-libraries --no_recursive -o TEST --files_per_job 1 --config_args "--num_events 20000 --material tungsten" +``` diff --git a/batch/python/umn_htcondor/submit.py b/batch/python/umn_htcondor/submit.py index a565f40..c3feede 100644 --- a/batch/python/umn_htcondor/submit.py +++ b/batch/python/umn_htcondor/submit.py @@ -255,7 +255,7 @@ def periodic_release(self) : self['periodic_release'] = held_by_us.and_((exit_code == 99).or_(exit_code == 100).or_(exit_code == 117).or_(exit_code == 118)) - def run_over_input_dirs(self, input_dirs, num_files_per_job) : + def run_over_input_dirs(self, input_dirs, num_files_per_job, recursive = True) : """Have the config script run over num_files_per_job files taken from input_dirs, generating jobs until all of the files in input_dirs are included. @@ -265,6 +265,8 @@ def run_over_input_dirs(self, input_dirs, num_files_per_job) : List of input directories, files, or file listings to run over num_files_per_job : int Number of files for each job to have (maximum, could be less) + recursive : bool + True if we should recursively search for root and list files in the supplied directories """ if self.__items_to_loop_over is not None : @@ -283,14 +285,20 @@ def smart_recursive_input(file_or_dir) : file_listing = listing.readlines() full_list.extend(smart_recursive_input([f.strip() for f in file_listing])) - elif os.path.isdir(file_or_dir) : - full_list.extend(smart_recursive_input([os.path.join(file_or_dir,f) for f in os.listdir(file_or_dir)])) + elif os.path.isdir(utility.full_dir(file_or_dir)) : + d = utility.full_dir(file_or_dir) + full_list.extend(smart_recursive_input([os.path.join(d,f) for f in os.listdir(d)])) else : print(f"'{file_or_dir}' is not a ROOT file, a directory, or a list of files. Skipping.") #file or directory return full_list - input_file_list = smart_recursive_input(input_dirs) + if recursive : + input_file_list = smart_recursive_input(input_dirs) + else : + input_file_list = [] + for d in [utility.full_dir(d) for d in input_dirs] : + input_file_list.extend([os.path.join(d,f) for f in os.listdir(d)]) # we need to define a list of dictionaries that htcondor submission will loop over # we partition the list of input files into space separate lists of maximum length arg.files_per_job diff --git a/batch/run_fire.sh b/batch/run_fire.sh deleted file mode 100644 index 943a7d4..0000000 --- a/batch/run_fire.sh +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/bash - -set -x - -############################################################################### -# run_fire.sh -# Batch running script for executing fire on a worker node and then copying -# the results to an output directory. -############################################################################### - -_job_id=$1 #should be unique between jobs submitted by the same user -_singularity_img=$2 #singularity img to use to run -_config_script=$3 #script itself to run, should be in output directory -_output_dir=$4 #output directory to copy products to, should be in /hdfs/cms/user/$USER/ldmx/ -_config_args=${@:5} #arguments to configuration script, input files should be in /hdfs/cms/user/$USER/ldmx/ - -if [[ ! -d /hdfs/cms/user ]]; then - echo "Worker node is not connected to hdfs." - exit 99 -fi - -if ! hash singularity &> /dev/null; then - echo "Worker node does not have singularity installed." - exit 99 -fi - -# make sure we go to our scratch area -_scratch_root=/export/scratch/users/$USER/ -mkdir -p $_scratch_root -cd $_scratch_root - -# cleanup the directory if it already exists -# (it shouldn't) -if [[ -d $_job_id ]]; then - rm -r $_job_id -fi - -# make the working directory for this job and go into it -mkdir $_job_id -if ! cd $_job_id; then - echo "Can't setup working directory." - exit 100 -fi - -# Now that we have entered our working directory, -# clean-up entails exiting the directory for this -# specific job and deleting the whole thing. -clean-up() { - cd $_scratch_root - rm -r $_job_id -} - -# Singularity command to run the fire executable -# --no-home : don't mount home directory -# --bind : mount our current directory and /hdfs/ (for reading input files) -# --cleanenv : don't copy current environment into container -if ! singularity run --no-home --bind $(pwd),/mnt/hdfs/phys/ --cleanenv $_singularity_img . fire $_config_script $_config_args; then - echo "fire returned an non-zero error status." - clean-up - exit 115 -fi - -# Our special copying function, -# sometimes jobs interrupt the copying mid-way through -# (don't know why this happens) -# but this means we need to check that the copied file -# matches the actually generated file. This is done -# using 'cmp -s' which does a bit-wise comparison and -# returns a failure status upon the first mis-match. -# -# Sometimes (usually for larger files like ours), -# the kernel decides to put the file into a buffer -# and have cp return success. This is done because -# the computer can have the copy continue on in the -# background without interfering with the user. -# In our case, this sometimes causes a failure because -# we attempt to compare the copied file (which is only -# partial copied) to the original. To solve this -# niche issue, we can simply add the 'sync' command -# which tells the terminal to wait for these write -# buffers to finish before moving on. -# -# We return a success-status of 0 if we cp and cmp. -# Otherwise, we make sure any partially-copied files -# are removed from the destination directory and try again -# until the input number of tries are attempted. -# If we get through all tries without return success, -# then we return a failure status of 1. -# -# Arguments -# 1 - Time in seconds to sleep between tries -# 2 - Number of tries to attempt before giving up -# 3 - source file to copy -# 4 - destination directory to put copy in -copy-and-check() { - local _sleep_between_tries="$1" - local _num_tries="$2" - local _source="$3" - local _dest_dir="$4" - for try in $(seq $_num_tries); do - if cp -t $_dest_dir $_source; then - sync #wait for large files to actually leave buffer - if cmp -s $_source $_dest_dir/$_source; then - #SUCCESS! - return 0; - else - #Interrupted during copying - # delete half-copied file - rm $_dest_dir/$_source - fi - fi - sleep $_sleep_between_tries - done - # make it here if we didn't have a success - return 1 -} - -# check if output directory exists -# we wait until here because sometimes -# hdfs is connected when we start the job -# but isn't connected at the end -if [[ ! -d $_output_dir ]]; then - echo "Output directory '$_output_dir' doesn't exist!" - exit 117 -fi - -# copy over each output file, checking to make sure it worked -# most of the time this is only one file, but sometimes -# we create both a event and a histogram file -for _output_file in *.root; do - if ! copy-and-check 30 10 $_output_file $_output_dir; then - # Coulding copy after trying 10 times, waiting - # 30s between each try. - echo "Copying failed after several tries." - exit 118 - fi -done - -clean-up diff --git a/batch/submit_jobs.py b/batch/submit_jobs.py index 3e6e4c3..10b85b5 100644 --- a/batch/submit_jobs.py +++ b/batch/submit_jobs.py @@ -29,6 +29,7 @@ parser.add_argument("--input_arg_name",type=str,default='',help='Name of argument that should go before the input file or run number when passing it to the config script.') parser.add_argument("--start_job",type=int,default=0,help="Starting number to use when run numbers. Only used if NOT running over items in a directory.") parser.add_argument("--files_per_job",type=int,default=10,help="If running over an input directory, this argument defines how many files to group together per job.") +parser.add_argument("--no_recursive",default=False,action='store_true',help='Should we NOT recursively enter the input directories?') # rarely-used optional args full_path_to_dir_we_are_in=os.path.dirname(os.path.realpath(__file__)) @@ -101,7 +102,7 @@ job_instructions.periodic_release() if arg.input_dir is not None : - job_instructions.run_over_input_dirs(arg.input_dir, arg.files_per_job) + job_instructions.run_over_input_dirs(arg.input_dir, arg.files_per_job, not arg.no_recursive) elif arg.refill : job_instructions.run_refill() else :