forked from anmoisio/morphogen-dbca
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05-prepare-divide-data.sh
92 lines (81 loc) · 2.19 KB
/
05-prepare-divide-data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/bin/bash
#SBATCH --time=1:00:00
#SBATCH --mem=1G
#SBATCH --job-name=prep_divide
#SBATCH --output=log/%x_%j.out
# options
exp_name=
parsed_data=
stage=0
stop_after_stage=10
ranges=
com_weight_threshold=
overwrite=false
weight_compounds=false
profile=false
part=
num_parts=
feats_type=
train_test_split_idx=
suffix=
. ./utils/parse_options.sh
. ./exp/${exp_name}/config.sh
# positional args
args="$parsed_data"
filename=$(basename $parsed_data)
splitdirname="${filename}_ranges${ranges}_comweight${com_weight_threshold#0.}"
output_dir="exp/${exp_name}/splits/${splitdirname}"
args="$args $output_dir"
if [ ! -z "$SLURM_ARRAY_TASK_ID" ]; then
# zero padding
part=$(printf "%02d" $SLURM_ARRAY_TASK_ID)
echo "part: $part"
fi
# keyword args
kwargs=""
kwargs="$kwargs stage"
kwargs="$kwargs stop_after_stage"
kwargs="$kwargs com_weight_threshold"
kwargs="$kwargs ranges"
kwargs="$kwargs part"
kwargs="$kwargs num_parts"
kwargs="$kwargs feats_type"
kwargs="$kwargs included_lemmas_file"
kwargs="$kwargs noisy_chars_file"
kwargs="$kwargs noisy_tags_file"
kwargs="$kwargs ignored_morph_tags_file"
kwargs="$kwargs ignored_tags_file"
kwargs="$kwargs ignored_compounds_file"
kwargs="$kwargs noisy_pos_tags_file"
kwargs="$kwargs included_tags_file"
kwargs="$kwargs excluded_tags_file"
kwargs="$kwargs min_lemma_len"
kwargs="$kwargs train_test_split_idx"
kwargs="$kwargs suffix"
for kwarg in $kwargs; do
if [ ! -z "${!kwarg}" ]; then
args="$args --$kwarg ${!kwarg}"
fi
done
for boolean_arg in overwrite weight_compounds profile; do
if [ "${!boolean_arg}" = true ]; then
args="$args --$boolean_arg"
fi
done
(set -x; python freq_mats.py $args) || exit 1
if [ $stage -eq 6 ]; then
for filetype in used_sent_ids compounds_per_sent atoms_per_sent subcompounds_per_sent
do
cat $output_dir/$filetype.*.txt \
> $output_dir/$filetype.txt || continue
rm $output_dir/$filetype.*.txt
done
rm $output_dir/atom_freqs.*.pt
rm $output_dir/compound_freqs.*.pt
fi
# different array jobs get different (consecutive) JOB_IDs
logfilename=log/${SLURM_JOB_NAME}_${SLURM_JOB_ID}.out
if [ -f $logfilename ]; then
sleep 5
(set -x; cp $logfilename $output_dir/) || true
fi