diff --git a/Snakefile b/Snakefile index e57ae3030..0fbca7bf6 100644 --- a/Snakefile +++ b/Snakefile @@ -268,10 +268,10 @@ else: shell.prefix(f"{envs} ") -results = expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"], dataset=eval_datasets) +#results = expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"], dataset=eval_datasets) # For base model, only generate the metrics once -results.extend(expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/basemodel-eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"][0], dataset=eval_datasets)) +#results.extend(expand(f"{data_root_dir}/{experiment}/{src}-{trg}/corpus_{{corpus}}/finetune_{{learning_rate}}_opusTCv20210807+bt-2021-09-01/eval/basemodel-eval-{{dataset}}.metrics", corpus=config["datasets"]["train"][0], learning_rate=config["experiment"]["finetune"]["learning-rates"][0], dataset=eval_datasets)) #print(results) diff --git a/data.smk b/data.smk index 8631fd65a..127cc4e31 100644 --- a/data.smk +++ b/data.smk @@ -27,10 +27,11 @@ rule download_tatoeba_corpus: shell: 'bash pipeline/data/download-tc-data.sh {wildcards.src} {wildcards.trg} {params.prefix} {params.version} inf >> {log} 2>&1' #TODO: explicitly defined dev and eval linking, the glob might cause problems -rule extract_tc_scored: +checkpoint extract_tc_scored: message: "Extracting corpora from scored tc training set" - log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/extract_tc_scored.log" - conda: "envs/base.yml" + log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}.log" + conda: None + container: None wildcard_constraints: min_score="0\.\d+", threads: 1 @@ -38,15 +39,22 @@ rule extract_tc_scored: output: src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{src}.gz", trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.{trg}.gz", + train_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/train.ids.gz", + domeval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{src}.gz", + domeval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.{trg}.gz", + domeval_ids="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/domeval.ids.gz", dev_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{src}.gz", dev_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/dev.{trg}.gz", eval_src="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{src}.gz", - eval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{trg}.gz" + eval_trg="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/eval.{trg}.gz", + subcorpora=directory("{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/subcorpora") params: input_dir="{project_name}/{src}-{trg}/{download_tc_dir}/", output_dir="{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/" - shell: '''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir} --min_score {wildcards.min_score} && ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1''' + shell: + '''python3 pipeline/data/filter-tc-data.py --source_corpus {input.train_src} --target_corpus {input.train_trg} --source_lang {wildcards.src} --target_lang {wildcards.trg} --id_file {input.train_ids} --score_file {input.scores} --domain_eval_lines 1000 --output_dir {params.output_dir} --min_score {wildcards.min_score} >> {log} 2>&1 && \ + ln {params.input_dir}/{{eval,dev}}.*.gz {params.output_dir} >> {log} 2>&1''' rule baseline_preprocessing: message: "Preprocessing data for baseline training" diff --git a/eval.smk b/eval.smk index 16a836e73..f95821a8b 100644 --- a/eval.smk +++ b/eval.smk @@ -2,10 +2,16 @@ wildcard_constraints: src="\w{2,3}", trg="\w{2,3}", train_vocab="train_joint_spm_vocab[^/]+", + extract_tc="extract_tc[^/]+", learn_rate="\d+" gpus_num=config["gpus-num"] +def find_domain_sets(wildcards, checkpoint): + checkpoint_output = checkpoint.get(src=wildcards.src,trg=wildcards.trg,project_name=wildcards.project_name,download_tc_dir=wildcards.download_tc_dir,min_score=wildcards.min_score).output["subcorpora"] + print(checkpoint_output) + return glob_wildcards(os.path.join(checkpoint_output,f"{{domain,.*}}.{wildcards.src}.gz")).domain + #TODO: combine model evaluation rules by storing vocabs in model dir with normally trained models as well rule evaluate_opus_model: message: "Evaluating an OPUS model" @@ -33,6 +39,24 @@ rule evaluate_opus_model: shell: '''bash pipeline/eval/eval-gpu.sh "{params.res_prefix}" "{params.dataset_prefix}" {wildcards.src} {wildcards.trg} {params.decoder} "{params.decoder_config}" >> {log} 2>&1''' +#TODO: this need to output a single report on the domain evaluations. input should be a directory containing the domain indices and the domain src, trg and id files. Translate the domain source with all domain indices, then separate the output according to ids for evaluation. Skip crawled data sets. +rule merge_domain_evaluation: + message: "Merging domain evaluation results" + log: "{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/evaluate_domains.log" + conda: None + container: None + threads: 7 + resources: gpu=1 + priority: 50 + wildcard_constraints: + min_score="0\.\d+", + model="[\w-]+" + input: + lambda wildcards: expand("{{project_name}}/{{src}}-{{trg}}/{{download_tc_dir}}/extract_tc_scored_{{min_score}}/{{preprocessing}}/{{train_vocab}}/{{train_model}}/eval/{domain}-domeval.metrics", domain=find_domain_sets(wildcards, checkpoints.extract_tc_scored)) + output: + report('{project_name}/{src}-{trg}/{download_tc_dir}/extract_tc_scored_{min_score}/{preprocessing}/{train_vocab}/{train_model}/eval/domeval.done', + category='evaluation', subcategory='{model}', caption='reports/evaluation.rst') + shell: '''touch {output} >> {log} 2>&1''' rule evaluate: message: "Evaluating a model" diff --git a/pipeline/data/filter-tc-data.py b/pipeline/data/filter-tc-data.py index 8a18dad71..3d4ad45a8 100644 --- a/pipeline/data/filter-tc-data.py +++ b/pipeline/data/filter-tc-data.py @@ -3,23 +3,35 @@ import os from collections import defaultdict -def process_files(src_file, trg_file, id_file, score_file, min_score, domain_eval_lines, output_dir): +def process_files(src_path, trg_path, src_lang, trg_lang, id_path, score_path, min_score, domain_eval_lines, output_dir): print(f"Creating output dir in {output_dir}") if not os.path.exists(output_dir): os.makedirs(output_dir) - train_src_path = f"{output_dir}/{os.path.basename(src_file)}" - train_trg_path = f"{output_dir}/{os.path.basename(trg_file)}" + train_src_path = f"{output_dir}/train.{src_lang}.gz" + train_trg_path = f"{output_dir}/train.{trg_lang}.gz" + train_id_path = f"{output_dir}/train.ids.gz" + domaineval_src_path = f"{output_dir}/domeval.{src_lang}.gz" + domaineval_trg_path = f"{output_dir}/domeval.{trg_lang}.gz" + domaineval_id_path = f"{output_dir}/domeval.ids.gz" eval_lines = defaultdict(lambda: ([], [])) # dictionary to store eval lines per corpus - domain_files = {} # dictionary to store open file handles for domains + domain_files = {} # dictionary to store open file handles for domains - with gzip.open(src_file, 'rt', encoding='utf-8') as src, \ - gzip.open(trg_file, 'rt', encoding='utf-8') as trg, \ - gzip.open(id_file, 'rt', encoding='utf-8') as ids, \ - gzip.open(score_file, 'rt', encoding='utf-8') as scores, \ + if not os.path.exists(os.path.join(output_dir,"subcorpora")): + os.makedirs(os.path.join(output_dir,"subcorpora")) + + + with gzip.open(src_path, 'rt', encoding='utf-8') as src, \ + gzip.open(trg_path, 'rt', encoding='utf-8') as trg, \ + gzip.open(id_path, 'rt', encoding='utf-8') as ids, \ + gzip.open(score_path, 'rt', encoding='utf-8') as scores, \ gzip.open(train_src_path, 'wt', encoding='utf-8') as train_src, \ - gzip.open(train_trg_path, 'wt', encoding='utf-8') as train_trg: + gzip.open(train_trg_path, 'wt', encoding='utf-8') as train_trg, \ + gzip.open(train_id_path, 'wt', encoding='utf-8') as train_ids, \ + gzip.open(domaineval_src_path, 'wt', encoding='utf-8') as eval_src, \ + gzip.open(domaineval_trg_path, 'wt', encoding='utf-8') as eval_trg, \ + gzip.open(domaineval_id_path, 'wt', encoding='utf-8') as eval_ids: eval_counts = defaultdict(int) @@ -30,45 +42,33 @@ def process_files(src_file, trg_file, id_file, score_file, min_score, domain_eva corpus_name = id_line.split("\t")[0] - # Open domain-specific files if not already opened + # Open domain-specific files if not already opened if corpus_name not in domain_files: - domain_src_path = f"{output_dir}/{corpus_name}.train.src.gz" - domain_trg_path = f"{output_dir}/{corpus_name}.train.trg.gz" + domain_src_path = f"{output_dir}/subcorpora/{corpus_name}.{src_lang}.gz" + domain_trg_path = f"{output_dir}/subcorpora/{corpus_name}.{trg_lang}.gz" domain_files[corpus_name] = ( gzip.open(domain_src_path, 'wt', encoding='utf-8'), gzip.open(domain_trg_path, 'wt', encoding='utf-8') ) - if domain_eval_lines > 0 and eval_counts[corpus_name] < domain_eval_lines: - eval_lines[corpus_name][0].append(src_line) - eval_lines[corpus_name][1].append(trg_line) + eval_src.write(src_line) + eval_trg.write(trg_line) + eval_ids.write(id_line) eval_counts[corpus_name] += 1 else: domain_files[corpus_name][0].write(src_line) domain_files[corpus_name][1].write(trg_line) train_src.write(src_line) train_trg.write(trg_line) - - # Write the eval data to respective files - for corpus_name, (src_lines, trg_lines) in eval_lines.items(): - eval_src_file_path = f"{output_dir}/{corpus_name}.eval.src.gz" - eval_trg_file_path = f"{output_dir}/{corpus_name}.eval.trg.gz" - with gzip.open(eval_src_file_path, 'wt', encoding='utf-8') as eval_src_file, \ - gzip.open(eval_trg_file_path, 'wt', encoding='utf-8') as eval_trg_file: - for src_line, trg_line in zip(src_lines, trg_lines): - eval_src_file.write(src_line) - eval_trg_file.write(trg_line) - - # Close all domain-specific files - for src_file, trg_file in domain_files.values(): - src_file.close() - trg_file.close() + train_ids.write(id_line) def main(): parser = argparse.ArgumentParser(description='Process and filter corpus data based on score.') parser.add_argument('--source_corpus', required=True, help='Path to the source corpus file (gzipped)') parser.add_argument('--target_corpus', required=True, help='Path to the target corpus file (gzipped)') + parser.add_argument('--source_lang', required=True, help='Source language code') + parser.add_argument('--target_lang', required=True, help='Target language code') parser.add_argument('--id_file', required=True, help='Path to the ID file') parser.add_argument('--score_file', required=True, help='Path to the score file') parser.add_argument('--min_score', type=float, required=True, help='Minimum score for filtering (0 to 1)') @@ -82,6 +82,8 @@ def main(): process_files( args.source_corpus, args.target_corpus, + args.source_lang, + args.target_lang, args.id_file, args.score_file, args.min_score, diff --git a/pipeline/opusmt/finetune.sh b/pipeline/opusmt/finetune.sh index f2c27ff90..42b89f6e0 100755 --- a/pipeline/opusmt/finetune.sh +++ b/pipeline/opusmt/finetune.sh @@ -85,6 +85,7 @@ echo "### Training ${model_dir}" --devices ${GPUS} \ --beam-size 6 \ --sharding local \ + --lr-report \ --learn-rate "${learn_rate}" \ --sync-sgd \ --valid-metrics "${best_model_metric}" ${all_model_metrics[@]/$best_model_metric} \ diff --git a/pipeline/rat/build_index.sh b/pipeline/rat/build_index.sh index f8fae414f..b36e0dcf6 100755 --- a/pipeline/rat/build_index.sh +++ b/pipeline/rat/build_index.sh @@ -14,10 +14,10 @@ index_file=$4 echo "##### Building a fuzzy match index" # index building runs on single thread, --nthreads is only for matching -${fuzzy_match_cli} --action index --corpus ${src_corpus} +# ${fuzzy_match_cli} --action index --corpus ${src_corpus} # The add-target flag adds the target sentence to the db but there seems to be some bugs associated, so don't use it -#${fuzzy_match_cli} --action index --corpus ${src_corpus},${trg_corpus} --nthreads ${threads} --add-target +${fuzzy_match_cli} --action index --corpus ${src_corpus},${trg_corpus} --add-target # index is saved as src_corpus.fmi, move it to the correct place mv "${src_corpus}.fmi" "${index_file}" diff --git a/profiles/slurm-lumi-test/jobscript.sh b/profiles/slurm-lumi-test/jobscript.sh index d6649ce58..d973e1e12 100755 --- a/profiles/slurm-lumi-test/jobscript.sh +++ b/profiles/slurm-lumi-test/jobscript.sh @@ -5,7 +5,7 @@ log_file=$(echo '{properties}' | jq -r .log[0]) gpu=$(echo '{properties}' | jq -r .resources.gpu) -mkdir -p $(dirname $log_file) +#mkdir -p $(dirname $log_file) mkdir /tmp/$USER if [ $gpu != "null" ] && [ $gpu != "0" ]; then diff --git a/profiles/slurm-lumi/config.cluster.yaml b/profiles/slurm-lumi/config.cluster.yaml index 6ec57dc50..76b7e7b76 100755 --- a/profiles/slurm-lumi/config.cluster.yaml +++ b/profiles/slurm-lumi/config.cluster.yaml @@ -1,7 +1,8 @@ # CSC Puhti single-gpu-partition: small-g multi-gpu-partition: standard-g -cpu-partition: small +partialnode-cpu-partition: small +fullnode-cpu-partition: standard cpu-account: project_462000447 gpu-account: project_462000447 -time-limit: "24:00:00" +time-limit: "48:00:00" diff --git a/profiles/slurm-lumi/submit.py b/profiles/slurm-lumi/submit.py index 422cb41a7..b01ee30aa 100755 --- a/profiles/slurm-lumi/submit.py +++ b/profiles/slurm-lumi/submit.py @@ -25,7 +25,6 @@ options += ['--job-name', name] -partition = cluster_config['cpu-partition'] account = cluster_config['cpu-account'] if "resources" in job_properties: @@ -43,6 +42,15 @@ partition = cluster_config['multi-gpu-partition'] rocm_dir = os.getenv("ROCM_PATH") options += ['--export', f'ALL,SINGULARITY_BIND="{rocm_dir}"'] + else: + #this is a LUMI-C job + if 'threads' in job_properties and int(job_properties['threads']) >= 128: + if 'mem_mb' in resources and int(resources['mem_mb'] < 256000): + partition = cluster_config['fullnode-cpu-partition'] + else: + partition = cluster_config['partialnode-cpu-partition'] # The LUMI-C nodes with more memory than 256GB are in small partition + else: + partition = cluster_config['partialnode-cpu-partition'] # we don't need explicit memory limiting for now if 'mem_mb' in resources: diff --git a/rat.smk b/rat.smk index 2567d4bb5..512900d91 100644 --- a/rat.smk +++ b/rat.smk @@ -4,8 +4,10 @@ fuzzy_match_cli=f'{config["fuzzy-match-cli"]}' wildcard_constraints: src="\w{2,3}", trg="\w{2,3}", - index_type="(train|all_filtered)" + index_type="[\-._\w\d]+" + +# TODO: all_filtered index should be built earlier, like domain indexes rule build_fuzzy_index: message: "Building fuzzy index" log: "{project_name}/{src}-{trg}/{preprocessing}/build_index/build_index_{index_type}.log" @@ -14,44 +16,56 @@ rule build_fuzzy_index: threads: 2 input: index_source="{project_name}/{src}-{trg}/{preprocessing}/{index_type}.{src}.gz", - index_target="{project_name}/{src}-{trg}/{preprocessing}/{index_type}.{trg}.gz" + index_target="{project_name}/{src}-{trg}/{preprocessing}/{index_type}.{trg}.gz", output: index="{project_name}/{src}-{trg}/{preprocessing}/build_index/index.{index_type}.{src}-{trg}.fmi" shell: f'''bash pipeline/rat/build_index.sh "{fuzzy_match_cli}" "{{input.index_source}}" "{{input.index_target}}" "{{output.index}}" >> {{log}} 2>&1''' + +ruleorder: find_domain_fuzzy_matches > find_fuzzy_matches rule find_fuzzy_matches: message: "Finding fuzzies" - log: "{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/find_{index_type}-{set}_matches.log" + log: "{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/find_{index_type}-{set}_matches.log" conda: None container: None threads: workflow.cores - input: - source="{project_name}/{src}-{trg}/{preprocessing}/{set}.{src}.gz", - target="{project_name}/{src}-{trg}/{preprocessing}/{set}.{trg}.gz", - index="{project_name}/{src}-{trg}/{preprocessing}/build_index/index.{index_type}.{src}-{trg}.fmi" + resources: mem_mb=500000 + input: + source="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{set}.{src}.gz", + target="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{set}.{trg}.gz", + index="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/index.{index_type}.{src}-{trg}.fmi" output: - matches="{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/{index_type}-{set}.{src}-{trg}.matches" + matches="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/{index_type}-{set}.{src}-{trg}.matches" shell: f'''bash pipeline/rat/find_matches.sh "{fuzzy_match_cli}" "{{input.source}}" {{threads}} "{{input.index}}" "{{output.matches}}" {{wildcards.contrast_factor}} >> {{log}} 2>&1''' +use rule find_fuzzy_matches as find_domain_fuzzy_matches with: + input: + source="{project_name}/{src}-{trg}/{tc_processing}/domeval.{src}.gz", + target="{project_name}/{src}-{trg}/{tc_processing}/domeval.{trg}.gz", + index="{project_name}/{src}-{trg}/{tc_processing}/domeval_indexes/build_index/index.{index_type}.{src}-{trg}.fmi" + +ruleorder: augment_data_with_domain_fuzzies > augment_data_with_fuzzies + rule augment_data_with_fuzzies: message: "Augmenting data with fuzzies" wildcard_constraints: fuzzy_score="[01]\.\d+", min_fuzzies="\d+", max_fuzzies="\d+", - set="[\w\d_-]+", - log: "{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/augment_{index_type}-{set}_matches.log" + set="[_\-\w\d]+", + log: "{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/augment_{index_type}-{set}_matches.log" conda: "envs/base.yml" threads: 1 resources: mem_mb=60000 input: - index_source="{project_name}/{src}-{trg}/{preprocessing}/{index_type}.{src}.gz", - index_target="{project_name}/{src}-{trg}/{preprocessing}/{index_type}.{trg}.gz", - augment_source="{project_name}/{src}-{trg}/{preprocessing}/{set}.{src}.gz", - augment_target="{project_name}/{src}-{trg}/{preprocessing}/{set}.{trg}.gz", - matches="{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/{index_type}-{set}.{src}-{trg}.matches" + #TODO: add support for domain indexes, either link them to build_index or refer to them in extract_tc + index_source="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{index_type}.{src}.gz", + index_target="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{index_type}.{trg}.gz", + augment_source="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{set}.{src}.gz", + augment_target="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/{set}.{trg}.gz", + matches="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/{index_type}-{set}.{src}-{trg}.matches" output: - source="{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/{index_type}-{set}.{src}.gz", - target="{project_name}/{src}-{trg}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/{index_type}-{set}.{trg}.gz" + source="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/{index_type}-{set}.{src}.gz", + target="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/augment_train_{fuzzy_score}_{min_fuzzies}_{max_fuzzies}/{index_type}-{set}.{trg}.gz" params: max_sents=lambda wildcards: 2000 if wildcards.set == "dev" else -1 shell: f'''python pipeline/rat/get_matches.py \ @@ -66,3 +80,12 @@ rule augment_data_with_fuzzies: --min_score {{wildcards.fuzzy_score}} \ --min_fuzzies {{wildcards.min_fuzzies}} \ --max_fuzzies {{wildcards.max_fuzzies}} >> {{log}} 2>&1''' + + +use rule augment_data_with_fuzzies as augment_data_with_domain_fuzzies with: + input: + index_source="{project_name}/{src}-{trg}/{tc_processing}/domeval_indexes/{index_type}.{src}.gz", + index_target="{project_name}/{src}-{trg}/{tc_processing}/domeval_indexes/{index_type}.{trg}.gz", + augment_source="{project_name}/{src}-{trg}/{tc_processing}/domeval.{src}.gz", + augment_target="{project_name}/{src}-{trg}/{tc_processing}/domeval.{trg}.gz", + matches="{project_name}/{src}-{trg}/{tc_processing}/{preprocessing}/build_index/find_matches_{contrast_factor}/{index_type}-{set}.{src}-{trg}.matches" diff --git a/translate.smk b/translate.smk index 6ca0b7ede..0b1fa8274 100644 --- a/translate.smk +++ b/translate.smk @@ -92,7 +92,8 @@ rule collect_copyrate_corpus: train_target="{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/{train_model}/translate/extract_by_copyrate/train.{trg}.gz" params: split_dir="{project_name}/{src}-{trg}/{preprocessing}/{train_vocab}/{train_model}/translate/corpus" - shell: 'bash pipeline/translate/collect_copyrate.sh {params.split_dir} {output.train_source} {output.train_target} >> {log} 2>&1' + shell: + 'bash pipeline/translate/collect_copyrate.sh {params.split_dir} {output.train_source} {output.train_target} >> {log} 2>&1' rule collect_corpus: message: "Collecting translated corpus"