Skip to content

Commit

Permalink
Merge pull request #11 from Helsinki-NLP/eflomal
Browse files Browse the repository at this point in the history
Changed fast_align to Eflomal
  • Loading branch information
onadegibert authored Oct 4, 2024
2 parents 631dcfa + 18497a6 commit 9a285b9
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 18 deletions.
1 change: 1 addition & 0 deletions envs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dependencies:
- opusfilter==3.0.0rc2
- git+https://github.com/hplt-project/opustrainer.git#egg=opustrainer
- unbabel-comet==2.2.2
- eflomal==2.0.0
23 changes: 14 additions & 9 deletions pipeline/alignment/generate-alignment-and-shortlist.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ mkdir -p "${dir}"
corpus_src="${corpus_prefix}.${SRC}.gz"
corpus_trg="${corpus_prefix}.${TRG}.gz"

test -s "${dir}/cleaned_empty_lines" ||
echo "### Removing empty target lines"
paste <(pigz -dc "${corpus_src}") <(pigz -dc "${corpus_trg}") | sed 's/\t/ ||| /' >"${dir}/corpus"
awk -F ' \\|\\|\\| ' '$1!="" && $2!=""' "${dir}/corpus" > "${dir}/corpus_dedup"

echo "### Splitting corpus back into source and target files and overwriting source and target files"
awk -F' \\|\\|\\| ' '{print $1}' "${dir}/corpus_dedup" | pigz -c > "${corpus_src}"
awk -F' \\|\\|\\| ' '{print $2}' "${dir}/corpus_dedup" | pigz -c > "${corpus_trg}"
rm "${dir}/corpus"
rm "${dir}/corpus_dedup"
touch "${dir}/cleaned_empty_lines"

echo "### Subword segmentation with SentencePiece"
test -s "${dir}/corpus.spm.${SRC}.gz" ||
Expand All @@ -44,16 +55,10 @@ test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/corpus" ||
sed 's/\t/ ||| /' >"${dir}/corpus"

echo "### Training alignments"
test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.s2t.gz" ||
"${BIN}/fast_align" -vod -i "${dir}/corpus" |
pigz >"${dir}/align.s2t.gz"
test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.t2s.gz" ||
"${BIN}/fast_align" -vodr -i "${dir}/corpus" |
pigz >"${dir}/align.t2s.gz"

test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.s2t" || test -s "${dir}/align.t2s" ||
"eflomal-align" -i "${dir}/corpus" -f "${dir}/align.s2t" -r "${dir}/align.t2s" -m 3

echo "### Symmetrizing alignments"
test -s "${output_dir}/corpus.aln.gz" || test -s "${dir}/align.t2s" ||
pigz -d "${dir}/align.s2t.gz" "${dir}/align.t2s.gz"
test -s "${output_dir}/corpus.aln.gz" ||
"${BIN}/atools" -i "${dir}/align.s2t" -j "${dir}/align.t2s" -c grow-diag-final-and |
pigz >"${output_dir}/corpus.aln.gz"
Expand Down
25 changes: 17 additions & 8 deletions pipeline/alignment/generate-alignment-tsv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ mkdir -p "${dir}"
corpus_src="${corpus_prefix}.${SRC}.gz"
corpus_trg="${corpus_prefix}.${TRG}.gz"

test -s "${dir}/cleaned_empty_lines" ||
echo "### Removing empty target lines"
paste <(pigz -dc "${corpus_src}") <(pigz -dc "${corpus_trg}") | sed 's/\t/ ||| /' >"${dir}/corpus"
awk -F ' \\|\\|\\| ' '$1!="" && $2!=""' "${dir}/corpus" > "${dir}/corpus_dedup"

echo "### Splitting corpus back into source and target files and overwriting source and target files"
awk -F' \\|\\|\\| ' '{print $1}' "${dir}/corpus_dedup" | pigz -c > "${corpus_src}"
awk -F' \\|\\|\\| ' '{print $2}' "${dir}/corpus_dedup" | pigz -c > "${corpus_trg}"
rm "${dir}/corpus"
rm "${dir}/corpus_dedup"
touch "${dir}/cleaned_empty_lines"

echo "### Subword segmentation with SentencePiece"
test -s "${dir}/corpus.spm.${SRC}.gz" ||
Expand All @@ -43,17 +54,15 @@ test -s "${output_dir}/corpus.aln" || test -s "${dir}/corpus" ||
paste <(pigz -dc "${dir}/corpus.spm.${SRC}.gz") <(pigz -dc "${dir}/corpus.spm.${TRG}.gz") |
sed 's/\t/ ||| /' >"${dir}/corpus"

echo "### Removing empty target lines"
awk -F ' \|\|\| ' '$1!="" && $2!=""' "${dir}/corpus" > "${dir}/corpus_clean"
mv "${dir}/corpus_clean" "${dir}/corpus"

echo "### Training alignments"
test -s "${output_dir}/corpus.aln" || test -s "${dir}/align.s2t.gz" ||
"${BIN}/fast_align" -vod -i "${dir}/corpus" |
pigz >"${dir}/align.s2t.gz"
test -s "${output_dir}/corpus.aln" || test -s "${dir}/align.t2s.gz" ||
"${BIN}/fast_align" -vodr -i "${dir}/corpus" |
pigz >"${dir}/align.t2s.gz"
test -s "${output_dir}/corpus.aln" || test -s "${dir}/align.s2t" || test -s "${dir}/align.t2s" ||
"eflomal-align" -i "${dir}/corpus" -f "${dir}/align.s2t" -r "${dir}/align.t2s" -m 3

echo "### Symmetrizing alignments"
test -s "${output_dir}/corpus.aln" || test -s "${dir}/align.t2s" ||
pigz -d "${dir}/align.s2t.gz" "${dir}/align.t2s.gz"
test -s "${output_dir}/corpus.aln" ||
"${BIN}/atools" -i "${dir}/align.s2t" -j "${dir}/align.t2s" -c grow-diag-final-and >"${output_dir}/corpus.aln"

Expand Down
17 changes: 16 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
langcodes==3.3.0
mtdata==0.4.0
mtdata==0.4.0
sacrebleu==2.0.0
fasttext==0.9.2
regex==2019.8.19
sacremoses==0.1.0
langcodes==3.3.0
opusfilter==3.0.0rc2
git+https://github.com/hplt-project/opustrainer.git#egg=opustrainer
unbabel-comet==2.2.2
eflomal==2.0.0
torch>2.2.1
torchvision>0.15.2
torchaudio>0.13.0
transformers==4.38.2
sentencepiece==0.1.99
accelerate==0.31.0

0 comments on commit 9a285b9

Please sign in to comment.