From 542449f2232926de0cd57e141f7286604f8eb080 Mon Sep 17 00:00:00 2001 From: lannliat Date: Thu, 29 Dec 2022 06:23:27 +0000 Subject: [PATCH] Add yield tuple write mode --- opustools_pkg/opustools/formatting.py | 10 +++++++++- opustools_pkg/opustools/opus_read.py | 12 ++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/opustools_pkg/opustools/formatting.py b/opustools_pkg/opustools/formatting.py index 1391cb6..c7868c1 100644 --- a/opustools_pkg/opustools/formatting.py +++ b/opustools_pkg/opustools/formatting.py @@ -244,7 +244,12 @@ def links_print_id(*args): str_link = '\n'.format(' '.join( ['{}="{}"'.format(k, v) for k, v in args[5].items()])) print(str_link, end='') - write_id_line(args[5], args[6], args[7], args[8]) + write_id_line(args[5], args[6], args[7], args[8]) + def yield_tuple(*args): + src = args[0].rstrip('\n').replace('\n', ' ') + tgt = args[1].rstrip('\n').replace('\n', ' ') + return src, tgt + def nothing(*args): pass @@ -279,6 +284,8 @@ def nothing(*args): return links_write if wmode == 'links'and not write: return links_print + if wmode == "yield_tuple": + return yield_tuple return nothing def sentence_format_type(wmode, fromto): @@ -319,6 +326,7 @@ def moses(sentences, ids): format_fs = {'normal': (normal_src, normal_trg), 'tmx': (tmx_src, tmx_trg), 'moses': (moses, moses), + 'yield_tuple': (moses, moses), 'links': (None, None)} return format_fs[wmode] diff --git a/opustools_pkg/opustools/opus_read.py b/opustools_pkg/opustools/opus_read.py index 42fdc73..dfda16a 100644 --- a/opustools_pkg/opustools/opus_read.py +++ b/opustools_pkg/opustools/opus_read.py @@ -84,8 +84,8 @@ def __init__(self, directory=None, source=None, target=None, N -- Skip all doucments that match the regex chunk_size -- Number of sentence pairs in chunks to be processed (default 1000000) verbose -- Print progress messages - """ + """ self.fromto = sorted([source, target]) fromto_copy = [source, target] self.switch_langs = fromto_copy != self.fromto @@ -281,8 +281,13 @@ def printPairs(self): continue link_attr = attrs_list[i] if i < len(attrs_list) else None - - self.out_put_pair(src_result, trg_result, self.resultfile, + + if self.write_mode == "yield_tuple": + yield self.out_put_pair(src_result, trg_result, self.resultfile, + self.mosessrc, self.mosestrg, link_attr, self.id_file, + src_doc_name, trg_doc_name) + else: + self.out_put_pair(src_result, trg_result, self.resultfile, self.mosessrc, self.mosestrg, link_attr, self.id_file, src_doc_name, trg_doc_name) @@ -317,4 +322,3 @@ def printPairs(self): self.id_file.close() self.of_handler.close_zipfiles() -