Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Piped processors #14

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config-example/worker.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,10 @@ RECASE_PORT = 9000 # comment out if you do not need a recaser
SOURCE_LANG = en
TARGET_LANG = de
THREADS = 4
#comment in and specify absolute path of pre-processor models or comment out if not needed
#SPLITTER_MODEL =
#TRUECASER_MODEL =
#comment in for the punctuation normalizer
#NORMALIZER = TRUE
#comment in for the original moses/perl tokenizer and detokenizer
#PERL_TOKENIZER = TRUE
13 changes: 13 additions & 0 deletions config-example/worker.processors.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
PORT = 7001
TRANSLATE_PORT = 8080
#RECASE_PORT = 9000 # comment out if you use a truecaser
SOURCE_LANG = en
TARGET_LANG = de
THREADS = 4
#comment out if no pre- or post-processors needed
SPLITTER_MODEL =
TRUECASER_MODEL =
#comment out to disable normalizer
NORMALIZER = TRUE
#comment out to use the embedded python tokenizer
PERL_TOKENIZER = TRUE
104 changes: 81 additions & 23 deletions worker/src/tasks/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@
import xmlrpclib
import operator
import os
import logging as logger
from util.parallel import parallel_map
from util.tokenize import Tokenizer
from util.detokenize import Detokenizer
from util.split_sentences import SentenceSplitter

from util.preprocessor import Tokenizer as PerlTokenizer
from util.preprocessor import Detokenizer as PerlDetokenizer
from util.preprocessor import Truecaser
from util.preprocessor import Detruecaser
from util.preprocessor import Normalizer
from util.preprocessor import CompoundSplitter

class Translator(object):
"""Base class for all classes that handle the 'translate' task for MTMonkeyWorkers"""

Expand Down Expand Up @@ -62,10 +70,20 @@ def process_task(self, task):

class MosesTranslator(Translator):
"""Handles the 'translate' task for MTMonkeyWorkers using Moses XML-RPC servers
and built-in segmentation, tokenization, and detokenization.
and built-in segmentation, tokenization, and detokenization.
@ivar translate_proxy_addr: proxy address for the translation server
@ivar recase_proxy_addr: proxy address for the recaser-decoder server
@ivar splitter: the sentence splitter class to be used along all threads
@type splitter: SentenceSplitter
@ivar preprocessors: a list of pre-processing classes supporting the function
"process_string" to run before text is sent to the decoder
@ivar postprocessors: a list of post-processing classes to run after text is
sent to the decoder
@ivar threads: desired number of threads
@type threads: int
"""

def __init__(self, translate_port, recase_port, source_lang, target_lang, threads):
def __init__(self, translate_port, recase_port, source_lang, target_lang, threads, truecaser_model=None, splitter_model=None, perl_tokenizer=None, normalizer=None):
"""Initialize a MosesTranslator object according to the given
configuration settings.

Expand All @@ -77,16 +95,43 @@ def __init__(self, translate_port, recase_port, source_lang, target_lang, thread
# precompile XML-RPC Moses server addresses
self.translate_proxy_addr = "http://localhost:" + translate_port + "/RPC2"
self.recase_proxy_addr = None
if recase_port is not None:
if recase_port is not None and recase_port.strip() != "":
self.recase_proxy_addr = "http://localhost:" + recase_port + "/RPC2"

# initialize text processing tools (can be shared among threads)
self.splitter = SentenceSplitter({'language': source_lang})
self.tokenizer = Tokenizer({'lowercase': True,
# put sentence-level pre- and post-processors in two lists
# depending on whether they are enabled from the settings
self.preprocessors = []
self.postprocessors = []
if normalizer:
normalizer = Normalizer(source_lang)
self.preprocessors.append(normalizer)
if not perl_tokenizer:
tokenizer = Tokenizer({'lowercase': True,
'moses_escape': True})
self.detokenizer = Detokenizer({'moses_deescape': True,
self.preprocessors.append(tokenizer)
detokenizer = Detokenizer({'moses_deescape': True,
'capitalize_sents': True,
'language': target_lang})
'language': target_lang})
self.postprocessors.append(detokenizer)
else:
tokenizer = PerlTokenizer(source_lang)
self.preprocessors.append(tokenizer)
detokenizer = PerlDetokenizer(target_lang)
self.postprocessors.append(detokenizer)
if truecaser_model:
truecaser = Truecaser(source_lang, truecaser_model)
self.preprocessors.append(truecaser)
detruecaser = Detruecaser(target_lang)
self.postprocessors.append(detruecaser)
if splitter_model:
compound_splitter = CompoundSplitter(source_lang, splitter_model)
self.preprocessors.append(compound_splitter)

#post-processors to run in the opposite order as pre-processors
self.postprocessors.reverse()

self.threads = threads


Expand All @@ -106,10 +151,14 @@ def process_task(self, task):
src_lines = self.splitter.split_sentences(task['text']) if dosegment else [ task['text'] ]
ret_src_tok = doalign or len(src_lines) > 1

def _translator(line):
return self._translate(line, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment)
#def _translator(line):
# return self._translate(line, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment)
#
#translated = parallel_map(_translator, src_lines)
translated = []
for line in src_lines:
translated.append(self._translate(line, doalign, dodetok, nbestsize, ret_src_tok, dotok, dosegment))

translated = parallel_map(_translator, src_lines)

return {
'translationId': uuid.uuid4().hex,
Expand All @@ -132,12 +181,17 @@ def _translate(self, src, doalign, dodetok, nbestsize, ret_src_tok, dotok, doseg
if self.recase_proxy_addr is not None: # recasing only if there is a recaser set up
recase_proxy = xmlrpclib.ServerProxy(self.recase_proxy_addr)

# tokenize
src_tokenized = self.tokenizer.tokenize(src) if dotok else src

# preprocess
src_original = src
if dotok:
for preprocessor in self.preprocessors:
#logger.warning("Preprocessed source before {}: {}".format(preprocessor.__class__.__name__, src))
src = preprocessor.process_string(src)
#commented out as causing utf-8 errors
#logger.warning("Preprocessed source after {}: {}".format(preprocessor.__class__.__name__, src))
# translate
translation = translate_proxy.translate({
"text": src_tokenized,
"text": src,
"align": doalign,
"nbest": nbestsize,
"nbest-distinct": True,
Expand All @@ -149,33 +203,37 @@ def _translate(self, src, doalign, dodetok, nbestsize, ret_src_tok, dotok, doseg
for hypo in translation['nbest']:
# recase (if there is a recaser set up)
if recase_proxy is not None:
recased = recase_proxy.translate({"text": hypo['hyp']})['text'].strip()
postprocessed = recase_proxy.translate({"text": hypo['hyp']})['text'].strip()
else:
recased = hypo['hyp']
postprocessed = hypo['hyp']

# construct the output
parsed_hypo = {
'text': recased,
'text': postprocessed,
'text-unprocessed': hypo['hyp'],
'score': hypo['totalScore'],
'rank': rank,
}
if dodetok: # detokenize if needed
parsed_hypo['text'] = self.detokenizer.detokenize(recased)

if dodetok: # postprocess if needed
for postprocessor in self.postprocessors:
#logger.warning("Postprocessed output before {}: {}".format(postprocessor.__class__.__name__, postprocessed))
postprocessed = postprocessor.process_string(postprocessed)
#logger.warning("Postprocessed output after {}: {}".format(postprocessor.__class__.__name__, postprocessed))
parsed_hypo['text'] = postprocessed
if doalign: # provide alignment information if needed
parsed_hypo['tokenized'] = recased
parsed_hypo['alignment-raw'] = _add_tgt_end(hypo['align'], recased)
parsed_hypo['tokenized'] = postprocessed
parsed_hypo['alignment-raw'] = _add_tgt_end(hypo['align'], postprocessed)

rank += 1
hypos.append(parsed_hypo)

result = {
'src': src,
'src': src_original,
'translated': hypos,
}

if ret_src_tok:
result['src-tokenized'] = src_tokenized
result['src-tokenized'] = src

return result

Expand Down
4 changes: 4 additions & 0 deletions worker/src/util/basic-protected-patterns
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<\/?\S+\/?>
<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
Loading