Feature/Multiprocessing (#20)

* Enable Language.pipe() execution * Refactor tests * Bump version
TakeLab · May 9, 2020 · 3610350 · 3610350
1 parent ed5288a
commit 3610350
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 70 deletions.
diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="spacy_udpipe",
-    version="0.2.1",
+    version="0.3.0",
     description="Use fast UDPipe models directly in spaCy",
     long_description=readme,
     long_description_content_type="text/markdown",

diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 import re
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
@@ -8,7 +9,7 @@
 from spacy.vocab import Vocab
 from ufal.udpipe import Sentence, Word
 
-from .udpipe import UDPipeModel
+from .udpipe import NO_SPACE, UDPipeModel
 from .utils import get_defaults
 
 
@@ -80,7 +81,7 @@ def __call__(
             text = ""
             for token in tokens:
                 text += token.form
-                if "SpaceAfter=No" not in token.misc:
+                if NO_SPACE not in token.misc:
                     text += " "
         for i, token in enumerate(tokens):
             span = text[offset:]
@@ -99,7 +100,7 @@ def __call__(
             lemmas.append(self.vocab.strings.add(token.lemma or ""))
             offset += len(token.form)
             span = text[offset:]
-            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
+            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                 spaces.append(False)
             elif not is_aligned:
                 spaces.append(True)
@@ -143,15 +144,18 @@ def pipe(
             Iterable[str],
             Iterable[List[str]],
             Iterable[List[List[str]]]
-        ]
+        ],
+        n_process: Optional[int] = 1
     ) -> Iterable[Doc]:
         """Tokenize a stream of texts.
 
         texts: A sequence of unicode texts (raw, presegmented or pretokenized).
+        n_process: Number of processes to use.
         YIELDS: A sequence of Doc objects, in order.
         """
-        for text in texts:
-            yield self(text)
+        n_process = mp.cpu_count() if n_process == -1 else n_process
+        with mp.Pool(processes=n_process) as pool:
+            return pool.map(self.__call__, texts)
 
     def _get_tokens_with_heads(
             self,

diff --git a/spacy_udpipe/udpipe.py b/spacy_udpipe/udpipe.py
@@ -1,11 +1,14 @@
 import re
 from typing import Dict, List, Optional, Union
 
-from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError,
-                         Sentence, Word)
+from ufal.udpipe import InputFormat
+from ufal.udpipe import Model as _Model
+from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word
 
 from .utils import get_path
 
+NO_SPACE = "SpaceAfter=No"
+
 
 class PretokenizedInputFormat(object):
     """Dummy tokenizer for pretokenized input.
@@ -14,7 +17,6 @@ class PretokenizedInputFormat(object):
     due to pure Python implementation. Mocks InputFormat API to enable
     plug-and-play behaviour.
     """
-    NO_SPACE = "SpaceAfter=No"
 
     def setText(self, text: str) -> None:
         """Store text in iterable lines for tokenization.
@@ -40,11 +42,50 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
             if re.match(r"\W", token):
                 # leave no space after previous token iff current token
                 # is non-alphanumeric (i.e. punctuation)
-                prev_word.misc = self.NO_SPACE
+                prev_word.misc = NO_SPACE
             prev_word = word
         return True
 
 
+class Model(_Model):
+    """Model wrapper with pickling support, enabling multiprocess execution."""
+
+    def __init__(self, path: str):
+        self.path = path
+        self._model = super().load(path)
+        if self._model is None:
+            raise Exception(f"Cannot load UDPipe model from file '{path}'")
+
+    def __reduce__(self):
+        # pickle support
+        return (self.__class__, (self.path,))
+
+    @property
+    def DEFAULT(self) -> InputFormat:
+        return self._model.DEFAULT
+
+    @property
+    def TOKENIZER_NORMALIZED_SPACES(self) -> InputFormat:
+        return self._model.TOKENIZER_NORMALIZED_SPACES
+
+    @property
+    def TOKENIZER_PRESEGMENTED(self) -> InputFormat:
+        return self._model.TOKENIZER_PRESEGMENTED
+
+    @property
+    def TOKENIZER_RANGES(self) -> InputFormat:
+        return self._model.TOKENIZER_RANGES
+
+    def newTokenizer(self, input_format: InputFormat) -> None:
+        return self._model.newTokenizer(input_format)
+
+    def parse(self, sentence: Sentence, input_format: InputFormat) -> None:
+        return self._model.parse(sentence, input_format)
+
+    def tag(self, sentence: Sentence, input_format: InputFormat) -> None:
+        return self._model.tag(sentence, input_format)
+
+
 class UDPipeModel(object):
 
     def __init__(
@@ -60,9 +101,7 @@ def __init__(
         meta: Meta-information about the UDPipe model.
         """
         path = path or get_path(lang=lang)
-        self.model = Model.load(path)
-        if self.model is None:
-            raise Exception(f"Cannot load UDPipe model from file '{path}'")
+        self.model = Model(path)
         self._lang = lang.split("-")[0]
         self._meta = meta or {"author": "Milan Straka & Jana Straková",
                               "description": "UDPipe pretrained model.",

diff --git a/tests/test_morph_exception.py b/tests/test_morph_exception.py
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
diff --git a/tests/test_spacy_udpipe.py b/tests/test_spacy_udpipe.py
@@ -0,0 +1,63 @@
+import tempfile
+
+import pytest
+import spacy
+
+from spacy_udpipe import UDPipeModel, download, load
+
+EN = "en"
+RO = "ro"
+SPACY_VERSION = "2.2.4"
+
+
+@pytest.fixture
+def lang() -> str:
+    return EN
+
+
+@pytest.fixture(autouse=True)
+def download_lang(lang: str) -> None:
+    download(lang=lang)
+
+
+def test_serialization(lang: str) -> None:
+    with tempfile.TemporaryDirectory() as tdir:
+        nlp = load(lang=lang)
+        nlp.to_disk(tdir)
+
+        udpipe_model = UDPipeModel(lang=lang)
+        nlp = spacy.load(tdir, udpipe_model=udpipe_model)
+
+
+def test_pipe(lang: str) -> None:
+    nlp = load(lang=lang)
+    assert nlp._meta["lang"] == f"udpipe_{lang}"
+
+    text = "spacy-udpipe package now support multiprocess execution."
+    doc = nlp(text)
+
+    texts = [text for _ in range(10)]
+    docs = list(nlp.pipe(texts, n_process=-1))
+
+    assert len(docs) == len(texts)
+    assert docs[0].to_json() == doc.to_json()
+
+
+def test_morph_exception() -> None:
+    assert spacy.__version__ <= SPACY_VERSION
+
+    lang = RO
+    text = "Ce mai faci?"
+
+    download(lang=lang)
+
+    try:
+        nlp = load(lang=lang)
+        assert nlp._meta["lang"] == f"udpipe_{lang}"
+        doc = nlp(text)
+    except ValueError:
+        nlp = load(lang=lang, ignore_tag_map=True)
+        assert nlp._meta["lang"] == f"udpipe_{lang}"
+        doc = nlp(text)
+
+    assert doc