From 36103501c4bc9b4461a87a44571b389f97e0ec38 Mon Sep 17 00:00:00 2001
From: asajatovic <antonio.sajatovic@gmail.com>
Date: Sat, 9 May 2020 22:12:30 +0200
Subject: [PATCH] Feature/Multiprocessing (#20)

* Enable Language.pipe() execution

* Refactor tests

* Bump version
---
 setup.py                      |  2 +-
 spacy_udpipe/language.py      | 16 +++++----
 spacy_udpipe/udpipe.py        | 53 +++++++++++++++++++++++++----
 tests/test_morph_exception.py | 33 ------------------
 tests/test_serialization.py   | 23 -------------
 tests/test_spacy_udpipe.py    | 63 +++++++++++++++++++++++++++++++++++
 6 files changed, 120 insertions(+), 70 deletions(-)
 delete mode 100644 tests/test_morph_exception.py
 delete mode 100644 tests/test_serialization.py
 create mode 100644 tests/test_spacy_udpipe.py

diff --git a/setup.py b/setup.py
index 0c6bd02..951507f 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="spacy_udpipe",
-    version="0.2.1",
+    version="0.3.0",
     description="Use fast UDPipe models directly in spaCy",
     long_description=readme,
     long_description_content_type="text/markdown",
diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py
index cb15aef..febd67e 100644
--- a/spacy_udpipe/language.py
+++ b/spacy_udpipe/language.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 import re
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
@@ -8,7 +9,7 @@
 from spacy.vocab import Vocab
 from ufal.udpipe import Sentence, Word
 
-from .udpipe import UDPipeModel
+from .udpipe import NO_SPACE, UDPipeModel
 from .utils import get_defaults
 
 
@@ -80,7 +81,7 @@ def __call__(
             text = ""
             for token in tokens:
                 text += token.form
-                if "SpaceAfter=No" not in token.misc:
+                if NO_SPACE not in token.misc:
                     text += " "
         for i, token in enumerate(tokens):
             span = text[offset:]
@@ -99,7 +100,7 @@ def __call__(
             lemmas.append(self.vocab.strings.add(token.lemma or ""))
             offset += len(token.form)
             span = text[offset:]
-            if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
+            if i == len(tokens) - 1 or NO_SPACE in token.misc:
                 spaces.append(False)
             elif not is_aligned:
                 spaces.append(True)
@@ -143,15 +144,18 @@ def pipe(
             Iterable[str],
             Iterable[List[str]],
             Iterable[List[List[str]]]
-        ]
+        ],
+        n_process: Optional[int] = 1
     ) -> Iterable[Doc]:
         """Tokenize a stream of texts.
 
         texts: A sequence of unicode texts (raw, presegmented or pretokenized).
+        n_process: Number of processes to use.
         YIELDS: A sequence of Doc objects, in order.
         """
-        for text in texts:
-            yield self(text)
+        n_process = mp.cpu_count() if n_process == -1 else n_process
+        with mp.Pool(processes=n_process) as pool:
+            return pool.map(self.__call__, texts)
 
     def _get_tokens_with_heads(
             self,
diff --git a/spacy_udpipe/udpipe.py b/spacy_udpipe/udpipe.py
index 3f62285..13dd1eb 100644
--- a/spacy_udpipe/udpipe.py
+++ b/spacy_udpipe/udpipe.py
@@ -1,11 +1,14 @@
 import re
 from typing import Dict, List, Optional, Union
 
-from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError,
-                         Sentence, Word)
+from ufal.udpipe import InputFormat
+from ufal.udpipe import Model as _Model
+from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word
 
 from .utils import get_path
 
+NO_SPACE = "SpaceAfter=No"
+
 
 class PretokenizedInputFormat(object):
     """Dummy tokenizer for pretokenized input.
@@ -14,7 +17,6 @@ class PretokenizedInputFormat(object):
     due to pure Python implementation. Mocks InputFormat API to enable
     plug-and-play behaviour.
     """
-    NO_SPACE = "SpaceAfter=No"
 
     def setText(self, text: str) -> None:
         """Store text in iterable lines for tokenization.
@@ -40,11 +42,50 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
             if re.match(r"\W", token):
                 # leave no space after previous token iff current token
                 # is non-alphanumeric (i.e. punctuation)
-                prev_word.misc = self.NO_SPACE
+                prev_word.misc = NO_SPACE
             prev_word = word
         return True
 
 
+class Model(_Model):
+    """Model wrapper with pickling support, enabling multiprocess execution."""
+
+    def __init__(self, path: str):
+        self.path = path
+        self._model = super().load(path)
+        if self._model is None:
+            raise Exception(f"Cannot load UDPipe model from file '{path}'")
+
+    def __reduce__(self):
+        # pickle support
+        return (self.__class__, (self.path,))
+
+    @property
+    def DEFAULT(self) -> InputFormat:
+        return self._model.DEFAULT
+
+    @property
+    def TOKENIZER_NORMALIZED_SPACES(self) -> InputFormat:
+        return self._model.TOKENIZER_NORMALIZED_SPACES
+
+    @property
+    def TOKENIZER_PRESEGMENTED(self) -> InputFormat:
+        return self._model.TOKENIZER_PRESEGMENTED
+
+    @property
+    def TOKENIZER_RANGES(self) -> InputFormat:
+        return self._model.TOKENIZER_RANGES
+
+    def newTokenizer(self, input_format: InputFormat) -> None:
+        return self._model.newTokenizer(input_format)
+
+    def parse(self, sentence: Sentence, input_format: InputFormat) -> None:
+        return self._model.parse(sentence, input_format)
+
+    def tag(self, sentence: Sentence, input_format: InputFormat) -> None:
+        return self._model.tag(sentence, input_format)
+
+
 class UDPipeModel(object):
 
     def __init__(
@@ -60,9 +101,7 @@ def __init__(
         meta: Meta-information about the UDPipe model.
         """
         path = path or get_path(lang=lang)
-        self.model = Model.load(path)
-        if self.model is None:
-            raise Exception(f"Cannot load UDPipe model from file '{path}'")
+        self.model = Model(path)
         self._lang = lang.split("-")[0]
         self._meta = meta or {"author": "Milan Straka & Jana Straková",
                               "description": "UDPipe pretrained model.",
diff --git a/tests/test_morph_exception.py b/tests/test_morph_exception.py
deleted file mode 100644
index cc83018..0000000
--- a/tests/test_morph_exception.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pytest
-import spacy
-from spacy_udpipe import download, load
-
-RO = "ro"
-SPACY_VERSION = "2.2.4"
-
-
-@pytest.fixture
-def lang() -> str:
-    return RO
-
-
-@pytest.fixture(autouse=True)
-def download_lang(lang: str) -> None:
-    download(lang)
-
-
-def test_morph_exception_ro(lang: str) -> None:
-    assert spacy.__version__ <= SPACY_VERSION
-
-    text = "Ce mai faci?"
-
-    try:
-        nlp = load(lang=lang)
-        assert nlp._meta["lang"] == f"udpipe_{lang}"
-        doc = nlp(text)
-    except ValueError:
-        nlp = load(lang=lang, ignore_tag_map=True)
-        assert nlp._meta["lang"] == f"udpipe_{lang}"
-        doc = nlp(text)
-
-    assert doc
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
deleted file mode 100644
index 350bf70..0000000
--- a/tests/test_serialization.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import pytest
-import spacy
-from spacy_udpipe import UDPipeModel, download, load
-
-EN = "en"
-
-
-@pytest.fixture
-def lang() -> str:
-    return EN
-
-
-@pytest.fixture(autouse=True)
-def download_lang(lang: str) -> None:
-    download(lang=lang)
-
-
-def test_serialization(lang: str) -> None:
-    nlp = load(lang=lang)
-    nlp.to_disk("./udpipe-spacy-model")
-
-    udpipe_model = UDPipeModel(lang=lang)
-    nlp = spacy.load("./udpipe-spacy-model", udpipe_model=udpipe_model)
diff --git a/tests/test_spacy_udpipe.py b/tests/test_spacy_udpipe.py
new file mode 100644
index 0000000..d1acbdd
--- /dev/null
+++ b/tests/test_spacy_udpipe.py
@@ -0,0 +1,63 @@
+import tempfile
+
+import pytest
+import spacy
+
+from spacy_udpipe import UDPipeModel, download, load
+
+EN = "en"
+RO = "ro"
+SPACY_VERSION = "2.2.4"
+
+
+@pytest.fixture
+def lang() -> str:
+    return EN
+
+
+@pytest.fixture(autouse=True)
+def download_lang(lang: str) -> None:
+    download(lang=lang)
+
+
+def test_serialization(lang: str) -> None:
+    with tempfile.TemporaryDirectory() as tdir:
+        nlp = load(lang=lang)
+        nlp.to_disk(tdir)
+
+        udpipe_model = UDPipeModel(lang=lang)
+        nlp = spacy.load(tdir, udpipe_model=udpipe_model)
+
+
+def test_pipe(lang: str) -> None:
+    nlp = load(lang=lang)
+    assert nlp._meta["lang"] == f"udpipe_{lang}"
+
+    text = "spacy-udpipe package now support multiprocess execution."
+    doc = nlp(text)
+
+    texts = [text for _ in range(10)]
+    docs = list(nlp.pipe(texts, n_process=-1))
+
+    assert len(docs) == len(texts)
+    assert docs[0].to_json() == doc.to_json()
+
+
+def test_morph_exception() -> None:
+    assert spacy.__version__ <= SPACY_VERSION
+
+    lang = RO
+    text = "Ce mai faci?"
+
+    download(lang=lang)
+
+    try:
+        nlp = load(lang=lang)
+        assert nlp._meta["lang"] == f"udpipe_{lang}"
+        doc = nlp(text)
+    except ValueError:
+        nlp = load(lang=lang, ignore_tag_map=True)
+        assert nlp._meta["lang"] == f"udpipe_{lang}"
+        doc = nlp(text)
+
+    assert doc