From 36103501c4bc9b4461a87a44571b389f97e0ec38 Mon Sep 17 00:00:00 2001 From: asajatovic Date: Sat, 9 May 2020 22:12:30 +0200 Subject: [PATCH] Feature/Multiprocessing (#20) * Enable Language.pipe() execution * Refactor tests * Bump version --- setup.py | 2 +- spacy_udpipe/language.py | 16 +++++---- spacy_udpipe/udpipe.py | 53 +++++++++++++++++++++++++---- tests/test_morph_exception.py | 33 ------------------ tests/test_serialization.py | 23 ------------- tests/test_spacy_udpipe.py | 63 +++++++++++++++++++++++++++++++++++ 6 files changed, 120 insertions(+), 70 deletions(-) delete mode 100644 tests/test_morph_exception.py delete mode 100644 tests/test_serialization.py create mode 100644 tests/test_spacy_udpipe.py diff --git a/setup.py b/setup.py index 0c6bd02..951507f 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setuptools.setup( name="spacy_udpipe", - version="0.2.1", + version="0.3.0", description="Use fast UDPipe models directly in spaCy", long_description=readme, long_description_content_type="text/markdown", diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py index cb15aef..febd67e 100644 --- a/spacy_udpipe/language.py +++ b/spacy_udpipe/language.py @@ -1,3 +1,4 @@ +import multiprocessing as mp import re from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -8,7 +9,7 @@ from spacy.vocab import Vocab from ufal.udpipe import Sentence, Word -from .udpipe import UDPipeModel +from .udpipe import NO_SPACE, UDPipeModel from .utils import get_defaults @@ -80,7 +81,7 @@ def __call__( text = "" for token in tokens: text += token.form - if "SpaceAfter=No" not in token.misc: + if NO_SPACE not in token.misc: text += " " for i, token in enumerate(tokens): span = text[offset:] @@ -99,7 +100,7 @@ def __call__( lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] - if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: + if i == len(tokens) - 1 or NO_SPACE in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) @@ -143,15 +144,18 @@ def pipe( Iterable[str], Iterable[List[str]], Iterable[List[List[str]]] - ] + ], + n_process: Optional[int] = 1 ) -> Iterable[Doc]: """Tokenize a stream of texts. texts: A sequence of unicode texts (raw, presegmented or pretokenized). + n_process: Number of processes to use. YIELDS: A sequence of Doc objects, in order. """ - for text in texts: - yield self(text) + n_process = mp.cpu_count() if n_process == -1 else n_process + with mp.Pool(processes=n_process) as pool: + return pool.map(self.__call__, texts) def _get_tokens_with_heads( self, diff --git a/spacy_udpipe/udpipe.py b/spacy_udpipe/udpipe.py index 3f62285..13dd1eb 100644 --- a/spacy_udpipe/udpipe.py +++ b/spacy_udpipe/udpipe.py @@ -1,11 +1,14 @@ import re from typing import Dict, List, Optional, Union -from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError, - Sentence, Word) +from ufal.udpipe import InputFormat +from ufal.udpipe import Model as _Model +from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word from .utils import get_path +NO_SPACE = "SpaceAfter=No" + class PretokenizedInputFormat(object): """Dummy tokenizer for pretokenized input. @@ -14,7 +17,6 @@ class PretokenizedInputFormat(object): due to pure Python implementation. Mocks InputFormat API to enable plug-and-play behaviour. """ - NO_SPACE = "SpaceAfter=No" def setText(self, text: str) -> None: """Store text in iterable lines for tokenization. @@ -40,11 +42,50 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool: if re.match(r"\W", token): # leave no space after previous token iff current token # is non-alphanumeric (i.e. punctuation) - prev_word.misc = self.NO_SPACE + prev_word.misc = NO_SPACE prev_word = word return True +class Model(_Model): + """Model wrapper with pickling support, enabling multiprocess execution.""" + + def __init__(self, path: str): + self.path = path + self._model = super().load(path) + if self._model is None: + raise Exception(f"Cannot load UDPipe model from file '{path}'") + + def __reduce__(self): + # pickle support + return (self.__class__, (self.path,)) + + @property + def DEFAULT(self) -> InputFormat: + return self._model.DEFAULT + + @property + def TOKENIZER_NORMALIZED_SPACES(self) -> InputFormat: + return self._model.TOKENIZER_NORMALIZED_SPACES + + @property + def TOKENIZER_PRESEGMENTED(self) -> InputFormat: + return self._model.TOKENIZER_PRESEGMENTED + + @property + def TOKENIZER_RANGES(self) -> InputFormat: + return self._model.TOKENIZER_RANGES + + def newTokenizer(self, input_format: InputFormat) -> None: + return self._model.newTokenizer(input_format) + + def parse(self, sentence: Sentence, input_format: InputFormat) -> None: + return self._model.parse(sentence, input_format) + + def tag(self, sentence: Sentence, input_format: InputFormat) -> None: + return self._model.tag(sentence, input_format) + + class UDPipeModel(object): def __init__( @@ -60,9 +101,7 @@ def __init__( meta: Meta-information about the UDPipe model. """ path = path or get_path(lang=lang) - self.model = Model.load(path) - if self.model is None: - raise Exception(f"Cannot load UDPipe model from file '{path}'") + self.model = Model(path) self._lang = lang.split("-")[0] self._meta = meta or {"author": "Milan Straka & Jana Straková", "description": "UDPipe pretrained model.", diff --git a/tests/test_morph_exception.py b/tests/test_morph_exception.py deleted file mode 100644 index cc83018..0000000 --- a/tests/test_morph_exception.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest -import spacy -from spacy_udpipe import download, load - -RO = "ro" -SPACY_VERSION = "2.2.4" - - -@pytest.fixture -def lang() -> str: - return RO - - -@pytest.fixture(autouse=True) -def download_lang(lang: str) -> None: - download(lang) - - -def test_morph_exception_ro(lang: str) -> None: - assert spacy.__version__ <= SPACY_VERSION - - text = "Ce mai faci?" - - try: - nlp = load(lang=lang) - assert nlp._meta["lang"] == f"udpipe_{lang}" - doc = nlp(text) - except ValueError: - nlp = load(lang=lang, ignore_tag_map=True) - assert nlp._meta["lang"] == f"udpipe_{lang}" - doc = nlp(text) - - assert doc diff --git a/tests/test_serialization.py b/tests/test_serialization.py deleted file mode 100644 index 350bf70..0000000 --- a/tests/test_serialization.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -import spacy -from spacy_udpipe import UDPipeModel, download, load - -EN = "en" - - -@pytest.fixture -def lang() -> str: - return EN - - -@pytest.fixture(autouse=True) -def download_lang(lang: str) -> None: - download(lang=lang) - - -def test_serialization(lang: str) -> None: - nlp = load(lang=lang) - nlp.to_disk("./udpipe-spacy-model") - - udpipe_model = UDPipeModel(lang=lang) - nlp = spacy.load("./udpipe-spacy-model", udpipe_model=udpipe_model) diff --git a/tests/test_spacy_udpipe.py b/tests/test_spacy_udpipe.py new file mode 100644 index 0000000..d1acbdd --- /dev/null +++ b/tests/test_spacy_udpipe.py @@ -0,0 +1,63 @@ +import tempfile + +import pytest +import spacy + +from spacy_udpipe import UDPipeModel, download, load + +EN = "en" +RO = "ro" +SPACY_VERSION = "2.2.4" + + +@pytest.fixture +def lang() -> str: + return EN + + +@pytest.fixture(autouse=True) +def download_lang(lang: str) -> None: + download(lang=lang) + + +def test_serialization(lang: str) -> None: + with tempfile.TemporaryDirectory() as tdir: + nlp = load(lang=lang) + nlp.to_disk(tdir) + + udpipe_model = UDPipeModel(lang=lang) + nlp = spacy.load(tdir, udpipe_model=udpipe_model) + + +def test_pipe(lang: str) -> None: + nlp = load(lang=lang) + assert nlp._meta["lang"] == f"udpipe_{lang}" + + text = "spacy-udpipe package now support multiprocess execution." + doc = nlp(text) + + texts = [text for _ in range(10)] + docs = list(nlp.pipe(texts, n_process=-1)) + + assert len(docs) == len(texts) + assert docs[0].to_json() == doc.to_json() + + +def test_morph_exception() -> None: + assert spacy.__version__ <= SPACY_VERSION + + lang = RO + text = "Ce mai faci?" + + download(lang=lang) + + try: + nlp = load(lang=lang) + assert nlp._meta["lang"] == f"udpipe_{lang}" + doc = nlp(text) + except ValueError: + nlp = load(lang=lang, ignore_tag_map=True) + assert nlp._meta["lang"] == f"udpipe_{lang}" + doc = nlp(text) + + assert doc