Skip to content

Commit

Permalink
Feature/Multiprocessing (#20)
Browse files Browse the repository at this point in the history
* Enable Language.pipe() execution

* Refactor tests

* Bump version
  • Loading branch information
asajatovic authored May 9, 2020
1 parent ed5288a commit 3610350
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 70 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setuptools.setup(
name="spacy_udpipe",
version="0.2.1",
version="0.3.0",
description="Use fast UDPipe models directly in spaCy",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
16 changes: 10 additions & 6 deletions spacy_udpipe/language.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import multiprocessing as mp
import re
from typing import Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -8,7 +9,7 @@
from spacy.vocab import Vocab
from ufal.udpipe import Sentence, Word

from .udpipe import UDPipeModel
from .udpipe import NO_SPACE, UDPipeModel
from .utils import get_defaults


Expand Down Expand Up @@ -80,7 +81,7 @@ def __call__(
text = ""
for token in tokens:
text += token.form
if "SpaceAfter=No" not in token.misc:
if NO_SPACE not in token.misc:
text += " "
for i, token in enumerate(tokens):
span = text[offset:]
Expand All @@ -99,7 +100,7 @@ def __call__(
lemmas.append(self.vocab.strings.add(token.lemma or ""))
offset += len(token.form)
span = text[offset:]
if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc:
if i == len(tokens) - 1 or NO_SPACE in token.misc:
spaces.append(False)
elif not is_aligned:
spaces.append(True)
Expand Down Expand Up @@ -143,15 +144,18 @@ def pipe(
Iterable[str],
Iterable[List[str]],
Iterable[List[List[str]]]
]
],
n_process: Optional[int] = 1
) -> Iterable[Doc]:
"""Tokenize a stream of texts.
texts: A sequence of unicode texts (raw, presegmented or pretokenized).
n_process: Number of processes to use.
YIELDS: A sequence of Doc objects, in order.
"""
for text in texts:
yield self(text)
n_process = mp.cpu_count() if n_process == -1 else n_process
with mp.Pool(processes=n_process) as pool:
return pool.map(self.__call__, texts)

def _get_tokens_with_heads(
self,
Expand Down
53 changes: 46 additions & 7 deletions spacy_udpipe/udpipe.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import re
from typing import Dict, List, Optional, Union

from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError,
Sentence, Word)
from ufal.udpipe import InputFormat
from ufal.udpipe import Model as _Model
from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word

from .utils import get_path

NO_SPACE = "SpaceAfter=No"


class PretokenizedInputFormat(object):
"""Dummy tokenizer for pretokenized input.
Expand All @@ -14,7 +17,6 @@ class PretokenizedInputFormat(object):
due to pure Python implementation. Mocks InputFormat API to enable
plug-and-play behaviour.
"""
NO_SPACE = "SpaceAfter=No"

def setText(self, text: str) -> None:
"""Store text in iterable lines for tokenization.
Expand All @@ -40,11 +42,50 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
if re.match(r"\W", token):
# leave no space after previous token iff current token
# is non-alphanumeric (i.e. punctuation)
prev_word.misc = self.NO_SPACE
prev_word.misc = NO_SPACE
prev_word = word
return True


class Model(_Model):
"""Model wrapper with pickling support, enabling multiprocess execution."""

def __init__(self, path: str):
self.path = path
self._model = super().load(path)
if self._model is None:
raise Exception(f"Cannot load UDPipe model from file '{path}'")

def __reduce__(self):
# pickle support
return (self.__class__, (self.path,))

@property
def DEFAULT(self) -> InputFormat:
return self._model.DEFAULT

@property
def TOKENIZER_NORMALIZED_SPACES(self) -> InputFormat:
return self._model.TOKENIZER_NORMALIZED_SPACES

@property
def TOKENIZER_PRESEGMENTED(self) -> InputFormat:
return self._model.TOKENIZER_PRESEGMENTED

@property
def TOKENIZER_RANGES(self) -> InputFormat:
return self._model.TOKENIZER_RANGES

def newTokenizer(self, input_format: InputFormat) -> None:
return self._model.newTokenizer(input_format)

def parse(self, sentence: Sentence, input_format: InputFormat) -> None:
return self._model.parse(sentence, input_format)

def tag(self, sentence: Sentence, input_format: InputFormat) -> None:
return self._model.tag(sentence, input_format)


class UDPipeModel(object):

def __init__(
Expand All @@ -60,9 +101,7 @@ def __init__(
meta: Meta-information about the UDPipe model.
"""
path = path or get_path(lang=lang)
self.model = Model.load(path)
if self.model is None:
raise Exception(f"Cannot load UDPipe model from file '{path}'")
self.model = Model(path)
self._lang = lang.split("-")[0]
self._meta = meta or {"author": "Milan Straka & Jana Straková",
"description": "UDPipe pretrained model.",
Expand Down
33 changes: 0 additions & 33 deletions tests/test_morph_exception.py

This file was deleted.

23 changes: 0 additions & 23 deletions tests/test_serialization.py

This file was deleted.

63 changes: 63 additions & 0 deletions tests/test_spacy_udpipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import tempfile

import pytest
import spacy

from spacy_udpipe import UDPipeModel, download, load

EN = "en"
RO = "ro"
SPACY_VERSION = "2.2.4"


@pytest.fixture
def lang() -> str:
return EN


@pytest.fixture(autouse=True)
def download_lang(lang: str) -> None:
download(lang=lang)


def test_serialization(lang: str) -> None:
with tempfile.TemporaryDirectory() as tdir:
nlp = load(lang=lang)
nlp.to_disk(tdir)

udpipe_model = UDPipeModel(lang=lang)
nlp = spacy.load(tdir, udpipe_model=udpipe_model)


def test_pipe(lang: str) -> None:
nlp = load(lang=lang)
assert nlp._meta["lang"] == f"udpipe_{lang}"

text = "spacy-udpipe package now support multiprocess execution."
doc = nlp(text)

texts = [text for _ in range(10)]
docs = list(nlp.pipe(texts, n_process=-1))

assert len(docs) == len(texts)
assert docs[0].to_json() == doc.to_json()


def test_morph_exception() -> None:
assert spacy.__version__ <= SPACY_VERSION

lang = RO
text = "Ce mai faci?"

download(lang=lang)

try:
nlp = load(lang=lang)
assert nlp._meta["lang"] == f"udpipe_{lang}"
doc = nlp(text)
except ValueError:
nlp = load(lang=lang, ignore_tag_map=True)
assert nlp._meta["lang"] == f"udpipe_{lang}"
doc = nlp(text)

assert doc

0 comments on commit 3610350

Please sign in to comment.