diff --git a/phunspell/phunspell.py b/phunspell/phunspell.py index cec9509..f5076fa 100644 --- a/phunspell/phunspell.py +++ b/phunspell/phunspell.py @@ -40,8 +40,8 @@ import string import pickle import tempfile -import logging -import functools + +# import functools from spylls.hunspell import Dictionary TEMPDIR = tempfile.gettempdir() @@ -142,6 +142,10 @@ # "hyph_zu_ZA" : ["zu_ZA", "LANG"], } +# memoize +DICTIONARIES_LOADED = {} + + class PhunspellError(Exception): def __init__(self, message): Exception.__init__(self, "%s" % (message)) @@ -163,6 +167,8 @@ def __init__(self, loc_lang="en_US", loc_list=[], load_all=False): """ try: if load_all: + # NOTE: meant to be run once to create/store local pickled dictionary + # objects self.dictionary_loader(self.dictionaries_all()) elif len(loc_list) > 0: # if locales passed, load specified dictionaries on init @@ -191,27 +197,30 @@ def dictionaries_all(self): """Load 'all' dictionaries""" return [n for n in DICTIONARIES.keys() if n.find('_') != -1] - @functools.lru_cache # memoize + # TODO: debug + # @functools.lru_cache def dictionary_load(self, loc): """load stored dictionary object for locale""" try: + if loc in DICTIONARIES_LOADED: + self.dictionary = DICTIONARIES_LOADED[loc] + return + datadir = os.getenv("PICKLED_DATADIR") if datadir: filepath = os.path.join(datadir, loc) else: filepath = os.path.join(TEMPDIR, loc) - logging.debug(f'Load dictionary from directory {filepath}') - pfile = open(filepath, 'rb') stored_dic = pickle.load(pfile) - # self.dictionary = Dictionary.from_files(stored_dic) self.dictionary = stored_dic pfile.close() - logging.debug(f'Loaded dictionary from directory {filepath}') + # memoize + DICTIONARIES_LOADED[loc] = stored_dic except (TypeError, OSError) as error: - raise PhunspellError(f'Cannot load dictionary: {filepath} {error}') + raise PhunspellError(f'Cannot load dictionary: {error}') def dictionary_store(self, loc): """iterate locale dump dictionary to object @@ -230,7 +239,6 @@ def dictionary_store(self, loc): else: filepath = os.path.join(TEMPDIR, loc) - logging.debug(f'Store dictionary to directory {filepath}') # if os.path.exists(filepath): self.dict_dirpath(loc) pfile = open(filepath, 'wb') @@ -255,7 +263,6 @@ def dictionary_loader(self, loc_list): continue self.dictionary_store(loc) - # self.dictionary_load(loc) def find_dict_dirpath(self, dictdir, loc_lang): """find directory for dictionary `loc_lang` @@ -398,11 +405,45 @@ def dictionaries(self): # import sys # sys.exit() + dicts_words = { + "af_ZA": "voortgewoed", + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", + "br_FR": "c'huñvderioù", + "de_DE": "schilffrei", + "en_GB": "indict", + "es_MX": "pianista", + "fr_FR": "zoomorphe", + } + # use cache if already seen + dicts_words_cached = { + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", + } + + # 16.41s user 0.48s system 99% cpu 16.986 total + pspell = Phunspell() + + for loc in dicts_words.keys(): + # 36.08s user 0.65s system 99% cpu 36.788 total + # pspell = Phunspell(loc) + print(pspell.lookup(dicts_words[loc], locs=loc)) + + for loc in dicts_words_cached.keys(): + # 36.08s user 0.65s system 99% cpu 36.788 total + # pspell = Phunspell(loc) + print(pspell.lookup(dicts_words[loc], locs=loc)) + + # import sys + # sys.exit() + pspell = Phunspell(loc_lang="en_US") print(pspell.lookup_list("Wonder Woman 1984")) print(pspell.lookup_list(pspell.to_list("Wonder Woman 1984"))) - pspell = Phunspell() # default "en_US" + pspell = Phunspell() # default "en_US" # pspell = Phunspell("af_ZA") print(pspell.lookup("phunspell")) # False diff --git a/phunspell/tests/test_multi_load_cache.py b/phunspell/tests/test_multi_load_cache.py new file mode 100644 index 0000000..566e90a --- /dev/null +++ b/phunspell/tests/test_multi_load_cache.py @@ -0,0 +1,37 @@ +import phunspell +import inspect +import unittest + +dicts_words = { + "af_ZA": "voortgewoed", + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", + "br_FR": "c'huñvderioù", + "de_DE": "schilffrei", + "en_GB": "indict", + "es_MX": "pianista", + "fr_FR": "zoomorphe", +} + +# use cache if already seen +dicts_words_cached = { + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", +} + + +class TestMultiLoadCache(unittest.TestCase): + pspell = phunspell.Phunspell() + + def test_multi_load_cache(self): + for loc in dicts_words.keys(): + self.assertTrue(self.pspell.lookup(dicts_words[loc], locs=loc)) + + for loc in dicts_words_cached.keys(): + self.assertTrue(self.pspell.lookup(dicts_words[loc], locs=loc)) + + +if __name__ == "__main__": + unittest.main() diff --git a/phunspell/tests/test_multi_load_no_cache.py b/phunspell/tests/test_multi_load_no_cache.py new file mode 100644 index 0000000..e28bfa4 --- /dev/null +++ b/phunspell/tests/test_multi_load_no_cache.py @@ -0,0 +1,46 @@ +import phunspell +import inspect +import unittest + +dicts_words = { + "af_ZA": "voortgewoed", + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", + "br_FR": "c'huñvderioù", + "de_DE": "schilffrei", + "en_GB": "indict", + "es_MX": "pianista", + "fr_FR": "zoomorphe", +} + +dicts_words_cached = { + "an_ES": "vengar", + "be_BY": "ідалапаклонніцкі", + "bg_BG": "удържехме", +} + +# TODO: +# fix this upstream +# re: reloading dictionaries is not handled upstream +# ResourceWarning: Enable tracemalloc to get the object allocation traceback +# /Users/dwright/Dev/python/misc/dw/lib/python3.8/site-packages/spylls/hunspell/dictionary.py:141: ResourceWarning: unclosed file <_io.TextIOWrapper name='/Users/dwright/Dev/python/Phunspell/phunspell/data/dictionary/de/de_DE.aff' mode='r' encoding='ISO8859-1'> +# aff, context = readers.read_aff(FileReader(path + '.aff')) +# ResourceWarning: Enable tracemalloc to get the object allocation traceback + + +class TestMultiLoadNoCache(unittest.TestCase): + def test_multi_load_no_cache(self): + for loc in dicts_words.keys(): + # slower performance + pspell = phunspell.Phunspell(loc) + self.assertTrue(pspell.lookup(dicts_words[loc], locs=loc)) + + for loc in dicts_words_cached.keys(): + # slower performance + pspell = phunspell.Phunspell(loc) + self.assertTrue(pspell.lookup(dicts_words[loc], locs=loc)) + + +if __name__ == "__main__": + unittest.main()