Skip to content

Commit

Permalink
pickle dictionary store, +memoize on reloads
Browse files Browse the repository at this point in the history
  • Loading branch information
David Wright committed Mar 10, 2021
1 parent 941b211 commit d996dc4
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 11 deletions.
63 changes: 52 additions & 11 deletions phunspell/phunspell.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
import string
import pickle
import tempfile
import logging
import functools

# import functools
from spylls.hunspell import Dictionary

TEMPDIR = tempfile.gettempdir()
Expand Down Expand Up @@ -142,6 +142,10 @@
# "hyph_zu_ZA" : ["zu_ZA", "LANG"],
}

# memoize
DICTIONARIES_LOADED = {}


class PhunspellError(Exception):
def __init__(self, message):
Exception.__init__(self, "%s" % (message))
Expand All @@ -163,6 +167,8 @@ def __init__(self, loc_lang="en_US", loc_list=[], load_all=False):
"""
try:
if load_all:
# NOTE: meant to be run once to create/store local pickled dictionary
# objects
self.dictionary_loader(self.dictionaries_all())
elif len(loc_list) > 0:
# if locales passed, load specified dictionaries on init
Expand Down Expand Up @@ -191,27 +197,30 @@ def dictionaries_all(self):
"""Load 'all' dictionaries"""
return [n for n in DICTIONARIES.keys() if n.find('_') != -1]

@functools.lru_cache # memoize
# TODO: debug
# @functools.lru_cache
def dictionary_load(self, loc):
"""load stored dictionary object for locale"""
try:
if loc in DICTIONARIES_LOADED:
self.dictionary = DICTIONARIES_LOADED[loc]
return

datadir = os.getenv("PICKLED_DATADIR")
if datadir:
filepath = os.path.join(datadir, loc)
else:
filepath = os.path.join(TEMPDIR, loc)

logging.debug(f'Load dictionary from directory {filepath}')

pfile = open(filepath, 'rb')
stored_dic = pickle.load(pfile)
# self.dictionary = Dictionary.from_files(stored_dic)
self.dictionary = stored_dic
pfile.close()

logging.debug(f'Loaded dictionary from directory {filepath}')
# memoize
DICTIONARIES_LOADED[loc] = stored_dic
except (TypeError, OSError) as error:
raise PhunspellError(f'Cannot load dictionary: {filepath} {error}')
raise PhunspellError(f'Cannot load dictionary: {error}')

def dictionary_store(self, loc):
"""iterate locale dump dictionary to object
Expand All @@ -230,7 +239,6 @@ def dictionary_store(self, loc):
else:
filepath = os.path.join(TEMPDIR, loc)

logging.debug(f'Store dictionary to directory {filepath}')
# if os.path.exists(filepath):
self.dict_dirpath(loc)
pfile = open(filepath, 'wb')
Expand All @@ -255,7 +263,6 @@ def dictionary_loader(self, loc_list):
continue

self.dictionary_store(loc)
# self.dictionary_load(loc)

def find_dict_dirpath(self, dictdir, loc_lang):
"""find directory for dictionary `loc_lang`
Expand Down Expand Up @@ -398,11 +405,45 @@ def dictionaries(self):
# import sys
# sys.exit()

dicts_words = {
"af_ZA": "voortgewoed",
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
"br_FR": "c'huñvderioù",
"de_DE": "schilffrei",
"en_GB": "indict",
"es_MX": "pianista",
"fr_FR": "zoomorphe",
}
# use cache if already seen
dicts_words_cached = {
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
}

# 16.41s user 0.48s system 99% cpu 16.986 total
pspell = Phunspell()

for loc in dicts_words.keys():
# 36.08s user 0.65s system 99% cpu 36.788 total
# pspell = Phunspell(loc)
print(pspell.lookup(dicts_words[loc], locs=loc))

for loc in dicts_words_cached.keys():
# 36.08s user 0.65s system 99% cpu 36.788 total
# pspell = Phunspell(loc)
print(pspell.lookup(dicts_words[loc], locs=loc))

# import sys
# sys.exit()

pspell = Phunspell(loc_lang="en_US")
print(pspell.lookup_list("Wonder Woman 1984"))
print(pspell.lookup_list(pspell.to_list("Wonder Woman 1984")))

pspell = Phunspell() # default "en_US"
pspell = Phunspell() # default "en_US"
# pspell = Phunspell("af_ZA")

print(pspell.lookup("phunspell")) # False
Expand Down
37 changes: 37 additions & 0 deletions phunspell/tests/test_multi_load_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import phunspell
import inspect
import unittest

dicts_words = {
"af_ZA": "voortgewoed",
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
"br_FR": "c'huñvderioù",
"de_DE": "schilffrei",
"en_GB": "indict",
"es_MX": "pianista",
"fr_FR": "zoomorphe",
}

# use cache if already seen
dicts_words_cached = {
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
}


class TestMultiLoadCache(unittest.TestCase):
pspell = phunspell.Phunspell()

def test_multi_load_cache(self):
for loc in dicts_words.keys():
self.assertTrue(self.pspell.lookup(dicts_words[loc], locs=loc))

for loc in dicts_words_cached.keys():
self.assertTrue(self.pspell.lookup(dicts_words[loc], locs=loc))


if __name__ == "__main__":
unittest.main()
46 changes: 46 additions & 0 deletions phunspell/tests/test_multi_load_no_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import phunspell
import inspect
import unittest

dicts_words = {
"af_ZA": "voortgewoed",
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
"br_FR": "c'huñvderioù",
"de_DE": "schilffrei",
"en_GB": "indict",
"es_MX": "pianista",
"fr_FR": "zoomorphe",
}

dicts_words_cached = {
"an_ES": "vengar",
"be_BY": "ідалапаклонніцкі",
"bg_BG": "удържехме",
}

# TODO:
# fix this upstream
# re: reloading dictionaries is not handled upstream
# ResourceWarning: Enable tracemalloc to get the object allocation traceback
# /Users/dwright/Dev/python/misc/dw/lib/python3.8/site-packages/spylls/hunspell/dictionary.py:141: ResourceWarning: unclosed file <_io.TextIOWrapper name='/Users/dwright/Dev/python/Phunspell/phunspell/data/dictionary/de/de_DE.aff' mode='r' encoding='ISO8859-1'>
# aff, context = readers.read_aff(FileReader(path + '.aff'))
# ResourceWarning: Enable tracemalloc to get the object allocation traceback


class TestMultiLoadNoCache(unittest.TestCase):
def test_multi_load_no_cache(self):
for loc in dicts_words.keys():
# slower performance
pspell = phunspell.Phunspell(loc)
self.assertTrue(pspell.lookup(dicts_words[loc], locs=loc))

for loc in dicts_words_cached.keys():
# slower performance
pspell = phunspell.Phunspell(loc)
self.assertTrue(pspell.lookup(dicts_words[loc], locs=loc))


if __name__ == "__main__":
unittest.main()

0 comments on commit d996dc4

Please sign in to comment.