Skip to content

Commit

Permalink
add fuzzy search function
Browse files Browse the repository at this point in the history
  • Loading branch information
aperrin66 committed Jul 25, 2024
1 parent 8412d80 commit 8a9bac3
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
1 change: 1 addition & 0 deletions pythesint/pythesint.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _process_config():
'search_'+cnf['name']+'_list',
vocabulary.search)
setattr(current_module, 'update_'+cnf['name'], vocabulary.update)
setattr(current_module, 'fuzzy_search_' + cnf['name'], vocabulary.fuzzy_search)

vocabularies = {}
_process_config()
31 changes: 31 additions & 0 deletions pythesint/vocabulary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from collections import OrderedDict

from rapidfuzz.fuzz import token_set_ratio
from rapidfuzz.process import extract
from rapidfuzz.utils import default_process


class Vocabulary(object):
def __init__(self, name, **kwargs):
self.name = name
Expand Down Expand Up @@ -104,3 +109,29 @@ def sort_list(self, list):
def get_list(self):
raise NotImplementedError

def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process,
results_limit=10, min_score=50.0):
"""Perform a fuzzy search on the vocabulary.
Fully parameterized, meant to be called by self.fuzzy_search()
"""
terms_list = self.get_list()
choices = (' '.join(ordered_dict.values()).lower() for ordered_dict in terms_list)
# returns a list of tuples (choice, similarity, index)
# similarity is a float in [0.0, 100.0], 100.0 meaning the
# search string is a subset of the choice string
results = extract(search_string.lower(), choices,
scorer=scorer, processor=processor, limit=results_limit)

# find results matching the minimmum similarity score
# the results list is sorted by decreasing similarity score
max_index = 0
for i, result in enumerate(results):
max_index = i
if result[1] < min_score:
break

return [terms_list[results[i][2]] for i in range(max_index)]

def fuzzy_search(self, search_string):
"""Perform a fuzzy search on the vocabulary terms"""
return self._fuzzy_search(search_string)

0 comments on commit 8a9bac3

Please sign in to comment.