Skip to content

Commit

Permalink
PB-1167: create a custom query string for the fuzzy search
Browse files Browse the repository at this point in the history
when searching in the metaphone index a simple searchText is sufficient,
there is no need for all the wildcard variants.

digit/text combinations will be expanded to (digit|digit/text) for a
better support of typos.

p.e.
['seestrasse', '78b', 'uster'] will be converted into
@detail seestrasse (78|78b) uster

the quorum operator will be used for the support of fuzzy matching. as a
starting value the quorum operor is matching documents with 70% overlap
( 70% of the search text keywords are present in the document )
  • Loading branch information
ltclm committed Dec 9, 2024
1 parent 116baac commit 7d6f65c
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions app/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def _fuzzy_search(self, searchTextFinal):
# For ranking modes, see http://sphinxsearch.com/docs/current.html#weighting
self.sphinx.SetRankingMode(sphinxapi.SPH_RANK_SPH04)
# Only include results with a certain weight. This might need tweaking
self.sphinx.SetFilterRange('@weight', 5000, 2**32 - 1)
# with the quorum operator lesser weights should be added to the results for the better
# support of fuzziness
self.sphinx.SetFilterRange('@weight', 2500, 2**32 - 1)
try:
if self.typeInfo in ('locations'):
results = self.sphinx.Query(searchTextFinal, index='swisssearch_fuzzy')
Expand Down Expand Up @@ -285,9 +287,10 @@ def _swiss_search(self): # pylint: disable=too-many-branches, too-many-statemen
results.append(d)
seen.append(d['id'])

# if standard index did not find anything, use soundex/metaphon indices
# if standard index did not find anything, use metaphone indices
# which should be more fuzzy in its results
if len(results) <= 0:
searchTextFinal = self._query_fields('@detail', True)
results = self._fuzzy_search(searchTextFinal)
else:
results = []
Expand Down Expand Up @@ -451,7 +454,7 @@ def _get_geoanchor_from_bbox(self):
center = center_from_box2d(self.bbox)
return transformer.transform(center[0], center[1])

def _query_fields(self, fields): # pylint: disable=too-many-locals
def _query_fields(self, fields, fuzzySearch=False): # pylint: disable=too-many-locals
# 10a, 10b needs to be interpreted as digit
q = []
isdigit = lambda x: bool(re.match('^[0-9]', x))
Expand Down Expand Up @@ -494,6 +497,21 @@ def _query_fields(self, fields): # pylint: disable=too-many-locals
f'{fields} "{preNonDigitAndPreDigit}"~5',
f'{fields} "{infNonDigitAndPreDigit}"'
]

# to improve the fuzzyness of the search result the search text has to be changed
# digit/text combinations are replaced with (digit|digit/text) p.e. 4a -> (4|4a)
# quorum operatar is used for query text fuzzy matching
if fuzzySearch:
def convert_if_digit(text):
if isdigit(text):
digit = re.findall(r'^\d+', text)[0]
return f'({str(digit)}|{text})'
return text
# add quorum matching operator with 70% 0.7 for fuzziness, might need some tweaking
# together with search.py#L171
quorum = '"%s"/0.7' % ' '.join([convert_if_digit(w) for w in self.searchText])
q = [f'{fields} {quorum}']
#q = q + [f'{fields} {generate_prefixes(self.searchText)}']
finalQuery = ' | '.join(q)
return finalQuery

Expand Down

0 comments on commit 7d6f65c

Please sign in to comment.