Skip to content

Commit

Permalink
improve spelling correction, add Python 3.13 to actions
Browse files Browse the repository at this point in the history
  • Loading branch information
capjamesg committed Oct 14, 2024
1 parent abab21d commit bf2c143
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
matrix:
os: ["ubuntu-latest", "macos-latest"]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- name: 🛎️ Checkout
uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
python-version: [3.12]
steps:
- name: 🛎️ Checkout
uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
matrix:
os: ["ubuntu-latest", "macos-latest"]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- name: 🛎️ Checkout
uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
matrix:
os: ["windows-latest"]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- name: 🛎️ Checkout
uses: actions/checkout@v4
Expand Down
57 changes: 44 additions & 13 deletions jamesql/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import hashlib
from BTrees.OOBTree import OOBTree
from lark import Lark

from nltk.corpus import stopwords
import math

from jamesql.rewriter import string_query_to_jamesql, grammar as rewriter_grammar
Expand All @@ -39,6 +39,7 @@
# that can be run in a single query
MAXIMUM_QUERY_STATEMENTS = 20

stop_words = set(stopwords.words("english"))

class GSI_INDEX_STRATEGIES(Enum):
PREFIX = "prefix"
Expand Down Expand Up @@ -291,6 +292,7 @@ def _compute_string_query(
default_strategies=indexing_strategies,
boosts=boosts,
fuzzy=fuzzy,
correct_spelling_index=self
)

return query
Expand Down Expand Up @@ -501,6 +503,34 @@ def remove(self, uuid: str) -> None:
with open(JOURNAL_FILE, "w") as f:
f.write("")

@lru_cache()
def spelling_correction(self, query: str) -> str:
"""
Accepts a query and returns a spelling corrected query.
"""

if query in self.word_counts and self.word_counts[query] > 1:
return query

fuzzy_suggestions = self._turn_query_into_fuzzy_options(query)

fuzzy_suggestions = [word for word in fuzzy_suggestions if word in self.word_counts]

if fuzzy_suggestions:
return max(fuzzy_suggestions, key=self.word_counts.get)

fuzzy_suggestions_2_edits = []

for word in fuzzy_suggestions:
fuzzy_suggestions_2_edits.extend(self._turn_query_into_fuzzy_options(word))

fuzzy_suggestions_2_edits = [word for word in fuzzy_suggestions_2_edits if word in self.word_counts]

if fuzzy_suggestions_2_edits:
return max(fuzzy_suggestions_2_edits, key=self.word_counts.get)

return query

def create_gsi(
self,
index_by: str | List[str],
Expand Down Expand Up @@ -839,23 +869,24 @@ def _turn_query_into_fuzzy_options(self, query_term: str) -> dict:
for c in string.ascii_lowercase
]
)

# add letter to end of query
query_terms.extend([query_term + c for c in string.ascii_lowercase])

# remove a letter from every possible position
query_terms.extend(
[query_term[:i] + query_term[i + 1 :] for i in range(len(query_term))]
)

# get unique words
final_query_terms = []

log_doc_count = math.log(len(self.global_index))

word_counts = {word: self.word_counts.get(word, 0) for word in query_terms}

for word, count in word_counts.items():
if count > log_doc_count or word == query_term:
final_query_terms.append(word)
# swap every letter with the next letter
query_terms.extend(
[
query_term[:i] + query_term[i + 1] + query_term[i] + query_term[i + 2 :]
for i in range(len(query_term) - 1)
]
)

return final_query_terms
return query_terms

def _run(self, query: dict, query_field: str) -> List[str]:
"""
Expand Down Expand Up @@ -905,7 +936,7 @@ def _run(self, query: dict, query_field: str) -> List[str]:
final_query_terms = []

for query_term in query_terms:
final_query_terms.extend(self._turn_query_into_fuzzy_options(query_term))
final_query_terms.extend(list(self._turn_query_into_fuzzy_options(query_term).keys()))

query_terms = final_query_terms

Expand Down
14 changes: 11 additions & 3 deletions jamesql/rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def WORD(self, items):
return items.value


def simplify_string_query(parser, query):
def simplify_string_query(parser, query, correct_spelling_index = None):
# remove punctuation not in grammar
query = re.sub(r"[^a-zA-Z0-9_.,!?^*:\-'<>=\[\] ]", "", query)

Expand All @@ -234,11 +234,19 @@ def simplify_string_query(parser, query):
query = simplifier(result.terms)
query = " ".join(query).strip()

if correct_spelling_index is not None:
final_query = ""

for word in query.split():
final_query += correct_spelling_index.spelling_correction(word) + " "

query = final_query.strip()

return query


def string_query_to_jamesql(parser, query, query_keys, default_strategies={}, boosts={}, fuzzy = False):
query = simplify_string_query(parser, query)
def string_query_to_jamesql(parser, query, query_keys, default_strategies={}, boosts={}, fuzzy = False, correct_spelling_index = None):
query = simplify_string_query(parser, query, correct_spelling_index)

if query.strip() == "":
return {"query": {}}
Expand Down

0 comments on commit bf2c143

Please sign in to comment.