Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

acronyms and number changes #4

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions SymSpellCppPy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,16 @@ PYBIND11_MODULE(SymSpellCppPy, m) {
" 3. multiple independent input terms with/without spelling errors",
py::arg("input"),
py::arg("max_edit_distance"))
.def("lookup_compound", py::overload_cast<const xstring &, int, bool>(
.def("lookup_compound", py::overload_cast<const xstring &, int, bool,bool>(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have a separate function, instead of overriding this one, else those test cases will fail, also format it once.

&symspellcpppy::SymSpell::LookupCompound),
" LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:\n"
" 1. mistakenly inserted space into a correct word led to two incorrect terms \n"
" 2. mistakenly omitted space between two correct words led to one incorrect combined term\n"
" 3. multiple independent input terms with/without spelling errors",
py::arg("input"),
py::arg("max_edit_distance"),
py::arg("transfer_casing"))
py::arg("transfer_casing"),
py::arg("ignore_non_words"))
.def("word_segmentation", py::overload_cast<const xstring &>(
&symspellcpppy::SymSpell::WordSegmentation),
" WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions\n"
Expand Down
15 changes: 15 additions & 0 deletions include/Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,21 @@ class Helpers {

return response_string;
}

static bool is_acronym(xstring word, bool match_any_term_with_digit){
if(match_any_term_with_digit == true){
for(char i:word){
if(std::isdigit(i)){
return true;
}
}
}
std::regex accr_regex("[A-Z0-9]{3,}");
if(std::regex_match(word,accr_regex)){
return true;
}
return false;
}
};

template<class T>
Expand Down
30 changes: 22 additions & 8 deletions library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ namespace symspellcpppy {
xstring line;
auto staging = std::make_shared<SuggestionStage>(16384);
while (getline(corpusStream, line)) {
for (const xstring &key : ParseWords(line)) {
for (const xstring &key : ParseWords(line,true)) {
CreateDictionaryEntry(key, 1, staging);
}

Expand Down Expand Up @@ -455,14 +455,19 @@ namespace symspellcpppy {
return true;
}

std::vector<xstring> SymSpell::ParseWords(const xstring &text) {
std::vector<xstring> SymSpell::ParseWords(const xstring &text,bool lower_casing=true) {
xregex r(XL("['’\\w-\\[_\\]]+"));
xsmatch m;
std::vector<xstring> matches;
xstring::const_iterator ptr(text.cbegin());
while (regex_search(ptr, text.cend(), m, r)) {
xstring matchLower = Helpers::string_lower(m[0]);
matches.push_back(matchLower);
if(lower_casing){
xstring matchLower = Helpers::string_lower(m[0]);
matches.push_back(matchLower);
}
else{
matches.push_back(m[0]);
}
ptr = m.suffix().first;
}
return matches;
Expand Down Expand Up @@ -511,22 +516,31 @@ namespace symspellcpppy {
}

std::vector<SuggestItem> SymSpell::LookupCompound(const xstring &input) {
return LookupCompound(input, maxDictionaryEditDistance, false);
return LookupCompound(input, maxDictionaryEditDistance, false,true);
}

std::vector<SuggestItem> SymSpell::LookupCompound(const xstring &input, int editDistanceMax) {
return LookupCompound(input, editDistanceMax, false);
return LookupCompound(input, editDistanceMax, false,true);
}

std::vector<SuggestItem> SymSpell::LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing) {
std::vector<xstring> termList1 = ParseWords(input);
std::vector<SuggestItem> SymSpell::LookupCompound(const xstring &input, int editDistanceMax,bool transferCasing,bool ignore_non_words) {
std::vector<xstring> termList1 = ParseWords(input,false);

std::vector<SuggestItem> suggestions; //suggestions for a single term
std::vector<SuggestItem> suggestionParts; //1 line with separate parts
auto distanceComparer = EditDistance(distanceAlgorithm);

bool lastCombi = false;
for (int i = 0; i < termList1.size(); i++) {
if(ignore_non_words == true){
if(Helpers::is_acronym(termList1[i],true)){
SuggestItem temp = SuggestItem(termList1[i],0,0);
suggestionParts.push_back(temp);
continue;
}

}

suggestions = Lookup(termList1[i], Top, editDistanceMax);

if ((i > 0) && !lastCombi) {
Expand Down
4 changes: 2 additions & 2 deletions library.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ namespace symspellcpppy {
bool
DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen) const;

static std::vector<xstring> ParseWords(const xstring &text);
static std::vector<xstring> ParseWords(const xstring &text, bool lower_casing);

std::shared_ptr<std::unordered_set<xstring>>
Edits(const xstring &word, int editDistance, std::shared_ptr<std::unordered_set<xstring>> deleteWords);
Expand Down Expand Up @@ -259,7 +259,7 @@ namespace symspellcpppy {
/// <param name="input">The string being spell checked.</param>
/// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
/// <returns>A List of SuggestItem object representing suggested correct spellings for the input string.</returns>
std::vector<SuggestItem> LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing);
std::vector<SuggestItem> LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing, bool ignore_non_words);

//######

Expand Down
21 changes: 20 additions & 1 deletion tests/CatchMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ TEST_CASE("Testing English", "[english]") {
auto results = symSpell.LookupCompound(typo, 2);
REQUIRE(results[0].term == correction);
}

SECTION("Lookup transfer casing") {
SymSpell symSpell(maxEditDistance, prefixLength);
symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' '));
Expand All @@ -181,4 +181,23 @@ TEST_CASE("Testing English", "[english]") {
auto results = symSpell.Lookup(typo, Verbosity::Top, 2, false, true);
REQUIRE(results[0].term == correction);
}

SECTION("Lookup compound accronyms and numbers") {
SymSpell symSpell(maxEditDistance, prefixLength);
symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' '));
xstring typo = "whera is the PNR9 locaited";
xstring correction = "where is the PNR9 located";
auto results = symSpell.LookupCompound(typo);
REQUIRE(results[0].term == correction);
}

SECTION("Lookup compound with just numbers") {
SymSpell symSpell(maxEditDistance, prefixLength);
symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' '));
xstring typo = "whera is the 999 locaited";
xstring correction = "where is the 999 located";
auto results = symSpell.LookupCompound(typo);
REQUIRE(results[0].term == correction);
}

}
13 changes: 9 additions & 4 deletions tests/SymSpellCppPyTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ def test_lookup_compound_only_combi(self):
results = sym_spell.lookup_compound(typo, edit_distance_max)
self.assertEqual(1, len(results))
self.assertEqual(correction, results[0].term)

def test_lookup_compound_no_suggestion(self):
edit_distance_max = 2
prefix_length = 7
Expand Down Expand Up @@ -615,7 +615,7 @@ def test_lookup_compound_transfer_casing(self):
"who couldn't read in sixth grade AND inspired him")

results = sym_spell.lookup_compound(typo, edit_distance_max,
transfer_casing=True)
transfer_casing=True,ignore_non_words=True)
self.assertEqual(correction, results[0].term)

def test_lookup_compound_transfer_casing_no_bigram(self):
Expand All @@ -630,7 +630,7 @@ def test_lookup_compound_transfer_casing_no_bigram(self):
"who couldn't read in sixth grade AND inspired him")

results = sym_spell.lookup_compound(typo, edit_distance_max,
transfer_casing=True)
True,True)
self.assertEqual(correction, results[0].term)

# TODO: test_create_dictionary_entry_below_threshold
Expand Down Expand Up @@ -718,7 +718,12 @@ def test_lookup_transfer_casing(self):
result = sym_spell.lookup("I", Verbosity.TOP, 2,
transfer_casing=True)
self.assertEqual("I", result[0].term)


def test_lookup_compund_acr(self):
symSpell = SymSpell()
symSpell.load_dictionary("resources/frequency_dictionary_en_82_765.txt", 0, 1, " ")
res = symSpell.lookup_compound("Whate is yur PNR numbir")
self.assertEqual("What is your PNR number",res[0].term)

if __name__ == '__main__':
unittest.main()