Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Affix class cleanup #1329

Merged
merged 24 commits into from
Jul 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d2f68f3
Fix a "readability-misleading-indentation" warning
ampli Jul 12, 2022
dca35e3
Move the list of strippable affix classes to dict-affix.h
ampli Jul 14, 2022
2be9d49
split_order(): Fix a misleading variable notation
ampli Jul 15, 2022
ee685ea
read_entry(): Avoid memleak on error
ampli Jul 15, 2022
1ec10c5
insert_length_limit(): Fix warning [readability-misleading-indentation]
ampli Jul 15, 2022
8022b0b
afdict_find: Move up
ampli Jul 15, 2022
ba529fa
is_afdict_punc(): Allow afdict_classnum to start with 0
ampli Jul 18, 2022
2e85ae3
afdict_classnum: Start with 0
ampli Jul 18, 2022
2c619fb
LPUNC,MPUNCT: Fix subscript mishandling
ampli Jul 22, 2022
79edc7b
determine_word_expressions(): Debug print the word status too
ampli Jul 23, 2022
7673701
Move afdict_classnames[] to dict-affix.h
ampli Jul 24, 2022
68c2398
afdict_init(): Warn on unknown strippable affix tokens
ampli Jul 25, 2022
151bcc3
afdict_init(): Don't stop on first error
ampli Jul 25, 2022
59d9138
he/4.0.affix: Remove a comment that is too far ahead of its time
ampli Jul 25, 2022
7159378
he: Revise list of strippable affixes
ampli Jul 25, 2022
6dcfef4
vn: Revise list of strippable affixes
ampli Jul 25, 2022
c972367
de: Revise list of strippable affixes
ampli Jul 25, 2022
f4cd18d
tr: Revise list of strippable affixes
ampli Jul 25, 2022
30aa3a9
kz: Revise list of strippable affixes
ampli Jul 26, 2022
753c435
id: Revise list of strippable affixes
ampli Jul 26, 2022
fe0af9b
fa: Revise list of strippable affixes
ampli Jul 26, 2022
3a162fe
ru: Revise list of strippable affixes
ampli Jul 26, 2022
56fb725
ru/4.0.dict: Fix typo in panic_max-disjunct-cost
ampli Jul 26, 2022
900a4ba
tests.py ZTHLangTestCase: Filter out errors on nonexistent affixes
ampli Jul 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions bindings/python-examples/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ def test_open_nonexistent_dictionary(self):

save_stderr = divert_start(2)
self.assertRaises(LG_DictionaryError, Dictionary, dummy_lang + '1')
self.assertIn(dummy_lang + '1', save_stderr.divert_end())
self.assertIn(dummy_lang + '1', str(save_stderr.divert_end()))

save_stderr = divert_start(2)
self.assertRaises(LG_Error, Dictionary, dummy_lang + '2')
self.assertIn(dummy_lang + '2', save_stderr.divert_end())
self.assertIn(dummy_lang + '2', str(save_stderr.divert_end()))

# Check absolute and relative dictionary access.
# Check also that the dictionary language is set correctly.
Expand Down Expand Up @@ -506,7 +506,7 @@ def test_22_defaut_handler_param(self):
dummy_lang = "a dummy dict name (bad param test)"
self.assertRaises(LG_Error, Dictionary, dummy_lang)
LG_Error.printall(self.error_handler_test, self) # grab a valid errinfo
#self.assertIn(dummy_lang, save_stderr.divert_end())
#self.assertIn(dummy_lang, str(save_stderr.divert_end()))
self.assertRaisesRegexp(TypeError, "must be an integer",
self.__class__.handler["default"],
self.errinfo, "bad param")
Expand All @@ -520,7 +520,7 @@ def test_22_defaut_handler_param(self):
self.param_ok = False
save_stdout = divert_start(1) # Note: Handler parameter is stdout
self.__class__.handler["default"](self.errinfo, 1)
self.assertIn(dummy_lang, save_stdout.divert_end())
self.assertIn(dummy_lang, str(save_stdout.divert_end()))
self.param_ok = True
except (TypeError, ValueError):
self.assertTrue(self.param_ok)
Expand Down Expand Up @@ -598,7 +598,7 @@ def test_50_set_orig_error_handler(self):
dummy_lang = "a dummy dict name (default handler test)"
save_stderr = divert_start(2)
self.assertRaises(LG_Error, Dictionary, dummy_lang)
self.assertIn(dummy_lang, save_stderr.divert_end())
self.assertIn(dummy_lang, str(save_stderr.divert_end()))
self.assertEqual(self.errinfo, "dummy")

class FSATsolverTestCase(unittest.TestCase):
Expand Down Expand Up @@ -1281,9 +1281,16 @@ def test_d_morphology(self):
'.', 'RIGHT-WALL'])


# The Thai 4.0.affix files currently contain strippable affixes that are
# not in the dict. This causes an annoying multiline error output that are
# filtered out here using divert().
class ZTHLangTestCase(unittest.TestCase):
def test_thai(self):
save_stderr = divert_start(2)
linkage_testfile(self, Dictionary(lang='th'), ParseOptions())
for line in save_stderr.divert_end().decode().split("\n"):
if 'Token(s) not in the dictionary' not in line:
print(line)


class ZXDictDialectTestCase(unittest.TestCase):
Expand Down Expand Up @@ -1501,7 +1508,7 @@ def divert_end(self):
os.close(self.savedfd)
os.unlink(self.filename)
self.filename = None
return str(content)
return content

__del__ = divert_end

Expand Down
4 changes: 2 additions & 2 deletions data/de/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

")" "%" "," "." ":" ";" "?" "!" "''" "'" "'s": RPUNC+;
"(" "$" "``": LPUNC+;
")" "," "." ":" ";" "?" "!" "'" „ « ...: RPUNC+;
"(" “ » ...: LPUNC+;

%e en er es em: SUF+;
.=: STEMSUBSCR+;
Expand Down
6 changes: 5 additions & 1 deletion data/de/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
% %
%***************************************************************************%

#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale de_DE.UTF-8;

% NOUNS, PRONOUNS
Expand Down Expand Up @@ -521,3 +521,7 @@ PL-CAPITALIZED-WORDS: BZZT+;
% will output an error for any unknown words.
<UNKNOWN-WORD>: NO+;

% Unhandled, but getting split according to 4.0.affix.
")" ":" ";" "'" „ « ....y: <UNKNOWN-WORD>; % RPUNC
"(" “ » ....x: <UNKNOWN-WORD>; % LPUNC

4 changes: 2 additions & 2 deletions data/fa/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+;
"(" "$" "``": LPUNC+;
")" "%" "," "." ":" ";" "?" "!" "'" : RPUNC+;
"(" "$": LPUNC+;
8 changes: 4 additions & 4 deletions data/fa/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
%
%*****************************************************************************

#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale fa_IR.UTF-8;

% PERSIAN SYNTAX NOTES:
Expand Down Expand Up @@ -835,11 +835,11 @@ or

;

% Unhandled, but getting split according to 4.0.affix.
")" "%" ";" "'": (); % RPUNC
"(" "$": (); % LPUNC


%":" ";" : % Colon and Semicolon
%;

%NUMBERS: ;
LEFT-WALL: ( Wcc+ or [Wi+] or [[Wd+]] ) & {Xp+}; % Connects to the Subject in declarative sentence, or Verb in subjectless or imperative sentence
RIGHT-WALL: RW- ;
Expand Down
11 changes: 2 additions & 9 deletions data/he/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
% need to add Hebrew punctuations
")" """ "," "." "–" "‐" ... ":" ";" "?" "!": RPUNC+;

")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ''.y '.y
"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" "–" "‐" 、 ~
¢ ₵ ™ ℠ : RPUNC+;

"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 `` „ “ ‘ ''.x '.x ….x ....x
¿ ¡ "$"
£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
† †† ‡ § ¶ © ® ℗ № "#": LPUNC+;
"(" „ “ """ ... ₪: LPUNC+;

=: INFIXMARK+;
% The order is preserved.
Expand Down
6 changes: 5 additions & 1 deletion data/he/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
% Among numerous other things, changes to handle count/uncountable changes
% have not been done yet. The created infrastructure for that may still need changes.

#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale he_IL.UTF-8;

% For now.
Expand Down Expand Up @@ -562,3 +562,7 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ &
% skip over (null-link) unknown words. If you remove it, the parser
% will output an error for any unknown words.
<UNKNOWN-WORD>: XXX+;

% Punctuations that get strip but are yet unhandled.
"." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: <UNKNOWN-WORD>; % RPUNC
"(" „ “ "".x" ....x: <UNKNOWN-WORD>; % LPUNC
11 changes: 7 additions & 4 deletions data/he/4.0.dict.m4
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ changecom(`%')dnl
% Among numerous other things, changes to handle count/uncountable changes
% have not been done yet. The created infrastructure for that may still need changes.

% Dictionary version number is 5.3.15 (formatted as V5v3v15+)
<dictionary-version-number>: V5v3v15+;
<dictionary-locale>: HE4il+;
#define dictionary-version-number 5.11.0;
#define dictionary-locale he_IL.UTF-8;

% For now.
LEFT-WALL: {Wa+} or {Wd+} or ();
Expand Down Expand Up @@ -458,4 +457,8 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ &
% With the following line in the dictionary, the parser will simply
% skip over (null-link) unknown words. If you remove it, the parser
% will output an error for any unknown words.
UNKNOWN-WORD: XXX+;
<UNKNOWN-WORD>: XXX+;

% Punctuations that get strip but are yet unhandled.
"." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: <UNKNOWN-WORD>; % RPUNC
"(" „ “ "".x" ....x: <UNKNOWN-WORD>; % LPUNC
34 changes: 2 additions & 32 deletions data/id/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,30 +1,11 @@
%
% Affixes get stripped off the left and right side of words
% i.e. spaces are inserted between the affix and the word itself.
%
% Some of the funky UTF-8 parenthesis are used in Asian texts.
% In order to allow single straight quote ' and double straight quote ''
% to be stripped off from both the left and the right, they are
% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y)
%
% 。is an end-of-sentence marker used in Japanese texts.

% Punctuation appearing on the right-side of words.
")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ''.y '.y ' `
"%" "," "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ? ! ….y ....y "”" ━.y –.y ー.y ‐.y 、.y
~ ¢ ₵ ™ ℠ : RPUNC+;
"," : RPUNC+; % Only this punctuation appears in the dict.

% Punctuation appearing on the left-side of words.
"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 、.x ` `` „ “ ‘ ''.x '.x ….x ....x
¿ ¡ "$" US$ USD C$
£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
† †† ‡ § ¶ © ® ℗ № "#"
* • ⁂ ❧ ☞ ◊ ※ ○ 。.x ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎
–.x ━.x ー.x -- - ‧.x
: LPUNC+;

% Suffixes
's 're 've 'd 'll 'm ’s ’re ’ve ’d ’ll ’m: SUF+;
% : LPUNC+; % none.

% The below is a quoted list, used during tokenization. Do NOT put
% spaces in between the various quotation marks!!
Expand All @@ -33,14 +14,3 @@
% The below is a quoted list, used during tokenization. Do NOT put
% spaces in between the various symbols!!
"()¿¡†‡§¶©®℗№#*•⁂❧☞◊※○。゜✿☆*◕●∇□◇@◎–━ー---‧": BULLETS+;

/en/words/units.1: UNITS+;
/en/words/units.1.dot: UNITS+;
/en/words/units.3: UNITS+;
/en/words/units.4: UNITS+;
/en/words/units.4.dot: UNITS+;
/en/words/units.5: UNITS+;
%
% units.6 contains just a single, sole slash in it. This allows units
% such as mL/s to be split at the slash.
/en/words/units.6: UNITS+;
2 changes: 1 addition & 1 deletion data/id/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
% %
%***************************************************************************%

#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale id_ID.UTF-8;

anjing kucing wanita cewek pria cowok lelaki laki-laki taman lapangan tulang
Expand Down
1 change: 1 addition & 0 deletions data/kz/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
% From these affixes, only "," is handled in the dictionary.
")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+;
"(" "$" "``": LPUNC+;

Expand Down
6 changes: 5 additions & 1 deletion data/kz/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
% Kazakh dictionary -- experimental -- 2016
% -- Tatiana Batura, Aigerim Bakiyeva, Aigerim Yerimbetova
%
#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale kk_KZ.UTF-8;

мен.pron: {W-} & S1s+;
Expand Down Expand Up @@ -133,3 +133,7 @@ LATIN-ADJ-P-NOUN-WORDS: XXX-;
LATIN-ADJ-S-NOUN-WORDS: XXX-;
HYPHENATED-WORDS: XXX-;
<UNKNOWN-WORD>: XXX-;

% Unhandled, but getting split according to 4.0.affix.
")" "%" "." ":" ";" "?" "!" "''" "'" : <UNKNOWN-WORD>; % RPUNC
"(" "$" "``": <UNKNOWN-WORD>; % LPUNC
4 changes: 2 additions & 2 deletions data/ru/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+;
"(" "$" "``": LPUNC+;
")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+;
"(" "$": LPUNC+;

""«»《》【】『』`„“": QUOTES+;

Expand Down
8 changes: 6 additions & 2 deletions data/ru/4.0.dict
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
%%
%% This file uses the utf8 encoding

#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale ru_RU.UTF-8;
#define max-disjunct-cost 2.7;
#define panic_max-disjunct-cost 4.0;
#define panic-max-disjunct-cost 4.0;

<costly-null>: [[[[]]]];

Expand Down Expand Up @@ -1096,3 +1096,7 @@ LENGTH-LIMIT-1: LL*+;
% skip over (null-link) unknown words. If you remove it, the parser
% will output an error for any unknown words.
<UNKNOWN-WORD>: XXX+;

% Unhandled, but getting split according to 4.0.affix.
"%" "'": <UNKNOWN-WORD>; % RPUNC
"$": <UNKNOWN-WORD>; % LPUNC
5 changes: 3 additions & 2 deletions data/tr/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+;
"(" "$" "``": LPUNC+;
% From these affixes, only "," is handled in the dictionary.
")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+;
"(" "$" : LPUNC+;

dakileri ki ler leri kileri düm m di lar diler um yor mi ni diğ diğini ini yorum miyorum ın in un ün nın nin nun nün a e ya ye i ı u ü yi yı yu yü yla ile yle da de ta te den ten dan tan sız: SUF+;

Expand Down
6 changes: 5 additions & 1 deletion data/tr/4.0.dict
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
%
% Turkish dictionary -- experimental -- 2016 -- Tatiana Batura, Maria Mitkovskaya, Natalya Semenova
%
#define dictionary-version-number 5.9.0;
#define dictionary-version-number 5.11.0;
#define dictionary-locale tr_TR.UTF-8;

% adjectives
Expand Down Expand Up @@ -119,3 +119,7 @@ PL-GREEK-LETTER-AND-NUMBER: XXX-;
CAPITALIZED-WORDS: XXX-;
HYPHENATED-WORDS: XXX-;
<UNKNOWN-WORD>: XXX-;

% Unhandled, but getting split according to 4.0.affix.
")" "%" "." ":" ";" "?" "!" "'": <UNKNOWN-WORD>; % RPUNC
"(" "$": <UNKNOWN-WORD>; % LPUNC
24 changes: 2 additions & 22 deletions data/vn/4.0.affix
Original file line number Diff line number Diff line change
@@ -1,31 +1,11 @@
%
% Affixes get stripped off the left and right side of words
% i.e. spaces are inserted between the affix and the word itself.
%
% Some of the funky UTF-8 parenthesis are used in Asian texts.
% In order to allow single straight quote ' and double straight quote ''
% to be stripped off from both the left and the right, they are
% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y)
%
% 。is an end-of-sentence marker used in Japanese texts.

% Punctuation appearing on the right-side of words.
% Note: the ellipsis ....y must appear *before* the dot ".", else the
% splitting won't work right.
")" "}" "]" ">" "".y" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" “ ''.y '.y `.y
"%" "," ....y "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ? ! ….y "”" ━.y –.y ー.y ‐.y 、.y
~ ¢ ₵ ™ ℠
: RPUNC+;
"," : RPUNC+; % Only this punctuation appears in the dict.

% Punctuation appearing on the left-side of words.
"(" "{" "[" "<" "".x" « 〈 ( 〔 《 【 [ 『 「 、.x `.x `` „ ‘ ''.x '.x ….x ....x
¿ ¡ "$" US$ USD C$
£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점
† †† ‡ § ¶ © ® ℗ № "#"
* • ⁂ ❧ ☞ ◊ ※ ○ 。.x ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎
–.x ━.x ー.x -- - ‧.x
: LPUNC+;

% : LPUNC+; % None.

% The below is a quoted list, used during tokenization. Do NOT put
% spaces in between the various quotation marks!!
Expand Down
2 changes: 1 addition & 1 deletion data/vn/4.0.dict
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
% Vietnamese Dictionary

#define dictionary-version-number 5.9.2;
#define dictionary-version-number 5.11.0;
#define dictionary-locale vi_VN.UTF-8;

% See https://bitbucket.org/ngocminh/lienkate for the master
Expand Down
Loading