From d2f68f339823b2043416d221e82eaca04a7c1944 Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 13 Jul 2022 01:50:47 +0300 Subject: [PATCH 01/24] Fix a "readability-misleading-indentation" warning --- link-grammar/dict-common/dict-impl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index 833ebaef18..258df088a6 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -551,8 +551,7 @@ static void get_dict_affixes(Dictionary dict, Dict_node * dn, { affix_list_add(afdict, &afdict->afdict_class[AFDICT_SUF], wtrunc+1); } - else - if (infix_mark == w[w_len-1]) + else if (infix_mark == w[w_len-1]) { wtrunc[w_len-1] = '\0'; affix_list_add(afdict, &afdict->afdict_class[AFDICT_PRE], wtrunc); From dca35e3192cc82a18e22726452bf2a66bcf7b591 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 15 Jul 2022 00:42:37 +0300 Subject: [PATCH 02/24] Move the list of strippable affix classes to dict-affix.h --- link-grammar/dict-common/dict-affix.h | 3 +++ link-grammar/dict-common/dict-impl.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h index 0249d772f3..48ce5e78ca 100644 --- a/link-grammar/dict-common/dict-affix.h +++ b/link-grammar/dict-common/dict-affix.h @@ -83,4 +83,7 @@ typedef enum { Afdict_class * afdict_find(Dictionary, const char *, bool); +static const afdict_classnum affix_strippable[] = + {AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC}; + #endif /* _LG_DICT_AFFIX_H_ */ diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index 258df088a6..520124f62a 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -732,10 +732,9 @@ bool afdict_init(Dictionary dict) * up. e.g. split 7gram before 7am before 7m. * Another example: The ellipsis "..." must appear before the dot ".". */ - afdict_classnum af[] = {AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC}; - for (size_t i = 0; i < ARRAY_SIZE(af); i++) + for (size_t i = 0; i < ARRAY_SIZE(affix_strippable); i++) { - ac = AFCLASS(afdict, af[i]); + ac = AFCLASS(afdict, affix_strippable[i]); if (0 < ac->length) { qsort(ac->string, ac->length, sizeof(char *), split_order); From 2be9d49124fa8f1b9a1db1ba22063d51ebe50498 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 15 Jul 2022 03:11:41 +0300 Subject: [PATCH 03/24] split_order(): Fix a misleading variable notation --- link-grammar/dict-common/dict-impl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index 520124f62a..fdc6da6ae2 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -608,10 +608,10 @@ static int split_order(const void *a, const void *b) const char * const *sa = a; const char * const *sb = b; - size_t len_a = strcspn(*sb, subscript_mark_str()); - size_t len_b = strcspn(*sa, subscript_mark_str()); + size_t len_a = strcspn(*sa, subscript_mark_str()); + size_t len_b = strcspn(*sb, subscript_mark_str()); - int len_order = (int)(len_a - len_b); + int len_order = (int)(len_b - len_a); if (0 == len_order) return strncmp(*sa, *sb, len_a); return len_order; From ee685ea683fc6f122c8be774a703b832e93a59b9 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 15 Jul 2022 05:39:44 +0300 Subject: [PATCH 04/24] read_entry(): Avoid memleak on error --- link-grammar/dict-file/read-dict.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/link-grammar/dict-file/read-dict.c b/link-grammar/dict-file/read-dict.c index cb0ac67946..3e60b0b552 100644 --- a/link-grammar/dict-file/read-dict.c +++ b/link-grammar/dict-file/read-dict.c @@ -980,12 +980,14 @@ static bool read_entry(Dictionary dict) * used in equations (.v means verb-like) */ if ((dict->token[0] == '/') && (dict->token[1] != '.')) { - dn = read_word_file(dict, dn, dict->token); - if (dn == NULL) + Dict_node *new_dn = read_word_file(dict, dn, dict->token); + if (new_dn == NULL) { prt_error("Error: Cannot open word file \"%s\".\n", dict->token); - return false; + + goto syntax_error; /* not a syntax error, but need to free dn */ } + dn = new_dn; } else if (0 == strcmp(dict->token, "#include")) { From 1ec10c53bdd725c03f74908b143fc84241b1fa67 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 15 Jul 2022 05:56:47 +0300 Subject: [PATCH 05/24] insert_length_limit(): Fix warning [readability-misleading-indentation] --- link-grammar/dict-file/read-dict.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/link-grammar/dict-file/read-dict.c b/link-grammar/dict-file/read-dict.c index 3e60b0b552..44e01dcd94 100644 --- a/link-grammar/dict-file/read-dict.c +++ b/link-grammar/dict-file/read-dict.c @@ -857,8 +857,7 @@ static void insert_length_limit(Dictionary dict, Dict_node *dn) { length_limit = UNLIMITED_LEN; } - else - if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string, + else if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string, sizeof(LIMITED_CONNECTORS_WORD)-1)) { char *endp; From 8022b0ba9e6d69969148ddea8d16376417c83211 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 15 Jul 2022 14:29:37 +0300 Subject: [PATCH 06/24] afdict_find: Move up --- link-grammar/dict-common/dict-affix.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h index 48ce5e78ca..4d333844c1 100644 --- a/link-grammar/dict-common/dict-affix.h +++ b/link-grammar/dict-common/dict-affix.h @@ -72,6 +72,8 @@ typedef enum { #define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 "last classname" #define AFCLASS(afdict, class) (&afdict->afdict_class[class]) +Afdict_class * afdict_find(Dictionary, const char *, bool); + /* Suffixes start with it. * This is needed to distinguish suffixes that were stripped off from * ordinary words that just happen to be the same as the suffix. @@ -81,8 +83,6 @@ typedef enum { #define INFIX_MARK(afdict) \ ((NULL == afdict) ? '\0' : (AFCLASS(afdict, AFDICT_INFIXMARK)->string[0][0])) -Afdict_class * afdict_find(Dictionary, const char *, bool); - static const afdict_classnum affix_strippable[] = {AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC}; From ba529fa1a6053b89bfc65d72b75a977a6cc7498f Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 18 Jul 2022 23:37:00 +0300 Subject: [PATCH 07/24] is_afdict_punc(): Allow afdict_classnum to start with 0 --- link-grammar/tokenize/tokenize.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c index b9191676e8..c7ce6dfe4d 100644 --- a/link-grammar/tokenize/tokenize.c +++ b/link-grammar/tokenize/tokenize.c @@ -371,11 +371,11 @@ static bool is_contraction_word(Dictionary dict, const char *s) static bool is_afdict_punc(const Dictionary afdict, const char *word) { if (NULL == afdict) return false; - int punc_types[] = { AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC, 0 }; + int punc_type[] = {AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC}; - for (int affix_punc = 0; punc_types[affix_punc] != 0; affix_punc++) + for (size_t affix_punc = 0; affix_punc < ARRAY_SIZE(punc_type); affix_punc++) { - const Afdict_class * punc_list = AFCLASS(afdict, punc_types[affix_punc]); + const Afdict_class * punc_list = AFCLASS(afdict, punc_type[affix_punc]); size_t l_strippable = punc_list->length; const char * const * punc = punc_list->string; From 2e85ae37cda594916d6be3098c612b2792627307 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 18 Jul 2022 23:37:28 +0300 Subject: [PATCH 08/24] afdict_classnum: Start with 0 --- link-grammar/dict-common/dict-affix.h | 7 ++----- link-grammar/dict-common/dict-common.c | 7 +++---- link-grammar/tokenize/tokenize.c | 11 ++++------- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h index 4d333844c1..ff64a7fbf1 100644 --- a/link-grammar/dict-common/dict-affix.h +++ b/link-grammar/dict-common/dict-affix.h @@ -21,7 +21,7 @@ /* Connector names for the affix class lists in the affix file */ typedef enum { - AFDICT_RPUNC=1, + AFDICT_RPUNC, AFDICT_LPUNC, AFDICT_MPUNC, AFDICT_UNITS, @@ -41,13 +41,10 @@ typedef enum { AFDICT_REGALTS, AFDICT_REGPARTS, - /* Have to have one last entry, to get the array size correct */ - AFDICT_LAST_ENTRY, AFDICT_NUM_ENTRIES } afdict_classnum; #define AFDICT_CLASSNAMES1 \ - "invalid classname", \ "RPUNC", \ "LPUNC", \ "MPUNC", \ @@ -69,7 +66,7 @@ typedef enum { "REGALTS", /* Min&max number of alternatives to issue for a word */\ "REGPARTS", /* Max number of word partitions */ -#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 "last classname" +#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 #define AFCLASS(afdict, class) (&afdict->afdict_class[class]) Afdict_class * afdict_find(Dictionary, const char *, bool); diff --git a/link-grammar/dict-common/dict-common.c b/link-grammar/dict-common/dict-common.c index 327f817cfa..af8956fb74 100644 --- a/link-grammar/dict-common/dict-common.c +++ b/link-grammar/dict-common/dict-common.c @@ -304,11 +304,10 @@ static void affix_list_delete(Dictionary dict) { if (NULL == dict->afdict_class) return; - int i; - Afdict_class * atc; - for (i=0, atc = dict->afdict_class; i < AFDICT_NUM_ENTRIES; i++, atc++) + Afdict_class * atc = dict->afdict_class; + for (size_t i = 0; i < AFDICT_NUM_ENTRIES; i++) { - if (atc->string) free(atc->string); + if (atc[i].length > 0) free(atc[i].string); } free(dict->afdict_class); dict->afdict_class = NULL; diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c index c7ce6dfe4d..2b298620a9 100644 --- a/link-grammar/tokenize/tokenize.c +++ b/link-grammar/tokenize/tokenize.c @@ -371,16 +371,13 @@ static bool is_contraction_word(Dictionary dict, const char *s) static bool is_afdict_punc(const Dictionary afdict, const char *word) { if (NULL == afdict) return false; - int punc_type[] = {AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC}; - for (size_t affix_punc = 0; affix_punc < ARRAY_SIZE(punc_type); affix_punc++) + for (size_t punc = 0; punc < ARRAY_SIZE(affix_strippable); punc++) { - const Afdict_class * punc_list = AFCLASS(afdict, punc_type[affix_punc]); - size_t l_strippable = punc_list->length; - const char * const * punc = punc_list->string; + const Afdict_class * punc_list = AFCLASS(afdict, affix_strippable[punc]); - for (size_t i = 0; i < l_strippable; i++) - if (0 == strcmp(word, punc[i])) return true; + for (size_t i = 0; i < punc_list->length; i++) + if (0 == strcmp(word, punc_list->string[i])) return true; } return false; From 2c619fbb94fc39d3e54bea337a01d75c270e916e Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 22 Jul 2022 15:43:30 +0300 Subject: [PATCH 09/24] LPUNC,MPUNCT: Fix subscript mishandling --- link-grammar/tokenize/tokenize.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c index 2b298620a9..357a993702 100644 --- a/link-grammar/tokenize/tokenize.c +++ b/link-grammar/tokenize/tokenize.c @@ -374,10 +374,17 @@ static bool is_afdict_punc(const Dictionary afdict, const char *word) for (size_t punc = 0; punc < ARRAY_SIZE(affix_strippable); punc++) { - const Afdict_class * punc_list = AFCLASS(afdict, affix_strippable[punc]); + if (AFDICT_UNITS == affix_strippable[punc]) continue; + const Afdict_class *punc_list = AFCLASS(afdict, affix_strippable[punc]); for (size_t i = 0; i < punc_list->length; i++) - if (0 == strcmp(word, punc_list->string[i])) return true; + { + /* If the word is subscripted, the affix must be too. */ + const char *p = punc_list->string[i]; + const char *w = word; + while ((*w == *p) && (*w != '\0')) { w++; p++; } + if (*w == *p) return true; + } } return false; @@ -1770,7 +1777,8 @@ static int split_mpunc(Sentence sent, const char *word, char *w, { for (size_t i = 0; i < l_strippable; i++) { - size_t sz = strlen(mpunc[i]); + /* Find the token length, but stop at the subscript mark if exists. */ + size_t sz = strcspn(mpunc[i], subscript_mark_str()); if (0 == strncmp(sep, mpunc[i], sz)) { if ('\0' == sep[sz]) continue; // mpunc in end position @@ -1826,7 +1834,8 @@ static const char *strip_left(Sentence sent, const char * w, { for (i=0; i Date: Sat, 23 Jul 2022 11:00:49 +0300 Subject: [PATCH 10/24] determine_word_expressions(): Debug print the word status too --- link-grammar/tokenize/tokenize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c index 357a993702..611dac406c 100644 --- a/link-grammar/tokenize/tokenize.c +++ b/link-grammar/tokenize/tokenize.c @@ -3225,8 +3225,8 @@ static bool determine_word_expressions(Sentence sent, Gword *w, w->regex_name ? w->regex_name : ""); while (we) { - prt_error("Debug: string='%s' expr=%s\n", - we->string, exp_stringify(we->exp)); + prt_error("Debug: string='%s' status=%s expr=%s\n", + we->string, gword_status(sent, w), exp_stringify(we->exp)); we = we->next; } } From 76737019dd9e80b50d65bef5fa1a3b867648a3f4 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 25 Jul 2022 01:50:31 +0300 Subject: [PATCH 11/24] Move afdict_classnames[] to dict-affix.h --- link-grammar/dict-common/dict-affix.h | 4 +++- link-grammar/dict-common/dict-impl.c | 5 +---- link-grammar/tokenize/anysplit.c | 2 -- link-grammar/tokenize/tokenize.c | 1 - 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h index ff64a7fbf1..f3ad986c8b 100644 --- a/link-grammar/dict-common/dict-affix.h +++ b/link-grammar/dict-common/dict-affix.h @@ -66,7 +66,9 @@ typedef enum { "REGALTS", /* Min&max number of alternatives to issue for a word */\ "REGPARTS", /* Max number of word partitions */ -#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 +static const char * const afdict_classname[] = + {AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2}; + #define AFCLASS(afdict, class) (&afdict->afdict_class[class]) Afdict_class * afdict_find(Dictionary, const char *, bool); diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index fdc6da6ae2..82952ac0e9 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -449,7 +449,6 @@ bool dictionary_setup_defines(Dictionary dict) return true; } -/* ======================================================================= */ /* ======================================================================= */ /* The affix dictionary is represented as a dynamically allocated array with @@ -457,8 +456,6 @@ bool dictionary_setup_defines(Dictionary dict) * has a pointer to an array of strings which are the punctuation/affix * names. */ -const char * afdict_classname[] = { AFDICT_CLASSNAMES }; - /** initialize the affix class table */ void afclass_init(Dictionary dict) { @@ -480,7 +477,7 @@ void afclass_init(Dictionary dict) */ Afdict_class * afdict_find(Dictionary afdict, const char * con, bool notify_err) { - const char ** ac; + const char * const * ac; for (ac = afdict_classname; ac < &afdict_classname[ARRAY_SIZE(afdict_classname)]; ac++) diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c index 53803f8b04..4d7e06e21f 100644 --- a/link-grammar/tokenize/anysplit.c +++ b/link-grammar/tokenize/anysplit.c @@ -43,8 +43,6 @@ #define MAX_WORD_TO_SPLIT 63 /* in codepoints */ -extern const char * const afdict_classname[]; - typedef int p_start; /* partition start in a word */ typedef p_start *p_list; /* list of partitions in a word */ diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c index 611dac406c..051475085d 100644 --- a/link-grammar/tokenize/tokenize.c +++ b/link-grammar/tokenize/tokenize.c @@ -1929,7 +1929,6 @@ static const char *strip_left(Sentence sent, const char * w, * * p is a mark of the invocation position, for debugging. */ -extern const char *const afdict_classname[]; /* For debug message only */ static bool strip_right(Sentence sent, const char *w, const char **wend, From 68c23981510e2146f126b8693f9353eedaf8c4e2 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 25 Jul 2022 23:13:40 +0300 Subject: [PATCH 12/24] afdict_init(): Warn on unknown strippable affix tokens This check cannot be done in affix_list_add() because it doesn't have access to the dict handle (no point to make an intensive code restructure for that). --- link-grammar/dict-common/dict-impl.c | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index 82952ac0e9..d5f9cf670d 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -18,6 +18,7 @@ #include "dict-api.h" #include "dict-defines.h" #include "dict-impl.h" +#include "print/print-util.h" // patch_subscript_mark #include "regex-morph.h" #include "dict-structures.h" #include "string-set.h" @@ -723,6 +724,42 @@ bool afdict_init(Dictionary dict) afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); } + if (!IS_DYNAMIC_DICT(dict)) + { + /* Validate that the strippable tokens are in the dict. + * UNITS are assumed to be from the dict only. + * Possible FIXME: Allow also tokens that match a regex (a tokenizer + * change is needed to recognize them). */ + for (size_t i = 0; i < ARRAY_SIZE(affix_strippable); i++) + { + if (AFDICT_UNITS != affix_strippable[i]) + { + ac = AFCLASS(afdict, affix_strippable[i]); + bool not_in_dict = false; + + for (size_t n = 0; n < ac->length; n++) + { + if (!dict_has_word(dict, ac->string[n])) + { + if (!not_in_dict) + { + not_in_dict = true; + prt_error("Warning: afdict_init: Class %s in file %s: " + "Token(s) not in the dictionary:", + afdict_classname[affix_strippable[i]], + afdict->name); + } + + char *s = strdupa(ac->string[n]); + patch_subscript_mark(s); + prt_error(" \"%s\"", s); + } + } + if (not_in_dict) prt_error("\n"); + } + } + } + /* Sort the affix-classes of tokens to be stripped. */ /* Longer unit names must get split off before shorter ones. * This prevents single-letter splits from screwing things From 151bcc397fd5a91df0c6a462c2df22bfcf176473 Mon Sep 17 00:00:00 2001 From: ampli Date: Mon, 25 Jul 2022 23:49:55 +0300 Subject: [PATCH 13/24] afdict_init(): Don't stop on first error --- link-grammar/dict-common/dict-impl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c index d5f9cf670d..3674d14ef8 100644 --- a/link-grammar/dict-common/dict-impl.c +++ b/link-grammar/dict-common/dict-impl.c @@ -627,6 +627,7 @@ bool afdict_init(Dictionary dict) { Afdict_class * ac; Dictionary afdict = dict->affix_table; + bool error_found = false; /* FIXME: read_entry() builds word lists in reverse order (can we * just create the list top-down without breaking anything?). Unless @@ -718,10 +719,13 @@ bool afdict_init(Dictionary dict) prt_error("Error: afdict_init: Failed to compile " "regex /%s/ in file \"%s\"\n", afdict_classname[AFDICT_SANEMORPHISM], afdict->name); - return false; + error_found = true; + } + else + { + lgdebug(+D_AI, "%s regex %s\n", + afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); } - lgdebug(+D_AI, "%s regex %s\n", - afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); } if (!IS_DYNAMIC_DICT(dict)) @@ -807,6 +811,6 @@ bool afdict_init(Dictionary dict) lg_error_flush(); } - return true; + return !error_found; } #undef D_AI From 59d9138e6e84624ae13c065c7a9989f8feca259c Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 00:10:17 +0300 Subject: [PATCH 14/24] he/4.0.affix: Remove a comment that is too far ahead of its time --- data/he/4.0.affix | 2 -- 1 file changed, 2 deletions(-) diff --git a/data/he/4.0.affix b/data/he/4.0.affix index 70d0f2555a..403b8bc23a 100644 --- a/data/he/4.0.affix +++ b/data/he/4.0.affix @@ -1,5 +1,3 @@ -% need to add Hebrew punctuations - ")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ''.y '.y "%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" "–" "‐" 、 ~ ¢ ₵ ™ ℠ : RPUNC+; From 715937818efe27710536d015ae15ffa23325933e Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 00:34:57 +0300 Subject: [PATCH 15/24] he: Revise list of strippable affixes Also update to "#define version/locale" and in 4.0.dict.m4 (they are now only in 4.0.dict). --- data/he/4.0.affix | 9 ++------- data/he/4.0.dict | 6 +++++- data/he/4.0.dict.m4 | 11 +++++++---- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/data/he/4.0.affix b/data/he/4.0.affix index 403b8bc23a..fdd081f40a 100644 --- a/data/he/4.0.affix +++ b/data/he/4.0.affix @@ -1,11 +1,6 @@ -")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ''.y '.y -"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ?! ….y ....y "”" "–" "‐" 、 ~ -¢ ₵ ™ ℠ : RPUNC+; +")" """ "," "." "–" "‐" ... ":" ";" "?" "!": RPUNC+; -"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 `` „ “ ‘ ''.x '.x ….x ....x -¿ ¡ "$" -£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 -† †† ‡ § ¶ © ® ℗ № "#": LPUNC+; +"(" „ “ """ ... ₪: LPUNC+; =: INFIXMARK+; % The order is preserved. diff --git a/data/he/4.0.dict b/data/he/4.0.dict index 206db9becf..299a2799a2 100644 --- a/data/he/4.0.dict +++ b/data/he/4.0.dict @@ -40,7 +40,7 @@ % Among numerous other things, changes to handle count/uncountable changes % have not been done yet. The created infrastructure for that may still need changes. -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale he_IL.UTF-8; % For now. @@ -562,3 +562,7 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ & % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. : XXX+; + +% Punctuations that get strip but are yet unhandled. + "." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: ; % RPUNC +"(" „ “ "".x" ....x: ; % LPUNC diff --git a/data/he/4.0.dict.m4 b/data/he/4.0.dict.m4 index da0cf81d21..6724318cbd 100644 --- a/data/he/4.0.dict.m4 +++ b/data/he/4.0.dict.m4 @@ -45,9 +45,8 @@ changecom(`%')dnl % Among numerous other things, changes to handle count/uncountable changes % have not been done yet. The created infrastructure for that may still need changes. -% Dictionary version number is 5.3.15 (formatted as V5v3v15+) -: V5v3v15+; -: HE4il+; +#define dictionary-version-number 5.11.0; +#define dictionary-locale he_IL.UTF-8; % For now. LEFT-WALL: {Wa+} or {Wd+} or (); @@ -458,4 +457,8 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ & % With the following line in the dictionary, the parser will simply % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. -UNKNOWN-WORD: XXX+; +: XXX+; + +% Punctuations that get strip but are yet unhandled. + "." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: ; % RPUNC +"(" „ “ "".x" ....x: ; % LPUNC From 6dcfef434b85027188adf7f997206b839995f68b Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 00:52:33 +0300 Subject: [PATCH 16/24] vn: Revise list of strippable affixes Only comma is defined in the dict, so leave only it. This is just a minimum fix to prevent error due to strippable affixes that are not found in the dict. --- data/vn/4.0.affix | 24 ++---------------------- data/vn/4.0.dict | 2 +- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/data/vn/4.0.affix b/data/vn/4.0.affix index 4c56eb052e..d3c445448f 100644 --- a/data/vn/4.0.affix +++ b/data/vn/4.0.affix @@ -1,31 +1,11 @@ -% % Affixes get stripped off the left and right side of words % i.e. spaces are inserted between the affix and the word itself. -% -% Some of the funky UTF-8 parenthesis are used in Asian texts. -% In order to allow single straight quote ' and double straight quote '' -% to be stripped off from both the left and the right, they are -% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y) -% -% 。is an end-of-sentence marker used in Japanese texts. % Punctuation appearing on the right-side of words. -% Note: the ellipsis ....y must appear *before* the dot ".", else the -% splitting won't work right. -")" "}" "]" ">" "".y" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" “ ''.y '.y `.y -"%" "," ....y "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ? ! ….y "”" ━.y –.y ー.y ‐.y 、.y -~ ¢ ₵ ™ ℠ - : RPUNC+; +"," : RPUNC+; % Only this punctuation appears in the dict. % Punctuation appearing on the left-side of words. -"(" "{" "[" "<" "".x" « 〈 ( 〔 《 【 [ 『 「 、.x `.x `` „ ‘ ''.x '.x ….x ....x -¿ ¡ "$" US$ USD C$ -£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 -† †† ‡ § ¶ © ® ℗ № "#" -* • ⁂ ❧ ☞ ◊ ※ ○ 。.x ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎ -–.x ━.x ー.x -- - ‧.x - : LPUNC+; - +% : LPUNC+; % None. % The below is a quoted list, used during tokenization. Do NOT put % spaces in between the various quotation marks!! diff --git a/data/vn/4.0.dict b/data/vn/4.0.dict index 3d49aa58f9..c65ae7b06b 100644 --- a/data/vn/4.0.dict +++ b/data/vn/4.0.dict @@ -1,6 +1,6 @@ % Vietnamese Dictionary -#define dictionary-version-number 5.9.2; +#define dictionary-version-number 5.11.0; #define dictionary-locale vi_VN.UTF-8; % See https://bitbucket.org/ngocminh/lienkate for the master From c97236772b41efdc62364aa57464558cc230213e Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 01:06:54 +0300 Subject: [PATCH 17/24] de: Revise list of strippable affixes --- data/de/4.0.affix | 4 ++-- data/de/4.0.dict | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/data/de/4.0.affix b/data/de/4.0.affix index 7b122443da..90a224758e 100644 --- a/data/de/4.0.affix +++ b/data/de/4.0.affix @@ -1,6 +1,6 @@ -")" "%" "," "." ":" ";" "?" "!" "''" "'" "'s": RPUNC+; -"(" "$" "``": LPUNC+; +")" "," "." ":" ";" "?" "!" "'" „ « ...: RPUNC+; +"(" “ » ...: LPUNC+; %e en er es em: SUF+; .=: STEMSUBSCR+; diff --git a/data/de/4.0.dict b/data/de/4.0.dict index ed2608b791..8ab9c054ee 100644 --- a/data/de/4.0.dict +++ b/data/de/4.0.dict @@ -7,7 +7,7 @@ % % %***************************************************************************% -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale de_DE.UTF-8; % NOUNS, PRONOUNS @@ -521,3 +521,7 @@ PL-CAPITALIZED-WORDS: BZZT+; % will output an error for any unknown words. : NO+; +% Unhandled, but getting split according to 4.0.affix. +")" ":" ";" "'" „ « ....y: ; % RPUNC +"(" “ » ....x: ; % LPUNC + From f4cd18d1ac50cd5c79e7838c6a45a462074e6957 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 01:21:34 +0300 Subject: [PATCH 18/24] tr: Revise list of strippable affixes --- data/tr/4.0.affix | 5 +++-- data/tr/4.0.dict | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/data/tr/4.0.affix b/data/tr/4.0.affix index 5cebea5832..fcd866cf78 100644 --- a/data/tr/4.0.affix +++ b/data/tr/4.0.affix @@ -1,5 +1,6 @@ -")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+; -"(" "$" "``": LPUNC+; +% From these affixes, only "," is handled in the dictionary. +")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+; +"(" "$" : LPUNC+; dakileri ki ler leri kileri düm m di lar diler um yor mi ni diğ diğini ini yorum miyorum ın in un ün nın nin nun nün a e ya ye i ı u ü yi yı yu yü yla ile yle da de ta te den ten dan tan sız: SUF+; diff --git a/data/tr/4.0.dict b/data/tr/4.0.dict index 099b3a868f..43798fbdeb 100644 --- a/data/tr/4.0.dict +++ b/data/tr/4.0.dict @@ -1,7 +1,7 @@ % % Turkish dictionary -- experimental -- 2016 -- Tatiana Batura, Maria Mitkovskaya, Natalya Semenova % -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale tr_TR.UTF-8; % adjectives @@ -119,3 +119,7 @@ PL-GREEK-LETTER-AND-NUMBER: XXX-; CAPITALIZED-WORDS: XXX-; HYPHENATED-WORDS: XXX-; : XXX-; + +% Unhandled, but getting split according to 4.0.affix. +")" "%" "." ":" ";" "?" "!" "'": ; % RPUNC +"(" "$": ; % LPUNC From 30aa3a940420ba9471402e0ba84365d8ea7b3fec Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 13:28:16 +0300 Subject: [PATCH 19/24] kz: Revise list of strippable affixes --- data/kz/4.0.affix | 1 + data/kz/4.0.dict | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/data/kz/4.0.affix b/data/kz/4.0.affix index f7795ee5c1..381a784079 100644 --- a/data/kz/4.0.affix +++ b/data/kz/4.0.affix @@ -1,3 +1,4 @@ +% From these affixes, only "," is handled in the dictionary. ")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+; "(" "$" "``": LPUNC+; diff --git a/data/kz/4.0.dict b/data/kz/4.0.dict index 688c027282..a1eccb6b73 100644 --- a/data/kz/4.0.dict +++ b/data/kz/4.0.dict @@ -2,7 +2,7 @@ % Kazakh dictionary -- experimental -- 2016 % -- Tatiana Batura, Aigerim Bakiyeva, Aigerim Yerimbetova % -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale kk_KZ.UTF-8; мен.pron: {W-} & S1s+; @@ -133,3 +133,7 @@ LATIN-ADJ-P-NOUN-WORDS: XXX-; LATIN-ADJ-S-NOUN-WORDS: XXX-; HYPHENATED-WORDS: XXX-; : XXX-; + +% Unhandled, but getting split according to 4.0.affix. +")" "%" "." ":" ";" "?" "!" "''" "'" : ; % RPUNC +"(" "$" "``": ; % LPUNC From 753c435920e40cf22460236d65f22b8b98926d3a Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 17:26:51 +0300 Subject: [PATCH 20/24] id: Revise list of strippable affixes --- data/id/4.0.affix | 34 ++-------------------------------- data/id/4.0.dict | 2 +- 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/data/id/4.0.affix b/data/id/4.0.affix index 146d905919..1e709e1f6e 100644 --- a/data/id/4.0.affix +++ b/data/id/4.0.affix @@ -1,30 +1,11 @@ -% % Affixes get stripped off the left and right side of words % i.e. spaces are inserted between the affix and the word itself. -% -% Some of the funky UTF-8 parenthesis are used in Asian texts. -% In order to allow single straight quote ' and double straight quote '' -% to be stripped off from both the left and the right, they are -% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y) -% -% 。is an end-of-sentence marker used in Japanese texts. % Punctuation appearing on the right-side of words. -")" "}" "]" ">" » 〉 ) 〕 》 】 ] 』 」 "’’" "’" ''.y '.y ' ` -"%" "," "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ? ! ….y ....y "”" ━.y –.y ー.y ‐.y 、.y -~ ¢ ₵ ™ ℠ : RPUNC+; +"," : RPUNC+; % Only this punctuation appears in the dict. % Punctuation appearing on the left-side of words. -"(" "{" "[" "<" « 〈 ( 〔 《 【 [ 『 「 、.x ` `` „ “ ‘ ''.x '.x ….x ....x -¿ ¡ "$" US$ USD C$ -£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺ ℳ ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼ ₸ ₮ ₩ ¥ ៛ 호점 -† †† ‡ § ¶ © ® ℗ № "#" -* • ⁂ ❧ ☞ ◊ ※ ○ 。.x ゜ ✿ ☆ * ◕ ● ∇ □ ◇ @ ◎ -–.x ━.x ー.x -- - ‧.x - : LPUNC+; - -% Suffixes -'s 're 've 'd 'll 'm ’s ’re ’ve ’d ’ll ’m: SUF+; +% : LPUNC+; % none. % The below is a quoted list, used during tokenization. Do NOT put % spaces in between the various quotation marks!! @@ -33,14 +14,3 @@ % The below is a quoted list, used during tokenization. Do NOT put % spaces in between the various symbols!! "()¿¡†‡§¶©®℗№#*•⁂❧☞◊※○。゜✿☆*◕●∇□◇@◎–━ー---‧": BULLETS+; - -/en/words/units.1: UNITS+; -/en/words/units.1.dot: UNITS+; -/en/words/units.3: UNITS+; -/en/words/units.4: UNITS+; -/en/words/units.4.dot: UNITS+; -/en/words/units.5: UNITS+; -% -% units.6 contains just a single, sole slash in it. This allows units -% such as mL/s to be split at the slash. -/en/words/units.6: UNITS+; diff --git a/data/id/4.0.dict b/data/id/4.0.dict index 4f39a38f94..41d138ba35 100644 --- a/data/id/4.0.dict +++ b/data/id/4.0.dict @@ -7,7 +7,7 @@ % % %***************************************************************************% -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale id_ID.UTF-8; anjing kucing wanita cewek pria cowok lelaki laki-laki taman lapangan tulang From fe0af9bb70437bde58b53775aa37e3f1daa5ec4f Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 17:15:50 +0300 Subject: [PATCH 21/24] fa: Revise list of strippable affixes --- data/fa/4.0.affix | 4 ++-- data/fa/4.0.dict | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data/fa/4.0.affix b/data/fa/4.0.affix index 9f42f51e41..b726baa24f 100644 --- a/data/fa/4.0.affix +++ b/data/fa/4.0.affix @@ -1,3 +1,3 @@ -")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+; -"(" "$" "``": LPUNC+; +")" "%" "," "." ":" ";" "?" "!" "'" : RPUNC+; +"(" "$": LPUNC+; diff --git a/data/fa/4.0.dict b/data/fa/4.0.dict index f13ef9490b..3b41321378 100644 --- a/data/fa/4.0.dict +++ b/data/fa/4.0.dict @@ -10,7 +10,7 @@ % %***************************************************************************** -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale fa_IR.UTF-8; % PERSIAN SYNTAX NOTES: @@ -835,11 +835,11 @@ or ; +% Unhandled, but getting split according to 4.0.affix. +")" "%" ";" "'": (); % RPUNC +"(" "$": (); % LPUNC -%":" ";" : % Colon and Semicolon -%; - %NUMBERS: ; LEFT-WALL: ( Wcc+ or [Wi+] or [[Wd+]] ) & {Xp+}; % Connects to the Subject in declarative sentence, or Verb in subjectless or imperative sentence RIGHT-WALL: RW- ; From 3a162fec479cf22469c47b292d0b4e8716ae3985 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 17:42:21 +0300 Subject: [PATCH 22/24] ru: Revise list of strippable affixes --- data/ru/4.0.affix | 4 ++-- data/ru/4.0.dict | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/data/ru/4.0.affix b/data/ru/4.0.affix index 324f50e716..1249897269 100644 --- a/data/ru/4.0.affix +++ b/data/ru/4.0.affix @@ -1,5 +1,5 @@ -")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+; -"(" "$" "``": LPUNC+; +")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+; +"(" "$": LPUNC+; ""«»《》【】『』`„“": QUOTES+; diff --git a/data/ru/4.0.dict b/data/ru/4.0.dict index ca0042b581..3c12e6e02b 100644 --- a/data/ru/4.0.dict +++ b/data/ru/4.0.dict @@ -7,7 +7,7 @@ %% %% This file uses the utf8 encoding -#define dictionary-version-number 5.9.0; +#define dictionary-version-number 5.11.0; #define dictionary-locale ru_RU.UTF-8; #define max-disjunct-cost 2.7; #define panic_max-disjunct-cost 4.0; @@ -1096,3 +1096,7 @@ LENGTH-LIMIT-1: LL*+; % skip over (null-link) unknown words. If you remove it, the parser % will output an error for any unknown words. : XXX+; + +% Unhandled, but getting split according to 4.0.affix. +"%" "'": ; % RPUNC +"$": ; % LPUNC From 56fb7251dff1910fa9110cda077683c41a265660 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 17:44:12 +0300 Subject: [PATCH 23/24] ru/4.0.dict: Fix typo in panic_max-disjunct-cost --- data/ru/4.0.dict | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/ru/4.0.dict b/data/ru/4.0.dict index 3c12e6e02b..1501c9c2e6 100644 --- a/data/ru/4.0.dict +++ b/data/ru/4.0.dict @@ -10,7 +10,7 @@ #define dictionary-version-number 5.11.0; #define dictionary-locale ru_RU.UTF-8; #define max-disjunct-cost 2.7; -#define panic_max-disjunct-cost 4.0; +#define panic-max-disjunct-cost 4.0; : [[[[]]]]; From 900a4ba79827950e0774d63ddd6a86ccb27cfd82 Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 26 Jul 2022 19:23:00 +0300 Subject: [PATCH 24/24] tests.py ZTHLangTestCase: Filter out errors on nonexistent affixes --- bindings/python-examples/tests.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bindings/python-examples/tests.py b/bindings/python-examples/tests.py index d419a6090b..7cfa01cf00 100755 --- a/bindings/python-examples/tests.py +++ b/bindings/python-examples/tests.py @@ -77,11 +77,11 @@ def test_open_nonexistent_dictionary(self): save_stderr = divert_start(2) self.assertRaises(LG_DictionaryError, Dictionary, dummy_lang + '1') - self.assertIn(dummy_lang + '1', save_stderr.divert_end()) + self.assertIn(dummy_lang + '1', str(save_stderr.divert_end())) save_stderr = divert_start(2) self.assertRaises(LG_Error, Dictionary, dummy_lang + '2') - self.assertIn(dummy_lang + '2', save_stderr.divert_end()) + self.assertIn(dummy_lang + '2', str(save_stderr.divert_end())) # Check absolute and relative dictionary access. # Check also that the dictionary language is set correctly. @@ -506,7 +506,7 @@ def test_22_defaut_handler_param(self): dummy_lang = "a dummy dict name (bad param test)" self.assertRaises(LG_Error, Dictionary, dummy_lang) LG_Error.printall(self.error_handler_test, self) # grab a valid errinfo - #self.assertIn(dummy_lang, save_stderr.divert_end()) + #self.assertIn(dummy_lang, str(save_stderr.divert_end())) self.assertRaisesRegexp(TypeError, "must be an integer", self.__class__.handler["default"], self.errinfo, "bad param") @@ -520,7 +520,7 @@ def test_22_defaut_handler_param(self): self.param_ok = False save_stdout = divert_start(1) # Note: Handler parameter is stdout self.__class__.handler["default"](self.errinfo, 1) - self.assertIn(dummy_lang, save_stdout.divert_end()) + self.assertIn(dummy_lang, str(save_stdout.divert_end())) self.param_ok = True except (TypeError, ValueError): self.assertTrue(self.param_ok) @@ -598,7 +598,7 @@ def test_50_set_orig_error_handler(self): dummy_lang = "a dummy dict name (default handler test)" save_stderr = divert_start(2) self.assertRaises(LG_Error, Dictionary, dummy_lang) - self.assertIn(dummy_lang, save_stderr.divert_end()) + self.assertIn(dummy_lang, str(save_stderr.divert_end())) self.assertEqual(self.errinfo, "dummy") class FSATsolverTestCase(unittest.TestCase): @@ -1281,9 +1281,16 @@ def test_d_morphology(self): '.', 'RIGHT-WALL']) +# The Thai 4.0.affix files currently contain strippable affixes that are +# not in the dict. This causes an annoying multiline error output that are +# filtered out here using divert(). class ZTHLangTestCase(unittest.TestCase): def test_thai(self): + save_stderr = divert_start(2) linkage_testfile(self, Dictionary(lang='th'), ParseOptions()) + for line in save_stderr.divert_end().decode().split("\n"): + if 'Token(s) not in the dictionary' not in line: + print(line) class ZXDictDialectTestCase(unittest.TestCase): @@ -1501,7 +1508,7 @@ def divert_end(self): os.close(self.savedfd) os.unlink(self.filename) self.filename = None - return str(content) + return content __del__ = divert_end