From d2f68f339823b2043416d221e82eaca04a7c1944 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Wed, 13 Jul 2022 01:50:47 +0300
Subject: [PATCH 01/24] Fix a "readability-misleading-indentation" warning

---
 link-grammar/dict-common/dict-impl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index 833ebaef18..258df088a6 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -551,8 +551,7 @@ static void get_dict_affixes(Dictionary dict, Dict_node * dn,
 		{
 			affix_list_add(afdict, &afdict->afdict_class[AFDICT_SUF], wtrunc+1);
 		}
-		else
-		if (infix_mark == w[w_len-1])
+		else if (infix_mark == w[w_len-1])
 		{
 			wtrunc[w_len-1] = '\0';
 			affix_list_add(afdict, &afdict->afdict_class[AFDICT_PRE], wtrunc);

From dca35e3192cc82a18e22726452bf2a66bcf7b591 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 15 Jul 2022 00:42:37 +0300
Subject: [PATCH 02/24] Move the list of strippable affix classes to
 dict-affix.h

---
 link-grammar/dict-common/dict-affix.h | 3 +++
 link-grammar/dict-common/dict-impl.c  | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h
index 0249d772f3..48ce5e78ca 100644
--- a/link-grammar/dict-common/dict-affix.h
+++ b/link-grammar/dict-common/dict-affix.h
@@ -83,4 +83,7 @@ typedef enum {
 
 Afdict_class * afdict_find(Dictionary, const char *, bool);
 
+static const afdict_classnum affix_strippable[] =
+	{AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC};
+
 #endif /* _LG_DICT_AFFIX_H_ */
diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index 258df088a6..520124f62a 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -732,10 +732,9 @@ bool afdict_init(Dictionary dict)
 	 * up. e.g. split 7gram before 7am before 7m.
 	 * Another example: The ellipsis "..." must appear before the dot ".".
 	 */
-	afdict_classnum af[] = {AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC};
-	for (size_t i = 0; i < ARRAY_SIZE(af); i++)
+	for (size_t i = 0; i < ARRAY_SIZE(affix_strippable); i++)
 	{
-		ac = AFCLASS(afdict, af[i]);
+		ac = AFCLASS(afdict, affix_strippable[i]);
 		if (0 < ac->length)
 		{
 			qsort(ac->string, ac->length, sizeof(char *), split_order);

From 2be9d49124fa8f1b9a1db1ba22063d51ebe50498 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 15 Jul 2022 03:11:41 +0300
Subject: [PATCH 03/24] split_order(): Fix a misleading variable notation

---
 link-grammar/dict-common/dict-impl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index 520124f62a..fdc6da6ae2 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -608,10 +608,10 @@ static int split_order(const void *a, const void *b)
 	const char * const *sa = a;
 	const char * const *sb = b;
 
-	size_t len_a = strcspn(*sb, subscript_mark_str());
-	size_t len_b = strcspn(*sa, subscript_mark_str());
+	size_t len_a = strcspn(*sa, subscript_mark_str());
+	size_t len_b = strcspn(*sb, subscript_mark_str());
 
-	int len_order = (int)(len_a - len_b);
+	int len_order = (int)(len_b - len_a);
 	if (0 == len_order) return strncmp(*sa, *sb, len_a);
 
 	return len_order;

From ee685ea683fc6f122c8be774a703b832e93a59b9 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 15 Jul 2022 05:39:44 +0300
Subject: [PATCH 04/24] read_entry(): Avoid memleak on error

---
 link-grammar/dict-file/read-dict.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/link-grammar/dict-file/read-dict.c b/link-grammar/dict-file/read-dict.c
index cb0ac67946..3e60b0b552 100644
--- a/link-grammar/dict-file/read-dict.c
+++ b/link-grammar/dict-file/read-dict.c
@@ -980,12 +980,14 @@ static bool read_entry(Dictionary dict)
 		 * used in equations (.v means verb-like) */
 		if ((dict->token[0] == '/') && (dict->token[1] != '.'))
 		{
-			dn = read_word_file(dict, dn, dict->token);
-			if (dn == NULL)
+			Dict_node *new_dn = read_word_file(dict, dn, dict->token);
+			if (new_dn == NULL)
 			{
 				prt_error("Error: Cannot open word file \"%s\".\n", dict->token);
-				return false;
+
+				goto syntax_error; /* not a syntax error, but need to free dn */
 			}
+			dn = new_dn;
 		}
 		else if (0 == strcmp(dict->token, "#include"))
 		{

From 1ec10c53bdd725c03f74908b143fc84241b1fa67 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 15 Jul 2022 05:56:47 +0300
Subject: [PATCH 05/24] insert_length_limit(): Fix warning
 [readability-misleading-indentation]

---
 link-grammar/dict-file/read-dict.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/link-grammar/dict-file/read-dict.c b/link-grammar/dict-file/read-dict.c
index 3e60b0b552..44e01dcd94 100644
--- a/link-grammar/dict-file/read-dict.c
+++ b/link-grammar/dict-file/read-dict.c
@@ -857,8 +857,7 @@ static void insert_length_limit(Dictionary dict, Dict_node *dn)
 	{
 		length_limit = UNLIMITED_LEN;
 	}
-	else
-	if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string,
+	else if (0 == strncmp(LIMITED_CONNECTORS_WORD, dn->string,
 	                 sizeof(LIMITED_CONNECTORS_WORD)-1))
 	{
 		char *endp;

From 8022b0ba9e6d69969148ddea8d16376417c83211 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 15 Jul 2022 14:29:37 +0300
Subject: [PATCH 06/24] afdict_find: Move up

---
 link-grammar/dict-common/dict-affix.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h
index 48ce5e78ca..4d333844c1 100644
--- a/link-grammar/dict-common/dict-affix.h
+++ b/link-grammar/dict-common/dict-affix.h
@@ -72,6 +72,8 @@ typedef enum {
 #define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 "last classname"
 #define AFCLASS(afdict, class) (&afdict->afdict_class[class])
 
+Afdict_class * afdict_find(Dictionary, const char *, bool);
+
 /* Suffixes start with it.
  * This is needed to distinguish suffixes that were stripped off from
  * ordinary words that just happen to be the same as the suffix.
@@ -81,8 +83,6 @@ typedef enum {
 #define INFIX_MARK(afdict) \
 	((NULL == afdict) ? '\0' : (AFCLASS(afdict, AFDICT_INFIXMARK)->string[0][0]))
 
-Afdict_class * afdict_find(Dictionary, const char *, bool);
-
 static const afdict_classnum affix_strippable[] =
 	{AFDICT_UNITS, AFDICT_LPUNC, AFDICT_RPUNC, AFDICT_MPUNC};
 

From ba529fa1a6053b89bfc65d72b75a977a6cc7498f Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 18 Jul 2022 23:37:00 +0300
Subject: [PATCH 07/24] is_afdict_punc(): Allow afdict_classnum to start with 0

---
 link-grammar/tokenize/tokenize.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c
index b9191676e8..c7ce6dfe4d 100644
--- a/link-grammar/tokenize/tokenize.c
+++ b/link-grammar/tokenize/tokenize.c
@@ -371,11 +371,11 @@ static bool is_contraction_word(Dictionary dict, const char *s)
 static bool is_afdict_punc(const Dictionary afdict, const char *word)
 {
 	if (NULL == afdict) return false;
-	int punc_types[] = { AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC, 0 };
+	int punc_type[] = {AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC};
 
-	for (int affix_punc = 0; punc_types[affix_punc] != 0; affix_punc++)
+	for (size_t affix_punc = 0; affix_punc < ARRAY_SIZE(punc_type); affix_punc++)
 	{
-		const Afdict_class * punc_list = AFCLASS(afdict, punc_types[affix_punc]);
+		const Afdict_class * punc_list = AFCLASS(afdict, punc_type[affix_punc]);
 		size_t l_strippable = punc_list->length;
 		const char * const * punc = punc_list->string;
 

From 2e85ae37cda594916d6be3098c612b2792627307 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 18 Jul 2022 23:37:28 +0300
Subject: [PATCH 08/24] afdict_classnum: Start with 0

---
 link-grammar/dict-common/dict-affix.h  |  7 ++-----
 link-grammar/dict-common/dict-common.c |  7 +++----
 link-grammar/tokenize/tokenize.c       | 11 ++++-------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h
index 4d333844c1..ff64a7fbf1 100644
--- a/link-grammar/dict-common/dict-affix.h
+++ b/link-grammar/dict-common/dict-affix.h
@@ -21,7 +21,7 @@
 /* Connector names for the affix class lists in the affix file */
 
 typedef enum {
-	AFDICT_RPUNC=1,
+	AFDICT_RPUNC,
 	AFDICT_LPUNC,
 	AFDICT_MPUNC,
 	AFDICT_UNITS,
@@ -41,13 +41,10 @@ typedef enum {
 	AFDICT_REGALTS,
 	AFDICT_REGPARTS,
 
-	/* Have to have one last entry, to get the array size correct */
-	AFDICT_LAST_ENTRY,
 	AFDICT_NUM_ENTRIES
 } afdict_classnum;
 
 #define AFDICT_CLASSNAMES1 \
-	"invalid classname", \
 	"RPUNC", \
 	"LPUNC", \
 	"MPUNC", \
@@ -69,7 +66,7 @@ typedef enum {
 	"REGALTS",     /* Min&max number of alternatives to issue for a word */\
 	"REGPARTS",    /* Max number of word partitions */
 
-#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2 "last classname"
+#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2
 #define AFCLASS(afdict, class) (&afdict->afdict_class[class])
 
 Afdict_class * afdict_find(Dictionary, const char *, bool);
diff --git a/link-grammar/dict-common/dict-common.c b/link-grammar/dict-common/dict-common.c
index 327f817cfa..af8956fb74 100644
--- a/link-grammar/dict-common/dict-common.c
+++ b/link-grammar/dict-common/dict-common.c
@@ -304,11 +304,10 @@ static void affix_list_delete(Dictionary dict)
 {
 	if (NULL == dict->afdict_class) return;
 
-	int i;
-	Afdict_class * atc;
-	for (i=0, atc = dict->afdict_class; i < AFDICT_NUM_ENTRIES; i++, atc++)
+	Afdict_class * atc = dict->afdict_class;
+	for (size_t i = 0;  i < AFDICT_NUM_ENTRIES; i++)
 	{
-		if (atc->string) free(atc->string);
+		if (atc[i].length > 0) free(atc[i].string);
 	}
 	free(dict->afdict_class);
 	dict->afdict_class = NULL;
diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c
index c7ce6dfe4d..2b298620a9 100644
--- a/link-grammar/tokenize/tokenize.c
+++ b/link-grammar/tokenize/tokenize.c
@@ -371,16 +371,13 @@ static bool is_contraction_word(Dictionary dict, const char *s)
 static bool is_afdict_punc(const Dictionary afdict, const char *word)
 {
 	if (NULL == afdict) return false;
-	int punc_type[] = {AFDICT_RPUNC, AFDICT_MPUNC, AFDICT_LPUNC};
 
-	for (size_t affix_punc = 0; affix_punc < ARRAY_SIZE(punc_type); affix_punc++)
+	for (size_t punc = 0; punc < ARRAY_SIZE(affix_strippable); punc++)
 	{
-		const Afdict_class * punc_list = AFCLASS(afdict, punc_type[affix_punc]);
-		size_t l_strippable = punc_list->length;
-		const char * const * punc = punc_list->string;
+		const Afdict_class * punc_list = AFCLASS(afdict, affix_strippable[punc]);
 
-		for (size_t i = 0; i < l_strippable; i++)
-			if (0 == strcmp(word, punc[i])) return true;
+		for (size_t i = 0; i < punc_list->length; i++)
+			if (0 == strcmp(word, punc_list->string[i])) return true;
 	}
 
 	return false;

From 2c619fbb94fc39d3e54bea337a01d75c270e916e Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Fri, 22 Jul 2022 15:43:30 +0300
Subject: [PATCH 09/24] LPUNC,MPUNCT: Fix subscript mishandling

---
 link-grammar/tokenize/tokenize.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c
index 2b298620a9..357a993702 100644
--- a/link-grammar/tokenize/tokenize.c
+++ b/link-grammar/tokenize/tokenize.c
@@ -374,10 +374,17 @@ static bool is_afdict_punc(const Dictionary afdict, const char *word)
 
 	for (size_t punc = 0; punc < ARRAY_SIZE(affix_strippable); punc++)
 	{
-		const Afdict_class * punc_list = AFCLASS(afdict, affix_strippable[punc]);
+		if (AFDICT_UNITS == affix_strippable[punc]) continue;
+		const Afdict_class *punc_list = AFCLASS(afdict, affix_strippable[punc]);
 
 		for (size_t i = 0; i < punc_list->length; i++)
-			if (0 == strcmp(word, punc_list->string[i])) return true;
+		{
+			/* If the word is subscripted, the affix must be too. */
+			const char *p = punc_list->string[i];
+			const char *w = word;
+			while ((*w == *p) && (*w != '\0')) { w++; p++; }
+			if (*w == *p) return true;
+		}
 	}
 
 	return false;
@@ -1770,7 +1777,8 @@ static int split_mpunc(Sentence sent, const char *word, char *w,
 	{
 		for (size_t i = 0; i < l_strippable; i++)
 		{
-			size_t sz = strlen(mpunc[i]);
+			/* Find the token length, but stop at the subscript mark if exists. */
+			size_t sz = strcspn(mpunc[i], subscript_mark_str());
 			if (0 == strncmp(sep, mpunc[i], sz))
 			{
 				if ('\0' == sep[sz]) continue; // mpunc in end position
@@ -1826,7 +1834,8 @@ static const char *strip_left(Sentence sent, const char * w,
 	{
 		for (i=0; i<l_strippable; i++)
 		{
-			size_t sz = strlen(lpunc[i]);
+			/* Find the token length, but stop at the subscript mark if exists. */
+			size_t sz = strcspn(lpunc[i], subscript_mark_str());
 
 			if (strncmp(w, lpunc[i], sz) == 0)
 			{
@@ -1961,8 +1970,7 @@ static bool strip_right(Sentence sent,
 		{
 			const char *t = rword[i];
 
-			/* Units contain a subscript mark. Punctuation do not contain it.
-			 * Find the token length, but stop at the subscript mark if exists. */
+			/* Find the token length, but stop at the subscript mark if exists. */
 			len = strcspn(t, subscript_mark_str());
 
 			/* The remaining word is too short for a possible match */

From 79edc7bff238b194bf67310ac7f8230717236702 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Sat, 23 Jul 2022 11:00:49 +0300
Subject: [PATCH 10/24] determine_word_expressions(): Debug print the word
 status too

---
 link-grammar/tokenize/tokenize.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c
index 357a993702..611dac406c 100644
--- a/link-grammar/tokenize/tokenize.c
+++ b/link-grammar/tokenize/tokenize.c
@@ -3225,8 +3225,8 @@ static bool determine_word_expressions(Sentence sent, Gword *w,
 				 w->regex_name ? w->regex_name : "");
 		while (we)
 		{
-			prt_error("Debug:  string='%s' expr=%s\n",
-			          we->string, exp_stringify(we->exp));
+			prt_error("Debug:  string='%s' status=%s expr=%s\n",
+			          we->string, gword_status(sent, w), exp_stringify(we->exp));
 			we = we->next;
 		}
 	}

From 76737019dd9e80b50d65bef5fa1a3b867648a3f4 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 25 Jul 2022 01:50:31 +0300
Subject: [PATCH 11/24] Move afdict_classnames[] to dict-affix.h

---
 link-grammar/dict-common/dict-affix.h | 4 +++-
 link-grammar/dict-common/dict-impl.c  | 5 +----
 link-grammar/tokenize/anysplit.c      | 2 --
 link-grammar/tokenize/tokenize.c      | 1 -
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/link-grammar/dict-common/dict-affix.h b/link-grammar/dict-common/dict-affix.h
index ff64a7fbf1..f3ad986c8b 100644
--- a/link-grammar/dict-common/dict-affix.h
+++ b/link-grammar/dict-common/dict-affix.h
@@ -66,7 +66,9 @@ typedef enum {
 	"REGALTS",     /* Min&max number of alternatives to issue for a word */\
 	"REGPARTS",    /* Max number of word partitions */
 
-#define AFDICT_CLASSNAMES AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2
+static const char * const afdict_classname[] =
+	{AFDICT_CLASSNAMES1 AFDICT_CLASSNAMES2};
+
 #define AFCLASS(afdict, class) (&afdict->afdict_class[class])
 
 Afdict_class * afdict_find(Dictionary, const char *, bool);
diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index fdc6da6ae2..82952ac0e9 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -449,7 +449,6 @@ bool dictionary_setup_defines(Dictionary dict)
 	return true;
 }
 
-/* ======================================================================= */
 /* ======================================================================= */
 
 /* The affix dictionary is represented as a dynamically allocated array with
@@ -457,8 +456,6 @@ bool dictionary_setup_defines(Dictionary dict)
  * has a pointer to an array of strings which are the punctuation/affix
  * names. */
 
-const char * afdict_classname[] = { AFDICT_CLASSNAMES };
-
 /** initialize the affix class table */
 void afclass_init(Dictionary dict)
 {
@@ -480,7 +477,7 @@ void afclass_init(Dictionary dict)
  */
 Afdict_class * afdict_find(Dictionary afdict, const char * con, bool notify_err)
 {
-	const char ** ac;
+	const char * const * ac;
 
 	for (ac = afdict_classname;
 	     ac < &afdict_classname[ARRAY_SIZE(afdict_classname)]; ac++)
diff --git a/link-grammar/tokenize/anysplit.c b/link-grammar/tokenize/anysplit.c
index 53803f8b04..4d7e06e21f 100644
--- a/link-grammar/tokenize/anysplit.c
+++ b/link-grammar/tokenize/anysplit.c
@@ -43,8 +43,6 @@
 
 #define MAX_WORD_TO_SPLIT 63 /* in codepoints */
 
-extern const char * const afdict_classname[];
-
 typedef int p_start;     /* partition start in a word */
 typedef p_start *p_list; /* list of partitions in a word */
 
diff --git a/link-grammar/tokenize/tokenize.c b/link-grammar/tokenize/tokenize.c
index 611dac406c..051475085d 100644
--- a/link-grammar/tokenize/tokenize.c
+++ b/link-grammar/tokenize/tokenize.c
@@ -1929,7 +1929,6 @@ static const char *strip_left(Sentence sent, const char * w,
  *
  * p is a mark of the invocation position, for debugging.
  */
-extern const char *const afdict_classname[]; /* For debug message only */
 static bool strip_right(Sentence sent,
                         const char *w,
                         const char **wend,

From 68c23981510e2146f126b8693f9353eedaf8c4e2 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 25 Jul 2022 23:13:40 +0300
Subject: [PATCH 12/24] afdict_init(): Warn on unknown strippable affix tokens

This check cannot be done in affix_list_add() because it doesn't have
access to the dict handle (no point to make an intensive code
restructure for that).
---
 link-grammar/dict-common/dict-impl.c | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index 82952ac0e9..d5f9cf670d 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -18,6 +18,7 @@
 #include "dict-api.h"
 #include "dict-defines.h"
 #include "dict-impl.h"
+#include "print/print-util.h"           // patch_subscript_mark
 #include "regex-morph.h"
 #include "dict-structures.h"
 #include "string-set.h"
@@ -723,6 +724,42 @@ bool afdict_init(Dictionary dict)
 		        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
 	}
 
+	if (!IS_DYNAMIC_DICT(dict))
+	{
+		/* Validate that the strippable tokens are in the dict.
+		 * UNITS are assumed to be from the dict only.
+		 * Possible FIXME: Allow also tokens that match a regex (a tokenizer
+		 * change is needed to recognize them). */
+		for (size_t i = 0; i < ARRAY_SIZE(affix_strippable); i++)
+		{
+			if (AFDICT_UNITS != affix_strippable[i])
+			{
+				ac = AFCLASS(afdict, affix_strippable[i]);
+				bool not_in_dict = false;
+
+				for (size_t n = 0;  n < ac->length; n++)
+				{
+					if (!dict_has_word(dict, ac->string[n]))
+					{
+						if (!not_in_dict)
+						{
+							not_in_dict = true;
+							prt_error("Warning: afdict_init: Class %s in file %s: "
+							          "Token(s) not in the dictionary:",
+							          afdict_classname[affix_strippable[i]],
+							          afdict->name);
+						}
+
+						char *s = strdupa(ac->string[n]);
+						patch_subscript_mark(s);
+						prt_error(" \"%s\"", s);
+					}
+				}
+				if (not_in_dict) prt_error("\n");
+			}
+		}
+	}
+
 	/* Sort the affix-classes of tokens to be stripped. */
 	/* Longer unit names must get split off before shorter ones.
 	 * This prevents single-letter splits from screwing things

From 151bcc397fd5a91df0c6a462c2df22bfcf176473 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Mon, 25 Jul 2022 23:49:55 +0300
Subject: [PATCH 13/24] afdict_init(): Don't stop on first error

---
 link-grammar/dict-common/dict-impl.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/link-grammar/dict-common/dict-impl.c b/link-grammar/dict-common/dict-impl.c
index d5f9cf670d..3674d14ef8 100644
--- a/link-grammar/dict-common/dict-impl.c
+++ b/link-grammar/dict-common/dict-impl.c
@@ -627,6 +627,7 @@ bool afdict_init(Dictionary dict)
 {
 	Afdict_class * ac;
 	Dictionary afdict = dict->affix_table;
+	bool error_found = false;
 
 	/* FIXME: read_entry() builds word lists in reverse order (can we
 	 * just create the list top-down without breaking anything?). Unless
@@ -718,10 +719,13 @@ bool afdict_init(Dictionary dict)
 			prt_error("Error: afdict_init: Failed to compile "
 			          "regex /%s/ in file \"%s\"\n",
 			          afdict_classname[AFDICT_SANEMORPHISM], afdict->name);
-			return false;
+			error_found = true;
+		}
+		else
+		{
+			lgdebug(+D_AI, "%s regex %s\n",
+			        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
 		}
-		lgdebug(+D_AI, "%s regex %s\n",
-		        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
 	}
 
 	if (!IS_DYNAMIC_DICT(dict))
@@ -807,6 +811,6 @@ bool afdict_init(Dictionary dict)
 		lg_error_flush();
 	}
 
-	return true;
+	return !error_found;
 }
 #undef D_AI

From 59d9138e6e84624ae13c065c7a9989f8feca259c Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 00:10:17 +0300
Subject: [PATCH 14/24] he/4.0.affix: Remove a comment that is too far ahead of
 its time

---
 data/he/4.0.affix | 2 --
 1 file changed, 2 deletions(-)

diff --git a/data/he/4.0.affix b/data/he/4.0.affix
index 70d0f2555a..403b8bc23a 100644
--- a/data/he/4.0.affix
+++ b/data/he/4.0.affix
@@ -1,5 +1,3 @@
-% need to add Hebrew punctuations
-
 ")" "}" "]" ">" » 〉 ） 〕 》 】 ］ 』 」 "’’" "’" ''.y '.y
 "%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ？！ ….y ....y "”" "–" "‐" 、 ～
 ¢ ₵ ™ ℠ : RPUNC+;

From 715937818efe27710536d015ae15ffa23325933e Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 00:34:57 +0300
Subject: [PATCH 15/24] he: Revise list of strippable affixes

Also update to "#define version/locale" and <UNKNOWN-WORD> in
4.0.dict.m4 (they are now only in 4.0.dict).
---
 data/he/4.0.affix   |  9 ++-------
 data/he/4.0.dict    |  6 +++++-
 data/he/4.0.dict.m4 | 11 +++++++----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/data/he/4.0.affix b/data/he/4.0.affix
index 403b8bc23a..fdd081f40a 100644
--- a/data/he/4.0.affix
+++ b/data/he/4.0.affix
@@ -1,11 +1,6 @@
-")" "}" "]" ">" » 〉 ） 〕 》 】 ］ 』 」 "’’" "’" ''.y '.y
-"%" "," "." 。 ":" ";" "?" "!" ‽ ؟ ？！ ….y ....y "”" "–" "‐" 、 ～
-¢ ₵ ™ ℠ : RPUNC+;
+")" """ "," "." "–" "‐" ... ":" ";" "?" "!": RPUNC+;
 
-"(" "{" "[" "<" « 〈 （ 〔 《 【 ［ 『 「 `` „ “ ‘ ''.x '.x ….x ....x
-¿ ¡ "$" 
-£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺  ℳ  ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼  ₸ ₮ ₩ ¥ ៛ 호점
-† †† ‡ § ¶ © ® ℗ № "#": LPUNC+;
+"(" „ “ """ ... ₪: LPUNC+;
 
 =: INFIXMARK+;
 % The order is preserved.
diff --git a/data/he/4.0.dict b/data/he/4.0.dict
index 206db9becf..299a2799a2 100644
--- a/data/he/4.0.dict
+++ b/data/he/4.0.dict
@@ -40,7 +40,7 @@
 % Among numerous other things, changes to handle count/uncountable changes
 % have not been done yet. The created infrastructure for that may still need changes.
 
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         he_IL.UTF-8;
 
 % For now.
@@ -562,3 +562,7 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ &
 % skip over (null-link) unknown words. If you remove it, the parser
 % will output an error for any unknown words.
 <UNKNOWN-WORD>: XXX+;
+
+% Punctuations that get strip but are yet unhandled.
+ "." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: <UNKNOWN-WORD>;    % RPUNC
+"(" „ “ "".x" ....x: <UNKNOWN-WORD>;                                 % LPUNC
diff --git a/data/he/4.0.dict.m4 b/data/he/4.0.dict.m4
index da0cf81d21..6724318cbd 100644
--- a/data/he/4.0.dict.m4
+++ b/data/he/4.0.dict.m4
@@ -45,9 +45,8 @@ changecom(`%')dnl
 % Among numerous other things, changes to handle count/uncountable changes
 % have not been done yet. The created infrastructure for that may still need changes.
 
-% Dictionary version number is 5.3.15 (formatted as V5v3v15+)
-<dictionary-version-number>: V5v3v15+;
-<dictionary-locale>: HE4il+;
+#define dictionary-version-number 5.11.0;
+#define dictionary-locale         he_IL.UTF-8;
 
 % For now.
 LEFT-WALL: {Wa+} or {Wd+} or ();
@@ -458,4 +457,8 @@ or Ds+ or (R- & (C+ or RS+)) or SIs- or (Ss+ &
 % With the following line in the dictionary, the parser will simply
 % skip over (null-link) unknown words. If you remove it, the parser
 % will output an error for any unknown words.
-UNKNOWN-WORD: XXX+;
+<UNKNOWN-WORD>: XXX+;
+
+% Punctuations that get strip but are yet unhandled.
+ "." "–" "‐" ")" "".y" "....y" ":" ";" "?" "!" ₪: <UNKNOWN-WORD>;    % RPUNC
+"(" „ “ "".x" ....x: <UNKNOWN-WORD>;                                 % LPUNC

From 6dcfef434b85027188adf7f997206b839995f68b Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 00:52:33 +0300
Subject: [PATCH 16/24] vn: Revise list of strippable affixes

Only comma is defined in the dict, so leave only it.
This is just a minimum fix to prevent error due to strippable affixes
that are not found in the dict.
---
 data/vn/4.0.affix | 24 ++----------------------
 data/vn/4.0.dict  |  2 +-
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/data/vn/4.0.affix b/data/vn/4.0.affix
index 4c56eb052e..d3c445448f 100644
--- a/data/vn/4.0.affix
+++ b/data/vn/4.0.affix
@@ -1,31 +1,11 @@
-% 
 % Affixes get stripped off the left and right side of words 
 % i.e. spaces are inserted between the affix and the word itself.
-%
-% Some of the funky UTF-8 parenthesis are used in Asian texts.
-% In order to allow single straight quote ' and double straight quote ''
-% to be stripped off from both the left and the right, they are
-% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y)
-% 
-% 。is an end-of-sentence marker used in Japanese texts.
 
 % Punctuation appearing on the right-side of words.
-% Note: the ellipsis ....y must appear *before* the dot ".", else the
-% splitting won't work right.
-")" "}" "]" ">" "".y" » 〉 ） 〕 》 】 ］ 』 」 "’’" "’" “ ''.y '.y `.y
-"%" "," ....y "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ？ ！ ….y "”" ━.y –.y ー.y ‐.y 、.y
-～ ¢ ₵ ™ ℠ 
-  : RPUNC+; 
+"," : RPUNC+; % Only this punctuation appears in the dict.
 
 % Punctuation appearing on the left-side of words.
-"(" "{" "[" "<" "".x" « 〈 （ 〔 《 【 ［ 『 「 、.x `.x `` „ ‘ ''.x '.x ….x ....x
-¿ ¡ "$" US$ USD C$ 
-£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺  ℳ  ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼  ₸ ₮ ₩ ¥ ៛ 호점
-† †† ‡ § ¶ © ® ℗ № "#"
-* • ⁂ ❧ ☞ ◊ ※  ○  。.x ゜ ✿ ☆ ＊ ◕ ● ∇ □ ◇ ＠ ◎ 
-–.x ━.x ー.x -- - ‧.x
-  : LPUNC+;
-
+%  : LPUNC+; % None.
 
 % The below is a quoted list, used during tokenization. Do NOT put
 % spaces in between the various quotation marks!!
diff --git a/data/vn/4.0.dict b/data/vn/4.0.dict
index 3d49aa58f9..c65ae7b06b 100644
--- a/data/vn/4.0.dict
+++ b/data/vn/4.0.dict
@@ -1,6 +1,6 @@
 % Vietnamese Dictionary
 
-#define dictionary-version-number 5.9.2;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         vi_VN.UTF-8;
 
 % See https://bitbucket.org/ngocminh/lienkate for the master

From c97236772b41efdc62364aa57464558cc230213e Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 01:06:54 +0300
Subject: [PATCH 17/24] de: Revise list of strippable affixes

---
 data/de/4.0.affix | 4 ++--
 data/de/4.0.dict  | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/data/de/4.0.affix b/data/de/4.0.affix
index 7b122443da..90a224758e 100644
--- a/data/de/4.0.affix
+++ b/data/de/4.0.affix
@@ -1,6 +1,6 @@
 
-")" "%" "," "." ":" ";" "?" "!" "''" "'" "'s": RPUNC+;
-"(" "$" "``": LPUNC+;
+")" "," "." ":" ";" "?" "!" "'" „ « ...: RPUNC+;
+"(" “ » ...: LPUNC+;
 
 %e en er es em: SUF+;
 .=: STEMSUBSCR+;
diff --git a/data/de/4.0.dict b/data/de/4.0.dict
index ed2608b791..8ab9c054ee 100644
--- a/data/de/4.0.dict
+++ b/data/de/4.0.dict
@@ -7,7 +7,7 @@
  %                                                                           %
  %***************************************************************************%
 
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         de_DE.UTF-8;
 
 % NOUNS, PRONOUNS
@@ -521,3 +521,7 @@ PL-CAPITALIZED-WORDS:    BZZT+;
 % will output an error for any unknown words.
 <UNKNOWN-WORD>: NO+;
 
+% Unhandled, but getting split according to 4.0.affix.
+")" ":" ";" "'" „ « ....y: <UNKNOWN-WORD>;     % RPUNC
+"(" “ » ....x: <UNKNOWN-WORD>;                 % LPUNC
+

From f4cd18d1ac50cd5c79e7838c6a45a462074e6957 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 01:21:34 +0300
Subject: [PATCH 18/24] tr: Revise list of strippable affixes

---
 data/tr/4.0.affix | 5 +++--
 data/tr/4.0.dict  | 6 +++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/data/tr/4.0.affix b/data/tr/4.0.affix
index 5cebea5832..fcd866cf78 100644
--- a/data/tr/4.0.affix
+++ b/data/tr/4.0.affix
@@ -1,5 +1,6 @@
-")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+;
-"(" "$" "``": LPUNC+;
+% From these affixes, only "," is handled in the dictionary.
+")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+;
+"(" "$" : LPUNC+;
 
 dakileri ki ler leri kileri düm m di lar diler um yor mi ni diğ diğini ini yorum miyorum ın in un ün nın nin nun nün a e ya ye i ı u ü yi yı yu yü yla ile yle da de ta te den ten dan tan sız: SUF+;
 
diff --git a/data/tr/4.0.dict b/data/tr/4.0.dict
index 099b3a868f..43798fbdeb 100644
--- a/data/tr/4.0.dict
+++ b/data/tr/4.0.dict
@@ -1,7 +1,7 @@
 %
 % Turkish dictionary -- experimental -- 2016 -- Tatiana Batura, Maria Mitkovskaya, Natalya Semenova
 %
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         tr_TR.UTF-8;
 
 % adjectives
@@ -119,3 +119,7 @@ PL-GREEK-LETTER-AND-NUMBER: XXX-;
 CAPITALIZED-WORDS: XXX-;
 HYPHENATED-WORDS: XXX-;
 <UNKNOWN-WORD>: XXX-;
+
+% Unhandled, but getting split according to 4.0.affix.
+")" "%" "." ":" ";" "?" "!" "'": <UNKNOWN-WORD>;      % RPUNC
+"(" "$": <UNKNOWN-WORD>;                              % LPUNC

From 30aa3a940420ba9471402e0ba84365d8ea7b3fec Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 13:28:16 +0300
Subject: [PATCH 19/24] kz: Revise list of strippable affixes

---
 data/kz/4.0.affix | 1 +
 data/kz/4.0.dict  | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/data/kz/4.0.affix b/data/kz/4.0.affix
index f7795ee5c1..381a784079 100644
--- a/data/kz/4.0.affix
+++ b/data/kz/4.0.affix
@@ -1,3 +1,4 @@
+% From these affixes, only "," is handled in the dictionary.
 ")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+;
 "(" "$" "``": LPUNC+;
 
diff --git a/data/kz/4.0.dict b/data/kz/4.0.dict
index 688c027282..a1eccb6b73 100644
--- a/data/kz/4.0.dict
+++ b/data/kz/4.0.dict
@@ -2,7 +2,7 @@
 % Kazakh dictionary -- experimental -- 2016
 % -- Tatiana Batura, Aigerim Bakiyeva, Aigerim Yerimbetova
 %
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         kk_KZ.UTF-8;
 
 мен.pron: {W-} & S1s+;
@@ -133,3 +133,7 @@ LATIN-ADJ-P-NOUN-WORDS: XXX-;
 LATIN-ADJ-S-NOUN-WORDS: XXX-;
 HYPHENATED-WORDS: XXX-;
 <UNKNOWN-WORD>: XXX-;
+
+% Unhandled, but getting split according to 4.0.affix.
+")" "%" "." ":" ";" "?" "!" "''" "'" : <UNKNOWN-WORD>;    % RPUNC
+"(" "$" "``": <UNKNOWN-WORD>;                             % LPUNC

From 753c435920e40cf22460236d65f22b8b98926d3a Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 17:26:51 +0300
Subject: [PATCH 20/24] id: Revise list of strippable affixes

---
 data/id/4.0.affix | 34 ++--------------------------------
 data/id/4.0.dict  |  2 +-
 2 files changed, 3 insertions(+), 33 deletions(-)

diff --git a/data/id/4.0.affix b/data/id/4.0.affix
index 146d905919..1e709e1f6e 100644
--- a/data/id/4.0.affix
+++ b/data/id/4.0.affix
@@ -1,30 +1,11 @@
-% 
 % Affixes get stripped off the left and right side of words 
 % i.e. spaces are inserted between the affix and the word itself.
-%
-% Some of the funky UTF-8 parenthesis are used in Asian texts.
-% In order to allow single straight quote ' and double straight quote ''
-% to be stripped off from both the left and the right, they are
-% distinguished by the suffix .x and .y (as as Mr.x Mrs.x or Jr.y Sr.y)
-% 
-% 。is an end-of-sentence marker used in Japanese texts.
 
 % Punctuation appearing on the right-side of words.
-")" "}" "]" ">" » 〉 ） 〕 》 】 ］ 』 」 "’’" "’" ''.y '.y ' `
-"%" "," "." 。.y ‧ ":" ";" "?" "!" ‽ ؟ ？ ！ ….y ....y "”" ━.y –.y ー.y ‐.y 、.y
-～ ¢ ₵ ™ ℠ : RPUNC+; 
+"," : RPUNC+; % Only this punctuation appears in the dict.
 
 % Punctuation appearing on the left-side of words.
-"(" "{" "[" "<" « 〈 （ 〔 《 【 ［ 『 「 、.x ` `` „ “ ‘ ''.x '.x ….x ....x
-¿ ¡ "$" US$ USD C$ 
-£ ₤ € ¤ ₳ ฿ ₡ ₢ ₠ ₫ ৳ ƒ ₣ ₲ ₴ ₭ ₺  ℳ  ₥ ₦ ₧ ₱ ₰ ₹ ₨ ₪ ﷼  ₸ ₮ ₩ ¥ ៛ 호점
-† †† ‡ § ¶ © ® ℗ № "#"
-* • ⁂ ❧ ☞ ◊ ※  ○  。.x ゜ ✿ ☆ ＊ ◕ ● ∇ □ ◇ ＠ ◎ 
-–.x ━.x ー.x -- - ‧.x
-  : LPUNC+;
-
-% Suffixes
-'s 're 've 'd 'll 'm ’s ’re ’ve ’d ’ll ’m: SUF+;
+%  : LPUNC+; % none.
 
 % The below is a quoted list, used during tokenization. Do NOT put
 % spaces in between the various quotation marks!!
@@ -33,14 +14,3 @@
 % The below is a quoted list, used during tokenization. Do NOT put
 % spaces in between the various symbols!!
 "()¿¡†‡§¶©®℗№#*•⁂❧☞◊※○。゜✿☆＊◕●∇□◇＠◎–━ー---‧": BULLETS+;
-
-/en/words/units.1: UNITS+;
-/en/words/units.1.dot: UNITS+;
-/en/words/units.3: UNITS+;
-/en/words/units.4: UNITS+;
-/en/words/units.4.dot: UNITS+;
-/en/words/units.5: UNITS+;
-%
-% units.6 contains just a single, sole slash in it. This allows units
-% such as mL/s to be split at the slash.
-/en/words/units.6: UNITS+;
diff --git a/data/id/4.0.dict b/data/id/4.0.dict
index 4f39a38f94..41d138ba35 100644
--- a/data/id/4.0.dict
+++ b/data/id/4.0.dict
@@ -7,7 +7,7 @@
  %                                                                           %
  %***************************************************************************%
 
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         id_ID.UTF-8;
 
 anjing kucing wanita cewek pria cowok lelaki laki-laki taman lapangan tulang

From fe0af9bb70437bde58b53775aa37e3f1daa5ec4f Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 17:15:50 +0300
Subject: [PATCH 21/24] fa: Revise list of strippable affixes

---
 data/fa/4.0.affix | 4 ++--
 data/fa/4.0.dict  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/fa/4.0.affix b/data/fa/4.0.affix
index 9f42f51e41..b726baa24f 100644
--- a/data/fa/4.0.affix
+++ b/data/fa/4.0.affix
@@ -1,3 +1,3 @@
 
-")" "%" "," "." ":" ";" "?" "!" "''" "'" : RPUNC+;
-"(" "$" "``": LPUNC+;
+")" "%" "," "." ":" ";" "?" "!" "'" : RPUNC+;
+"(" "$": LPUNC+;
diff --git a/data/fa/4.0.dict b/data/fa/4.0.dict
index f13ef9490b..3b41321378 100644
--- a/data/fa/4.0.dict
+++ b/data/fa/4.0.dict
@@ -10,7 +10,7 @@
 %
 %*****************************************************************************
 
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         fa_IR.UTF-8;
 
 % PERSIAN SYNTAX NOTES:
@@ -835,11 +835,11 @@ or
 
 ;
 
+% Unhandled, but getting split according to 4.0.affix.
+")" "%" ";" "'": ();                      % RPUNC
+"(" "$": ();                              % LPUNC
 
 
-%":" ";" :	% Colon and Semicolon
-%;
-
 %NUMBERS:   ;
 LEFT-WALL: ( Wcc+ or [Wi+] or [[Wd+]] ) & {Xp+}; % Connects to the Subject in declarative sentence, or Verb in subjectless or imperative sentence
 RIGHT-WALL: RW- ;

From 3a162fec479cf22469c47b292d0b4e8716ae3985 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 17:42:21 +0300
Subject: [PATCH 22/24] ru: Revise list of strippable affixes

---
 data/ru/4.0.affix | 4 ++--
 data/ru/4.0.dict  | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/data/ru/4.0.affix b/data/ru/4.0.affix
index 324f50e716..1249897269 100644
--- a/data/ru/4.0.affix
+++ b/data/ru/4.0.affix
@@ -1,5 +1,5 @@
-")" "%" "," "." ":" ";" "?" "!" "''" "'": RPUNC+;
-"(" "$" "``": LPUNC+;
+")" "%" "," "." ":" ";" "?" "!" "'": RPUNC+;
+"(" "$": LPUNC+;
 
 ""«»《》【】『』`„“": QUOTES+;
 
diff --git a/data/ru/4.0.dict b/data/ru/4.0.dict
index ca0042b581..3c12e6e02b 100644
--- a/data/ru/4.0.dict
+++ b/data/ru/4.0.dict
@@ -7,7 +7,7 @@
 %%
 %% This file uses the utf8 encoding
 
-#define dictionary-version-number 5.9.0;
+#define dictionary-version-number 5.11.0;
 #define dictionary-locale         ru_RU.UTF-8;
 #define max-disjunct-cost         2.7;
 #define panic_max-disjunct-cost   4.0;
@@ -1096,3 +1096,7 @@ LENGTH-LIMIT-1: LL*+;
 % skip over (null-link) unknown words. If you remove it, the parser
 % will output an error for any unknown words.
 <UNKNOWN-WORD>: XXX+;
+
+% Unhandled, but getting split according to 4.0.affix.
+"%" "'": <UNKNOWN-WORD>;        % RPUNC
+"$": <UNKNOWN-WORD>;            % LPUNC

From 56fb7251dff1910fa9110cda077683c41a265660 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 17:44:12 +0300
Subject: [PATCH 23/24] ru/4.0.dict: Fix typo in panic_max-disjunct-cost

---
 data/ru/4.0.dict | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/ru/4.0.dict b/data/ru/4.0.dict
index 3c12e6e02b..1501c9c2e6 100644
--- a/data/ru/4.0.dict
+++ b/data/ru/4.0.dict
@@ -10,7 +10,7 @@
 #define dictionary-version-number 5.11.0;
 #define dictionary-locale         ru_RU.UTF-8;
 #define max-disjunct-cost         2.7;
-#define panic_max-disjunct-cost   4.0;
+#define panic-max-disjunct-cost   4.0;
 
 <costly-null>: [[[[]]]];
 

From 900a4ba79827950e0774d63ddd6a86ccb27cfd82 Mon Sep 17 00:00:00 2001
From: ampli <amirpli@gmail.com>
Date: Tue, 26 Jul 2022 19:23:00 +0300
Subject: [PATCH 24/24] tests.py ZTHLangTestCase: Filter out errors on
 nonexistent affixes

---
 bindings/python-examples/tests.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/bindings/python-examples/tests.py b/bindings/python-examples/tests.py
index d419a6090b..7cfa01cf00 100755
--- a/bindings/python-examples/tests.py
+++ b/bindings/python-examples/tests.py
@@ -77,11 +77,11 @@ def test_open_nonexistent_dictionary(self):
 
         save_stderr = divert_start(2)
         self.assertRaises(LG_DictionaryError, Dictionary, dummy_lang + '1')
-        self.assertIn(dummy_lang + '1', save_stderr.divert_end())
+        self.assertIn(dummy_lang + '1', str(save_stderr.divert_end()))
 
         save_stderr = divert_start(2)
         self.assertRaises(LG_Error, Dictionary, dummy_lang + '2')
-        self.assertIn(dummy_lang + '2', save_stderr.divert_end())
+        self.assertIn(dummy_lang + '2', str(save_stderr.divert_end()))
 
 # Check absolute and relative dictionary access.
 # Check also that the dictionary language is set correctly.
@@ -506,7 +506,7 @@ def test_22_defaut_handler_param(self):
         dummy_lang = "a dummy dict name (bad param test)"
         self.assertRaises(LG_Error, Dictionary, dummy_lang)
         LG_Error.printall(self.error_handler_test, self) # grab a valid errinfo
-        #self.assertIn(dummy_lang, save_stderr.divert_end())
+        #self.assertIn(dummy_lang, str(save_stderr.divert_end()))
         self.assertRaisesRegexp(TypeError, "must be an integer",
                                 self.__class__.handler["default"],
                                 self.errinfo, "bad param")
@@ -520,7 +520,7 @@ def test_22_defaut_handler_param(self):
             self.param_ok = False
             save_stdout = divert_start(1) # Note: Handler parameter is stdout
             self.__class__.handler["default"](self.errinfo, 1)
-            self.assertIn(dummy_lang, save_stdout.divert_end())
+            self.assertIn(dummy_lang, str(save_stdout.divert_end()))
             self.param_ok = True
         except (TypeError, ValueError):
             self.assertTrue(self.param_ok)
@@ -598,7 +598,7 @@ def test_50_set_orig_error_handler(self):
         dummy_lang = "a dummy dict name (default handler test)"
         save_stderr = divert_start(2)
         self.assertRaises(LG_Error, Dictionary, dummy_lang)
-        self.assertIn(dummy_lang, save_stderr.divert_end())
+        self.assertIn(dummy_lang, str(save_stderr.divert_end()))
         self.assertEqual(self.errinfo, "dummy")
 
 class FSATsolverTestCase(unittest.TestCase):
@@ -1281,9 +1281,16 @@ def test_d_morphology(self):
              '.', 'RIGHT-WALL'])
 
 
+# The Thai 4.0.affix files currently contain strippable affixes that are
+# not in the dict. This causes an annoying multiline error output that are
+# filtered out here using divert().
 class ZTHLangTestCase(unittest.TestCase):
     def test_thai(self):
+        save_stderr = divert_start(2)
         linkage_testfile(self, Dictionary(lang='th'), ParseOptions())
+        for line in save_stderr.divert_end().decode().split("\n"):
+           if 'Token(s) not in the dictionary' not in line:
+              print(line)
 
 
 class ZXDictDialectTestCase(unittest.TestCase):
@@ -1501,7 +1508,7 @@ def divert_end(self):
         os.close(self.savedfd)
         os.unlink(self.filename)
         self.filename = None
-        return str(content)
+        return content
 
     __del__ = divert_end