add japanese, finally fix book lang stuff, format lang everything

eerussianguy · Jan 28, 2024 · 12d44f0 · 12d44f0
1 parent a60bfda
commit 12d44f0
Show file tree

Hide file tree

Showing 78 changed files with 4,827 additions and 1,573 deletions.
diff --git a/resources/format_lang.py b/resources/format_lang.py
@@ -0,0 +1,89 @@
+import difflib
+import json
+
+from typing import Tuple
+
+
+def main(validate: bool, namespace: str, langs: Tuple[str, ...]):
+    en_us = load(namespace, 'en_us')
+    for lang in langs:
+        if lang != 'en_us':
+            format_lang(namespace, en_us, lang, validate)
+
+
+def update(namespace: str, langs: Tuple[str, ...]):
+    en_us = load(namespace, 'en_us')
+    en_us_old = load_old(namespace, 'en_us')
+    updated_keys = {k for k in en_us.keys() if k in en_us_old and en_us[k] != en_us_old[k]}
+
+    if updated_keys:
+        print('Found %d modified values:' % len(updated_keys))
+        for k in updated_keys:
+            print('Modified: %s : "%s" -> "%s"' % (k, en_us_old[k], en_us[k]))
+
+        inp = input('Remove these keys from other translations?\n(yes|no) >')
+        print('Answer: %s' % inp)
+        if inp == 'yes':
+            # Strip these keys from en_us, so they don't show up in translations
+            for k in updated_keys:
+                del en_us[k]
+            for lang in langs:
+                if lang != 'en_us':
+                    format_lang(namespace, en_us, lang, False)
+    else:
+        print('No differences found')
+
+
+def format_lang(namespace: str, en_us, lang: str, validate: bool):
+    lang_data = load(namespace, lang)
+    lang_comments = {k: v for k, v in lang_data.items() if '__comment' in k and v != 'This file was automatically created by mcresources'}
+    lang_data = {k: v for k, v in lang_data.items() if '__comment' not in k}
+
+    formatted_lang_data = {}
+    for k, v in lang_comments.items():
+        formatted_lang_data[k] = v
+
+    translated = 0
+    for k, v in en_us.items():
+        if '__comment' in k:
+            pass  # Exclude comments in en_us
+        elif k in lang_data and lang_data[k] != v:
+            translated += 1
+            formatted_lang_data[k] = lang_data[k]
+        else:
+            formatted_lang_data[k] = v
+
+    # Unique keys to this language, only allowed in the default vanilla overrides. It makes no sense for a language to have uniquely named TFC keys
+    # But, for vanilla minecraft, we may have to override for vanilla items we rename without renaming.
+    # e.g. we use 'Egg' but if a translation is 'Chicken Egg', that might be renamed for other languages only.
+    if namespace == 'minecraft':
+        for k, v in lang_data.items():
+            if k not in en_us:
+                formatted_lang_data[k] = v
+
+    print('Translation progress for %s (%s): %d / %d (%.1f%%)' % (lang, namespace, translated, len(en_us), 100 * translated / len(en_us)))
+    save(namespace, lang, formatted_lang_data, validate)
+
+
+def load(namespace: str, lang: str):
+    with open('./src/main/resources/assets/%s/lang/%s.json' % (namespace, lang), 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def load_old(namespace: str, lang: str):
+    """ The old lang file need to be manually placed under the project root and
+    be named as exactly `<lang>.<namespace>.old.json`, where <lang> is the
+    language code, and <namespace> is usually either 'minecraft' or 'tfc'.
+    """
+    with open('./%s.%s.old.json' % (lang, namespace), 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def save(namespace: str, lang: str, lang_data, validate: bool):
+    if validate:
+        with open('./src/main/resources/assets/%s/lang/%s.json' % (namespace, lang), 'r', encoding='utf-8') as f:
+            old_lang_data = json.load(f)
+            assert old_lang_data == lang_data, 'Validation error in mod localization for %s:\n\n=== Diff (expected vs. actual) ===\n\n%s' % (lang, '\n'.join(difflib.unified_diff(json.dumps(lang_data, ensure_ascii=False, indent=2).split('\n'), json.dumps(old_lang_data, ensure_ascii=False, indent=2).split('\n'))))
+    else:
+        with open('./src/main/resources/assets/%s/lang/%s.json' % (namespace, lang), 'w', encoding='utf-8') as f:
+            json.dump(lang_data, f, ensure_ascii=False, indent=2)
diff --git a/resources/generate_book.py b/resources/generate_book.py
diff --git a/resources/i18n.py b/resources/i18n.py
@@ -1,36 +1,27 @@
-import os
 import json
+import os
 
+import Levenshtein
 
-class I18n:
 
-    @staticmethod
-    def create(lang: str):
-        return I18n(lang) if lang == 'en_us' else ForLanguage(lang)
+class I18n:
 
     lang: str
 
-    def __init__(self, lang: str):
+    def __init__(self, lang: str, validate: bool = False):
         self.lang = lang
-
-    def translate(self, text: str) -> str:
-        """ Translates the string into the current domain """
-        return text
-
-    def flush(self):
-        """ Updates the local translation file, if needed """
-        pass
-
-
-class ForLanguage(I18n):
-    def __init__(self, lang: str):
-        super().__init__(lang)
         self.before = {}
         self.after = {}
-        self.lang_path = './lang/%s.json' % lang
+        self.validate = validate
+        self.lang_path = './resources/lang/%s.json' % lang
+
+        self.fuzzy_matches = 0
+        self.fuzzy_non_matches = 0
 
         # Default translation
         if not os.path.isfile(self.lang_path):
+            if validate:
+                raise ValueError('Cannot validate book for lang %s, as resources/lang/%s.json does not exist' % (lang, lang))
             print('Writing default translation for language %s to %s' % (self.lang, self.lang_path))
             with open(self.lang_path, 'w', encoding='utf-8') as f:
                 f.write('{}\n')
@@ -47,17 +38,48 @@ def __init__(self, lang: str):
                 exit(-1)
             self.before[key] = value
 
+    def is_root(self) -> bool:
+        """ Return true if we are in the root language (en_us) """
+        return self.lang == 'en_us'
+
     def translate(self, text: str) -> str:
-        if text in self.before:
+        """ Translates the string into the current domain """
+        if self.is_root():
+            # For en_us, always keep the current text (read only)
+            translated = text
+        elif text in self.before:
             translated = self.before[text]  # Translate if available
         else:
-            translated = text  # Not available, but record and output anyway
+            # Try a fuzzy matcher (if we're not in en_us)
+            # Use the lowercase of both keys, as difference in capitalization is almost surely not a translation issue
+            distance, match = min(((Levenshtein.distance(text.lower(), key.lower()), key) for key in self.before.keys()))
+            if distance / len(text) < 0.1 and distance < 20:  # Heuristic: < 5% of text, and < 20 overall distance
+                if self.before[match] == match:
+                    # This has just matched a default key that was inserted in the translated files
+                    # So if we slightly modify the en_us default, we should change this value as well.
+                    self.fuzzy_non_matches += 1
+                    translated = text
+                else:
+                    # Use the fuzzy match
+                    self.fuzzy_matches += 1
+                    translated = self.before[match]
+            else:
+                # Not available, but record and output anyway
+                self.fuzzy_non_matches += 1
+                translated = text
 
         self.after[text] = translated
         return translated
 
     def flush(self):
+        """ Updates the local translation file, if needed """
+        if not self.is_root() and self.fuzzy_matches + self.fuzzy_non_matches > 0:
+            print('Matched %d / %d entries (%.1f%%). Updated %d entries for lang %s.' % (self.fuzzy_matches, self.fuzzy_matches + self.fuzzy_non_matches, 100 * self.fuzzy_matches / (self.fuzzy_matches + self.fuzzy_non_matches), self.fuzzy_non_matches, self.lang))
+        if self.validate:
+            assert self.before == self.after, 'Validation error translating book to lang \'%s\'' % self.lang
         with open(self.lang_path, 'w', encoding='utf-8') as f:
-            print('Writing updated translation for language %s' % self.lang)
+            unique_count = len(self.after) if self.is_root() else sum(k != v for k, v in self.after.items())
+            if unique_count > 0:
+                print('Writing updated translation for language %s: %d / %d (%.2f%%)' % (self.lang, unique_count, len(self.after), 100 * unique_count / len(self.after)))
             json.dump(self.after, f, indent=2, ensure_ascii=False)