Skip to content

Commit

Permalink
Don't generate map.simplify, add cp852.toml
Browse files Browse the repository at this point in the history
  • Loading branch information
insolor committed Mar 4, 2024
1 parent ddd3eed commit 9a9a3e7
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 28 deletions.
28 changes: 0 additions & 28 deletions automation/generate_encoding_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Iterable, Iterator, Mapping, NamedTuple, Optional
from unidecode import unidecode
from collections import defaultdict


Expand Down Expand Up @@ -77,23 +76,6 @@ def group_mapping(mapping: Iterable[tuple[str, str]], encoding: str) -> Iterator
yield format_result(state_start, prev_state)


def get_simplified_map(letters: Iterable[str]) -> Iterator[tuple[str, str]]:
for letter in letters:
simplified = unidecode(letter)
if simplified != letter and simplified in letters:
yield letter, simplified


def get_grouped_simplified(letters: Iterable[str]) -> Mapping[str, str]:
simplified_map = get_simplified_map(letters)

grouped_simplified = defaultdict(list)
for letter, simplified in simplified_map:
grouped_simplified[simplified.lower()].append(letter)

return grouped_simplified


def main(encoding: str):
print("[metadata]")
print(f"encoding = \"{encoding}\"")
Expand All @@ -115,16 +97,6 @@ def main(encoding: str):

print()

print("[maps.simplify]")
simplified = get_grouped_simplified(letters)
simplified = {"".join(from_letters): to_letter for to_letter, from_letters in simplified.items()}
for from_letters, to_letter in simplified.items():
from_codes = "|".join([str(letter.encode(encoding)[0]) for letter in from_letters])
to_code = to_letter.encode(encoding)[0]
print(f""""{from_codes}" = {to_code} # {from_letters} -> {to_letter}""")

print()

print("[maps.utf]")
letters = get_letters(encoding, 128)
for letter in letters:
Expand Down
182 changes: 182 additions & 0 deletions store/encodings/cp852.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
[metadata]
encoding = "cp852"

[maps.capitalize]
"97:122" = -32 # a-z -> A-Z
129 = 25 # ü -> Ü
130 = 14 # é -> É
131 = 51 # â -> Â
132 = 10 # ä -> Ä
133 = 89 # ů -> Ů
134 = 9 # ć -> Ć
135 = -7 # ç -> Ç
136 = 21 # ł -> Ł
137 = 74 # ë -> Ë
139 = -1 # ő -> Ő
140 = 75 # î -> Î
146 = -1 # ĺ -> Ĺ
147 = 79 # ô -> Ô
148 = 5 # ö -> Ö
150 = -1 # ľ -> Ľ
152 = -1 # ś -> Ś
156 = -1 # ť -> Ť
159 = 13 # č -> Č
160 = 21 # á -> Á
161 = 53 # í -> Í
162 = 62 # ó -> Ó
163 = 70 # ú -> Ú
165 = -1 # ą -> Ą
167 = -1 # ž -> Ž
169 = -1 # ę -> Ę
171 = -30 # ź -> Ź
173 = 11 # ş -> Ş
190 = -1 # ż -> Ż
199 = -1 # ă -> Ă
208 = 1 # đ -> Đ
212 = -2 # ď -> Ď
216 = -33 # ě -> Ě
228 = -1 # ń -> Ń
229 = -16 # ň -> Ň
231 = -1 # š -> Š
234 = -2 # ŕ -> Ŕ
236 = 1 # ý -> Ý
238 = -17 # ţ -> Ţ
251 = -16 # ű -> Ű
253 = -1 # ř -> Ř

[maps.lowercast]
"65:90" = 32 # A-Z -> a-z
128 = 7 # Ç -> ç
138 = 1 # Ő -> ő
141 = 30 # Ź -> ź
142 = -10 # Ä -> ä
143 = -9 # Ć -> ć
144 = -14 # É -> é
145 = 1 # Ĺ -> ĺ
149 = 1 # Ľ -> ľ
151 = 1 # Ś -> ś
153 = -5 # Ö -> ö
154 = -25 # Ü -> ü
155 = 1 # Ť -> ť
157 = -21 # Ł -> ł
164 = 1 # Ą -> ą
166 = 1 # Ž -> ž
168 = 1 # Ę -> ę
172 = -13 # Č -> č
181 = -21 # Á -> á
182 = -51 # Â -> â
183 = 33 # Ě -> ě
184 = -11 # Ş -> ş
189 = 1 # Ż -> ż
198 = 1 # Ă -> ă
209 = -1 # Đ -> đ
210 = 2 # Ď -> ď
211 = -74 # Ë -> ë
213 = 16 # Ň -> ň
214 = -53 # Í -> í
215 = -75 # Î -> î
221 = 17 # Ţ -> ţ
222 = -89 # Ů -> ů
224 = -62 # Ó -> ó
226 = -79 # Ô -> ô
227 = 1 # Ń -> ń
230 = 1 # Š -> š
232 = 2 # Ŕ -> ŕ
233 = -70 # Ú -> ú
235 = 16 # Ű -> ű
237 = -1 # Ý -> ý
252 = 1 # Ř -> ř

[maps.simplify]
"129|150|151|154|163" = 117 # üûùÜú -> u
"152" = 121 # ÿ -> y
"164|165" = 110 # ñÑ -> n
"131|132|133|134|142|143|145|146|160" = 97 # âäàåÄÅæÆá -> a
"130|136|137|138|144" = 101 # éêëèÉ -> e
"139|140|141|161" = 105 # ïîìí -> i
"147|148|149|153|162" = 111 # ôöòÖó -> o
"128|135" = 99 # Çç -> c

[maps.utf]
34755 = 128 # Ç
48323 = 129 # ü
43459 = 130 # é
41667 = 131 # â
42179 = 132 # ä
44997 = 133 # ů
34756 = 134 # ć
42947 = 135 # ç
33477 = 136 # ł
43971 = 137 # ë
37061 = 138 # Ő
37317 = 139 # ő
44739 = 140 # î
47557 = 141 # Ź
33987 = 142 # Ä
34500 = 143 # Ć
35267 = 144 # É
47556 = 145 # Ĺ
47812 = 146 # ĺ
46275 = 147 # ô
46787 = 148 # ö
48580 = 149 # Ľ
48836 = 150 # ľ
39621 = 151 # Ś
39877 = 152 # ś
38595 = 153 # Ö
40131 = 154 # Ü
42181 = 155 # Ť
42437 = 156 # ť
33221 = 157 # Ł
36292 = 159 # č
41411 = 160 # á
44483 = 161 # í
46019 = 162 # ó
47811 = 163 # ú
33988 = 164 # Ą
34244 = 165 # ą
48581 = 166 # Ž
48837 = 167 # ž
39108 = 168 # Ę
39364 = 169 # ę
47813 = 171 # ź
36036 = 172 # Č
40901 = 173 # ş
33219 = 181 # Á
33475 = 182 # Â
39620 = 183 # Ě
40645 = 184 # Ş
48069 = 189 # Ż
48325 = 190 # ż
33476 = 198 # Ă
33732 = 199 # ă
37316 = 208 # đ
37060 = 209 # Đ
36548 = 210 # Ď
35779 = 211 # Ë
36804 = 212 # ď
34757 = 213 # Ň
36291 = 214 # Í
36547 = 215 # Î
39876 = 216 # ě
41669 = 221 # Ţ
44741 = 222 # Ů
37827 = 224 # Ó
40899 = 225 # ß
38083 = 226 # Ô
33733 = 227 # Ń
33989 = 228 # ń
35013 = 229 # ň
41157 = 230 # Š
41413 = 231 # š
38085 = 232 # Ŕ
39619 = 233 # Ú
38341 = 234 # ŕ
45253 = 235 # Ű
48579 = 236 # ý
40387 = 237 # Ý
41925 = 238 # ţ
34763 = 243 # ˇ
45509 = 251 # ű
39109 = 252 # Ř
39365 = 253 # ř

0 comments on commit 9a9a3e7

Please sign in to comment.