Skip to content

Commit

Permalink
Support Unicode 16 (#74)
Browse files Browse the repository at this point in the history
* Unicode 16: Initial support

Includes Kirat Rai normalization behavior.

* Support Unicode 16 variation seqs for quotation mark width

* Update emoji-test.txt

* Remove workaround for Unicode bug fixed in 16.0
  • Loading branch information
Jules-Bertholet authored Jan 15, 2025
1 parent 82d7136 commit 7a7fcdc
Show file tree
Hide file tree
Showing 5 changed files with 1,349 additions and 198 deletions.
129 changes: 105 additions & 24 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from itertools import batched
from typing import Callable, Iterable

UNICODE_VERSION = "15.1.0"
UNICODE_VERSION = "16.0.0"
"""The version of the Unicode data files to download."""

NUM_CODEPOINTS = 0x110000
Expand Down Expand Up @@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
- 4th bit: whether to set top bit on emoji presentation.
If this is set but 3rd is not, the width mode is related to zwj sequences
- 5th from top: whether this is unaffected by ligature-transparent
(if set, should also set 3rd and 4th)
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
where no ZWJ has been encountered yet; encountering one flips this on"""
where no ZWJ has been encountered yet; encountering one flips this on
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
"""

# BASIC WIDTHS

Expand Down Expand Up @@ -264,8 +267,17 @@ class WidthState(enum.IntEnum):
TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
"(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`"

# Kirat Rai
KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
"\\u16D67 (\\u16D67 \\u16D67)+ and canonical equivalents"
KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
"(\\u16D68)+ and canonical equivalents"

# VARIATION SELECTORS

VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
"\\uFE00 if CJK, or \\uFE01 otherwise"

# Text presentation sequences (not CJK)
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
"\\uFE0E (text presentation sequences)"
Expand Down Expand Up @@ -361,6 +373,7 @@ def width_alone(self) -> int:
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
| WidthState.VARIATION_SELECTOR_15
| WidthState.VARIATION_SELECTOR_16
| WidthState.VARIATION_SELECTOR_1_OR_2
):
return 0
case (
Expand Down Expand Up @@ -493,12 +506,6 @@ def load_zero_widths() -> list[bool]:
lambda cp: operator.setitem(zw_map, cp, True),
)

# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
# as they canonically decompose to two characters with this property,
# but they aren't.
for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
zw_map[c] = True

# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
# as zero-width. This matches the behavior of glibc `wcwidth`.
#
Expand Down Expand Up @@ -639,6 +646,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
([0xFE0F], WidthState.VARIATION_SELECTOR_16),
([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I),
([0x16D67], WidthState.KIRAT_RAI_VOWEL_SIGN_E),
([0x16D68], WidthState.KIRAT_RAI_VOWEL_SIGN_AI),
(emoji_presentation, WidthState.EMOJI_PRESENTATION),
(emoji_modifiers, WidthState.EMOJI_MODIFIER),
(regional_indicators, WidthState.REGIONAL_INDICATOR),
Expand All @@ -648,9 +657,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
ea[cp] = width

# East-Asian only
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY

# Not East Asian only
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15

return (not_ea, ea)
Expand Down Expand Up @@ -716,7 +727,7 @@ def load_solidus_transparent(
cjk_width_map: list[WidthState],
) -> list[tuple[Codepoint, Codepoint]]:
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
"""

ccc_above_1 = set()
Expand Down Expand Up @@ -748,7 +759,7 @@ def load_solidus_transparent(
num_chars = len(ccc_above_1)

for cp in ccc_above_1:
if cp != 0xFE0F:
if cp not in [0xFE00, 0xFE0F]:
assert (
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
), f"U+{cp:X}"
Expand Down Expand Up @@ -1304,8 +1315,17 @@ def lookup_fns(
return (0, next_info.set_emoji_presentation());
}"""

if not is_cjk:
if is_cjk:
s += """
if c == '\\u{FE00}' {
return (0, next_info.set_vs1_2());
}
"""
else:
s += """
if c == '\\u{FE01}' {
return (0, next_info.set_vs1_2());
}
if c == '\\u{FE0E}' {
return (0, next_info.set_text_presentation());
}
Expand All @@ -1315,9 +1335,19 @@ def lookup_fns(
} else {
next_info = next_info.unset_text_presentation();
}
}"""
} else """

s += """
s += """if next_info.is_vs1_2() {
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
return ("""

s += str(2 - is_cjk)

s += """, WidthInfo::DEFAULT);
} else {
next_info = next_info.unset_vs1_2();
}
}
if next_info.is_ligature_transparent() {
if c == '\\u{200D}' {
return (0, next_info.set_zwj_bit());
Expand Down Expand Up @@ -1496,6 +1526,22 @@ def lookup_fns(
return (0, WidthInfo::EMOJI_PRESENTATION)
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D63}}') => {{
return (0, WidthInfo::DEFAULT);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D67}}') => {{
return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D68}}') => {{
return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D69}}') => {{
return (0, WidthInfo::DEFAULT);
}}
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\u{{16D63}}') => {{
return (0, WidthInfo::DEFAULT);
}}
// Fallback
_ => {{}}
}}
Expand Down Expand Up @@ -1562,6 +1608,8 @@ def emit_module(
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct WidthInfo(u16);
const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
impl WidthInfo {
/// No special handling necessary
const DEFAULT: Self = Self(0);
Expand Down Expand Up @@ -1591,51 +1639,84 @@ def emit_module(
/// Has top bit set
fn is_emoji_presentation(self) -> bool {{
(self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_16.0) == WidthInfo::VARIATION_SELECTOR_16.0
}}
/// Has top bit set
fn is_zwj_emoji_presentation(self) -> bool {{
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
}}
/// Set top bit
fn set_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
{{
Self(self.0 | 0b1000_0000_0000_0000)
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self::VARIATION_SELECTOR_16
}}
}}
/// Clear top bit
fn unset_emoji_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 & 0b0111_1111_1111_1111)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0)
}} else {{
Self::DEFAULT
}}
}}
/// Has 2nd bit set
fn is_text_presentation(self) -> bool {{
(self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
(self.0 & WidthInfo::VARIATION_SELECTOR_15.0) == WidthInfo::VARIATION_SELECTOR_15.0
}}
/// Set 2nd bit
fn set_text_presentation(self) -> Self {{
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
Self(self.0 | 0b0100_0000_0000_0000)
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
)
}} else {{
Self(0b0100_0000_0000_0000)
Self(WidthInfo::VARIATION_SELECTOR_15.0)
}}
}}
/// Clear 2nd bit
fn unset_text_presentation(self) -> Self {{
Self(self.0 & 0b1011_1111_1111_1111)
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
}}
/// Has 7th bit set
fn is_vs1_2(self) -> bool {{
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
}}
/// Set 7th bit
fn set_vs1_2(self) -> Self {{
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
Self(
self.0
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
& !WidthInfo::VARIATION_SELECTOR_15.0
& !WidthInfo::VARIATION_SELECTOR_16.0,
)
}} else {{
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}
/// Clear 7th bit
fn unset_vs1_2(self) -> Self {{
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
}}
}}
Expand Down
46 changes: 21 additions & 25 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
//! - Has the [`Emoji_Presentation`] property, and
//! - Is not in the [Enclosed Ideographic Supplement] block.
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
//! and width 2 when followed by '\u{FE01}'.
//! - Script-specific ligatures:
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G23126)s
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G23126)s
//! will not affect the width.
//! - **[Arabic]**: A character sequence consisting of one character with [`Joining_Group`]`=Lam`,
//! followed by any number of characters with [`Joining_Type`]`=Transparent`, followed by one character
Expand All @@ -75,6 +77,7 @@
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
//! have width 0.
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
Expand All @@ -96,15 +99,6 @@
//! with the [`Default_Ignorable_Code_Point`] property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
//! with the [`Grapheme_Extend`] property.
//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
//! - [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0),
//! - [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7),
//! - [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8),
//! - [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA),
//! - [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB),
//! - [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B),
//! - [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - The following [`Prepended_Concatenation_Mark`]s:
Expand All @@ -130,18 +124,18 @@
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
//!
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50313
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G50313
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40095
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
//! [`General_Category`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G124142
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G52443
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G45593
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G50009
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G37908
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
//!
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
Expand All @@ -150,22 +144,24 @@
//!
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
//!
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
//! [combining marks]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
//!
//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence
//! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
//!
//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
//!
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
//! [Buginese]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-17/#G26743
//! [Hebrew]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G6528
//! [Khmer]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-16/#G64642
//! [Kirat Rai]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-13/#G746409
//! [Lisu]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-18/#G44587
//! [Old Turkic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-14/#G41975
//! [Tifinagh]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-19/#G43184
//!
//!
//! ## Canonical equivalence
Expand Down
Loading

0 comments on commit 7a7fcdc

Please sign in to comment.