From 2517d68723f4a472b2eaa5090e01f74d8af4543e Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 17 Jun 2024 18:59:25 -0400 Subject: [PATCH] Treat ambiguous `Modifier_Letter`s as narrow (#63) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Treat ambiguous `Modifier_Letter`s as narrow This matches the behavior of common fonts. Affects 6 characters: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5B%3AEast_Asian_Width%3DAmbiguous%3A%5D-%5B%5B%3AScript%3D%2FLatin%7CGreek%7CCyrillic%2F%3A%5D-%5B%5B%3ABlock%3DNumber+Forms%3A%5D%26%5B%3Asubhead%3DRoman+numerals%3A%5D%5D%5D%5D%26%5B%3AModifier_Letter%3A%5D * Simplify derivation of ambiguous Use `Letter` general category instead of script and block. Changes `ℓ` to narrow, matching common fonts --- scripts/unicode.py | 17 ++--------------- src/lib.rs | 3 +-- src/tables.rs | 24 +++++++++--------------- tests/tests.rs | 8 ++++++++ 4 files changed, 20 insertions(+), 32 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index e3be355..aa0d86b 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -18,7 +18,6 @@ # - NormalizationTest.txt (for tests only) # - PropList.txt # - ReadMe.txt -# - Scripts.txt # - UnicodeData.txt # - auxiliary/GraphemeBreakProperty.txt # - emoji/emoji-data.txt @@ -430,22 +429,10 @@ def load_east_asian_widths() -> list[EastAsianWidth]: # Catch any leftover codepoints and assign them implicit Neutral/narrow width. width_map.append(EastAsianWidth.NARROW) - # Characters from alphabetic scripts are narrow - load_property( - "Scripts.txt", - r"(?:Latin|Greek|Cyrillic)", - lambda cp: ( - operator.setitem(width_map, cp, EastAsianWidth.NARROW) - if width_map[cp] == EastAsianWidth.AMBIGUOUS - and not (0x2160 <= cp <= 0x217F) # Roman numerals remain ambiguous - else None - ), - ) - - # Ambiguous `Modifier_Symbol`s are narrow + # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow load_property( "extracted/DerivedGeneralCategory.txt", - "Sk", + r"(:?Lu|Ll|Lt|Lm|Lo|Sk)", lambda cp: ( operator.setitem(width_map, cp, EastAsianWidth.NARROW) if width_map[cp] == EastAsianWidth.AMBIGUOUS diff --git a/src/lib.rs b/src/lib.rs index 4297e11..71b5d70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -122,8 +122,7 @@ //! - Has an [`East_Asian_Width`] of [`Ambiguous`], or //! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or //! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and -//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and -//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`. +//! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`. //! 7. All other characters have width 1. //! //! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338 diff --git a/src/tables.rs b/src/tables.rs index c8a4aba..fa632d6 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1022,7 +1022,7 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([ ], #[cfg(feature = "cjk")] [ - 0x00, 0x9D, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0xA0, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08, + 0x00, 0x9D, 0x02, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x02, 0x02, 0x1E, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x02, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x02, 0x2A, @@ -1030,9 +1030,9 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([ ], #[cfg(feature = "cjk")] [ - 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, - 0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, - 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xB0, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, + 0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, + 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, ], @@ -1042,23 +1042,23 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([ 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x4C, 0x02, 0x02, 0x02, 0x02, 0x02, - 0xB1, 0x4E, 0x4F, 0xB2, + 0xB0, 0x4E, 0x4F, 0xB1, ], #[cfg(feature = "cjk")] [ 0x85, 0x86, 0x75, 0x02, 0x02, 0x87, 0x02, 0x02, 0x02, 0x88, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x89, 0x8A, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, - 0x02, 0x02, 0x8B, 0x8C, 0xB3, 0xB4, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, + 0x02, 0x02, 0x8B, 0x8C, 0xB2, 0xB3, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x02, 0x97, 0x02, 0x02, 0x98, 0x99, 0x9A, 0x9B, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, ], ]); #[cfg(feature = "cjk")] -const WIDTH_LEAVES_LEN: usize = 181; +const WIDTH_LEAVES_LEN: usize = 180; #[cfg(not(feature = "cjk"))] const WIDTH_LEAVES_LEN: usize = 157; -/// Autogenerated. 181 sub-table(s). Consult [`lookup_width`] for layout info. +/// Autogenerated. 180 sub-table(s). Consult [`lookup_width`] for layout info. static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ [ 0x55, 0x55, 0x75, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, @@ -1852,12 +1852,6 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], #[cfg(feature = "cjk")] - [ - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x95, 0xA9, 0x59, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, - ], - #[cfg(feature = "cjk")] [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55, @@ -1883,7 +1877,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], #[cfg(feature = "cjk")] [ - 0x95, 0x59, 0x59, 0x55, 0x95, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0x5A, 0x55, ], diff --git a/tests/tests.rs b/tests/tests.rs index 8ff0c6b..2940df2 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -78,6 +78,14 @@ fn test_default_ignorable() { assert_width!('\u{E0000}', Some(0), Some(0)); } +#[test] +fn test_ambiguous() { + assert_width!("\u{B7}", 1, 2); + assert_width!("\u{0387}", 1, 2); + assert_width!("\u{A8}", 1, 1); + assert_width!("\u{02C9}", 1, 1); +} + #[test] fn test_jamo() { assert_width!('\u{1100}', Some(2), Some(2));