Skip to content

Commit

Permalink
Assign width 3 to KHMER SIGN BEYYAL
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Jun 2, 2024
1 parent e6ba907 commit 714ddc5
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 16 deletions.
30 changes: 19 additions & 11 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
WIDE = 0x1_0002
"Two columns wide."

THREE = 0x1_0003
"Three columns wide."

# \r\n
LINE_FEED = 0b0000_0000_0000_0001
"\\n (CRLF has width 1)"
Expand Down Expand Up @@ -341,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
case _:
return CharWidthInTable.SPECIAL

def is_carried(self) -> bool:
"Whether this corresponds to a non-default `WidthInfo`."
return int(self) <= 0xFFFF

def width_alone(self) -> int:
"The width of a character with this type when it appears alone."
match self:
Expand All @@ -357,6 +364,8 @@ def width_alone(self) -> int:
| WidthState.EMOJI_PRESENTATION
):
return 2
case WidthState.THREE:
return 3
case _:
return 1

Expand Down Expand Up @@ -598,6 +607,7 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
(alef_joining, WidthState.JOINING_GROUP_ALEF),
(range(0x1780, 0x17A3), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
([0x17A7, 0x17AB, 0x17AC, 0x17AF], WidthState.KHMER_COENG_ELIGIBLE_LETTER),
([0x17D8], WidthState.THREE),
([0x1A10], WidthState.BUGINESE_LETTER_YA),
(range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
([0x2D6F], WidthState.TIFINAGH_CONSONANT),
Expand Down Expand Up @@ -1196,7 +1206,11 @@ def lookup_fns(
s += f" '\\u{{{lo:X}}}'"
if hi != lo:
s += f"..='\\u{{{hi:X}}}'"
s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
if width.is_carried():
width_info = width.name
else:
width_info = "DEFAULT"
s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"

s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION),
}}
Expand Down Expand Up @@ -1531,7 +1545,7 @@ def emit_module(
)

for variant in WidthState:
if variant.table_width() == CharWidthInTable.SPECIAL:
if variant.is_carried():
if variant.is_cjk_only():
module.write(' #[cfg(feature = "cjk")]\n')
module.write(
Expand Down Expand Up @@ -1925,7 +1939,7 @@ def emit_module(
test_width_variants = []
test_width_variants_cjk = []
for variant in WidthState:
if variant.table_width() == CharWidthInTable.SPECIAL:
if variant.is_carried():
if not variant.is_cjk_only():
test_width_variants.append(variant)
if not variant.is_non_cjk_only():
Expand Down Expand Up @@ -2003,10 +2017,7 @@ def emit_module(
)

for variant in WidthState:
if (
variant.table_width() == CharWidthInTable.SPECIAL
and not variant.is_cjk_only()
):
if variant.is_carried() and not variant.is_cjk_only():
module.write(f" WidthInfo::{variant.name},\n")

module.write(
Expand All @@ -2018,10 +2029,7 @@ def emit_module(
)

for variant in WidthState:
if (
variant.table_width() == CharWidthInTable.SPECIAL
and not variant.is_non_cjk_only()
):
if variant.is_carried() and not variant.is_non_cjk_only():
module.write(f" WidthInfo::{variant.name},\n")

module.write(
Expand Down
9 changes: 5 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@
//! 2. In all other cases, the width of the string equals the sum of its character widths:
//! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 3. The following have width 0:
//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
//! 4. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`] property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -111,15 +112,15 @@
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
//! 6. All other characters have width 1.
//! 7. All other characters have width 1.
//!
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
Expand Down
4 changes: 3 additions & 1 deletion src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
Expand Down Expand Up @@ -444,6 +445,7 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
'\u{17D8}' => (3, WidthInfo::DEFAULT),
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
Expand Down Expand Up @@ -1220,7 +1222,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
[
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xD5, 0xD5, 0xD7, 0x55, 0x10, 0x00,
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
[
Expand Down
6 changes: 6 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,12 @@ fn test_khmer_coeng() {
}
}

#[test]
fn test_khmer_sign_beyyal() {
assert_width!("៘", 3, 3);
assert_width!("។ល។", 3, 3);
}

#[test]
fn test_emoji_modifier() {
assert_width!("\u{1F46A}", 2, 2);
Expand Down

0 comments on commit 714ddc5

Please sign in to comment.