Skip to content

Commit

Permalink
Merge pull request #34 from Jules-Bertholet/default-ignorable-code-point
Browse files Browse the repository at this point in the history
Fixes to characters considered zero-width
  • Loading branch information
Manishearth authored Feb 13, 2024
2 parents 8942487 + aae585f commit fda272b
Show file tree
Hide file tree
Showing 5 changed files with 368 additions and 230 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ fn main() {

**NOTE:** The computed width values may not match the actual rendered column
width. For example, the woman scientist emoji comprises of a woman emoji, a
zero-width joiner and a microscope emoji.
zero-width joiner and a microscope emoji. Such [emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences)
are considered to have the sum of the widths of their constituent parts:

```rust
extern crate unicode_width;
Expand All @@ -39,8 +40,10 @@ fn main() {
}
```

See [Unicode Standard Annex #11][UAX11] for precise details on what is and isn't
covered by this crate.
Additionally, [defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence)
and nonstandard [Korean jamo](https://unicode.org/glossary/#jamo) sequences may
be rendered with a different width than what this crate says. (This is not an
exhaustive list.)

## features

Expand Down
118 changes: 91 additions & 27 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):

def fetch_open(filename: str):
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
"""
if not os.path.exists(os.path.basename(filename)):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
try:
Expand All @@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":

class EffectiveWidth(enum.IntEnum):
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
"""

ZERO = 0
""" Zero columns wide. """
Expand Down Expand Up @@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":

def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if `c` is in general categories
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
character. `c` is considered a zero-width character if
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = []

# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
with fetch_open("UnicodeData.txt") as categories:
zw_map = []
current = 0
for line in categories.readlines():
if len(raw_data := line.split(";")) != 15:
Expand All @@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
raw_data[1],
raw_data[2],
]
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
zero_width = cat_code in ["Cc", "Mn", "Me"]

assert current <= codepoint
while current <= codepoint:
Expand All @@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
zw_map.append(False)

return zw_map
# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
with fetch_open("DerivedCoreProperties.txt") as properties:
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
)

for line in properties.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True

# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
# as zero-width. This matches the behavior of glibc `wcwidth`.
#
# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
# into a single wide grapheme. So we treat vowel and trailing jamo as
# 0-width, such that only the width of the leading jamo is counted
# and the resulting grapheme has width 2.
#
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
with fetch_open("HangulSyllableType.txt") as categories:
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")

for line in categories.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True

# Special case: U+115F HANGUL CHOSEONG FILLER.
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
# (which are considered 0-width on their own) to form a composed Hangul syllable with
# width 2. Therefore, we treat it as having width 2.
zw_map[0x115F] = False
return zw_map


class Bucket:
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
list overlaps with another's width list, those buckets can be merged via `try_extend`."""
list overlaps with another's width list, those buckets can be merged via `try_extend`.
"""

def __init__(self):
"""Creates an empty bucket."""
Expand Down Expand Up @@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
same bucket. Returns a list of the buckets in increasing order of those bits."""
num_bits = cap_bit - low_bit
assert num_bits > 0
buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
buckets = [Bucket() for _ in range(0, 2**num_bits)]
mask = (1 << num_bits) - 1
for (codepoint, width) in entries:
for codepoint, width in entries:
buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
return buckets

Expand Down Expand Up @@ -269,7 +334,7 @@ def __init__(
buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))

for bucket in buckets:
for (i, existing) in enumerate(self.indexed):
for i, existing in enumerate(self.indexed):
if existing.try_extend(bucket):
self.entries.append(i)
break
Expand All @@ -283,7 +348,8 @@ def __init__(

def indices_to_widths(self):
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
their buckets. Assumes that no bucket contains codepoints with different widths."""
their buckets. Assumes that no bucket contains codepoints with different widths.
"""
self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
del self.indexed

Expand Down Expand Up @@ -315,7 +381,7 @@ def make_tables(
to include in the top-level table."""
tables = []
entry_groups = [entries]
for (low_bit, cap_bit, offset_type) in table_cfgs:
for low_bit, cap_bit, offset_type in table_cfgs:
table = Table(entry_groups, low_bit, cap_bit, offset_type)
entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
tables.append(table)
Expand All @@ -326,7 +392,8 @@ def emit_module(
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
):
"""Outputs a Rust module to `out_name` using table data from `tables`.
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
"""
if os.path.exists(out_name):
os.remove(out_name)
with open(out_name, "w", newline="\n", encoding="utf-8") as module:
Expand Down Expand Up @@ -432,7 +499,7 @@ def emit_module(
)

subtable_count = 1
for (i, table) in enumerate(tables):
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
if i == len(tables) - 1:
table.indices_to_widths() # for the last table, indices == widths
Expand All @@ -442,7 +509,7 @@ def emit_module(
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
)
for (j, byte) in enumerate(byte_array):
for j, byte in enumerate(byte_array):
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
if j % 15 == 0:
module.write("\n ")
Expand All @@ -458,16 +525,17 @@ def main(module_filename: str):
`module_filename`.
We obey the following rules in decreasing order of importance:
- The soft hyphen (`U+00AD`) is single-width.
- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
- Hangul jamo medial vowels & final consonants are zero-width.
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
"""
version = load_unicode_version()
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")

Expand All @@ -482,15 +550,11 @@ def main(module_filename: str):
# Override for soft hyphen
width_map[0x00AD] = EffectiveWidth.NARROW

# Override for Hangul Jamo medial vowels & final consonants
for i in range(0x1160, 0x11FF + 1):
width_map[i] = EffectiveWidth.ZERO

tables = make_tables(TABLE_CFGS, enumerate(width_map))

print("------------------------")
total_size = 0
for (i, table) in enumerate(tables):
for i, table in enumerate(tables):
size_bytes = len(table.to_bytes())
print(f"Table {i} Size: {size_bytes} bytes")
total_size += size_bytes
Expand Down
19 changes: 12 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]

#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(feature = "bench", feature(test))]
#![no_std]

Expand Down Expand Up @@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {

impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> { cw::width(self, false) }
fn width(self) -> Option<usize> {
cw::width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> { cw::width(self, true) }
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
}
}

/// Methods for determining displayed width of Unicode strings.
Expand All @@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
/// non-CJK contexts, or when the context cannot be reliably determined.
fn width<'a>(&'a self) -> usize;
fn width(&self) -> usize;

/// Returns the string's displayed width in columns.
///
Expand All @@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
/// CJK contexts.
fn width_cjk<'a>(&'a self) -> usize;
fn width_cjk(&self) -> usize;
}

impl UnicodeWidthStr for str {
Expand Down
Loading

0 comments on commit fda272b

Please sign in to comment.