Skip to content

Commit 2517d68

Browse files
Treat ambiguous Modifier_Letters as narrow (unicode-rs#63)
* Treat ambiguous `Modifier_Letter`s as narrow This matches the behavior of common fonts. Affects 6 characters: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5B%3AEast_Asian_Width%3DAmbiguous%3A%5D-%5B%5B%3AScript%3D%2FLatin%7CGreek%7CCyrillic%2F%3A%5D-%5B%5B%3ABlock%3DNumber+Forms%3A%5D%26%5B%3Asubhead%3DRoman+numerals%3A%5D%5D%5D%5D%26%5B%3AModifier_Letter%3A%5D * Simplify derivation of ambiguous Use `Letter` general category instead of script and block. Changes `ℓ` to narrow, matching common fonts
1 parent 8e40640 commit 2517d68

File tree

4 files changed

+20
-32
lines changed

4 files changed

+20
-32
lines changed

scripts/unicode.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
# - NormalizationTest.txt (for tests only)
1919
# - PropList.txt
2020
# - ReadMe.txt
21-
# - Scripts.txt
2221
# - UnicodeData.txt
2322
# - auxiliary/GraphemeBreakProperty.txt
2423
# - emoji/emoji-data.txt
@@ -430,22 +429,10 @@ def load_east_asian_widths() -> list[EastAsianWidth]:
430429
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
431430
width_map.append(EastAsianWidth.NARROW)
432431

433-
# Characters from alphabetic scripts are narrow
434-
load_property(
435-
"Scripts.txt",
436-
r"(?:Latin|Greek|Cyrillic)",
437-
lambda cp: (
438-
operator.setitem(width_map, cp, EastAsianWidth.NARROW)
439-
if width_map[cp] == EastAsianWidth.AMBIGUOUS
440-
and not (0x2160 <= cp <= 0x217F) # Roman numerals remain ambiguous
441-
else None
442-
),
443-
)
444-
445-
# Ambiguous `Modifier_Symbol`s are narrow
432+
# Ambiguous `Letter`s and `Modifier_Symbol`s are narrow
446433
load_property(
447434
"extracted/DerivedGeneralCategory.txt",
448-
"Sk",
435+
r"(:?Lu|Ll|Lt|Lm|Lo|Sk)",
449436
lambda cp: (
450437
operator.setitem(width_map, cp, EastAsianWidth.NARROW)
451438
if width_map[cp] == EastAsianWidth.AMBIGUOUS

src/lib.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,7 @@
122122
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
123123
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
124124
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
125-
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
126-
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
125+
//! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`.
127126
//! 7. All other characters have width 1.
128127
//!
129128
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338

src/tables.rs

+9-15
Original file line numberDiff line numberDiff line change
@@ -1022,17 +1022,17 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
10221022
],
10231023
#[cfg(feature = "cjk")]
10241024
[
1025-
0x00, 0x9D, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0xA0, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08,
1025+
0x00, 0x9D, 0x02, 0x02, 0x02, 0x02, 0x9E, 0x9F, 0x02, 0x04, 0x02, 0x05, 0x06, 0x07, 0x08,
10261026
0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
10271027
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x02, 0x02, 0x1E, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
10281028
0x02, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x02, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x02, 0x2A,
10291029
0x02, 0x02, 0x02, 0x02,
10301030
],
10311031
#[cfg(feature = "cjk")]
10321032
[
1033-
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE,
1034-
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
1035-
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xB0, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
1033+
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
1034+
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
1035+
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10361036
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10371037
0x39, 0x39, 0x39, 0x39,
10381038
],
@@ -1042,23 +1042,23 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
10421042
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10431043
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10441044
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x4C, 0x02, 0x02, 0x02, 0x02, 0x02,
1045-
0xB1, 0x4E, 0x4F, 0xB2,
1045+
0xB0, 0x4E, 0x4F, 0xB1,
10461046
],
10471047
#[cfg(feature = "cjk")]
10481048
[
10491049
0x85, 0x86, 0x75, 0x02, 0x02, 0x87, 0x02, 0x02, 0x02, 0x88, 0x02, 0x02, 0x02, 0x02, 0x02,
10501050
0x02, 0x02, 0x89, 0x8A, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
1051-
0x02, 0x02, 0x8B, 0x8C, 0xB3, 0xB4, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
1051+
0x02, 0x02, 0x8B, 0x8C, 0xB2, 0xB3, 0x8E, 0x02, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
10521052
0x96, 0x02, 0x97, 0x02, 0x02, 0x98, 0x99, 0x9A, 0x9B, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
10531053
0x02, 0x02, 0x02, 0x02,
10541054
],
10551055
]);
10561056

10571057
#[cfg(feature = "cjk")]
1058-
const WIDTH_LEAVES_LEN: usize = 181;
1058+
const WIDTH_LEAVES_LEN: usize = 180;
10591059
#[cfg(not(feature = "cjk"))]
10601060
const WIDTH_LEAVES_LEN: usize = 157;
1061-
/// Autogenerated. 181 sub-table(s). Consult [`lookup_width`] for layout info.
1061+
/// Autogenerated. 180 sub-table(s). Consult [`lookup_width`] for layout info.
10621062
static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
10631063
[
10641064
0x55, 0x55, 0x75, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
@@ -1852,12 +1852,6 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
18521852
0x55, 0x55,
18531853
],
18541854
#[cfg(feature = "cjk")]
1855-
[
1856-
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1857-
0x55, 0x55, 0x95, 0xA9, 0x59, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1858-
0x55, 0x55,
1859-
],
1860-
#[cfg(feature = "cjk")]
18611855
[
18621856
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03,
18631857
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55,
@@ -1883,7 +1877,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
18831877
],
18841878
#[cfg(feature = "cjk")]
18851879
[
1886-
0x95, 0x59, 0x59, 0x55, 0x95, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1880+
0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
18871881
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
18881882
0x5A, 0x55,
18891883
],

tests/tests.rs

+8
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@ fn test_default_ignorable() {
7878
assert_width!('\u{E0000}', Some(0), Some(0));
7979
}
8080

81+
#[test]
82+
fn test_ambiguous() {
83+
assert_width!("\u{B7}", 1, 2);
84+
assert_width!("\u{0387}", 1, 2);
85+
assert_width!("\u{A8}", 1, 1);
86+
assert_width!("\u{02C9}", 1, 1);
87+
}
88+
8189
#[test]
8290
fn test_jamo() {
8391
assert_width!('\u{1100}', Some(2), Some(2));

0 commit comments

Comments
 (0)