Skip to content

Commit 714ddc5

Browse files
Assign width 3 to KHMER SIGN BEYYAL
See https://unicode.org/charts/nameslist/n_1780.html
1 parent e6ba907 commit 714ddc5

File tree

4 files changed

+33
-16
lines changed

4 files changed

+33
-16
lines changed

scripts/unicode.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
184184
WIDE = 0x1_0002
185185
"Two columns wide."
186186

187+
THREE = 0x1_0003
188+
"Three columns wide."
189+
187190
# \r\n
188191
LINE_FEED = 0b0000_0000_0000_0001
189192
"\\n (CRLF has width 1)"
@@ -341,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
341344
case _:
342345
return CharWidthInTable.SPECIAL
343346

347+
def is_carried(self) -> bool:
348+
"Whether this corresponds to a non-default `WidthInfo`."
349+
return int(self) <= 0xFFFF
350+
344351
def width_alone(self) -> int:
345352
"The width of a character with this type when it appears alone."
346353
match self:
@@ -357,6 +364,8 @@ def width_alone(self) -> int:
357364
| WidthState.EMOJI_PRESENTATION
358365
):
359366
return 2
367+
case WidthState.THREE:
368+
return 3
360369
case _:
361370
return 1
362371

@@ -598,6 +607,7 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
598607
(alef_joining, WidthState.JOINING_GROUP_ALEF),
599608
(range(0x1780, 0x17A3), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
600609
([0x17A7, 0x17AB, 0x17AC, 0x17AF], WidthState.KHMER_COENG_ELIGIBLE_LETTER),
610+
([0x17D8], WidthState.THREE),
601611
([0x1A10], WidthState.BUGINESE_LETTER_YA),
602612
(range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
603613
([0x2D6F], WidthState.TIFINAGH_CONSONANT),
@@ -1196,7 +1206,11 @@ def lookup_fns(
11961206
s += f" '\\u{{{lo:X}}}'"
11971207
if hi != lo:
11981208
s += f"..='\\u{{{hi:X}}}'"
1199-
s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
1209+
if width.is_carried():
1210+
width_info = width.name
1211+
else:
1212+
width_info = "DEFAULT"
1213+
s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"
12001214

12011215
s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION),
12021216
}}
@@ -1531,7 +1545,7 @@ def emit_module(
15311545
)
15321546

15331547
for variant in WidthState:
1534-
if variant.table_width() == CharWidthInTable.SPECIAL:
1548+
if variant.is_carried():
15351549
if variant.is_cjk_only():
15361550
module.write(' #[cfg(feature = "cjk")]\n')
15371551
module.write(
@@ -1925,7 +1939,7 @@ def emit_module(
19251939
test_width_variants = []
19261940
test_width_variants_cjk = []
19271941
for variant in WidthState:
1928-
if variant.table_width() == CharWidthInTable.SPECIAL:
1942+
if variant.is_carried():
19291943
if not variant.is_cjk_only():
19301944
test_width_variants.append(variant)
19311945
if not variant.is_non_cjk_only():
@@ -2003,10 +2017,7 @@ def emit_module(
20032017
)
20042018

20052019
for variant in WidthState:
2006-
if (
2007-
variant.table_width() == CharWidthInTable.SPECIAL
2008-
and not variant.is_cjk_only()
2009-
):
2020+
if variant.is_carried() and not variant.is_cjk_only():
20102021
module.write(f" WidthInfo::{variant.name},\n")
20112022

20122023
module.write(
@@ -2018,10 +2029,7 @@ def emit_module(
20182029
)
20192030

20202031
for variant in WidthState:
2021-
if (
2022-
variant.table_width() == CharWidthInTable.SPECIAL
2023-
and not variant.is_non_cjk_only()
2024-
):
2032+
if variant.is_carried() and not variant.is_non_cjk_only():
20252033
module.write(f" WidthInfo::{variant.name},\n")
20262034

20272035
module.write(

src/lib.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@
8888
//! 2. In all other cases, the width of the string equals the sum of its character widths:
8989
//! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
9090
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
91-
//! 3. The following have width 0:
91+
//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
92+
//! 4. The following have width 0:
9293
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
9394
//! with the [`Default_Ignorable_Code_Point`] property.
9495
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -111,15 +112,15 @@
111112
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
112113
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
113114
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
114-
//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
115+
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
115116
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
116-
//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
117+
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
117118
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
118119
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
119120
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
120121
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
121122
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
122-
//! 6. All other characters have width 1.
123+
//! 7. All other characters have width 1.
123124
//!
124125
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
125126
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F

src/tables.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
161161
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
162162
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
163163
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
164+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
164165
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
165166
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
166167
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -444,6 +445,7 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
444445
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
445446
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
446447
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
448+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
447449
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
448450
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
449451
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -1220,7 +1222,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
12201222
],
12211223
[
12221224
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xD5, 0xD5, 0xD7, 0x55, 0x10, 0x00,
1223-
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1225+
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
12241226
0x55, 0x55,
12251227
],
12261228
[

tests/tests.rs

+6
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,12 @@ fn test_khmer_coeng() {
409409
}
410410
}
411411

412+
#[test]
413+
fn test_khmer_sign_beyyal() {
414+
assert_width!("៘", 3, 3);
415+
assert_width!("។ល។", 3, 3);
416+
}
417+
412418
#[test]
413419
fn test_emoji_modifier() {
414420
assert_width!("\u{1F46A}", 2, 2);

0 commit comments

Comments
 (0)