Skip to content

Commit 19dc63f

Browse files
Support Khmer
- Support nonspacing coeng signs - Assign width 2 to KHMER INDEPENDENT VOWEL QAA and 3 to KHMER SIGN BEYYAL (https://unicode.org/charts/nameslist/n_1780.html)
1 parent e9130a8 commit 19dc63f

File tree

5 files changed

+113
-21
lines changed

5 files changed

+113
-21
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ fn main() {
2525
```
2626

2727
**NOTE:** The computed width values may not match the actual rendered column
28-
width. For example, Brahmic scripts like Devanagari have complex rendering rules
28+
width. For example, many Brahmic scripts like Devanagari have complex rendering rules
2929
which this crate does not currently handle (and will never fully handle, because
3030
the exact rendering depends on the font):
3131

scripts/unicode.py

+40-11
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
184184
WIDE = 0x1_0002
185185
"Two columns wide."
186186

187+
THREE = 0x1_0003
188+
"Three columns wide."
189+
187190
# \r\n
188191
LINE_FEED = 0b0000_0000_0000_0001
189192
"\\n (CRLF has width 1)"
@@ -324,6 +327,11 @@ class WidthState(enum.IntEnum):
324327
ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110
325328
"\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)"
326329

330+
# Khmer coeng signs
331+
332+
KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111
333+
"\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF"
334+
327335
def table_width(self) -> CharWidthInTable:
328336
"The width of a character as stored in the lookup tables."
329337
match self:
@@ -336,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
336344
case _:
337345
return CharWidthInTable.SPECIAL
338346

347+
def is_carried(self) -> bool:
348+
"Whether this corresponds to a non-default `WidthInfo`."
349+
return int(self) <= 0xFFFF
350+
339351
def width_alone(self) -> int:
340352
"The width of a character with this type when it appears alone."
341353
match self:
@@ -352,6 +364,8 @@ def width_alone(self) -> int:
352364
| WidthState.EMOJI_PRESENTATION
353365
):
354366
return 2
367+
case WidthState.THREE:
368+
return 3
355369
case _:
356370
return 1
357371

@@ -591,6 +605,18 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
591605
([0x0A], WidthState.LINE_FEED),
592606
([0x05DC], WidthState.HEBREW_LETTER_LAMED),
593607
(alef_joining, WidthState.JOINING_GROUP_ALEF),
608+
(range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
609+
(range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
610+
(range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
611+
(range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
612+
(range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
613+
(range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
614+
(
615+
[0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF],
616+
WidthState.KHMER_COENG_ELIGIBLE_LETTER,
617+
),
618+
([0x17A4], WidthState.WIDE),
619+
([0x17D8], WidthState.THREE),
594620
([0x1A10], WidthState.BUGINESE_LETTER_YA),
595621
(range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
596622
([0x2D6F], WidthState.TIFINAGH_CONSONANT),
@@ -1189,7 +1215,11 @@ def lookup_fns(
11891215
s += f" '\\u{{{lo:X}}}'"
11901216
if hi != lo:
11911217
s += f"..='\\u{{{hi:X}}}'"
1192-
s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
1218+
if width.is_carried():
1219+
width_info = width.name
1220+
else:
1221+
width_info = "DEFAULT"
1222+
s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"
11931223

11941224
s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION),
11951225
}}
@@ -1323,6 +1353,11 @@ def lookup_fns(
13231353
return (0, WidthInfo::DEFAULT);
13241354
}
13251355
1356+
// Khmer coeng signs
1357+
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => {
1358+
return (-1, WidthInfo::DEFAULT);
1359+
}
1360+
13261361
// Buginese <a, -i> ZWJ ya ligature
13271362
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => {
13281363
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -1519,7 +1554,7 @@ def emit_module(
15191554
)
15201555

15211556
for variant in WidthState:
1522-
if variant.table_width() == CharWidthInTable.SPECIAL:
1557+
if variant.is_carried():
15231558
if variant.is_cjk_only():
15241559
module.write(' #[cfg(feature = "cjk")]\n')
15251560
module.write(
@@ -1913,7 +1948,7 @@ def emit_module(
19131948
test_width_variants = []
19141949
test_width_variants_cjk = []
19151950
for variant in WidthState:
1916-
if variant.table_width() == CharWidthInTable.SPECIAL:
1951+
if variant.is_carried():
19171952
if not variant.is_cjk_only():
19181953
test_width_variants.append(variant)
19191954
if not variant.is_non_cjk_only():
@@ -1991,10 +2026,7 @@ def emit_module(
19912026
)
19922027

19932028
for variant in WidthState:
1994-
if (
1995-
variant.table_width() == CharWidthInTable.SPECIAL
1996-
and not variant.is_cjk_only()
1997-
):
2029+
if variant.is_carried() and not variant.is_cjk_only():
19982030
module.write(f" WidthInfo::{variant.name},\n")
19992031

20002032
module.write(
@@ -2006,10 +2038,7 @@ def emit_module(
20062038
)
20072039

20082040
for variant in WidthState:
2009-
if (
2010-
variant.table_width() == CharWidthInTable.SPECIAL
2011-
and not variant.is_non_cjk_only()
2012-
):
2041+
if variant.is_carried() and not variant.is_non_cjk_only():
20132042
module.write(f" WidthInfo::{variant.name},\n")
20142043

20152044
module.write(

src/lib.rs

+11-5
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@
7272
//! with [`Joining_Group`]`=Alef`, has total width 1. For example: `لا`‎, `لآ`‎, `ڸا`‎, `لٟٞأ`
7373
//! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
7474
//! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
75+
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
76+
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
77+
//! have width 0.
7578
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
7679
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
7780
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -85,8 +88,10 @@
8588
//! - Is a [default-ignorable][`Default_Ignorable_Code_Point`] [combining mark][combining marks].
8689
//! 2. In all other cases, the width of the string equals the sum of its character widths:
8790
//! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
88-
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
89-
//! 3. The following have width 0:
91+
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) and
92+
//! [`'\u{17A4}'` KHMER INDEPENDENT VOWEL QAA](https://util.unicode.org/UnicodeJsps/character.jsp?a=17A4) have width 2.
93+
//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
94+
//! 4. The following have width 0:
9095
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
9196
//! with the [`Default_Ignorable_Code_Point`] property.
9297
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -109,15 +114,15 @@
109114
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
110115
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
111116
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
112-
//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
117+
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
113118
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
114-
//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
119+
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
115120
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
116121
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
117122
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
118123
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
119124
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
120-
//! 6. All other characters have width 1.
125+
//! 7. All other characters have width 1.
121126
//!
122127
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
123128
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
@@ -150,6 +155,7 @@
150155
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
151156
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
152157
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
158+
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
153159
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
154160
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
155161
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184

src/tables.rs

+21-4
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ impl WidthInfo {
5858
const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b0011110000000101);
5959
const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011100000000110);
6060
const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011110000000110);
61+
const KHMER_COENG_ELIGIBLE_LETTER: Self = Self(0b0011110000000111);
6162

6263
/// Whether this width mode is ligature_transparent
6364
/// (has 5th MSB set.)
@@ -159,6 +160,8 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
159160
'\u{A}' => (1, WidthInfo::LINE_FEED),
160161
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
161162
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
163+
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
164+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
162165
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
163166
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
164167
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -255,6 +258,11 @@ fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
255258
return (0, WidthInfo::DEFAULT);
256259
}
257260

261+
// Khmer coeng signs
262+
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
263+
return (-1, WidthInfo::DEFAULT);
264+
}
265+
258266
// Buginese <a, -i> ZWJ ya ligature
259267
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
260268
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -436,6 +444,8 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
436444
'\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
437445
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
438446
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
447+
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
448+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
439449
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
440450
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
441451
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -539,6 +549,11 @@ fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {
539549
return (0, WidthInfo::DEFAULT);
540550
}
541551

552+
// Khmer coeng signs
553+
(WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => {
554+
return (-1, WidthInfo::DEFAULT);
555+
}
556+
542557
// Buginese <a, -i> ZWJ ya ligature
543558
(WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => {
544559
return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
@@ -1206,8 +1221,8 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
12061221
0x55, 0x55,
12071222
],
12081223
[
1209-
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x10, 0x00,
1210-
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1224+
0x7F, 0xFF, 0xFD, 0xF7, 0xFF, 0xFD, 0xD7, 0x5F, 0x77, 0xD6, 0xD5, 0xD7, 0x55, 0x10, 0x00,
1225+
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
12111226
0x55, 0x55,
12121227
],
12131228
[
@@ -2575,7 +2590,7 @@ mod tests {
25752590
}
25762591
}
25772592

2578-
static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 37] = [
2593+
static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 38] = [
25792594
WidthInfo::DEFAULT,
25802595
WidthInfo::LINE_FEED,
25812596
WidthInfo::EMOJI_MODIFIER,
@@ -2613,10 +2628,11 @@ mod tests {
26132628
WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
26142629
WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
26152630
WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
2631+
WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
26162632
];
26172633

26182634
#[cfg(feature = "cjk")]
2619-
static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 38] = [
2635+
static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 39] = [
26202636
WidthInfo::DEFAULT,
26212637
WidthInfo::LINE_FEED,
26222638
WidthInfo::EMOJI_MODIFIER,
@@ -2655,6 +2671,7 @@ mod tests {
26552671
WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU,
26562672
WidthInfo::OLD_TURKIC_LETTER_ORKHON_I,
26572673
WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I,
2674+
WidthInfo::KHMER_COENG_ELIGIBLE_LETTER,
26582675
];
26592676

26602677
#[rustfmt::skip]

tests/tests.rs

+40
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,46 @@ fn test_old_turkic_ligature() {
385385
assert_width!("\u{200D}\u{10C32}", 1, 1);
386386
}
387387

388+
#[test]
389+
fn test_khmer_coeng() {
390+
assert_width!("ល", 1, 1);
391+
assert_width!("ង", 1, 1);
392+
assert_width!("លង", 2, 2);
393+
assert_width!("ល្ង", 1, 1);
394+
395+
for c in '\0'..=char::MAX {
396+
if matches!(
397+
c,
398+
'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}'
399+
| '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}'
400+
| '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}'
401+
| '\u{17A0}' | '\u{17A2}' | '\u{17A7}'
402+
| '\u{17AB}'..='\u{17AC}' | '\u{17AF}'
403+
) {
404+
assert_width!(format!("\u{17D2}{c}"), 0, 0);
405+
assert_width!(format!("\u{17D2}\u{200D}\u{200D}{c}"), 0, 0);
406+
} else {
407+
assert_width!(
408+
format!("\u{17D2}{c}"),
409+
c.width().unwrap_or(1),
410+
c.width_cjk().unwrap_or(1)
411+
);
412+
}
413+
}
414+
}
415+
416+
#[test]
417+
fn test_khmer_qaa() {
418+
assert_width!("\u{17A4}", 2, 2);
419+
assert_width!("\u{17A2}\u{17A6}", 2, 2);
420+
}
421+
422+
#[test]
423+
fn test_khmer_sign_beyyal() {
424+
assert_width!("\u{17D8}", 3, 3);
425+
assert_width!("\u{17D4}\u{179B}\u{17D4}", 3, 3);
426+
}
427+
388428
#[test]
389429
fn test_emoji_modifier() {
390430
assert_width!("\u{1F46A}", 2, 2);

0 commit comments

Comments
 (0)