Skip to content

Commit dc86c74

Browse files
Assign the same CJK width to canonically equivalent strings
1 parent a2db56b commit dc86c74

File tree

4 files changed

+462
-394
lines changed

4 files changed

+462
-394
lines changed

scripts/unicode.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@
1515
# - DerivedCoreProperties.txt
1616
# - EastAsianWidth.txt
1717
# - HangulSyllableType.txt
18+
# - NormalizationTest.txt (for tests only)
1819
# - PropList.txt
1920
# - ReadMe.txt
21+
# - Scripts.txt
22+
# - UnicodeData.txt
23+
# - emoji/emoji-data.txt
2024
# - emoji/emoji-variation-sequences.txt
25+
# - extracted/DerivedGeneralCategory.txt
2126
#
2227
# Since this should not require frequent updates, we just store this
2328
# out-of-line and check the generated module into git.
@@ -142,6 +147,7 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
142147
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
143148
144149
`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
150+
145151
with fetch_open("EastAsianWidth.txt") as eaw:
146152
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
147153
single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
@@ -179,7 +185,43 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
179185
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
180186
width_map.append(EffectiveWidth.NARROW)
181187

182-
return width_map
188+
# Characters from alphabetic scripts are narrow
189+
load_property(
190+
"Scripts.txt",
191+
r"(?:Latin|Greek|Cyrillic)",
192+
lambda cp: (
193+
operator.setitem(width_map, cp, EffectiveWidth.NARROW)
194+
if width_map[cp] == EffectiveWidth.AMBIGUOUS
195+
and not (0x2160 <= cp <= 0x217F) # Roman numerals remain ambiguous
196+
else None
197+
),
198+
)
199+
200+
# Ambiguous `Modifier_Symbol`s are narrow
201+
load_property(
202+
"extracted/DerivedGeneralCategory.txt",
203+
"Sk",
204+
lambda cp: (
205+
operator.setitem(width_map, cp, EffectiveWidth.NARROW)
206+
if width_map[cp] == EffectiveWidth.AMBIGUOUS
207+
else None
208+
),
209+
)
210+
211+
# GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT
212+
width_map[0x0387] = EffectiveWidth.AMBIGUOUS
213+
214+
# Canonical equivalence for symbols with stroke
215+
with fetch_open("UnicodeData.txt") as udata:
216+
single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;")
217+
for line in udata.readlines():
218+
if match := single.match(line):
219+
composed = int(match.group(1), 16)
220+
decomposed = int(match.group(2), 16)
221+
if width_map[decomposed] == EffectiveWidth.AMBIGUOUS:
222+
width_map[composed] = EffectiveWidth.AMBIGUOUS
223+
224+
return width_map
183225

184226

185227
def load_zero_widths() -> list[bool]:

src/lib.rs

+29-13
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,9 @@
4040
//! 3. The sequence `"\r\n"` has width 1.
4141
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
4242
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43-
//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44-
//! 6. The following have width 0:
43+
//! 5. In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
44+
//! 6. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
45+
//! 7. The following have width 0:
4546
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4647
//! with the [`Default_Ignorable_Code_Point`] property.
4748
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -64,18 +65,26 @@
6465
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
6566
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
6667
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
67-
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
68+
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6869
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
69-
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
70-
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
71-
//! 9. All other characters have width 1.
70+
//! 9. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
71+
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
72+
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
73+
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
74+
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
75+
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
76+
//! 10. All other characters have width 1.
77+
//!
78+
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
7279
//!
7380
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
7481
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
7582
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
83+
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
7684
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
7785
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
7886
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
87+
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
7988
//!
8089
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
8190
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
@@ -84,14 +93,13 @@
8493
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
8594
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
8695
//!
87-
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
96+
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
8897
//!
8998
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
9099
//!
91100
//! ## Canonical equivalence
92101
//!
93-
//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
94-
//! However, this guarantee does not currently hold for the CJK width variants.
102+
//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).
95103
96104
#![forbid(unsafe_code)]
97105
#![deny(missing_docs)]
@@ -198,14 +206,17 @@ enum NextCharInfo {
198206
#[default]
199207
Default,
200208
/// `'\n'`
201-
LineFeed = 0x0A,
209+
LineFeed,
210+
/// '\u{0338}'
211+
/// For preserving canonical equivalence with CJK
212+
CombiningLongSolidusOverlay,
202213
/// `'\u{A4FC}'..='\u{A4FD}'`
203214
/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
204215
TrailingLisuToneLetter,
205216
/// `'\u{FE0E}'`
206-
Vs15 = 0x0E,
217+
Vs15,
207218
/// `'\u{FE0F}'`
208-
Vs16 = 0x0F,
219+
Vs16,
209220
}
210221

211222
fn str_width(s: &str, is_cjk: bool) -> usize {
@@ -222,7 +233,11 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
222233
/// they're treated as single width.
223234
#[inline]
224235
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
225-
if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
236+
if (is_cjk
237+
&& next_info == NextCharInfo::CombiningLongSolidusOverlay
238+
&& matches!(c, '<' | '=' | '>'))
239+
|| (next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c))
240+
{
226241
(2, NextCharInfo::Default)
227242
} else if c <= '\u{A0}' {
228243
match c {
@@ -235,6 +250,7 @@ fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextC
235250
('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
236251
(0, NextCharInfo::Default)
237252
}
253+
('\u{0338}', _) => (0, NextCharInfo::CombiningLongSolidusOverlay),
238254
('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
239255
('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
240256
('\u{FE0F}', _) => (0, NextCharInfo::Vs16),

0 commit comments

Comments
 (0)