Skip to content

Commit b83714b

Browse files
Add API to correct defective combining character sequences
1 parent a6a221a commit b83714b

File tree

5 files changed

+413
-1
lines changed

5 files changed

+413
-1
lines changed

scripts/unicode.py

+114-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# Since this should not require frequent updates, we just store this
2020
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121
import collections
22+
import re
2223
import urllib.request
2324

2425
UNICODE_VERSION = "15.1.0"
@@ -66,6 +67,8 @@
6667
class UnicodeData(object):
6768
def __init__(self):
6869
self._load_unicode_data()
70+
self._load_default_ignorable_marks()
71+
6972
self.norm_props = self._load_norm_props()
7073
self.norm_tests = self._load_norm_tests()
7174

@@ -100,6 +103,11 @@ def _load_unicode_data(self):
100103
self.general_category_mark = []
101104
self.general_category_public_assigned = []
102105

106+
# Characters that cannot be part of a combining character sequence:
107+
# control characters, format characters other than ZWJ and ZWNJ,
108+
# the line and paragraph separators, and noncharacters.
109+
self.not_in_ccs = []
110+
103111
assigned_start = 0;
104112
prev_char_int = -1;
105113
prev_name = "";
@@ -125,6 +133,9 @@ def _load_unicode_data(self):
125133
if category == 'M' or 'M' in expanded_categories.get(category, []):
126134
self.general_category_mark.append(char_int)
127135

136+
if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
137+
self.not_in_ccs.append(char_int)
138+
128139
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
129140
if category not in ['Co', 'Cs']:
130141
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -135,6 +146,44 @@ def _load_unicode_data(self):
135146

136147
self.general_category_public_assigned.append((assigned_start, prev_char_int))
137148

149+
# Mark noncharacters as nongraphic
150+
for i in range(0xFDD0, 0xFDF0):
151+
self.not_in_ccs.append(i)
152+
for prefix in range(0, 0x11):
153+
shifted = prefix << 16
154+
self.not_in_ccs.append(shifted | 0xFFFE)
155+
self.not_in_ccs.append(shifted | 0xFFFF)
156+
157+
self.not_in_ccs.sort()
158+
159+
def _load_default_ignorable_marks(self):
160+
default_ignorable_cps = set()
161+
162+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
163+
multiple = re.compile(
164+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
165+
)
166+
167+
for line in self._fetch("DerivedCoreProperties.txt").splitlines():
168+
raw_data = None # (low, high)
169+
if match := single.match(line):
170+
raw_data = (match.group(1), match.group(1))
171+
elif match := multiple.match(line):
172+
raw_data = (match.group(1), match.group(2))
173+
else:
174+
continue
175+
low = int(raw_data[0], 16)
176+
high = int(raw_data[1], 16)
177+
for cp in range(low, high + 1):
178+
default_ignorable_cps.add(cp)
179+
180+
self.default_ignorable_marks = []
181+
for cp in self.general_category_mark:
182+
if cp in default_ignorable_cps:
183+
self.default_ignorable_marks.append(cp)
184+
185+
self.default_ignorable_marks.sort()
186+
138187
def _load_cjk_compat_ideograph_variants(self):
139188
for line in self._fetch("StandardizedVariants.txt").splitlines():
140189
strip_comments = line.split('#', 1)[0].strip()
@@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out):
454503

455504
def gen_public_assigned(general_category_public_assigned, out):
456505
# This could be done as a hash but the table is somewhat small.
457-
out.write("#[inline]\n")
506+
out.write("\n#[inline]\n")
458507
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
459508
out.write(" match c {\n")
460509

@@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out):
476525
out.write("}\n")
477526
out.write("\n")
478527

528+
def gen_not_in_ccs(not_in_ccs, out):
529+
# List of codepoints to list of ranges
530+
range_list = []
531+
for cp in not_in_ccs:
532+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
533+
range_list[-1] = (range_list[-1][0], cp)
534+
else:
535+
range_list.append((cp, cp))
536+
537+
out.write("\n#[inline]\n")
538+
out.write("pub fn not_in_ccs(c: char) -> bool {\n")
539+
out.write(" match c {\n")
540+
541+
start = True
542+
for first, last in range_list:
543+
if start:
544+
out.write(" ")
545+
start = False
546+
else:
547+
out.write("\n | ")
548+
if first == last:
549+
out.write("'\\u{%s}'" % hexify(first))
550+
else:
551+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
552+
out.write(" => true,\n")
553+
554+
out.write(" _ => false,\n")
555+
out.write(" }\n")
556+
out.write("}\n")
557+
558+
def gen_default_ignorable_mark(default_ignorable_marks, out):
559+
# List of codepoints to list of ranges
560+
range_list = []
561+
for cp in default_ignorable_marks:
562+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
563+
range_list[-1] = (range_list[-1][0], cp)
564+
else:
565+
range_list.append((cp, cp))
566+
567+
out.write("\n#[inline]\n")
568+
out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
569+
out.write(" match c {\n")
570+
571+
start = True
572+
for first, last in range_list:
573+
if start:
574+
out.write(" ")
575+
start = False
576+
else:
577+
out.write("\n | ")
578+
if first == last:
579+
out.write("'\\u{%s}'" % hexify(first))
580+
else:
581+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
582+
out.write(" => true,\n")
583+
584+
out.write(" _ => false,\n")
585+
out.write(" }\n")
586+
out.write("}\n")
587+
479588
def gen_stream_safe(leading, trailing, out):
480589
# This could be done as a hash but the table is very small.
481590
out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711
gen_public_assigned(data.general_category_public_assigned, out)
603712
out.write("\n")
604713

714+
gen_not_in_ccs(data.not_in_ccs, out)
715+
716+
gen_default_ignorable_mark(data.default_ignorable_marks, out)
717+
605718
gen_nfc_qc(data.norm_props, out)
606719
out.write("\n")
607720

src/correct_ccs.rs

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#[cfg(not(feature = "std"))]
2+
use alloc::collections::VecDeque;
3+
use core::iter::FusedIterator;
4+
#[cfg(feature = "std")]
5+
use std::collections::VecDeque;
6+
7+
use crate::{lookups, tables};
8+
9+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
10+
enum CcsKind {
11+
/// A CCS base character (graphic character other than combining mark).
12+
Base,
13+
14+
/// A combining character other than a `Default_Ignorable_Code_Point`.
15+
NonIgnorableCombining,
16+
17+
/// A default-ignorable combining character, ZWJ, or ZWNJ.
18+
IgnorableCombining,
19+
}
20+
21+
impl CcsKind {
22+
fn of(c: char) -> Option<Self> {
23+
if c == '\u{200C}' || c == '\u{200D}' {
24+
// ZWNJ || ZWJ
25+
Some(CcsKind::IgnorableCombining)
26+
} else if lookups::is_combining_mark(c) {
27+
if tables::is_default_ignorable_mark(c) {
28+
Some(CcsKind::IgnorableCombining)
29+
} else {
30+
Some(CcsKind::NonIgnorableCombining)
31+
}
32+
} else if tables::not_in_ccs(c) {
33+
None
34+
} else {
35+
Some(CcsKind::Base)
36+
}
37+
}
38+
}
39+
40+
/// An iterator over the string that corrects
41+
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
42+
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
43+
///
44+
/// For the purposes of this iterator, private use characters,
45+
/// as well as unassigned codepoints other than noncharacters,
46+
/// are considered valid base characters,
47+
/// so combining character sequences that start with such will not be modified.
48+
///
49+
/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
50+
/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
51+
/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
52+
#[derive(Clone, Debug)]
53+
pub struct CorrectDefectiveCcs<I> {
54+
/// Whether the last character emitted was part of a CCS.
55+
in_ccs: bool,
56+
buffer: VecDeque<Option<char>>,
57+
/// Whether the last character in `buffer` is part of a CCS.
58+
/// (Updated only when `is_ccs` is set from false to true).
59+
end_of_buffer_in_ccs: bool,
60+
iter: I,
61+
}
62+
63+
impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
64+
type Item = char;
65+
66+
fn next(&mut self) -> Option<Self::Item> {
67+
if self.in_ccs {
68+
if let Some(c) = self.buffer.pop_front() {
69+
// Empty buffer
70+
71+
if self.buffer.is_empty() {
72+
self.in_ccs = self.end_of_buffer_in_ccs;
73+
}
74+
c
75+
} else {
76+
// Forward from inner iterator
77+
78+
let c = self.iter.next();
79+
if c.map_or(true, tables::not_in_ccs) {
80+
self.in_ccs = false;
81+
}
82+
c
83+
}
84+
} else {
85+
if self.buffer.is_empty() {
86+
// We don't have a buffer of default ignorable combining characters built up
87+
88+
let c = self.iter.next()?;
89+
match CcsKind::of(c) {
90+
// Character not in CCS, just forward it
91+
None => return Some(c),
92+
93+
// Character starts non-defective CCS,
94+
// label ourselves as in CCS and forward it
95+
Some(CcsKind::Base) => {
96+
self.in_ccs = true;
97+
return Some(c);
98+
}
99+
100+
// Character starts defective CCS and is not default-ignorable.
101+
// Put it in the buffer to emit on next iteration,
102+
// mark ourselves as in CCS,
103+
// and emit NO-BREAK SPACE
104+
Some(CcsKind::NonIgnorableCombining) => {
105+
self.in_ccs = true;
106+
self.end_of_buffer_in_ccs = true;
107+
self.buffer.push_back(Some(c));
108+
return Some('\u{00A0}'); // NO-BREAK SPACE
109+
}
110+
111+
// Character starts defective CCS and is default-ignorable.
112+
// Put it in the buffer, and fall through to loop below
113+
// to find out whether we emit a NO-BREAK SPACE first.
114+
Some(CcsKind::IgnorableCombining) => {
115+
self.buffer.push_back(Some(c));
116+
}
117+
}
118+
}
119+
120+
loop {
121+
// We do have a buffer of default ignorable combining characters built up,
122+
// and we need to figure out whether to emit a NO-BREAK SPACE first.
123+
124+
let c = self.iter.next();
125+
match c.and_then(CcsKind::of) {
126+
// Inner iterator yielded character outside CCS (or `None`).
127+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
128+
None => {
129+
self.in_ccs = true;
130+
self.end_of_buffer_in_ccs = false;
131+
let ret = self.buffer.pop_front().unwrap();
132+
self.buffer.push_back(c);
133+
return ret;
134+
}
135+
136+
// Inner iterator yielded character that starts a new CCS.
137+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
138+
Some(CcsKind::Base) => {
139+
self.in_ccs = true;
140+
self.end_of_buffer_in_ccs = true;
141+
let ret = self.buffer.pop_front().unwrap();
142+
self.buffer.push_back(c);
143+
return ret;
144+
}
145+
146+
// Inner iterator yielded non-ignorable combining character.
147+
// Emit the built-up buffer with leading NO-BREAK SPACE.
148+
Some(CcsKind::NonIgnorableCombining) => {
149+
self.in_ccs = true;
150+
self.end_of_buffer_in_ccs = true;
151+
self.buffer.push_back(c);
152+
return Some('\u{00A0}'); // NO-BREAK SPACE
153+
}
154+
155+
// Inner iterator yielded ignorable combining character.
156+
// Add it to the buffer, don't emit anything.
157+
Some(CcsKind::IgnorableCombining) => {
158+
self.buffer.push_back(c);
159+
}
160+
}
161+
}
162+
}
163+
}
164+
}
165+
166+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
167+
168+
impl<I> CorrectDefectiveCcs<I> {
169+
pub(crate) fn new(iter: I) -> Self {
170+
Self {
171+
in_ccs: false,
172+
buffer: VecDeque::new(),
173+
end_of_buffer_in_ccs: false,
174+
iter,
175+
}
176+
}
177+
}

0 commit comments

Comments
 (0)