Skip to content

Commit 523bd20

Browse files
committed
ucd-util: fix canonicalization of 'isc'
This commit fixes a bug where 'isc' was canonicalized to 'c'. 'isc' is an alias for 'ISO_Comment', but the 'is' prefix was being dropped since canonicalization permits ignoring 'is' prefixes when designating property names. This is the root cause of a bug in the regex library: rust-lang/regex#466
1 parent 7a5cd62 commit 523bd20

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

ucd-util/src/name.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,10 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
9494
// names/aliases had a particular structure (unlike character names), but
9595
// we assume that it's ASCII only and drop anything that isn't ASCII.
9696
let mut start = 0;
97+
let mut starts_with_is = false;
9798
if slice.len() >= 2 {
9899
// Ignore any "is" prefix.
99-
let starts_with_is =
100+
starts_with_is =
100101
slice[0..2] == b"is"[..]
101102
|| slice[0..2] == b"IS"[..]
102103
|| slice[0..2] == b"iS"[..]
@@ -121,6 +122,16 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
121122
next_write += 1;
122123
}
123124
}
125+
// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
126+
// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
127+
// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
128+
// is actually an alias for the 'Other' general category.
129+
if starts_with_is && next_write == 1 && slice[0] == b'c' {
130+
slice[0] = b'i';
131+
slice[1] = b's';
132+
slice[2] = b'c';
133+
next_write = 3;
134+
}
124135
&mut slice[..next_write]
125136
}
126137

@@ -162,6 +173,9 @@ mod tests {
162173
assert_eq!(sym_norm("Greek"), "greek");
163174
assert_eq!(sym_norm("isGreek"), "greek");
164175
assert_eq!(sym_norm("IS_Greek"), "greek");
176+
assert_eq!(sym_norm("isc"), "isc");
177+
assert_eq!(sym_norm("is c"), "isc");
178+
assert_eq!(sym_norm("is_c"), "isc");
165179
}
166180

167181
#[test]

0 commit comments

Comments
 (0)