Skip to content

Commit d1df3fe

Browse files
authored
Auto merge of #34485 - tbu-:pr_unicode_debug_str, r=alexcrichton
Escape fewer Unicode codepoints in `Debug` impl of `str` Use the same procedure as Python to determine whether a character is printable, described in [PEP 3138]. In particular, this means that the following character classes are escaped: - Cc (Other, Control) - Cf (Other, Format) - Cs (Other, Surrogate), even though they can't appear in Rust strings - Co (Other, Private Use) - Cn (Other, Not Assigned) - Zl (Separator, Line) - Zp (Separator, Paragraph) - Zs (Separator, Space), except for the ASCII space `' '` `0x20` This allows for user-friendly inspection of strings that are not English (e.g. compare `"\u{e9}\u{e8}\u{ea}"` to `"éèê"`). Fixes #34318. CC #34422. [PEP 3138]: https://www.python.org/dev/peps/pep-3138/
2 parents 748ecb1 + 3d09b4a commit d1df3fe

File tree

16 files changed

+1032
-29
lines changed

16 files changed

+1032
-29
lines changed

src/etc/char_private.py

+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode tables:
14+
# - Categories.txt
15+
16+
import os
17+
import subprocess
18+
19+
def to_ranges(iter):
20+
current = None
21+
for i in iter:
22+
if current is None or i != current[1] or i in (0x10000, 0x20000):
23+
if current is not None:
24+
yield tuple(current)
25+
current = [i, i + 1]
26+
else:
27+
current[1] += 1
28+
if current is not None:
29+
yield tuple(current)
30+
31+
def get_escaped(dictionary):
32+
for i in range(0x110000):
33+
if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
34+
yield i
35+
36+
def get_file(f):
37+
try:
38+
return open(os.path.basename(f))
39+
except FileNotFoundError:
40+
subprocess.run(["curl", "-O", f], check=True)
41+
return open(os.path.basename(f))
42+
43+
def main():
44+
file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
45+
46+
dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
47+
48+
CUTOFF=0x10000
49+
singletons0 = []
50+
singletons1 = []
51+
normal0 = []
52+
normal1 = []
53+
extra = []
54+
55+
for a, b in to_ranges(get_escaped(dictionary)):
56+
if a > 2 * CUTOFF:
57+
extra.append((a, b - a))
58+
elif a == b - 1:
59+
if a & CUTOFF:
60+
singletons1.append(a & ~CUTOFF)
61+
else:
62+
singletons0.append(a)
63+
elif a == b - 2:
64+
if a & CUTOFF:
65+
singletons1.append(a & ~CUTOFF)
66+
singletons1.append((a + 1) & ~CUTOFF)
67+
else:
68+
singletons0.append(a)
69+
singletons0.append(a + 1)
70+
else:
71+
if a >= 2 * CUTOFF:
72+
extra.append((a, b - a))
73+
elif a & CUTOFF:
74+
normal1.append((a & ~CUTOFF, b - a))
75+
else:
76+
normal0.append((a, b - a))
77+
78+
print("""\
79+
// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
80+
// file at the top-level directory of this distribution and at
81+
// http://rust-lang.org/COPYRIGHT.
82+
//
83+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
84+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
85+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
86+
// option. This file may not be copied, modified, or distributed
87+
// except according to those terms.
88+
89+
// NOTE: The following code was generated by "src/etc/char_private.py",
90+
// do not edit directly!
91+
92+
use slice::SliceExt;
93+
94+
fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool {
95+
for &s in singletons {
96+
if x == s {
97+
return false;
98+
} else if x < s {
99+
break;
100+
}
101+
}
102+
for w in normal.chunks(2) {
103+
let start = w[0];
104+
let len = w[1];
105+
let difference = (x as i32) - (start as i32);
106+
if 0 <= difference {
107+
if difference < len as i32 {
108+
return false;
109+
}
110+
} else {
111+
break;
112+
}
113+
}
114+
true
115+
}
116+
117+
pub fn is_printable(x: char) -> bool {
118+
let x = x as u32;
119+
let lower = x as u16;
120+
if x < 0x10000 {
121+
check(lower, SINGLETONS0, NORMAL0)
122+
} else if x < 0x20000 {
123+
check(lower, SINGLETONS1, NORMAL1)
124+
} else {\
125+
""")
126+
for a, b in extra:
127+
print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
128+
print(" return false;")
129+
print(" }")
130+
print("""\
131+
true
132+
}
133+
}\
134+
""")
135+
print()
136+
print("const SINGLETONS0: &'static [u16] = &[")
137+
for s in singletons0:
138+
print(" 0x{:x},".format(s))
139+
print("];")
140+
print("const SINGLETONS1: &'static [u16] = &[")
141+
for s in singletons1:
142+
print(" 0x{:x},".format(s))
143+
print("];")
144+
print("const NORMAL0: &'static [u16] = &[")
145+
for a, b in normal0:
146+
print(" 0x{:x}, 0x{:x},".format(a, b))
147+
print("];")
148+
print("const NORMAL1: &'static [u16] = &[")
149+
for a, b in normal1:
150+
print(" 0x{:x}, 0x{:x},".format(a, b))
151+
print("];")
152+
153+
if __name__ == '__main__':
154+
main()

src/libcollections/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#![feature(allow_internal_unstable)]
3434
#![feature(box_patterns)]
3535
#![feature(box_syntax)]
36+
#![cfg_attr(not(test), feature(char_escape_debug))]
3637
#![feature(core_intrinsics)]
3738
#![feature(dropck_parametricity)]
3839
#![feature(fmt_internals)]

src/libcollections/str.rs

+8
Original file line numberDiff line numberDiff line change
@@ -1697,6 +1697,14 @@ impl str {
16971697
return s;
16981698
}
16991699

1700+
/// Escapes each char in `s` with `char::escape_debug`.
1701+
#[unstable(feature = "str_escape",
1702+
reason = "return type may change to be an iterator",
1703+
issue = "27791")]
1704+
pub fn escape_debug(&self) -> String {
1705+
self.chars().flat_map(|c| c.escape_debug()).collect()
1706+
}
1707+
17001708
/// Escapes each char in `s` with `char::escape_default`.
17011709
#[unstable(feature = "str_escape",
17021710
reason = "return type may change to be an iterator",

src/libcollectionstest/str.rs

+18-2
Original file line numberDiff line numberDiff line change
@@ -703,16 +703,32 @@ fn test_escape_unicode() {
703703
assert_eq!("\u{1d4ea}\r".escape_unicode(), "\\u{1d4ea}\\u{d}");
704704
}
705705

706+
#[test]
707+
fn test_escape_debug() {
708+
assert_eq!("abc".escape_debug(), "abc");
709+
assert_eq!("a c".escape_debug(), "a c");
710+
assert_eq!("éèê".escape_debug(), "éèê");
711+
assert_eq!("\r\n\t".escape_debug(), "\\r\\n\\t");
712+
assert_eq!("'\"\\".escape_debug(), "\\'\\\"\\\\");
713+
assert_eq!("\u{7f}\u{ff}".escape_debug(), "\\u{7f}\u{ff}");
714+
assert_eq!("\u{100}\u{ffff}".escape_debug(), "\u{100}\\u{ffff}");
715+
assert_eq!("\u{10000}\u{10ffff}".escape_debug(), "\u{10000}\\u{10ffff}");
716+
assert_eq!("ab\u{200b}".escape_debug(), "ab\\u{200b}");
717+
assert_eq!("\u{10d4ea}\r".escape_debug(), "\\u{10d4ea}\\r");
718+
}
719+
706720
#[test]
707721
fn test_escape_default() {
708722
assert_eq!("abc".escape_default(), "abc");
709723
assert_eq!("a c".escape_default(), "a c");
724+
assert_eq!("éèê".escape_default(), "\\u{e9}\\u{e8}\\u{ea}");
710725
assert_eq!("\r\n\t".escape_default(), "\\r\\n\\t");
711726
assert_eq!("'\"\\".escape_default(), "\\'\\\"\\\\");
727+
assert_eq!("\u{7f}\u{ff}".escape_default(), "\\u{7f}\\u{ff}");
712728
assert_eq!("\u{100}\u{ffff}".escape_default(), "\\u{100}\\u{ffff}");
713729
assert_eq!("\u{10000}\u{10ffff}".escape_default(), "\\u{10000}\\u{10ffff}");
714-
assert_eq!("ab\u{fb00}".escape_default(), "ab\\u{fb00}");
715-
assert_eq!("\u{1d4ea}\r".escape_default(), "\\u{1d4ea}\\r");
730+
assert_eq!("ab\u{200b}".escape_default(), "ab\\u{200b}");
731+
assert_eq!("\u{10d4ea}\r".escape_default(), "\\u{10d4ea}\\r");
716732
}
717733

718734
#[test]

src/libcore/char.rs

+37
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use prelude::v1::*;
1919

20+
use char_private::is_printable;
2021
use mem::transmute;
2122

2223
// UTF-8 ranges and tags for encoding characters
@@ -263,6 +264,8 @@ pub trait CharExt {
263264
fn escape_unicode(self) -> EscapeUnicode;
264265
#[stable(feature = "core", since = "1.6.0")]
265266
fn escape_default(self) -> EscapeDefault;
267+
#[unstable(feature = "char_escape_debug", issue = "35068")]
268+
fn escape_debug(self) -> EscapeDebug;
266269
#[stable(feature = "core", since = "1.6.0")]
267270
fn len_utf8(self) -> usize;
268271
#[stable(feature = "core", since = "1.6.0")]
@@ -326,6 +329,19 @@ impl CharExt for char {
326329
EscapeDefault { state: init_state }
327330
}
328331

332+
#[inline]
333+
fn escape_debug(self) -> EscapeDebug {
334+
let init_state = match self {
335+
'\t' => EscapeDefaultState::Backslash('t'),
336+
'\r' => EscapeDefaultState::Backslash('r'),
337+
'\n' => EscapeDefaultState::Backslash('n'),
338+
'\\' | '\'' | '"' => EscapeDefaultState::Backslash(self),
339+
c if is_printable(c) => EscapeDefaultState::Char(c),
340+
c => EscapeDefaultState::Unicode(c.escape_unicode()),
341+
};
342+
EscapeDebug(EscapeDefault { state: init_state })
343+
}
344+
329345
#[inline]
330346
fn len_utf8(self) -> usize {
331347
let code = self as u32;
@@ -600,6 +616,27 @@ impl ExactSizeIterator for EscapeDefault {
600616
}
601617
}
602618

619+
/// An iterator that yields the literal escape code of a `char`.
620+
///
621+
/// This `struct` is created by the [`escape_debug()`] method on [`char`]. See its
622+
/// documentation for more.
623+
///
624+
/// [`escape_debug()`]: ../../std/primitive.char.html#method.escape_debug
625+
/// [`char`]: ../../std/primitive.char.html
626+
#[unstable(feature = "char_escape_debug", issue = "35068")]
627+
#[derive(Clone, Debug)]
628+
pub struct EscapeDebug(EscapeDefault);
629+
630+
#[unstable(feature = "char_escape_debug", issue = "35068")]
631+
impl Iterator for EscapeDebug {
632+
type Item = char;
633+
fn next(&mut self) -> Option<char> { self.0.next() }
634+
fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() }
635+
}
636+
637+
#[unstable(feature = "char_escape_debug", issue = "35068")]
638+
impl ExactSizeIterator for EscapeDebug { }
639+
603640
/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
604641
/// value.
605642
///

0 commit comments

Comments
 (0)