Skip to content

Commit a1c0721

Browse files
committed
Rework CStrUnit.
- Rename it as `MixedUnit`, because it will soon be used in more than just C string literals. - Change the `Byte` variant to `HighByte` and use it only for `\x80`..`\xff` cases. This fixes the old inexactness where ASCII chars could be encoded with either `Byte` or `Char`. - Add useful comments. - Remove `is_ascii`, in favour of `u8::is_ascii`.
1 parent ef1e222 commit a1c0721

File tree

3 files changed

+52
-42
lines changed

3 files changed

+52
-42
lines changed

Diff for: compiler/rustc_ast/src/util/literal.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
6+
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
77
Mode,
88
};
99
use rustc_span::symbol::{kw, sym, Symbol};
@@ -127,10 +127,10 @@ impl LitKind {
127127
let s = symbol.as_str();
128128
let mut buf = Vec::with_capacity(s.len());
129129
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
130-
Ok(CStrUnit::Byte(b)) => buf.push(b),
131-
Ok(CStrUnit::Char(c)) => {
130+
Ok(MixedUnit::Char(c)) => {
132131
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
133132
}
133+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
134134
Err(err) => {
135135
assert!(!err.is_fatal(), "failed to unescape C string literal")
136136
}

Diff for: compiler/rustc_lexer/src/unescape.rs

+45-34
Original file line numberDiff line numberDiff line change
@@ -101,32 +101,45 @@ where
101101
}
102102
}
103103

104-
/// A unit within CStr. Must not be a nul character.
105-
pub enum CStrUnit {
106-
Byte(u8),
104+
/// Used for mixed utf8 string literals, i.e. those that allow both unicode
105+
/// chars and high bytes.
106+
pub enum MixedUnit {
107+
/// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
108+
/// and Unicode chars (written directly or via `\u` escapes).
109+
///
110+
/// For example, if '¥' appears in a string it is represented here as
111+
/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
112+
/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
107113
Char(char),
114+
115+
/// Used for high bytes (`\x80`..`\xff`).
116+
///
117+
/// For example, if `\xa5` appears in a string it is represented here as
118+
/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
119+
/// byte string as the single byte `0xa5`.
120+
HighByte(u8),
108121
}
109122

110-
impl From<u8> for CStrUnit {
111-
fn from(value: u8) -> Self {
112-
CStrUnit::Byte(value)
123+
impl From<char> for MixedUnit {
124+
fn from(c: char) -> Self {
125+
MixedUnit::Char(c)
113126
}
114127
}
115128

116-
impl From<char> for CStrUnit {
117-
fn from(value: char) -> Self {
118-
CStrUnit::Char(value)
129+
impl From<u8> for MixedUnit {
130+
fn from(n: u8) -> Self {
131+
if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) }
119132
}
120133
}
121134

122135
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
123136
where
124-
F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
137+
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
125138
{
126139
match mode {
127140
CStr => {
128141
unescape_non_raw_common(src, mode, &mut |r, mut result| {
129-
if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result {
142+
if let Ok(MixedUnit::Char('\0')) = result {
130143
result = Err(EscapeError::NulInCStr);
131144
}
132145
callback(r, result)
@@ -137,7 +150,8 @@ where
137150
if let Ok('\0') = result {
138151
result = Err(EscapeError::NulInCStr);
139152
}
140-
callback(r, result.map(CStrUnit::Char))
153+
// High bytes aren't possible in raw strings.
154+
callback(r, result.map(MixedUnit::Char))
141155
});
142156
}
143157
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
@@ -217,20 +231,19 @@ impl Mode {
217231
}
218232
}
219233

220-
fn scan_escape<T: From<u8> + From<char>>(
234+
fn scan_escape<T: From<char> + From<u8>>(
221235
chars: &mut Chars<'_>,
222236
mode: Mode,
223237
) -> Result<T, EscapeError> {
224238
// Previous character was '\\', unescape what follows.
225-
let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? {
226-
'"' => b'"',
227-
'n' => b'\n',
228-
'r' => b'\r',
229-
't' => b'\t',
230-
'\\' => b'\\',
231-
'\'' => b'\'',
232-
'0' => b'\0',
233-
239+
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
240+
'"' => '"',
241+
'n' => '\n',
242+
'r' => '\r',
243+
't' => '\t',
244+
'\\' => '\\',
245+
'\'' => '\'',
246+
'0' => '\0',
234247
'x' => {
235248
// Parse hexadecimal character code.
236249

@@ -240,15 +253,17 @@ fn scan_escape<T: From<u8> + From<char>>(
240253
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
241254
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
242255

243-
let value = hi * 16 + lo;
244-
245-
if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
246-
return Err(EscapeError::OutOfRangeHexEscape);
247-
}
256+
let value = (hi * 16 + lo) as u8;
248257

249-
value as u8
258+
return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() {
259+
Err(EscapeError::OutOfRangeHexEscape)
260+
} else {
261+
// This may be a high byte, but that will only happen if `T` is
262+
// `MixedUnit`, because of the `ascii_escapes_should_be_ascii`
263+
// check above.
264+
Ok(T::from(value as u8))
265+
};
250266
}
251-
252267
'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from),
253268
_ => return Err(EscapeError::InvalidEscape),
254269
};
@@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
336351

337352
/// Takes a contents of a string literal (without quotes) and produces a
338353
/// sequence of escaped characters or errors.
339-
fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
354+
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
340355
where
341356
F: FnMut(Range<usize>, Result<T, EscapeError>),
342357
{
@@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 {
430445
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
431446
res as u8
432447
}
433-
434-
fn is_ascii(x: u32) -> bool {
435-
x <= 0x7F
436-
}

Diff for: src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs

+4-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::{
66
};
77

88
use rustc_lexer::unescape::{
9-
unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode,
9+
unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode,
1010
};
1111

1212
use crate::{
@@ -336,10 +336,9 @@ impl ast::CString {
336336
let mut buf = Vec::new();
337337
let mut prev_end = 0;
338338
let mut has_error = false;
339-
let mut char_buf = [0u8; 4];
340-
let mut extend_unit = |buf: &mut Vec<u8>, unit: CStrUnit| match unit {
341-
CStrUnit::Byte(b) => buf.push(b),
342-
CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()),
339+
let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
340+
MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
341+
MixedUnit::HighByte(b) => buf.push(b),
343342
};
344343
unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match (
345344
unescaped,

0 commit comments

Comments
 (0)