Skip to content

Commit 8d7f254

Browse files
committed
Validate and transcribe raw strings via unescape module
1 parent 238f136 commit 8d7f254

File tree

5 files changed

+62
-50
lines changed

5 files changed

+62
-50
lines changed

src/libsyntax/parse/lexer/mod.rs

+24-21
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,12 @@ impl<'a> StringReader<'a> {
10861086
Ok(TokenKind::lit(token::Str, symbol, suffix))
10871087
}
10881088
'r' => {
1089-
let (kind, symbol) = self.scan_raw_string();
1089+
let (start, end, hash_count) = self.scan_raw_string();
1090+
let symbol = self.name_from_to(start, end);
1091+
self.validate_raw_str_escape(start, end);
10901092
let suffix = self.scan_optional_raw_name();
10911093

1092-
Ok(TokenKind::lit(kind, symbol, suffix))
1094+
Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
10931095
}
10941096
'-' => {
10951097
if self.nextch_is('>') {
@@ -1243,7 +1245,7 @@ impl<'a> StringReader<'a> {
12431245
id
12441246
}
12451247

1246-
fn scan_raw_string(&mut self) -> (token::LitKind, Symbol) {
1248+
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
12471249
let start_bpos = self.pos;
12481250
self.bump();
12491251
let mut hash_count: u16 = 0;
@@ -1273,7 +1275,6 @@ impl<'a> StringReader<'a> {
12731275
self.bump();
12741276
let content_start_bpos = self.pos;
12751277
let mut content_end_bpos;
1276-
let mut valid = true;
12771278
'outer: loop {
12781279
match self.ch {
12791280
None => {
@@ -1289,29 +1290,14 @@ impl<'a> StringReader<'a> {
12891290
}
12901291
break;
12911292
}
1292-
Some(c) => {
1293-
if c == '\r' && !self.nextch_is('\n') {
1294-
let last_bpos = self.pos;
1295-
self.err_span_(start_bpos,
1296-
last_bpos,
1297-
"bare CR not allowed in raw string, use \\r \
1298-
instead");
1299-
valid = false;
1300-
}
1301-
}
1293+
_ => (),
13021294
}
13031295
self.bump();
13041296
}
13051297

13061298
self.bump();
13071299

1308-
let symbol = if valid {
1309-
self.name_from_to(content_start_bpos, content_end_bpos)
1310-
} else {
1311-
Symbol::intern("??")
1312-
};
1313-
1314-
(token::StrRaw(hash_count), symbol)
1300+
(content_start_bpos, content_end_bpos, hash_count)
13151301
}
13161302

13171303
fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
@@ -1421,6 +1407,23 @@ impl<'a> StringReader<'a> {
14211407
});
14221408
}
14231409

1410+
fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
1411+
self.with_str_from_to(content_start, content_end, |lit: &str| {
1412+
unescape::unescape_raw_str(lit, &mut |range, c| {
1413+
if let Err(err) = c {
1414+
emit_unescape_error(
1415+
&self.sess.span_diagnostic,
1416+
lit,
1417+
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
1418+
unescape::Mode::Str,
1419+
range,
1420+
err,
1421+
)
1422+
}
1423+
})
1424+
});
1425+
}
1426+
14241427
fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
14251428
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
14261429
unescape::unescape_byte_str(lit, &mut |range, c| {

src/libsyntax/parse/literal.rs

+12-25
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use crate::ast::{self, Lit, LitKind};
44
use crate::parse::parser::Parser;
55
use crate::parse::PResult;
66
use crate::parse::token::{self, Token, TokenKind};
7-
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
7+
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_raw_str, unescape_byte};
88
use crate::print::pprust;
99
use crate::symbol::{kw, sym, Symbol};
1010
use crate::tokenstream::{TokenStream, TokenTree};
@@ -141,7 +141,17 @@ impl LitKind {
141141
// Ditto.
142142
let s = symbol.as_str();
143143
let symbol = if s.contains('\r') {
144-
Symbol::intern(&raw_str_lit(&s))
144+
let mut buf = String::with_capacity(s.len());
145+
let mut error = Ok(());
146+
unescape_raw_str(&s, &mut |_, unescaped_char| {
147+
match unescaped_char {
148+
Ok(c) => buf.push(c),
149+
Err(_) => error = Err(LitError::LexerError),
150+
}
151+
});
152+
error?;
153+
buf.shrink_to_fit();
154+
Symbol::intern(&buf)
145155
} else {
146156
symbol
147157
};
@@ -350,29 +360,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
350360
}
351361
}
352362

353-
/// Parses a string representing a raw string literal into its final form. The
354-
/// only operation this does is convert embedded CRLF into a single LF.
355-
fn raw_str_lit(lit: &str) -> String {
356-
debug!("raw_str_lit: {:?}", lit);
357-
let mut res = String::with_capacity(lit.len());
358-
359-
let mut chars = lit.chars().peekable();
360-
while let Some(c) = chars.next() {
361-
if c == '\r' {
362-
if *chars.peek().unwrap() != '\n' {
363-
panic!("lexer accepted bare CR");
364-
}
365-
chars.next();
366-
res.push('\n');
367-
} else {
368-
res.push(c);
369-
}
370-
}
371-
372-
res.shrink_to_fit();
373-
res
374-
}
375-
376363
// Checks if `s` looks like i32 or u1234 etc.
377364
fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
378365
s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())

src/libsyntax/parse/unescape.rs

+22
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,28 @@ where
6666
})
6767
}
6868

69+
/// Takes a contents of a string literal (without quotes) and produces a
70+
/// sequence of characters or errors.
71+
/// NOTE: Raw strings do not perform any explicit character escaping, here we
72+
/// only translate CRLF to LF and produce errors on bare CR.
73+
pub(crate) fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
74+
where
75+
F: FnMut(Range<usize>, Result<char, EscapeError>),
76+
{
77+
let mut byte_offset: usize = 0;
78+
79+
let mut chars = literal_text.chars().peekable();
80+
while let Some(curr) = chars.next() {
81+
let result = match (curr, chars.peek()) {
82+
('\r', Some('\n')) => Ok(curr),
83+
('\r', _) => Err(EscapeError::BareCarriageReturn),
84+
_ => Ok(curr),
85+
};
86+
callback(byte_offset..(byte_offset + curr.len_utf8()), result);
87+
byte_offset += curr.len_utf8();
88+
}
89+
}
90+
6991
#[derive(Debug, Clone, Copy)]
7092
pub(crate) enum Mode {
7193
Char,

src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ fn main() {
2121
let _s = "foobar"; //~ ERROR: bare CR not allowed in string
2222

2323
// the following string literal has a bare CR in it
24-
let _s = r"barfoo"; //~ ERROR: bare CR not allowed in raw string
24+
let _s = r"barfoo"; //~ ERROR: bare CR not allowed in string
2525

2626
// the following string literal has a bare CR in it
2727
let _s = "foo\bar"; //~ ERROR: unknown character escape: \r

src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr

+3-3
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ error: bare CR not allowed in string, use \r instead
2828
LL | let _s = "foobar";
2929
| ^
3030

31-
error: bare CR not allowed in raw string, use \r instead
32-
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:14
31+
error: bare CR not allowed in string, use \r instead
32+
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:19
3333
|
3434
LL | let _s = r"barfoo";
35-
| ^^^^^
35+
| ^
3636

3737
error: unknown character escape: \r
3838
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:27:19

0 commit comments

Comments
 (0)