Skip to content

Commit 5a21f76

Browse files
committed
Revert "Auto merge of rust-lang#62948 - matklad:failable-file-loading, r=petrochenkov"
This reverts commit ef1ecbe, reversing changes made to fc8765d. That changed unfortunately broke rustfix on windows: rust-lang/rustfix#176 Specifically, what ef1ecbe did was to enforce normalization of \r\n to \n at file loading time, similarly to how we deal with Byte Order Mark. Normalization changes raw offsets in files, which are exposed via `--error-format=json`, and used by rusfix. The proper solution here (which also handles the latent case with BOM) is rust-lang#65074 However, since it's somewhat involved, and we are time sensitive, we prefer to revert the original change on beta.
1 parent 9689670 commit 5a21f76

File tree

6 files changed

+104
-102
lines changed

6 files changed

+104
-102
lines changed

src/librustc_lexer/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ impl Cursor<'_> {
268268
loop {
269269
match self.nth_char(0) {
270270
'\n' => break,
271+
'\r' if self.nth_char(1) == '\n' => break,
271272
EOF_CHAR if self.is_eof() => break,
272273
_ => {
273274
self.bump();
@@ -440,6 +441,7 @@ impl Cursor<'_> {
440441
match self.nth_char(0) {
441442
'/' if !first => break,
442443
'\n' if self.nth_char(1) != '\'' => break,
444+
'\r' if self.nth_char(1) == '\n' => break,
443445
EOF_CHAR if self.is_eof() => break,
444446
'\'' => {
445447
self.bump();

src/librustc_lexer/src/unescape.rs

+28-8
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
128128
if first_char != '\\' {
129129
return match first_char {
130130
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
131-
'\r' => Err(EscapeError::BareCarriageReturn),
131+
'\r' => Err(if chars.clone().next() == Some('\n') {
132+
EscapeError::EscapeOnlyChar
133+
} else {
134+
EscapeError::BareCarriageReturn
135+
}),
132136
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
133137
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
134138
_ => {
@@ -240,15 +244,27 @@ where
240244

241245
let unescaped_char = match first_char {
242246
'\\' => {
243-
let second_char = chars.clone().next();
244-
match second_char {
245-
Some('\n') => {
247+
let (second_char, third_char) = {
248+
let mut chars = chars.clone();
249+
(chars.next(), chars.next())
250+
};
251+
match (second_char, third_char) {
252+
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
246253
skip_ascii_whitespace(&mut chars);
247254
continue;
248255
}
249256
_ => scan_escape(first_char, &mut chars, mode),
250257
}
251258
}
259+
'\r' => {
260+
let second_char = chars.clone().next();
261+
if second_char == Some('\n') {
262+
chars.next();
263+
Ok('\n')
264+
} else {
265+
scan_escape(first_char, &mut chars, mode)
266+
}
267+
}
252268
'\n' => Ok('\n'),
253269
'\t' => Ok('\t'),
254270
_ => scan_escape(first_char, &mut chars, mode),
@@ -282,11 +298,15 @@ where
282298
while let Some(curr) = chars.next() {
283299
let start = initial_len - chars.as_str().len() - curr.len_utf8();
284300

285-
let result = match curr {
286-
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
287-
c if mode.is_bytes() && !c.is_ascii() =>
301+
let result = match (curr, chars.clone().next()) {
302+
('\r', Some('\n')) => {
303+
chars.next();
304+
Ok('\n')
305+
},
306+
('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
307+
(c, _) if mode.is_bytes() && !c.is_ascii() =>
288308
Err(EscapeError::NonAsciiCharInByteString),
289-
c => Ok(c),
309+
(c, _) => Ok(c),
290310
};
291311
let end = initial_len - chars.as_str().len();
292312

src/librustc_lexer/src/unescape/tests.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ fn test_unescape_char_bad() {
1111
check(r"\", EscapeError::LoneSlash);
1212

1313
check("\n", EscapeError::EscapeOnlyChar);
14+
check("\r\n", EscapeError::EscapeOnlyChar);
1415
check("\t", EscapeError::EscapeOnlyChar);
1516
check("'", EscapeError::EscapeOnlyChar);
1617
check("\r", EscapeError::BareCarriageReturn);
@@ -30,7 +31,6 @@ fn test_unescape_char_bad() {
3031
check(r"\v", EscapeError::InvalidEscape);
3132
check(r"\💩", EscapeError::InvalidEscape);
3233
check(r"\●", EscapeError::InvalidEscape);
33-
check("\\\r", EscapeError::InvalidEscape);
3434

3535
check(r"\x", EscapeError::TooShortHexEscape);
3636
check(r"\x0", EscapeError::TooShortHexEscape);
@@ -116,9 +116,10 @@ fn test_unescape_str_good() {
116116

117117
check("foo", "foo");
118118
check("", "");
119-
check(" \t\n", " \t\n");
119+
check(" \t\n\r\n", " \t\n\n");
120120

121121
check("hello \\\n world", "hello world");
122+
check("hello \\\r\n world", "hello world");
122123
check("thread's", "thread's")
123124
}
124125

@@ -133,6 +134,7 @@ fn test_unescape_byte_bad() {
133134
check(r"\", EscapeError::LoneSlash);
134135

135136
check("\n", EscapeError::EscapeOnlyChar);
137+
check("\r\n", EscapeError::EscapeOnlyChar);
136138
check("\t", EscapeError::EscapeOnlyChar);
137139
check("'", EscapeError::EscapeOnlyChar);
138140
check("\r", EscapeError::BareCarriageReturn);
@@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() {
236238

237239
check("foo", b"foo");
238240
check("", b"");
239-
check(" \t\n", b" \t\n");
241+
check(" \t\n\r\n", b" \t\n\n");
240242

241243
check("hello \\\n world", b"hello world");
244+
check("hello \\\r\n world", b"hello world");
242245
check("thread's", b"thread's")
243246
}
244247

@@ -250,6 +253,7 @@ fn test_unescape_raw_str() {
250253
assert_eq!(unescaped, expected);
251254
}
252255

256+
check("\r\n", &[(0..2, Ok('\n'))]);
253257
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
254258
check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
255259
}
@@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() {
262266
assert_eq!(unescaped, expected);
263267
}
264268

269+
check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
265270
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
266271
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
267272
check(

src/libsyntax/parse/lexer/mod.rs

+66-15
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span};
88
use rustc_lexer::Base;
99
use rustc_lexer::unescape;
1010

11+
use std::borrow::Cow;
1112
use std::char;
13+
use std::iter;
1214
use std::convert::TryInto;
1315
use rustc_data_structures::sync::Lrc;
1416
use log::debug;
@@ -179,7 +181,18 @@ impl<'a> StringReader<'a> {
179181
let string = self.str_from(start);
180182
// comments with only more "/"s are not doc comments
181183
let tok = if is_doc_comment(string) {
182-
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
184+
let mut idx = 0;
185+
loop {
186+
idx = match string[idx..].find('\r') {
187+
None => break,
188+
Some(it) => idx + it + 1
189+
};
190+
if string[idx..].chars().next() != Some('\n') {
191+
self.err_span_(start + BytePos(idx as u32 - 1),
192+
start + BytePos(idx as u32),
193+
"bare CR not allowed in doc-comment");
194+
}
195+
}
183196
token::DocComment(Symbol::intern(string))
184197
} else {
185198
token::Comment
@@ -204,10 +217,15 @@ impl<'a> StringReader<'a> {
204217
}
205218

206219
let tok = if is_doc_comment {
207-
self.forbid_bare_cr(start,
208-
string,
209-
"bare CR not allowed in block doc-comment");
210-
token::DocComment(Symbol::intern(string))
220+
let has_cr = string.contains('\r');
221+
let string = if has_cr {
222+
self.translate_crlf(start,
223+
string,
224+
"bare CR not allowed in block doc-comment")
225+
} else {
226+
string.into()
227+
};
228+
token::DocComment(Symbol::intern(&string[..]))
211229
} else {
212230
token::Comment
213231
};
@@ -473,16 +491,49 @@ impl<'a> StringReader<'a> {
473491
&self.src[self.src_index(start)..self.src_index(end)]
474492
}
475493

476-
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
477-
let mut idx = 0;
478-
loop {
479-
idx = match s[idx..].find('\r') {
480-
None => break,
481-
Some(it) => idx + it + 1
482-
};
483-
self.err_span_(start + BytePos(idx as u32 - 1),
484-
start + BytePos(idx as u32),
485-
errmsg);
494+
/// Converts CRLF to LF in the given string, raising an error on bare CR.
495+
fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
496+
let mut chars = s.char_indices().peekable();
497+
while let Some((i, ch)) = chars.next() {
498+
if ch == '\r' {
499+
if let Some((lf_idx, '\n')) = chars.peek() {
500+
return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
501+
}
502+
let pos = start + BytePos(i as u32);
503+
let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
504+
self.err_span_(pos, end_pos, errmsg);
505+
}
506+
}
507+
return s.into();
508+
509+
fn translate_crlf_(rdr: &StringReader<'_>,
510+
start: BytePos,
511+
s: &str,
512+
mut j: usize,
513+
mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
514+
errmsg: &str)
515+
-> String {
516+
let mut buf = String::with_capacity(s.len());
517+
// Skip first CR
518+
buf.push_str(&s[.. j - 1]);
519+
while let Some((i, ch)) = chars.next() {
520+
if ch == '\r' {
521+
if j < i {
522+
buf.push_str(&s[j..i]);
523+
}
524+
let next = i + ch.len_utf8();
525+
j = next;
526+
if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
527+
let pos = start + BytePos(i as u32);
528+
let end_pos = start + BytePos(next as u32);
529+
rdr.err_span_(pos, end_pos, errmsg);
530+
}
531+
}
532+
}
533+
if j < s.len() {
534+
buf.push_str(&s[j..]);
535+
}
536+
buf
486537
}
487538
}
488539

src/libsyntax_pos/lib.rs

-56
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,6 @@ impl SourceFile {
10641064
mut src: String,
10651065
start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
10661066
remove_bom(&mut src);
1067-
normalize_newlines(&mut src);
10681067

10691068
let src_hash = {
10701069
let mut hasher: StableHasher<u128> = StableHasher::new();
@@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) {
12321231
}
12331232
}
12341233

1235-
1236-
/// Replaces `\r\n` with `\n` in-place in `src`.
1237-
///
1238-
/// Returns error if there's a lone `\r` in the string
1239-
fn normalize_newlines(src: &mut String) {
1240-
if !src.as_bytes().contains(&b'\r') {
1241-
return;
1242-
}
1243-
1244-
// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
1245-
// While we *can* call `as_mut_vec` and do surgery on the live string
1246-
// directly, let's rather steal the contents of `src`. This makes the code
1247-
// safe even if a panic occurs.
1248-
1249-
let mut buf = std::mem::replace(src, String::new()).into_bytes();
1250-
let mut gap_len = 0;
1251-
let mut tail = buf.as_mut_slice();
1252-
loop {
1253-
let idx = match find_crlf(&tail[gap_len..]) {
1254-
None => tail.len(),
1255-
Some(idx) => idx + gap_len,
1256-
};
1257-
tail.copy_within(gap_len..idx, 0);
1258-
tail = &mut tail[idx - gap_len..];
1259-
if tail.len() == gap_len {
1260-
break;
1261-
}
1262-
gap_len += 1;
1263-
}
1264-
1265-
// Account for removed `\r`.
1266-
// After `set_len`, `buf` is guaranteed to contain utf-8 again.
1267-
let new_len = buf.len() - gap_len;
1268-
unsafe {
1269-
buf.set_len(new_len);
1270-
*src = String::from_utf8_unchecked(buf);
1271-
}
1272-
1273-
fn find_crlf(src: &[u8]) -> Option<usize> {
1274-
let mut search_idx = 0;
1275-
while let Some(idx) = find_cr(&src[search_idx..]) {
1276-
if src[search_idx..].get(idx + 1) != Some(&b'\n') {
1277-
search_idx += idx + 1;
1278-
continue;
1279-
}
1280-
return Some(search_idx + idx);
1281-
}
1282-
None
1283-
}
1284-
1285-
fn find_cr(src: &[u8]) -> Option<usize> {
1286-
src.iter().position(|&b| b == b'\r')
1287-
}
1288-
}
1289-
12901234
// _____________________________________________________________________________
12911235
// Pos, BytePos, CharPos
12921236
//

src/libsyntax_pos/tests.rs

-20
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,3 @@ fn test_lookup_line() {
1616
assert_eq!(lookup_line(lines, BytePos(28)), 2);
1717
assert_eq!(lookup_line(lines, BytePos(29)), 2);
1818
}
19-
20-
#[test]
21-
fn test_normalize_newlines() {
22-
fn check(before: &str, after: &str) {
23-
let mut actual = before.to_string();
24-
normalize_newlines(&mut actual);
25-
assert_eq!(actual.as_str(), after);
26-
}
27-
check("", "");
28-
check("\n", "\n");
29-
check("\r", "\r");
30-
check("\r\r", "\r\r");
31-
check("\r\n", "\n");
32-
check("hello world", "hello world");
33-
check("hello\nworld", "hello\nworld");
34-
check("hello\r\nworld", "hello\nworld");
35-
check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
36-
check("\r\r\n", "\r\n");
37-
check("hello\rworld", "hello\rworld");
38-
}

0 commit comments

Comments
 (0)