Revert "Auto merge of rust-lang#62948 - matklad:failable-file-loading, r=petrochenkov"

matklad · matklad · commit 5a21f766f02e · 2019-10-10T18:18:04.000+03:00
This reverts commit ef1ecbe, reversing changes made to fc8765d. That changed unfortunately broke rustfix on windows: rust-lang/rustfix#176 Specifically, what ef1ecbe did was to enforce normalization of \r\n to \n at file loading time, similarly to how we deal with Byte Order Mark. Normalization changes raw offsets in files, which are exposed via `--error-format=json`, and used by rusfix. The proper solution here (which also handles the latent case with BOM) is rust-lang#65074 However, since it's somewhat involved, and we are time sensitive, we prefer to revert the original change on beta.
diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
@@ -268,6 +268,7 @@ impl Cursor<'_> {
         loop {
             match self.nth_char(0) {
                 '\n' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
                 EOF_CHAR if self.is_eof() => break,
                 _ => {
                     self.bump();
@@ -440,6 +441,7 @@ impl Cursor<'_> {
             match self.nth_char(0) {
                 '/' if !first => break,
                 '\n' if self.nth_char(1) != '\'' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
                 EOF_CHAR if self.is_eof() => break,
                 '\'' => {
                     self.bump();
diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs
@@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
     if first_char != '\\' {
         return match first_char {
             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
-            '\r' => Err(EscapeError::BareCarriageReturn),
+            '\r' => Err(if chars.clone().next() == Some('\n') {
+                EscapeError::EscapeOnlyChar
+            } else {
+                EscapeError::BareCarriageReturn
+            }),
             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
             _ => {
@@ -240,15 +244,27 @@ where
 
         let unescaped_char = match first_char {
             '\\' => {
-                let second_char = chars.clone().next();
-                match second_char {
-                    Some('\n') => {
+                let (second_char, third_char) = {
+                    let mut chars = chars.clone();
+                    (chars.next(), chars.next())
+                };
+                match (second_char, third_char) {
+                    (Some('\n'), _) | (Some('\r'), Some('\n')) => {
                         skip_ascii_whitespace(&mut chars);
                         continue;
                     }
                     _ => scan_escape(first_char, &mut chars, mode),
                 }
             }
+            '\r' => {
+                let second_char = chars.clone().next();
+                if second_char == Some('\n') {
+                    chars.next();
+                    Ok('\n')
+                } else {
+                    scan_escape(first_char, &mut chars, mode)
+                }
+            }
             '\n' => Ok('\n'),
             '\t' => Ok('\t'),
             _ => scan_escape(first_char, &mut chars, mode),
@@ -282,11 +298,15 @@ where
     while let Some(curr) = chars.next() {
         let start = initial_len - chars.as_str().len() - curr.len_utf8();
 
-        let result = match curr {
-            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            c if mode.is_bytes() && !c.is_ascii() =>
+        let result = match (curr, chars.clone().next()) {
+            ('\r', Some('\n')) => {
+                chars.next();
+                Ok('\n')
+            },
+            ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
+            (c, _) if mode.is_bytes() && !c.is_ascii() =>
                 Err(EscapeError::NonAsciiCharInByteString),
-            c => Ok(c),
+            (c, _) => Ok(c),
         };
         let end = initial_len - chars.as_str().len();
 
diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs
@@ -11,6 +11,7 @@ fn test_unescape_char_bad() {
     check(r"\", EscapeError::LoneSlash);
 
     check("\n", EscapeError::EscapeOnlyChar);
+    check("\r\n", EscapeError::EscapeOnlyChar);
     check("\t", EscapeError::EscapeOnlyChar);
     check("'", EscapeError::EscapeOnlyChar);
     check("\r", EscapeError::BareCarriageReturn);
@@ -30,7 +31,6 @@ fn test_unescape_char_bad() {
     check(r"\v", EscapeError::InvalidEscape);
     check(r"\💩", EscapeError::InvalidEscape);
     check(r"\●", EscapeError::InvalidEscape);
-    check("\\\r", EscapeError::InvalidEscape);
 
     check(r"\x", EscapeError::TooShortHexEscape);
     check(r"\x0", EscapeError::TooShortHexEscape);
@@ -116,9 +116,10 @@ fn test_unescape_str_good() {
 
     check("foo", "foo");
     check("", "");
-    check(" \t\n", " \t\n");
+    check(" \t\n\r\n", " \t\n\n");
 
     check("hello \\\n     world", "hello world");
+    check("hello \\\r\n     world", "hello world");
     check("thread's", "thread's")
 }
 
@@ -133,6 +134,7 @@ fn test_unescape_byte_bad() {
     check(r"\", EscapeError::LoneSlash);
 
     check("\n", EscapeError::EscapeOnlyChar);
+    check("\r\n", EscapeError::EscapeOnlyChar);
     check("\t", EscapeError::EscapeOnlyChar);
     check("'", EscapeError::EscapeOnlyChar);
     check("\r", EscapeError::BareCarriageReturn);
@@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() {
 
     check("foo", b"foo");
     check("", b"");
-    check(" \t\n", b" \t\n");
+    check(" \t\n\r\n", b" \t\n\n");
 
     check("hello \\\n     world", b"hello world");
+    check("hello \\\r\n     world", b"hello world");
     check("thread's", b"thread's")
 }
 
@@ -250,6 +253,7 @@ fn test_unescape_raw_str() {
         assert_eq!(unescaped, expected);
     }
 
+    check("\r\n", &[(0..2, Ok('\n'))]);
     check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
     check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
 }
@@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() {
         assert_eq!(unescaped, expected);
     }
 
+    check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
     check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
     check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
     check(
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
@@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span};
 use rustc_lexer::Base;
 use rustc_lexer::unescape;
 
+use std::borrow::Cow;
 use std::char;
+use std::iter;
 use std::convert::TryInto;
 use rustc_data_structures::sync::Lrc;
 use log::debug;
@@ -179,7 +181,18 @@ impl<'a> StringReader<'a> {
                 let string = self.str_from(start);
                 // comments with only more "/"s are not doc comments
                 let tok = if is_doc_comment(string) {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
+                    let mut idx = 0;
+                    loop {
+                        idx = match string[idx..].find('\r') {
+                            None => break,
+                            Some(it) => idx + it + 1
+                        };
+                        if string[idx..].chars().next() != Some('\n') {
+                            self.err_span_(start + BytePos(idx as u32 - 1),
+                                            start + BytePos(idx as u32),
+                                            "bare CR not allowed in doc-comment");
+                        }
+                    }
                     token::DocComment(Symbol::intern(string))
                 } else {
                     token::Comment
@@ -204,10 +217,15 @@ impl<'a> StringReader<'a> {
                 }
 
                 let tok = if is_doc_comment {
-                    self.forbid_bare_cr(start,
-                                        string,
-                                        "bare CR not allowed in block doc-comment");
-                    token::DocComment(Symbol::intern(string))
+                    let has_cr = string.contains('\r');
+                    let string = if has_cr {
+                        self.translate_crlf(start,
+                                            string,
+                                            "bare CR not allowed in block doc-comment")
+                    } else {
+                        string.into()
+                    };
+                    token::DocComment(Symbol::intern(&string[..]))
                 } else {
                     token::Comment
                 };
@@ -473,16 +491,49 @@ impl<'a> StringReader<'a> {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
-    fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
-        let mut idx = 0;
-        loop {
-            idx = match s[idx..].find('\r') {
-                None => break,
-                Some(it) => idx + it + 1
-            };
-            self.err_span_(start + BytePos(idx as u32 - 1),
-                           start + BytePos(idx as u32),
-                           errmsg);
+    /// Converts CRLF to LF in the given string, raising an error on bare CR.
+    fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
+        let mut chars = s.char_indices().peekable();
+        while let Some((i, ch)) = chars.next() {
+            if ch == '\r' {
+                if let Some((lf_idx, '\n')) = chars.peek() {
+                    return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
+                }
+                let pos = start + BytePos(i as u32);
+                let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
+                self.err_span_(pos, end_pos, errmsg);
+            }
+        }
+        return s.into();
+
+        fn translate_crlf_(rdr: &StringReader<'_>,
+                           start: BytePos,
+                           s: &str,
+                           mut j: usize,
+                           mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
+                           errmsg: &str)
+                           -> String {
+            let mut buf = String::with_capacity(s.len());
+            // Skip first CR
+            buf.push_str(&s[.. j - 1]);
+            while let Some((i, ch)) = chars.next() {
+                if ch == '\r' {
+                    if j < i {
+                        buf.push_str(&s[j..i]);
+                    }
+                    let next = i + ch.len_utf8();
+                    j = next;
+                    if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
+                        let pos = start + BytePos(i as u32);
+                        let end_pos = start + BytePos(next as u32);
+                        rdr.err_span_(pos, end_pos, errmsg);
+                    }
+                }
+            }
+            if j < s.len() {
+                buf.push_str(&s[j..]);
+            }
+            buf
         }
     }
 
diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs
@@ -1064,7 +1064,6 @@ impl SourceFile {
                mut src: String,
                start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
         remove_bom(&mut src);
-        normalize_newlines(&mut src);
 
         let src_hash = {
             let mut hasher: StableHasher<u128> = StableHasher::new();
@@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) {
     }
 }
 
-
-/// Replaces `\r\n` with `\n` in-place in `src`.
-///
-/// Returns error if there's a lone `\r` in the string
-fn normalize_newlines(src: &mut String) {
-    if !src.as_bytes().contains(&b'\r') {
-        return;
-    }
-
-    // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
-    // While we *can* call `as_mut_vec` and do surgery on the live string
-    // directly, let's rather steal the contents of `src`. This makes the code
-    // safe even if a panic occurs.
-
-    let mut buf = std::mem::replace(src, String::new()).into_bytes();
-    let mut gap_len = 0;
-    let mut tail = buf.as_mut_slice();
-    loop {
-        let idx = match find_crlf(&tail[gap_len..]) {
-            None => tail.len(),
-            Some(idx) => idx + gap_len,
-        };
-        tail.copy_within(gap_len..idx, 0);
-        tail = &mut tail[idx - gap_len..];
-        if tail.len() == gap_len {
-            break;
-        }
-        gap_len += 1;
-    }
-
-    // Account for removed `\r`.
-    // After `set_len`, `buf` is guaranteed to contain utf-8 again.
-    let new_len = buf.len() - gap_len;
-    unsafe {
-        buf.set_len(new_len);
-        *src = String::from_utf8_unchecked(buf);
-    }
-
-    fn find_crlf(src: &[u8]) -> Option<usize> {
-        let mut search_idx = 0;
-        while let Some(idx) = find_cr(&src[search_idx..]) {
-            if src[search_idx..].get(idx + 1) != Some(&b'\n') {
-                search_idx += idx + 1;
-                continue;
-            }
-            return Some(search_idx + idx);
-        }
-        None
-    }
-
-    fn find_cr(src: &[u8]) -> Option<usize> {
-        src.iter().position(|&b| b == b'\r')
-    }
-}
-
 // _____________________________________________________________________________
 // Pos, BytePos, CharPos
 //
diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs
@@ -16,23 +16,3 @@ fn test_lookup_line() {
     assert_eq!(lookup_line(lines, BytePos(28)), 2);
     assert_eq!(lookup_line(lines, BytePos(29)), 2);
 }
-
-#[test]
-fn test_normalize_newlines() {
-    fn check(before: &str, after: &str) {
-        let mut actual = before.to_string();
-        normalize_newlines(&mut actual);
-        assert_eq!(actual.as_str(), after);
-    }
-    check("", "");
-    check("\n", "\n");
-    check("\r", "\r");
-    check("\r\r", "\r\r");
-    check("\r\n", "\n");
-    check("hello world", "hello world");
-    check("hello\nworld", "hello\nworld");
-    check("hello\r\nworld", "hello\nworld");
-    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
-    check("\r\r\n", "\r\n");
-    check("hello\rworld", "hello\rworld");
-}