Remove TokenKind::InvalidPrefix.

nnethercote · nnethercote · commit e9a0c3c98c56 · 2024-11-19T18:06:22.000+11:00
It was added in rust-lang#123752 to handle some cases involving emoji, but it isn't necessary because it's always treated the same as `TokenKind::InvalidIdent`. This commit removes it, which makes things a little simpler.
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -99,10 +99,6 @@ pub enum TokenKind {
     /// several tokens: `'r` and `#` and `foo`.
     RawLifetime,
 
-    /// Similar to the above, but *always* an error on every edition. This is used
-    /// for emoji identifier recovery, as those are not meant to be ever accepted.
-    InvalidPrefix,
-
     /// Guarded string literal prefix: `#"` or `##`.
     ///
     /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
@@ -466,7 +462,7 @@ impl Cursor<'_> {
                 Literal { kind, suffix_start }
             }
             // Identifier starting with an emoji. Only lexed for graceful error recovery.
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
             _ => Unknown,
         };
         let res = Token::new(token_kind, self.pos_within_token());
@@ -550,23 +546,22 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
-            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
+            c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
             _ => Ident,
         }
     }
 
-    fn invalid_ident_or_prefix(&mut self) -> TokenKind {
+    fn invalid_ident(&mut self) -> TokenKind {
         // Start is already eaten, eat the rest of identifier.
         self.eat_while(|c| {
             const ZERO_WIDTH_JOINER: char = '\u{200d}';
             is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
         });
-        // Known prefixes must have been handled earlier. So if
-        // we see a prefix here, it is definitely an unknown prefix.
-        match self.first() {
-            '#' | '"' | '\'' => InvalidPrefix,
-            _ => InvalidIdent,
-        }
+        // An invalid identifier followed by '#' or '"' or '\'' could be
+        // interpreted as an invalid literal prefix. We don't bother doing that
+        // because the treatment of invalid identifiers and invalid prefixes
+        // would be the same.
+        InvalidIdent
     }
 
     fn c_or_byte_string(
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     let ident = Symbol::intern(lifetime_name);
                     token::Lifetime(ident, IdentIsRaw::No)
                 }
-                rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix
+                rustc_lexer::TokenKind::InvalidIdent
                     // Do not recover an identifier with emoji if the codepoint is a confusable
                     // with a recoverable substitution token, like `➖`.
                     if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
@@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
                 rustc_lexer::TokenKind::Unknown
-                | rustc_lexer::TokenKind::InvalidIdent
-                | rustc_lexer::TokenKind::InvalidPrefix => {
+                | rustc_lexer::TokenKind::InvalidIdent => {
                     // Don't emit diagnostics for sequences of the same invalid token
                     if swallow_next_invalid > 0 {
                         swallow_next_invalid -= 1;
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
@@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
                 },
                 Some(c) => c,
             },
-            TokenKind::RawIdent
-            | TokenKind::UnknownPrefix
-            | TokenKind::InvalidPrefix
-            | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
+            TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
+                Class::Ident(self.new_span(before, text))
+            }
             TokenKind::Lifetime { .. }
             | TokenKind::RawLifetime
             | TokenKind::UnknownPrefixLifetime => Class::Lifetime,
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
                 rustc_lexer::TokenKind::Ident => {
                     SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
                 }
-                rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
+                rustc_lexer::TokenKind::InvalidIdent => {
                     err = "Ident contains invalid characters";
                     IDENT
                 }

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ impl<'a> Converter<'a> {`
`183`	`183`	`rustc_lexer::TokenKind::Ident => {`
`184`	`184`	`SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)`
`185`	`185`	`}`
`186`		`- rustc_lexer::TokenKind::InvalidPrefix \| rustc_lexer::TokenKind::InvalidIdent => {`
	`186`	`+ rustc_lexer::TokenKind::InvalidIdent => {`
`187`	`187`	`err = "Ident contains invalid characters";`
`188`	`188`	`IDENT`
`189`	`189`	`}`