Entity decoding supports prefix matches

jhy · jhy · commit 5ee376bc5bbe · 2024-11-22T13:03:49.000+11:00
When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). Finds the longest prefix. Validated matches browser behavior and extended entities *don't* match (like `&clubsuite;`) Fixes #2207
diff --git a/CHANGES.md b/CHANGES.md
@@ -33,6 +33,8 @@
   `Connection.Response#cookies()` will provide the last one set. Generally it is better to use
   the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that
   applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
+* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
+  attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
 
 ## 1.18.1 (2024-Jul-10)
 
diff --git a/pom.xml b/pom.xml
@@ -98,6 +98,7 @@
                 <ignore>java.io.UncheckedIOException</ignore>
                 <ignore>java.util.Comparator</ignore> <!-- Comparator.comparingInt() -->
                 <ignore>java.util.List</ignore> <!-- List#stream() -->
+                <ignore>java.util.ArrayList</ignore> <!-- List / ArrayList #sort() -->
                 <ignore>java.util.LinkedHashMap</ignore> <!-- LinkedHashMap#computeIfAbsent() -->
                 <ignore>java.util.Map</ignore> <!-- Map#computeIfAbsent() -->
                 <ignore>java.util.Objects</ignore>
diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java
@@ -11,7 +11,9 @@
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
 
 import static org.jsoup.nodes.Document.OutputSettings.*;
@@ -36,6 +38,9 @@ public class Entities {
     private static final char[] codeDelims = {',', ';'};
     private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
 
+    private static final int BaseCount = 106;
+    private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
+
     public enum EscapeMode {
         /**
          * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
@@ -50,6 +55,12 @@ public enum EscapeMode {
          */
         extended(EntitiesData.fullPoints, 2125);
 
+        static {
+            // sort the base names by length, for prefix matching
+            Collections.addAll(baseSorted, base.nameKeys);
+            baseSorted.sort((a, b) -> b.length() - a.length());
+        }
+
         // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
         private String[] nameKeys;
         private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
@@ -134,6 +145,19 @@ public static int codepointsForName(final String name, final int[] codepoints) {
         return 0;
     }
 
+    /**
+     Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
+
+     @return longest entity name that is a prefix of the input, or "" if no entity matches
+     */
+    public static String findPrefix(String input) {
+        for (String name : baseSorted) {
+            if (input.startsWith(name)) return name;
+        }
+        return emptyName;
+        // if perf critical, could look at using a Trie vs a scan
+    }
+
     /**
      HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
      both in attributes and in text data.
diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -228,7 +228,12 @@ void advanceTransition(TokeniserState newState) {
                 reader.rewindToMark();
                 if (looksLegit) // named with semicolon
                     characterReferenceError("invalid named reference [%s]", nameRef);
-                return null;
+                if (inAttribute) return null;
+                // check if there's a base prefix match; consume and use that if so
+                String prefix = Entities.findPrefix(nameRef);
+                if (prefix.isEmpty()) return null;
+                reader.matchConsume(prefix);
+                nameRef = prefix;
             }
             if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
                 // don't want that to match
diff --git a/src/test/java/org/jsoup/nodes/EntitiesTest.java b/src/test/java/org/jsoup/nodes/EntitiesTest.java
@@ -112,6 +112,13 @@ public class EntitiesTest {
         assertEquals("Hello &= &", Entities.unescape(text, false));
     }
 
+    @Test public void prefixMatch() {
+        // https://github.com/jhy/jsoup/issues/2207
+        // example from https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
+        String text = "I'm &notit; I tell you. I'm &notin; I tell you.";
+        assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, false));
+        assertEquals("I'm &notit; I tell you. I'm ∉ I tell you.", Entities.unescape(text, true)); // not for attributes
+    }
 
     @Test public void caseSensitive() {
         String unescaped = "Ü ü & &";
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -925,9 +925,8 @@ private static Stream<Arguments> dupeAttributeData() {
         assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
     }
 
-    @Test public void doesNotFindShortestMatchingEntity() {
-        // previous behaviour was to identify a possible entity, then chomp down the string until a match was found.
-        // (as defined in html5.) However in practise that lead to spurious matches against the author's intent.
+    @Test public void doesNotFindExtendedPrefixMatchingEntity() {
+        // only base entities, not extended entities, should allow prefix match (i.e., those in the spec named list that don't include a trailing ; - https://html.spec.whatwg.org/multipage/named-characters.html)
         String html = "One &clubsuite; &clubsuit;";
         Document doc = Jsoup.parse(html);
         assertEquals(StringUtil.normaliseWhitespace("One &amp;clubsuite; ♣"), doc.body().html());
@@ -941,6 +940,23 @@ private static Stream<Arguments> dupeAttributeData() {
         assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
     }
 
+    @Test public void findsBasePrefixEntity() {
+        // https://github.com/jhy/jsoup/issues/2207
+        String html = "a&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.";
+        Document doc = Jsoup.parse(html);
+        doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
+        assertEquals("a&nbsp;c&shy;c I'm &not;it; I tell you. I'm &notin; I tell you.", doc.body().html());
+        assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().text());
+
+        // and in an attribute:
+        html = "<a title=\"&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.\">One</a>";
+        doc = Jsoup.parse(html);
+        doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
+        Element el = doc.expectFirst("a");
+        assertEquals("<a title=\"&amp;nbspc&amp;shyc I'm &amp;notit; I tell you. I'm &notin; I tell you.\">One</a>", el.outerHtml());
+        assertEquals("&nbspc&shyc I'm &notit; I tell you. I'm ∉ I tell you.", el.attr("title"));
+    }
+
     @Test public void handlesXmlDeclarationAsBogusComment() {
         String html = "<?xml encoding='UTF-8' ?><body>One</body>";
         Document doc = Jsoup.parse(html);