Skip to content

Commit 5ee376b

Browse files
committed
Entity decoding supports prefix matches
When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). Finds the longest prefix. Validated matches browser behavior and extended entities *don't* match (like `&clubsuite;`) Fixes #2207
1 parent 708fc1f commit 5ee376b

File tree

6 files changed

+59
-4
lines changed

6 files changed

+59
-4
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
`Connection.Response#cookies()` will provide the last one set. Generally it is better to use
3434
the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that
3535
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
36+
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
37+
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
3638

3739
## 1.18.1 (2024-Jul-10)
3840

pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
<ignore>java.io.UncheckedIOException</ignore>
9999
<ignore>java.util.Comparator</ignore> <!-- Comparator.comparingInt() -->
100100
<ignore>java.util.List</ignore> <!-- List#stream() -->
101+
<ignore>java.util.ArrayList</ignore> <!-- List / ArrayList #sort() -->
101102
<ignore>java.util.LinkedHashMap</ignore> <!-- LinkedHashMap#computeIfAbsent() -->
102103
<ignore>java.util.Map</ignore> <!-- Map#computeIfAbsent() -->
103104
<ignore>java.util.Objects</ignore>

src/main/java/org/jsoup/nodes/Entities.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
import java.io.IOException;
1212
import java.nio.charset.Charset;
1313
import java.nio.charset.CharsetEncoder;
14+
import java.util.ArrayList;
1415
import java.util.Arrays;
16+
import java.util.Collections;
1517
import java.util.HashMap;
1618

1719
import static org.jsoup.nodes.Document.OutputSettings.*;
@@ -36,6 +38,9 @@ public class Entities {
3638
private static final char[] codeDelims = {',', ';'};
3739
private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
3840

41+
private static final int BaseCount = 106;
42+
private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
43+
3944
public enum EscapeMode {
4045
/**
4146
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
@@ -50,6 +55,12 @@ public enum EscapeMode {
5055
*/
5156
extended(EntitiesData.fullPoints, 2125);
5257

58+
static {
59+
// sort the base names by length, for prefix matching
60+
Collections.addAll(baseSorted, base.nameKeys);
61+
baseSorted.sort((a, b) -> b.length() - a.length());
62+
}
63+
5364
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
5465
private String[] nameKeys;
5566
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
@@ -134,6 +145,19 @@ public static int codepointsForName(final String name, final int[] codepoints) {
134145
return 0;
135146
}
136147

148+
/**
149+
Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
150+
151+
@return longest entity name that is a prefix of the input, or "" if no entity matches
152+
*/
153+
public static String findPrefix(String input) {
154+
for (String name : baseSorted) {
155+
if (input.startsWith(name)) return name;
156+
}
157+
return emptyName;
158+
// if perf critical, could look at using a Trie vs a scan
159+
}
160+
137161
/**
138162
HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
139163
both in attributes and in text data.

src/main/java/org/jsoup/parser/Tokeniser.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,12 @@ void advanceTransition(TokeniserState newState) {
228228
reader.rewindToMark();
229229
if (looksLegit) // named with semicolon
230230
characterReferenceError("invalid named reference [%s]", nameRef);
231-
return null;
231+
if (inAttribute) return null;
232+
// check if there's a base prefix match; consume and use that if so
233+
String prefix = Entities.findPrefix(nameRef);
234+
if (prefix.isEmpty()) return null;
235+
reader.matchConsume(prefix);
236+
nameRef = prefix;
232237
}
233238
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
234239
// don't want that to match

src/test/java/org/jsoup/nodes/EntitiesTest.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ public class EntitiesTest {
112112
assertEquals("Hello &= &", Entities.unescape(text, false));
113113
}
114114

115+
@Test public void prefixMatch() {
116+
// https://github.com/jhy/jsoup/issues/2207
117+
// example from https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
118+
String text = "I'm &notit; I tell you. I'm &notin; I tell you.";
119+
assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, false));
120+
assertEquals("I'm &notit; I tell you. I'm ∉ I tell you.", Entities.unescape(text, true)); // not for attributes
121+
}
115122

116123
@Test public void caseSensitive() {
117124
String unescaped = "Ü ü & &";

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -925,9 +925,8 @@ private static Stream<Arguments> dupeAttributeData() {
925925
assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
926926
}
927927

928-
@Test public void doesNotFindShortestMatchingEntity() {
929-
// previous behaviour was to identify a possible entity, then chomp down the string until a match was found.
930-
// (as defined in html5.) However in practise that lead to spurious matches against the author's intent.
928+
@Test public void doesNotFindExtendedPrefixMatchingEntity() {
929+
// only base entities, not extended entities, should allow prefix match (i.e., those in the spec named list that don't include a trailing ; - https://html.spec.whatwg.org/multipage/named-characters.html)
931930
String html = "One &clubsuite; &clubsuit;";
932931
Document doc = Jsoup.parse(html);
933932
assertEquals(StringUtil.normaliseWhitespace("One &amp;clubsuite; ♣"), doc.body().html());
@@ -941,6 +940,23 @@ private static Stream<Arguments> dupeAttributeData() {
941940
assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
942941
}
943942

943+
@Test public void findsBasePrefixEntity() {
944+
// https://github.com/jhy/jsoup/issues/2207
945+
String html = "a&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.";
946+
Document doc = Jsoup.parse(html);
947+
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
948+
assertEquals("a&nbsp;c&shy;c I'm &not;it; I tell you. I'm &notin; I tell you.", doc.body().html());
949+
assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().text());
950+
951+
// and in an attribute:
952+
html = "<a title=\"&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.\">One</a>";
953+
doc = Jsoup.parse(html);
954+
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
955+
Element el = doc.expectFirst("a");
956+
assertEquals("<a title=\"&amp;nbspc&amp;shyc I'm &amp;notit; I tell you. I'm &notin; I tell you.\">One</a>", el.outerHtml());
957+
assertEquals("&nbspc&shyc I'm &notit; I tell you. I'm ∉ I tell you.", el.attr("title"));
958+
}
959+
944960
@Test public void handlesXmlDeclarationAsBogusComment() {
945961
String html = "<?xml encoding='UTF-8' ?><body>One</body>";
946962
Document doc = Jsoup.parse(html);

0 commit comments

Comments
 (0)