Skip to content

Commit 0ef4b70

Browse files
committed
Allow < in tag name state
We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent. Fixes #2230
1 parent 51909b1 commit 0ef4b70

File tree

5 files changed

+22
-26
lines changed

5 files changed

+22
-26
lines changed

CHANGES.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@
3535
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
3636
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
3737
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
38-
* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
38+
* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly
39+
created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
40+
* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
41+
character node. [2230](https://github.com/jhy/jsoup/issues/2230)
3942

4043
## 1.18.1 (2024-Jul-10)
4144

src/main/java/org/jsoup/parser/CharacterReader.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ String consumeRawData() {
489489

490490
String consumeTagName() {
491491
// '\t', '\n', '\r', '\f', ' ', '/', '>'
492-
// NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
492+
// NOTE: out of spec; does not stop and append on nullChar but eats
493493
bufferUp();
494494
int pos = bufPos;
495495
final int start = pos;
@@ -505,7 +505,6 @@ String consumeTagName() {
505505
case ' ':
506506
case '/':
507507
case '>':
508-
case '<':
509508
break OUTER;
510509
}
511510
pos++;

src/main/java/org/jsoup/parser/TokeniserState.java

-4
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,6 @@ enum TokeniserState {
160160
case '/':
161161
t.transition(SelfClosingStartTag);
162162
break;
163-
case '<': // NOTE: out of spec, but clear author intent
164-
r.unconsume();
165-
t.error(this);
166-
// intended fall through to next >
167163
case '>':
168164
t.emitTagPending();
169165
t.transition(Data);

src/test/java/org/jsoup/parser/HtmlParserTest.java

+17-2
Original file line numberDiff line numberDiff line change
@@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) {
16491649
// when the Element is created, the name got normalized to "template" and so looked like there should be a
16501650
// template on the stack during resetInsertionMode for the select.
16511651
// The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
1652-
Document doc = Jsoup.parse("<template\u001E<select<input<");
1652+
Document doc = Jsoup.parse("<template\u001E><select><input>");
16531653
assertNotNull(doc);
1654-
assertEquals("<template><select></select><input>&lt;</template>",
1654+
assertEquals("<template><select></select><input></template>",
16551655
TextUtil.stripNewlines(doc.head().html()));
16561656
}
16571657

@@ -1924,4 +1924,19 @@ private static void assertMathNamespace(Element el) {
19241924
TextUtil.normalizeSpaces(doc.body().html())
19251925
);
19261926
}
1927+
1928+
@Test void gtAfterTagClose() {
1929+
// https://github.com/jhy/jsoup/issues/2230
1930+
String html = "<div>Div</div<> <a>One<a<b>Hello</b>";
1931+
// this gives us an element "a<b", which is gross, but to the spec & browsers
1932+
Document doc = Jsoup.parse(html);
1933+
Element body = doc.body();
1934+
assertEquals("<div> Div <a>One<a<b> Hello </a<b></a></div>", TextUtil.normalizeSpaces(body.html()));
1935+
1936+
Elements abs = doc.getElementsByTag("a<b");
1937+
assertEquals(1, abs.size());
1938+
Element ab = abs.first();
1939+
assertEquals("Hello", ab.text());
1940+
assertEquals("a<b", ab.tag().normalName());
1941+
}
19271942
}

src/test/java/org/jsoup/parser/TokeniserStateTest.java

-17
Original file line numberDiff line numberDiff line change
@@ -198,13 +198,6 @@ public void testPublicAndSystemIdentifiersWithWhitespace() {
198198
}
199199
}
200200

201-
@Test public void handlesLessInTagThanAsNewTag() {
202-
// out of spec, but clear author intent
203-
String html = "<p\n<p<div id=one <span>Two";
204-
Document doc = Jsoup.parse(html);
205-
assertEquals("<p></p><p></p><div id=\"one\"><span>Two</span></div>", TextUtil.stripNewlines(doc.body().html()));
206-
}
207-
208201
@Test
209202
public void testUnconsumeAtBufferBoundary() {
210203
String triggeringSnippet = "<a href=\"\"foo";
@@ -250,16 +243,6 @@ public void testMalformedSelfClosingTag() {
250243
assertEquals(7, errorList.get(0).getPosition());
251244
}
252245

253-
@Test
254-
public void testOpeningAngleBracketInTagName() {
255-
String triggeringSnippet = "<html<";
256-
ParseErrorList errorList = ParseErrorList.tracking(1);
257-
258-
Parser.parseFragment(triggeringSnippet, null, "", errorList);
259-
260-
assertEquals(5, errorList.get(0).getPosition());
261-
}
262-
263246
@Test
264247
public void rcData() {
265248
Document doc = Jsoup.parse("<title>One \0Two</title>");

0 commit comments

Comments
 (0)