NaturalIntelligence · nullcatalyst · Sep 24, 2022
diff --git a/spec/html_spec.js b/spec/html_spec.js
@@ -78,4 +78,66 @@ const parsingOptions = {
     output = output.replace('₹','&inr;');
   expect(output.replace(/\s+/g, "")).toEqual(html.replace(/\s+/g, ""));
     });
+
+
+    it("should fail to parse HTML <script> tags containing '<' without using options.ignoreTagsInNodes", function () {
+        const html = `
+            <html lang="en">
+                <body>
+                    <script>
+                        // Without options.ignoreTagsInNodes, '<' will attempt to create a new tag, throwing an error since it's missing a closing '>'
+                        if (1 < 2) {}
+                    </script>
+                </body>
+            </html>`;
+
+        const parsingOptions = {
+            ignoreAttributes: false,
+            preserveOrder: true,
+            unpairedTags: ["hr", "br", "link", "meta"],
+            stopNodes: ["*.pre", "*.script"],
+            processEntities: true,
+            htmlEntities: true,
+        };
+        const parser = new XMLParser(parsingOptions);
+        expect(function () { parser.parse(html); }).toThrow();
+    });
+
+
+    it("should parse HTML <script> tags containing '<' by using options.ignoreTagsInNodes", function () {
+        const html = `
+            <html lang="en">
+                <body>
+                    <script>
+                        // The character '<' should not create a new tag, and should not need a corresponding '>'
+                        if (1 < 2) {}
+                    </script>
+                </body>
+            </html>`;
+
+        const parsingOptions = {
+            ignoreAttributes: false,
+            preserveOrder: true,
+            unpairedTags: ["hr", "br", "link", "meta"],
+            stopNodes: ["*.pre", "*.script"],
+            ignoreTagsInNodes: ["*.script"],
+            processEntities: true,
+            htmlEntities: true,
+        };
+        const parser = new XMLParser(parsingOptions);
+        let result = parser.parse(html);
+
+        const builderOptions = {
+            ignoreAttributes: false,
+            format: true,
+            preserveOrder: true,
+            suppressEmptyNode: false,
+            unpairedTags: ["hr", "br", "link", "meta"],
+            stopNodes: ["*.pre", "*.script"],
+        }
+        const builder = new XMLBuilder(builderOptions);
+        let output = builder.build(result);
+        expect(output.replace(/\s+/g, "")).toEqual(html.replace(/\s+/g, ""));
+    });
+
 });
diff --git a/src/fxp.d.ts b/src/fxp.d.ts
@@ -23,6 +23,7 @@ Control how tag value should be parsed. Called only if tag value is not empty
   attributeValueProcessor: (attrName: string, attrValue: string, jPath: string) => string;
   numberParseOptions: strnumOptions;
   stopNodes: string[];
+  ignoreTagsInNodes: string[];
   unpairedTags: string[];
   alwaysCreateTextNode: boolean;
   isArray: (tagName: string, jPath: string, isLeafNode: boolean, isAttribute: boolean) => boolean;

diff --git a/src/xmlparser/OptionsBuilder.js b/src/xmlparser/OptionsBuilder.js
@@ -22,7 +22,8 @@ const defaultOptions = {
     attributeValueProcessor: function(attrName, val) {
       return val;
     },
-    stopNodes: [], //nested tags will not be parsed even for errors
+    stopNodes: [], // nested tags will not be parsed even for errors
+    ignoreTagsInNodes: [], // nested tags will not be parsed even for errors
     alwaysCreateTextNode: false,
     isArray: () => false,
     commentPropName: false,

diff --git a/src/xmlparser/OrderedObjParser.js b/src/xmlparser/OrderedObjParser.js
@@ -46,7 +46,7 @@ class OrderedObjParser{
     this.parseTextData = parseTextData;
     this.resolveNameSpace = resolveNameSpace;
     this.buildAttributesMap = buildAttributesMap;
-    this.isItStopNode = isItStopNode;
+    this.checkNodePathMatch = checkNodePathMatch;
     this.replaceEntitiesValue = replaceEntitiesValue;
     this.readStopNodeData = readStopNodeData;
     this.saveTextToParentTag = saveTextToParentTag;
@@ -289,7 +289,7 @@ const parseXml = function(xmlData) {
           currentNode = this.tagsNodeStack.pop();
         }
 
-        if (this.isItStopNode(this.options.stopNodes, jPath, tagName)) { //TODO: namespace
+        if (this.checkNodePathMatch(this.options.stopNodes, jPath, tagName)) { //TODO: namespace
           let tagContent = "";
           //self-closing tag
           if(tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1){
@@ -302,7 +302,7 @@ const parseXml = function(xmlData) {
           //normal tag
           else{
             //read until closing tag is found
-            const result = this.readStopNodeData(xmlData, tagName, closeIndex + 1);
+            const result = this.readStopNodeData(xmlData, tagName, closeIndex + 1, this.checkNodePathMatch(this.options.ignoreTagsInNodes, jPath, tagName));
             if(!result) throw new Error(`Unexpected end of ${tagName}`);
             i = result.i;
             tagContent = result.tagContent;
@@ -403,15 +403,15 @@ function saveTextToParentTag(textData, currentNode, jPath, isLeafNode) {
 //TODO: use jPath to simplify the logic
 /**
  * 
- * @param {string[]} stopNodes 
+ * @param {string[]} nodePaths
  * @param {string} jPath
- * @param {string} currentTagName 
+ * @param {string} currentTagName
  */
-function isItStopNode(stopNodes, jPath, currentTagName){
+function checkNodePathMatch(nodePaths, jPath, currentTagName) {
   const allNodesExp = "*." + currentTagName;
-  for (const stopNodePath in stopNodes) {
-    const stopNodeExp = stopNodes[stopNodePath];
-    if( allNodesExp === stopNodeExp || jPath === stopNodeExp  ) return true;
+  for (const nodePath in nodePaths) {
+    const nodeExp = nodePaths[nodePath];
+    if (allNodesExp === nodeExp || jPath === nodeExp) return true;
   }
   return false;
 }
@@ -494,8 +494,9 @@ function readTagExp(xmlData,i, removeNSPrefix, closingChar = ">"){
  * @param {string} xmlData 
  * @param {string} tagName 
  * @param {number} i 
+ * @param {boolean} ignoreNestedTags Ignores nested tags if true. This allows parsing of tags like <script>if (a < b) {}</script> without the < triggering a new open tag.
  */
-function readStopNodeData(xmlData, tagName, i){
+function readStopNodeData(xmlData, tagName, i, ignoreNestedTags){
   const startIndex = i;
   // Starting at 1 since we already have an open tag
   let openTagCount = 1;
@@ -524,7 +525,7 @@ function readStopNodeData(xmlData, tagName, i){
         } else if(xmlData.substr(i + 1, 2) === '![') { 
           const closeIndex = findClosingIndex(xmlData, "]]>", i, "StopNode is not closed.") - 2;
           i=closeIndex;
-        } else {
+        } else if (!ignoreNestedTags) {
           const tagData = readTagExp(xmlData, i, '>')
 
           if (tagData) {