Add support for new parse-latin, parse-english

wooorm · wooorm · commit 95bc616a3457 · 2023-01-09T11:30:03.000+01:00
This is a breaking change: these parsers were updated,
which cleans their API a lot, and adds types.
diff --git a/lib/index.js b/lib/index.js
@@ -4,10 +4,12 @@
  * @typedef {import('nlcst').Root} NlcstRoot
  * @typedef {import('nlcst').Paragraph} NlcstParagraph
  * @typedef {import('nlcst').WhiteSpace} NlcstWhiteSpace
+ * @typedef {import('nlcst').Sentence} NlcstSentence
  * @typedef {import('nlcst').Source} NlcstSource
  * @typedef {import('nlcst').Content} NlcstContent
  * @typedef {import('nlcst').SentenceContent} NlcstSentenceContent
  * @typedef {NlcstRoot|NlcstContent} NlcstNode
+ * @typedef {Extract<NlcstNode, import('unist').Parent>} NlcstParent
  *
  * @typedef {import('hast').Root} HastRoot
  * @typedef {import('hast').Element} HastElement
@@ -19,11 +21,11 @@
  * @typedef {import('vfile').VFile} VFile
  *
  * @typedef {{
- *   parse(nodes: Array<NlcstContent>): NlcstRoot
- *   tokenizeSource(value: string): NlcstSource
- *   tokenizeWhiteSpace(value: string): NlcstWhiteSpace
- *   tokenizeParagraph(nodes: Array<NlcstSentenceContent>): NlcstParagraph
- *   tokenize(value: string): Array<NlcstSentenceContent>
+ *   tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
+ *   tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
+ *   parse(value: string | null | undefined): NlcstRoot
+ *   tokenizeParagraph(value: string | null | undefined): NlcstParagraph
+ *   tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
  * }} ParserInstance
  * @typedef {new () => ParserInstance} ParserConstructor
  */
@@ -34,7 +36,7 @@ import {phrasing} from 'hast-util-phrasing'
 import {toString} from 'hast-util-to-string'
 import {whitespace} from 'hast-util-whitespace'
 import {toString as nlcstToString} from 'nlcst-to-string'
-import {pointStart} from 'unist-util-position'
+import {pointStart, pointEnd} from 'unist-util-position'
 import {location} from 'vfile-location'
 
 const source = convertElement(['code', dataNlcstSourced])
@@ -74,6 +76,10 @@ const flowAccepting = convertElement([
   'dialog'
 ])
 
+// Ported from:
+// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
+const terminalMarker = /^([!.?\u2026\u203D]+)$/
+
 /**
  * Transform `tree` to nlcst.
  *
@@ -179,11 +185,37 @@ export function toNlcst(tree, file, Parser) {
    * @param {HastElementContent|Array<HastElementContent>} node
    */
   function add(node) {
-    /** @type {Array<NlcstSentenceContent>|undefined} */
+    /** @type {Array<NlcstSentenceContent> | undefined} */
     const result = Array.isArray(node) ? all(node) : one(node)
 
     if (result && result.length > 0) {
-      results.push(parser.tokenizeParagraph(result))
+      const start = pointStart(result[0])
+      const end = pointEnd(result[result.length - 1])
+
+      // Turn into a sentence.
+      /** @type {NlcstSentence} */
+      const sentence = {type: 'SentenceNode', children: result}
+      if (start && end) sentence.position = {start, end}
+
+      let index = -1
+      while (parser.tokenizeSentencePlugins[++index]) {
+        parser.tokenizeSentencePlugins[index](sentence)
+      }
+
+      // Turn into a paragraph.
+      /** @type {NlcstParagraph} */
+      const paragraph = {
+        type: 'ParagraphNode',
+        children: splitNode(sentence, 'PunctuationNode', terminalMarker)
+      }
+      if (start && end) paragraph.position = {start: {...start}, end: {...end}}
+
+      index = -1
+      while (parser.tokenizeParagraphPlugins[++index]) {
+        parser.tokenizeParagraphPlugins[index](paragraph)
+      }
+
+      results.push(paragraph)
     }
   }
 
@@ -238,13 +270,13 @@ export function toNlcst(tree, file, Parser) {
       change = true
     } else if (node.type === 'element' && !ignore(node)) {
       if (node.tagName === 'wbr') {
-        replacement = [parser.tokenizeWhiteSpace(' ')]
+        replacement = [{type: 'WhiteSpaceNode', value: ' '}]
         change = true
       } else if (node.tagName === 'br') {
-        replacement = [parser.tokenizeWhiteSpace('\n')]
+        replacement = [{type: 'WhiteSpaceNode', value: '\n'}]
         change = true
       } else if (source(node)) {
-        replacement = [parser.tokenizeSource(toString(node))]
+        replacement = [{type: 'SourceNode', value: toString(node)}]
         change = true
       } else {
         replacement = all(node.children)
@@ -327,3 +359,51 @@ function dataNlcstSourced(node) {
 function dataNlcstIgnore(node) {
   return Boolean(node.properties && node.properties.dataNlcst === 'ignore')
 }
+
+// Ported from:
+// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
+/**
+ * A function that splits one node into several nodes.
+ *
+ * @template {NlcstParent} TheNode
+ * @param {TheNode} node
+ * @param {RegExp} expression
+ * @param {NlcstContent['type']} childType
+ * @returns {Array<TheNode>}
+ */
+function splitNode(node, childType, expression) {
+  /** @type {Array<TheNode>} */
+  const result = []
+  let index = -1
+  let start = 0
+
+  while (++index < node.children.length) {
+    const token = node.children[index]
+
+    if (
+      index === node.children.length - 1 ||
+      (token.type === childType && expression.test(nlcstToString(token)))
+    ) {
+      /** @type {TheNode} */
+      // @ts-expect-error: fine
+      const parent = {
+        type: node.type,
+        children: node.children.slice(start, index + 1)
+      }
+
+      const first = node.children[start]
+      const last = token
+      if (first.position && last.position) {
+        parent.position = {
+          start: first.position.start,
+          end: last.position.end
+        }
+      }
+
+      result.push(parent)
+      start = index + 1
+    }
+  }
+
+  return result
+}
diff --git a/package.json b/package.json
@@ -53,9 +53,9 @@
     "@types/tape": "^4.0.0",
     "c8": "^7.0.0",
     "is-hidden": "^2.0.0",
-    "parse-dutch": "^5.0.0",
-    "parse-english": "^5.0.0",
-    "parse-latin": "^5.0.0",
+    "parse-dutch": "^6.0.0",
+    "parse-english": "^6.0.0",
+    "parse-latin": "^6.0.0",
     "prettier": "^2.0.0",
     "rehype": "^12.0.0",
     "remark-cli": "^11.0.0",
diff --git a/test/index.js b/test/index.js
@@ -3,11 +3,8 @@ import path from 'node:path'
 import test from 'tape'
 import {rehype} from 'rehype'
 import {VFile} from 'vfile'
-// @ts-expect-error: to do type.
 import {ParseLatin} from 'parse-latin'
-// @ts-expect-error: to do type.
 import {ParseDutch} from 'parse-dutch'
-// @ts-expect-error: to do type.
 import {ParseEnglish} from 'parse-english'
 import {isHidden} from 'is-hidden'
 import {toNlcst} from '../index.js'