Skip to content

Commit 95bc616

Browse files
committed
Add support for new parse-latin, parse-english
This is a breaking change: these parsers were updated, which cleans their API a lot, and adds types.
1 parent 9b8cca5 commit 95bc616

File tree

3 files changed

+94
-17
lines changed

3 files changed

+94
-17
lines changed

Diff for: lib/index.js

+91-11
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
* @typedef {import('nlcst').Root} NlcstRoot
55
* @typedef {import('nlcst').Paragraph} NlcstParagraph
66
* @typedef {import('nlcst').WhiteSpace} NlcstWhiteSpace
7+
* @typedef {import('nlcst').Sentence} NlcstSentence
78
* @typedef {import('nlcst').Source} NlcstSource
89
* @typedef {import('nlcst').Content} NlcstContent
910
* @typedef {import('nlcst').SentenceContent} NlcstSentenceContent
1011
* @typedef {NlcstRoot|NlcstContent} NlcstNode
12+
* @typedef {Extract<NlcstNode, import('unist').Parent>} NlcstParent
1113
*
1214
* @typedef {import('hast').Root} HastRoot
1315
* @typedef {import('hast').Element} HastElement
@@ -19,11 +21,11 @@
1921
* @typedef {import('vfile').VFile} VFile
2022
*
2123
* @typedef {{
22-
* parse(nodes: Array<NlcstContent>): NlcstRoot
23-
* tokenizeSource(value: string): NlcstSource
24-
* tokenizeWhiteSpace(value: string): NlcstWhiteSpace
25-
* tokenizeParagraph(nodes: Array<NlcstSentenceContent>): NlcstParagraph
26-
* tokenize(value: string): Array<NlcstSentenceContent>
24+
* tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
25+
* tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
26+
* parse(value: string | null | undefined): NlcstRoot
27+
* tokenizeParagraph(value: string | null | undefined): NlcstParagraph
28+
* tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
2729
* }} ParserInstance
2830
* @typedef {new () => ParserInstance} ParserConstructor
2931
*/
@@ -34,7 +36,7 @@ import {phrasing} from 'hast-util-phrasing'
3436
import {toString} from 'hast-util-to-string'
3537
import {whitespace} from 'hast-util-whitespace'
3638
import {toString as nlcstToString} from 'nlcst-to-string'
37-
import {pointStart} from 'unist-util-position'
39+
import {pointStart, pointEnd} from 'unist-util-position'
3840
import {location} from 'vfile-location'
3941

4042
const source = convertElement(['code', dataNlcstSourced])
@@ -74,6 +76,10 @@ const flowAccepting = convertElement([
7476
'dialog'
7577
])
7678

79+
// Ported from:
80+
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
81+
const terminalMarker = /^([!.?\u2026\u203D]+)$/
82+
7783
/**
7884
* Transform `tree` to nlcst.
7985
*
@@ -179,11 +185,37 @@ export function toNlcst(tree, file, Parser) {
179185
* @param {HastElementContent|Array<HastElementContent>} node
180186
*/
181187
function add(node) {
182-
/** @type {Array<NlcstSentenceContent>|undefined} */
188+
/** @type {Array<NlcstSentenceContent> | undefined} */
183189
const result = Array.isArray(node) ? all(node) : one(node)
184190

185191
if (result && result.length > 0) {
186-
results.push(parser.tokenizeParagraph(result))
192+
const start = pointStart(result[0])
193+
const end = pointEnd(result[result.length - 1])
194+
195+
// Turn into a sentence.
196+
/** @type {NlcstSentence} */
197+
const sentence = {type: 'SentenceNode', children: result}
198+
if (start && end) sentence.position = {start, end}
199+
200+
let index = -1
201+
while (parser.tokenizeSentencePlugins[++index]) {
202+
parser.tokenizeSentencePlugins[index](sentence)
203+
}
204+
205+
// Turn into a paragraph.
206+
/** @type {NlcstParagraph} */
207+
const paragraph = {
208+
type: 'ParagraphNode',
209+
children: splitNode(sentence, 'PunctuationNode', terminalMarker)
210+
}
211+
if (start && end) paragraph.position = {start: {...start}, end: {...end}}
212+
213+
index = -1
214+
while (parser.tokenizeParagraphPlugins[++index]) {
215+
parser.tokenizeParagraphPlugins[index](paragraph)
216+
}
217+
218+
results.push(paragraph)
187219
}
188220
}
189221

@@ -238,13 +270,13 @@ export function toNlcst(tree, file, Parser) {
238270
change = true
239271
} else if (node.type === 'element' && !ignore(node)) {
240272
if (node.tagName === 'wbr') {
241-
replacement = [parser.tokenizeWhiteSpace(' ')]
273+
replacement = [{type: 'WhiteSpaceNode', value: ' '}]
242274
change = true
243275
} else if (node.tagName === 'br') {
244-
replacement = [parser.tokenizeWhiteSpace('\n')]
276+
replacement = [{type: 'WhiteSpaceNode', value: '\n'}]
245277
change = true
246278
} else if (source(node)) {
247-
replacement = [parser.tokenizeSource(toString(node))]
279+
replacement = [{type: 'SourceNode', value: toString(node)}]
248280
change = true
249281
} else {
250282
replacement = all(node.children)
@@ -327,3 +359,51 @@ function dataNlcstSourced(node) {
327359
function dataNlcstIgnore(node) {
328360
return Boolean(node.properties && node.properties.dataNlcst === 'ignore')
329361
}
362+
363+
// Ported from:
364+
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
365+
/**
366+
* A function that splits one node into several nodes.
367+
*
368+
* @template {NlcstParent} TheNode
369+
* @param {TheNode} node
370+
* @param {RegExp} expression
371+
* @param {NlcstContent['type']} childType
372+
* @returns {Array<TheNode>}
373+
*/
374+
function splitNode(node, childType, expression) {
375+
/** @type {Array<TheNode>} */
376+
const result = []
377+
let index = -1
378+
let start = 0
379+
380+
while (++index < node.children.length) {
381+
const token = node.children[index]
382+
383+
if (
384+
index === node.children.length - 1 ||
385+
(token.type === childType && expression.test(nlcstToString(token)))
386+
) {
387+
/** @type {TheNode} */
388+
// @ts-expect-error: fine
389+
const parent = {
390+
type: node.type,
391+
children: node.children.slice(start, index + 1)
392+
}
393+
394+
const first = node.children[start]
395+
const last = token
396+
if (first.position && last.position) {
397+
parent.position = {
398+
start: first.position.start,
399+
end: last.position.end
400+
}
401+
}
402+
403+
result.push(parent)
404+
start = index + 1
405+
}
406+
}
407+
408+
return result
409+
}

Diff for: package.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@
5353
"@types/tape": "^4.0.0",
5454
"c8": "^7.0.0",
5555
"is-hidden": "^2.0.0",
56-
"parse-dutch": "^5.0.0",
57-
"parse-english": "^5.0.0",
58-
"parse-latin": "^5.0.0",
56+
"parse-dutch": "^6.0.0",
57+
"parse-english": "^6.0.0",
58+
"parse-latin": "^6.0.0",
5959
"prettier": "^2.0.0",
6060
"rehype": "^12.0.0",
6161
"remark-cli": "^11.0.0",

Diff for: test/index.js

-3
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,8 @@ import path from 'node:path'
33
import test from 'tape'
44
import {rehype} from 'rehype'
55
import {VFile} from 'vfile'
6-
// @ts-expect-error: to do type.
76
import {ParseLatin} from 'parse-latin'
8-
// @ts-expect-error: to do type.
97
import {ParseDutch} from 'parse-dutch'
10-
// @ts-expect-error: to do type.
118
import {ParseEnglish} from 'parse-english'
129
import {isHidden} from 'is-hidden'
1310
import {toNlcst} from '../index.js'

0 commit comments

Comments
 (0)