4
4
* @typedef {import('nlcst').Root } NlcstRoot
5
5
* @typedef {import('nlcst').Paragraph } NlcstParagraph
6
6
* @typedef {import('nlcst').WhiteSpace } NlcstWhiteSpace
7
+ * @typedef {import('nlcst').Sentence } NlcstSentence
7
8
* @typedef {import('nlcst').Source } NlcstSource
8
9
* @typedef {import('nlcst').Content } NlcstContent
9
10
* @typedef {import('nlcst').SentenceContent } NlcstSentenceContent
10
11
* @typedef {NlcstRoot|NlcstContent } NlcstNode
12
+ * @typedef {Extract<NlcstNode, import('unist').Parent> } NlcstParent
11
13
*
12
14
* @typedef {import('hast').Root } HastRoot
13
15
* @typedef {import('hast').Element } HastElement
19
21
* @typedef {import('vfile').VFile } VFile
20
22
*
21
23
* @typedef {{
22
- * parse(nodes : Array<NlcstContent>): NlcstRoot
23
- * tokenizeSource(value: string): NlcstSource
24
- * tokenizeWhiteSpace (value: string): NlcstWhiteSpace
25
- * tokenizeParagraph(nodes: Array<NlcstSentenceContent> ): NlcstParagraph
26
- * tokenize(value: string): Array<NlcstSentenceContent>
24
+ * tokenizeSentencePlugins : Array<(node: NlcstSentence) => void>,
25
+ * tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
26
+ * parse (value: string | null | undefined ): NlcstRoot
27
+ * tokenizeParagraph(value: string | null | undefined ): NlcstParagraph
28
+ * tokenize(value: string | null | undefined ): Array<NlcstSentenceContent>
27
29
* }} ParserInstance
28
30
* @typedef {new () => ParserInstance } ParserConstructor
29
31
*/
@@ -34,7 +36,7 @@ import {phrasing} from 'hast-util-phrasing'
34
36
import { toString } from 'hast-util-to-string'
35
37
import { whitespace } from 'hast-util-whitespace'
36
38
import { toString as nlcstToString } from 'nlcst-to-string'
37
- import { pointStart } from 'unist-util-position'
39
+ import { pointStart , pointEnd } from 'unist-util-position'
38
40
import { location } from 'vfile-location'
39
41
40
42
const source = convertElement ( [ 'code' , dataNlcstSourced ] )
@@ -74,6 +76,10 @@ const flowAccepting = convertElement([
74
76
'dialog'
75
77
] )
76
78
79
+ // Ported from:
80
+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
81
+ const terminalMarker = / ^ ( [ ! . ? \u2026 \u203D ] + ) $ /
82
+
77
83
/**
78
84
* Transform `tree` to nlcst.
79
85
*
@@ -179,11 +185,37 @@ export function toNlcst(tree, file, Parser) {
179
185
* @param {HastElementContent|Array<HastElementContent> } node
180
186
*/
181
187
function add ( node ) {
182
- /** @type {Array<NlcstSentenceContent>| undefined } */
188
+ /** @type {Array<NlcstSentenceContent> | undefined } */
183
189
const result = Array . isArray ( node ) ? all ( node ) : one ( node )
184
190
185
191
if ( result && result . length > 0 ) {
186
- results . push ( parser . tokenizeParagraph ( result ) )
192
+ const start = pointStart ( result [ 0 ] )
193
+ const end = pointEnd ( result [ result . length - 1 ] )
194
+
195
+ // Turn into a sentence.
196
+ /** @type {NlcstSentence } */
197
+ const sentence = { type : 'SentenceNode' , children : result }
198
+ if ( start && end ) sentence . position = { start, end}
199
+
200
+ let index = - 1
201
+ while ( parser . tokenizeSentencePlugins [ ++ index ] ) {
202
+ parser . tokenizeSentencePlugins [ index ] ( sentence )
203
+ }
204
+
205
+ // Turn into a paragraph.
206
+ /** @type {NlcstParagraph } */
207
+ const paragraph = {
208
+ type : 'ParagraphNode' ,
209
+ children : splitNode ( sentence , 'PunctuationNode' , terminalMarker )
210
+ }
211
+ if ( start && end ) paragraph . position = { start : { ...start } , end : { ...end } }
212
+
213
+ index = - 1
214
+ while ( parser . tokenizeParagraphPlugins [ ++ index ] ) {
215
+ parser . tokenizeParagraphPlugins [ index ] ( paragraph )
216
+ }
217
+
218
+ results . push ( paragraph )
187
219
}
188
220
}
189
221
@@ -238,13 +270,13 @@ export function toNlcst(tree, file, Parser) {
238
270
change = true
239
271
} else if ( node . type === 'element' && ! ignore ( node ) ) {
240
272
if ( node . tagName === 'wbr' ) {
241
- replacement = [ parser . tokenizeWhiteSpace ( ' ' ) ]
273
+ replacement = [ { type : 'WhiteSpaceNode' , value : ' ' } ]
242
274
change = true
243
275
} else if ( node . tagName === 'br' ) {
244
- replacement = [ parser . tokenizeWhiteSpace ( ' \n') ]
276
+ replacement = [ { type : 'WhiteSpaceNode' , value : ' \n'} ]
245
277
change = true
246
278
} else if ( source ( node ) ) {
247
- replacement = [ parser . tokenizeSource ( toString ( node ) ) ]
279
+ replacement = [ { type : 'SourceNode' , value : toString ( node ) } ]
248
280
change = true
249
281
} else {
250
282
replacement = all ( node . children )
@@ -327,3 +359,51 @@ function dataNlcstSourced(node) {
327
359
function dataNlcstIgnore ( node ) {
328
360
return Boolean ( node . properties && node . properties . dataNlcst === 'ignore' )
329
361
}
362
+
363
+ // Ported from:
364
+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
365
+ /**
366
+ * A function that splits one node into several nodes.
367
+ *
368
+ * @template {NlcstParent} TheNode
369
+ * @param {TheNode } node
370
+ * @param {RegExp } expression
371
+ * @param {NlcstContent['type'] } childType
372
+ * @returns {Array<TheNode> }
373
+ */
374
+ function splitNode ( node , childType , expression ) {
375
+ /** @type {Array<TheNode> } */
376
+ const result = [ ]
377
+ let index = - 1
378
+ let start = 0
379
+
380
+ while ( ++ index < node . children . length ) {
381
+ const token = node . children [ index ]
382
+
383
+ if (
384
+ index === node . children . length - 1 ||
385
+ ( token . type === childType && expression . test ( nlcstToString ( token ) ) )
386
+ ) {
387
+ /** @type {TheNode } */
388
+ // @ts -expect-error: fine
389
+ const parent = {
390
+ type : node . type ,
391
+ children : node . children . slice ( start , index + 1 )
392
+ }
393
+
394
+ const first = node . children [ start ]
395
+ const last = token
396
+ if ( first . position && last . position ) {
397
+ parent . position = {
398
+ start : first . position . start ,
399
+ end : last . position . end
400
+ }
401
+ }
402
+
403
+ result . push ( parent )
404
+ start = index + 1
405
+ }
406
+ }
407
+
408
+ return result
409
+ }
0 commit comments