Skip to content

Commit 5fa24dd

Browse files
authored
feat: add converter to magic-regexp syntax (#368)
1 parent 968d3aa commit 5fa24dd

File tree

7 files changed

+561
-2
lines changed

7 files changed

+561
-2
lines changed

Diff for: build.config.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ import { defineBuildConfig } from 'unbuild'
22
export default defineBuildConfig({
33
declaration: true,
44
rollup: { emitCJS: true },
5-
entries: ['./src/index', './src/transform', './src/further-magic'],
5+
entries: ['./src/index', './src/transform', './src/converter', './src/further-magic'],
66
externals: ['magic-regexp', 'type-level-regexp'],
77
})

Diff for: docs/content/2.getting-started/5.converter.md

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
title: Converter (experimental)
3+
---
4+
5+
It is also possible to convert existing regular expressions to `magic-regexp` syntax.
6+
7+
```ts
8+
import { convert } from 'magic-regexp/converter'
9+
10+
convert(/a|b|c/)
11+
// createRegExp(exactly('a').or('b').or('c'))
12+
13+
convert(/(foo)bar\d+/)
14+
// createRegExp(exactly('foo').grouped(), 'bar', oneOrMore(digit))
15+
```
16+
17+
### Options
18+
19+
- `argsOnly` (boolean)
20+
_Default: `false`_
21+
Only show arguments without `createRegExp`
22+
23+
```ts
24+
convert(/\w+@\w\.com/, { argsOnly: true })
25+
// oneOrMore(wordChar), '@', wordChar, '.com'
26+
```

Diff for: package.json

+5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
"import": "./dist/index.mjs",
1111
"require": "./dist/index.cjs"
1212
},
13+
"./converter": {
14+
"import": "./dist/converter.mjs",
15+
"require": "./dist/converter.cjs"
16+
},
1317
"./transform": {
1418
"import": "./dist/transform.mjs",
1519
"require": "./dist/transform.cjs"
@@ -48,6 +52,7 @@
4852
"estree-walker": "^3.0.3",
4953
"magic-string": "^0.30.5",
5054
"mlly": "^1.4.2",
55+
"regexp-tree": "^0.1.27",
5156
"type-level-regexp": "~0.1.17",
5257
"ufo": "^1.3.1",
5358
"unplugin": "^1.5.0"

Diff for: pnpm-lock.yaml

+8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: src/converter.ts

+310
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
import regexpTree from 'regexp-tree'
2+
import type { Expression, Char, ClassRange } from 'regexp-tree/ast'
3+
4+
function build(node: Expression | null): string {
5+
if (node === null) return ''
6+
7+
switch (node.type) {
8+
case 'CharacterClass': {
9+
const exprs = combineContinuousSimpleChars(node.expressions)
10+
11+
// TODO: hard coded cases, need to be improved for multi char class
12+
if (exprs.length === 1) {
13+
const first = exprs[0]
14+
if (typeof first === 'string') {
15+
return node.negative ? `charNotIn(${first})` : `charIn(${first})`
16+
} else if (first.type === 'Char' && first.kind === 'meta' && node.negative) {
17+
if (first.value === '\\t') return `not.tab`
18+
if (first.value === '\\n') return `not.linefeed`
19+
if (first.value === '\\r') return `not.carriageReturn`
20+
} else {
21+
const range = normalizeClassRange(first)
22+
if (range === 'A-Z') {
23+
return node.negative ? `not.letter.uppercase` : `letter.uppercase`
24+
} else if (range === 'a-z') {
25+
return node.negative ? `not.letter.lowercase` : `letter.lowercase`
26+
}
27+
}
28+
} else if (exprs.length === 2) {
29+
if (typeof exprs[0] !== 'string' && typeof exprs[1] !== 'string') {
30+
const range1 = normalizeClassRange(exprs[0])
31+
const range2 = normalizeClassRange(exprs[1])
32+
if ((range1 === 'A-Z' && range2 === 'a-z') || (range1 === 'a-z' && range2 === 'A-Z')) {
33+
return node.negative ? `not.letter` : `letter`
34+
}
35+
}
36+
}
37+
38+
throw new Error('Unsupported for Complex charactor class')
39+
}
40+
41+
case 'Disjunction':
42+
return chain(build(node.left), `or(${build(node.right)})`)
43+
44+
case 'Assertion':
45+
switch (node.kind) {
46+
case '\\b':
47+
return 'wordBoundary'
48+
49+
case '\\B':
50+
return 'not.wordBoundary'
51+
52+
case '^':
53+
return chain('', 'at.lineStart()')
54+
55+
case '$':
56+
return chain('', 'at.lineEnd()')
57+
58+
case 'Lookbehind':
59+
return chain('', `${node.negative ? 'notAfter' : 'after'}(${build(node.assertion)})`)
60+
61+
case 'Lookahead':
62+
return chain('', `${node.negative ? 'notBefore' : 'before'}(${build(node.assertion)})`)
63+
64+
/* v8 ignore next 2 */
65+
default:
66+
throw new TypeError(`Unknown Assertion kind: ${(node as any).kind}`)
67+
}
68+
case 'Char':
69+
if (node.kind === 'meta') {
70+
switch (node.value) {
71+
case '.':
72+
return 'char'
73+
74+
case '\\w':
75+
return 'wordChar'
76+
case '\\d':
77+
return 'digit'
78+
case '\\s':
79+
return 'whitespace'
80+
case '\\t':
81+
return 'tab'
82+
case '\\n':
83+
return 'linefeed'
84+
case '\\r':
85+
return 'carriageReturn'
86+
87+
case '\\W':
88+
return 'not.wordChar'
89+
case '\\D':
90+
return 'not.digit'
91+
case '\\S':
92+
return 'not.whitespace'
93+
94+
case '\f':
95+
case '\v':
96+
default:
97+
throw new Error(`Unsupported Meta Char: ${node.value}`)
98+
}
99+
} else {
100+
const char = getChar(node)
101+
if (char === null) throw new Error(`Unknown Char: ${node.value}`)
102+
return `'${char}'`
103+
}
104+
105+
case 'Repetition': {
106+
const quantifier = node.quantifier
107+
const expr = build(node.expression)
108+
109+
// TODO: support lazy quantifier
110+
const lazy = !quantifier.greedy
111+
if (lazy) throw new Error('Unsupported for lazy quantifier')
112+
113+
switch (quantifier.kind) {
114+
case '+':
115+
return `oneOrMore(${expr})`
116+
case '?':
117+
return `maybe(${expr})`
118+
case '*':
119+
return chain(expr, 'times.any()')
120+
case 'Range':
121+
// {1}
122+
if (quantifier.from === quantifier.to) {
123+
return chain(expr, `times(${quantifier.from})`)
124+
}
125+
// {1,}
126+
else if (!quantifier.to) {
127+
return chain(expr, `times.atLeast(${quantifier.from})`)
128+
}
129+
// {0,3}
130+
else if (quantifier.from === 0) {
131+
return chain(expr, `times.atMost(${quantifier.to})`)
132+
}
133+
// {1,3}
134+
return chain(expr, `times.between(${quantifier.from}, ${quantifier.to})`)
135+
136+
/* v8 ignore next 2 */
137+
default:
138+
return '' as never
139+
}
140+
}
141+
142+
case 'Alternative': {
143+
const alts = combineContinuousSimpleChars(node.expressions)
144+
const exprs: string[] = []
145+
146+
for (let i = 0; i < alts.length; i++) {
147+
const alt = alts[i]
148+
149+
if (typeof alt === 'string') {
150+
exprs.push(alt)
151+
continue
152+
}
153+
154+
if (alt.type === 'Assertion') {
155+
switch (alt.kind) {
156+
case '^': {
157+
const next = alts[++i]
158+
if (next === undefined)
159+
throw new Error(`Unexpected assertion: ${JSON.stringify(alt)}`)
160+
exprs.push(chain(next, 'at.lineStart()'))
161+
continue
162+
}
163+
164+
case '$': {
165+
const prev = exprs.pop()
166+
if (prev === undefined)
167+
throw new Error(`Unexpected assertion: ${JSON.stringify(alt)}`)
168+
exprs.push(chain(prev, 'at.lineEnd()'))
169+
continue
170+
}
171+
172+
case 'Lookbehind': {
173+
const next = alts[++i]
174+
if (next === undefined)
175+
throw new Error(`Unexpected assertion: ${JSON.stringify(alt)}`)
176+
const helper = alt.negative ? 'notAfter' : 'after'
177+
exprs.push(chain(next, `${helper}(${build(alt.assertion)})`))
178+
continue
179+
}
180+
181+
case 'Lookahead': {
182+
const prev = exprs.pop()
183+
if (prev === undefined)
184+
throw new Error(`Unexpected assertion: ${JSON.stringify(alt)}`)
185+
const helper = alt.negative ? 'notBefore' : 'before'
186+
exprs.push(chain(prev, `${helper}(${build(alt.assertion)})`))
187+
continue
188+
}
189+
}
190+
}
191+
192+
// TODO: currenly not support backreference for cross group
193+
if (alt.type === 'Backreference') {
194+
if (alt.kind !== 'name') throw new Error(`Unsupport for number reference`)
195+
196+
const ref = chain(`exactly(${exprs.join(', ')})`, `and.referenceTo('${alt.reference}')`)
197+
exprs.length = 0
198+
exprs.push(ref)
199+
continue
200+
}
201+
202+
exprs.push(build(alt))
203+
}
204+
205+
return exprs.join(', ')
206+
}
207+
case 'Group':
208+
if (node.capturing)
209+
return chain(build(node.expression), node.name ? `as('${node.name}')` : 'grouped()')
210+
else return chain(build(node.expression))
211+
212+
/* v8 ignore next 2 */
213+
case 'Backreference':
214+
return chain('', `and.referenceTo('${node.reference}')`)
215+
}
216+
}
217+
218+
function normalizeClassRange(node: Char | ClassRange): string | undefined {
219+
if (node.type === 'ClassRange') return node.from.value + '-' + node.to.value
220+
}
221+
222+
function combineContinuousSimpleChars<T extends (Char | ClassRange) | Expression>(
223+
expressions: T[]
224+
): (T | string)[] {
225+
let simpleChars = ''
226+
const exprs = expressions.reduce(
227+
(acc, expr) => {
228+
const char = expr.type === 'Char' ? getChar(expr) : null
229+
if (char !== null) {
230+
simpleChars += char
231+
} else {
232+
if (simpleChars) {
233+
acc.push(`'${simpleChars}'`)
234+
simpleChars = ''
235+
}
236+
acc.push(expr)
237+
}
238+
return acc
239+
},
240+
[] as Array<T | string>
241+
)
242+
243+
// Add the last accumulated string if it exists
244+
if (simpleChars) {
245+
exprs.push(`'${simpleChars}'`)
246+
}
247+
248+
return exprs
249+
}
250+
251+
function getChar(char: Char): string | null {
252+
function escapeSimpleChar(char: string): string {
253+
// for generator only because we will output createRegExp('...')
254+
return char === "'" ? "\\'" : char
255+
}
256+
257+
switch (char.kind) {
258+
case 'simple':
259+
return escapeSimpleChar(char.value)
260+
261+
case 'oct':
262+
case 'decimal':
263+
case 'hex':
264+
case 'unicode':
265+
if ('symbol' in char) return escapeSimpleChar((char as any).symbol)
266+
}
267+
268+
return null
269+
}
270+
271+
function chain(expr: Expression | string, helper?: string): string {
272+
let _expr = ''
273+
if (typeof expr === 'string') {
274+
if (expr === '') _expr = "exactly('')"
275+
else _expr = expr.startsWith("'") && expr.endsWith("'") ? `exactly(${expr})` : expr
276+
} else {
277+
_expr = build(expr)
278+
}
279+
return helper ? `${_expr}.${helper}` : _expr
280+
}
281+
282+
function buildFlags(flags: string) {
283+
if (!flags) return ''
284+
285+
const readableFlags = flags.split('').map(flag => {
286+
return (
287+
{
288+
d: 'withIndices',
289+
i: 'caseInsensitive',
290+
g: 'global',
291+
m: 'multiline',
292+
s: 'dotAll',
293+
u: 'unicode',
294+
y: 'sticky',
295+
}[flag] || `'${flag}'`
296+
)
297+
})
298+
299+
return '[' + readableFlags.join(', ') + ']'
300+
}
301+
302+
export function convert(regex: RegExp, { argsOnly = false } = {}) {
303+
const ast = regexpTree.parse(regex)
304+
305+
if (ast.type !== 'RegExp') throw new TypeError(`Unexpected RegExp AST: ${ast.type}`)
306+
307+
const flags = buildFlags(ast.flags)
308+
const args = build(ast.body) + (flags ? `, ${flags}` : '')
309+
return argsOnly ? args : `createRegExp(${args})`
310+
}

0 commit comments

Comments
 (0)