Skip to content

Commit d2127dc

Browse files
feat: added not greedy range repitition
1 parent 3c4ee8f commit d2127dc

File tree

10 files changed

+172
-51
lines changed

10 files changed

+172
-51
lines changed

assembly/__spec_tests__/generated.spec.ts

+111-31
Original file line numberDiff line numberDiff line change
@@ -368,16 +368,28 @@ it("line: 49 - matches ^(abc){1,2}zz against 'abcabcabczz'", () => {
368368
it("line: 50 - matches ^(abc){1,2}zz against '>>abczz'", () => {
369369
expectNotMatch("^(abc){1,2}zz", [">>abczz"]);
370370
});
371-
xit("line: 51 - lazy range repitition quantifiers are not supported", () => {});
372-
xit("line: 52 - lazy range repitition quantifiers are not supported", () => {});
373-
xit("line: 53 - lazy range repitition quantifiers are not supported", () => {});
374-
xit("line: 54 - lazy range repitition quantifiers are not supported", () => {});
375-
xit("line: 55 - lazy range repitition quantifiers are not supported", () => {});
376-
xit("line: 56 - lazy range repitition quantifiers are not supported", () => {});
377-
xit("line: 57 - lazy range repitition quantifiers are not supported", () => {});
378-
xit("line: 58 - lazy range repitition quantifiers are not supported", () => {});
379-
xit("line: 59 - lazy range repitition quantifiers are not supported", () => {});
380-
xit("line: 60 - lazy range repitition quantifiers are not supported", () => {});
371+
it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => {
372+
const match = exec("^(b+?|a){1,2}?c", "bc", "s");
373+
expect(match.matches[0]).toBe("bc".substring(0, 2));
374+
expect(match.matches[1]).toBe("bc".substring(0, 1));
375+
});
376+
xit("line: 52 - issues with repeated capture groups", () => {});
377+
xit("line: 53 - issues with repeated capture groups", () => {});
378+
xit("line: 54 - issues with repeated capture groups", () => {});
379+
xit("line: 55 - issues with repeated capture groups", () => {});
380+
it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => {
381+
const match = exec("^(b+?|a){1,2}?c", "aac", "s");
382+
expect(match.matches[0]).toBe("aac".substring(0, 3));
383+
expect(match.matches[1]).toBe("aac".substring(1, 2));
384+
});
385+
xit("line: 57 - issues with repeated capture groups", () => {});
386+
xit("line: 58 - issues with repeated capture groups", () => {});
387+
it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => {
388+
expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]);
389+
});
390+
it("line: 60 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbac'", () => {
391+
expectNotMatch("^(b+?|a){1,2}?c", ["abbbbbbbbbbbac"]);
392+
});
381393
it("line: 61 - matches ^(b+|a){1,2}c against 'bc'", () => {
382394
const match = exec("^(b+|a){1,2}c", "bc", "s");
383395
expect(match.matches[0]).toBe("bc".substring(0, 2));
@@ -400,17 +412,41 @@ it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => {
400412
it("line: 70 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbac'", () => {
401413
expectNotMatch("^(b+|a){1,2}c", ["abbbbbbbbbbbac"]);
402414
});
403-
xit("line: 71 - lazy range repitition quantifiers are not supported", () => {});
404-
xit("line: 72 - lazy range repitition quantifiers are not supported", () => {});
405-
xit("line: 73 - lazy range repitition quantifiers are not supported", () => {});
406-
xit("line: 74 - lazy range repitition quantifiers are not supported", () => {});
407-
xit("line: 75 - lazy range repitition quantifiers are not supported", () => {});
408-
xit("line: 76 - lazy range repitition quantifiers are not supported", () => {});
409-
xit("line: 77 - lazy range repitition quantifiers are not supported", () => {});
410-
xit("line: 78 - lazy range repitition quantifiers are not supported", () => {});
411-
xit("line: 79 - lazy range repitition quantifiers are not supported", () => {});
412-
xit("line: 80 - lazy range repitition quantifiers are not supported", () => {});
413-
xit("line: 81 - lazy range repitition quantifiers are not supported", () => {});
415+
it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => {
416+
const match = exec("^(b+|a){1,2}?bc", "bbc", "s");
417+
expect(match.matches[0]).toBe("bbc".substring(0, 3));
418+
expect(match.matches[1]).toBe("bbc".substring(0, 1));
419+
});
420+
xit("line: 72 - issues with repeated capture groups", () => {});
421+
xit("line: 73 - issues with repeated capture groups", () => {});
422+
it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => {
423+
const match = exec("^(b*|ba){1,2}?bc", "bababc", "s");
424+
expect(match.matches[0]).toBe("bababc".substring(0, 6));
425+
expect(match.matches[1]).toBe("bababc".substring(2, 4));
426+
});
427+
it("line: 75 - matches ^(b*|ba){1,2}?bc against 'bababbc'", () => {
428+
expectNotMatch("^(b*|ba){1,2}?bc", ["bababbc"]);
429+
});
430+
it("line: 76 - matches ^(b*|ba){1,2}?bc against 'babababc'", () => {
431+
expectNotMatch("^(b*|ba){1,2}?bc", ["babababc"]);
432+
});
433+
it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => {
434+
const match = exec("^(ba|b*){1,2}?bc", "babc", "s");
435+
expect(match.matches[0]).toBe("babc".substring(0, 4));
436+
expect(match.matches[1]).toBe("babc".substring(0, 2));
437+
});
438+
xit("line: 78 - issues with repeated capture groups", () => {});
439+
it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => {
440+
const match = exec("^(ba|b*){1,2}?bc", "bababc", "s");
441+
expect(match.matches[0]).toBe("bababc".substring(0, 6));
442+
expect(match.matches[1]).toBe("bababc".substring(2, 4));
443+
});
444+
it("line: 80 - matches ^(ba|b*){1,2}?bc against 'bababbc'", () => {
445+
expectNotMatch("^(ba|b*){1,2}?bc", ["bababbc"]);
446+
});
447+
it("line: 81 - matches ^(ba|b*){1,2}?bc against 'babababc'", () => {
448+
expectNotMatch("^(ba|b*){1,2}?bc", ["babababc"]);
449+
});
414450
xit("line: 82 - test regex contains syntax not supported in JS", () => {});
415451
it("line: 83 - matches ^[ab\\]cde] against 'athing'", () => {
416452
const match = exec("^[ab\\]cde]", "athing", "s");
@@ -1120,11 +1156,26 @@ it("line: 244 - matches ^[aeiou\\d]{4,5}$ against 'aaaaa'", () => {
11201156
it("line: 245 - matches ^[aeiou\\d]{4,5}$ against '123456'", () => {
11211157
expectNotMatch("^[aeiou\\d]{4,5}$", ["123456"]);
11221158
});
1123-
xit("line: 246 - lazy range repitition quantifiers are not supported", () => {});
1124-
xit("line: 247 - lazy range repitition quantifiers are not supported", () => {});
1125-
xit("line: 248 - lazy range repitition quantifiers are not supported", () => {});
1126-
xit("line: 249 - lazy range repitition quantifiers are not supported", () => {});
1127-
xit("line: 250 - lazy range repitition quantifiers are not supported", () => {});
1159+
it("line: 246 - matches ^[aeiou\\d]{4,5}? against 'uoie'", () => {
1160+
const match = exec("^[aeiou\\d]{4,5}?", "uoie", "s");
1161+
expect(match.matches[0]).toBe("uoie".substring(0, 4));
1162+
});
1163+
it("line: 247 - matches ^[aeiou\\d]{4,5}? against '1234'", () => {
1164+
const match = exec("^[aeiou\\d]{4,5}?", "1234", "s");
1165+
expect(match.matches[0]).toBe("1234".substring(0, 4));
1166+
});
1167+
it("line: 248 - matches ^[aeiou\\d]{4,5}? against '12345'", () => {
1168+
const match = exec("^[aeiou\\d]{4,5}?", "12345", "s");
1169+
expect(match.matches[0]).toBe("12345".substring(0, 4));
1170+
});
1171+
it("line: 249 - matches ^[aeiou\\d]{4,5}? against 'aaaaa'", () => {
1172+
const match = exec("^[aeiou\\d]{4,5}?", "aaaaa", "s");
1173+
expect(match.matches[0]).toBe("aaaaa".substring(0, 4));
1174+
});
1175+
it("line: 250 - matches ^[aeiou\\d]{4,5}? against '123456'", () => {
1176+
const match = exec("^[aeiou\\d]{4,5}?", "123456", "s");
1177+
expect(match.matches[0]).toBe("123456".substring(0, 4));
1178+
});
11281179
xit("line: 251 - back references are not supported", () => {});
11291180
xit("line: 252 - back references are not supported", () => {});
11301181
xit("line: 253 - back references are not supported", () => {});
@@ -1182,8 +1233,16 @@ xit("line: 287 - non capturing groups not supported", () => {});
11821233
xit("line: 288 - non capturing groups not supported", () => {});
11831234
xit("line: 289 - non capturing groups not supported", () => {});
11841235
xit("line: 290 - the test behaviour differs between PCRE and JS", () => {});
1185-
xit("line: 291 - lazy range repitition quantifiers are not supported", () => {});
1186-
xit("line: 292 - lazy range repitition quantifiers are not supported", () => {});
1236+
it("line: 291 - matches ^[ab]{1,3}?(ab*|b) against 'aabbbbb'", () => {
1237+
const match = exec("^[ab]{1,3}?(ab*|b)", "aabbbbb", "s");
1238+
expect(match.matches[0]).toBe("aabbbbb".substring(0, 7));
1239+
expect(match.matches[1]).toBe("aabbbbb".substring(1, 7));
1240+
});
1241+
it("line: 292 - matches ^[ab]{1,3}?(ab*?|b) against 'aabbbbb'", () => {
1242+
const match = exec("^[ab]{1,3}?(ab*?|b)", "aabbbbb", "s");
1243+
expect(match.matches[0]).toBe("aabbbbb".substring(0, 2));
1244+
expect(match.matches[1]).toBe("aabbbbb".substring(1, 2));
1245+
});
11871246
it("line: 293 - matches ^[ab]{1,3}(ab*?|b) against 'aabbbbb'", () => {
11881247
const match = exec("^[ab]{1,3}(ab*?|b)", "aabbbbb", "s");
11891248
expect(match.matches[0]).toBe("aabbbbb".substring(0, 4));
@@ -1503,7 +1562,10 @@ it("line: 1224 - matches a{0}bc against 'bc'", () => {
15031562
const match = exec("a{0}bc", "bc", "s");
15041563
expect(match.matches[0]).toBe("bc".substring(0, 2));
15051564
});
1506-
xit("line: 1225 - lazy range repitition quantifiers are not supported", () => {});
1565+
it("line: 1225 - matches (a|(bc)){0,0}?xyz against 'xyz'", () => {
1566+
const match = exec("(a|(bc)){0,0}?xyz", "xyz", "s");
1567+
expect(match.matches[0]).toBe("xyz".substring(0, 3));
1568+
});
15071569
xit("line: 1226 - back references are not supported", () => {});
15081570
xit("line: 1227 - back references are not supported", () => {});
15091571
xit("line: 1228 - back references are not supported", () => {});
@@ -1617,8 +1679,26 @@ it("line: 1267 - matches [^az] against 'aaAabcd '", () => {
16171679
expect(match.matches[0]).toBe("aaAabcd ".substring(4, 5));
16181680
});
16191681
xit("line: 1268 - back references are not supported", () => {});
1620-
xit("line: 1269 - lazy range repitition quantifiers are not supported", () => {});
1621-
xit("line: 1270 - lazy range repitition quantifiers are not supported", () => {});
1682+
it("line: 1269 - matches P[^*]TAIRE[^*]{1,6}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => {
1683+
const match = exec(
1684+
"P[^*]TAIRE[^*]{1,6}?LL",
1685+
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx",
1686+
"s"
1687+
);
1688+
expect(match.matches[0]).toBe(
1689+
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22)
1690+
);
1691+
});
1692+
it("line: 1270 - matches P[^*]TAIRE[^*]{1,}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => {
1693+
const match = exec(
1694+
"P[^*]TAIRE[^*]{1,}?LL",
1695+
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx",
1696+
"s"
1697+
);
1698+
expect(match.matches[0]).toBe(
1699+
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22)
1700+
);
1701+
});
16221702
it("line: 1271 - matches (\\.\\d\\d[1-9]?)\\d+ against '1.230003938'", () => {
16231703
const match = exec("(\\.\\d\\d[1-9]?)\\d+", "1.230003938", "s");
16241704
expect(match.matches[0]).toBe("1.230003938".substring(1, 11));

assembly/__tests__/quantifiers.spec.ts

+6-6
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ describe("non-greedy", () => {
5151
expect(match.matches[0]).toStrictEqual("ab");
5252
});
5353

54-
it("zero or one supports non-greedy mode", () => {
55-
expectMatch("a?", ["a"]);
56-
let match = exec("a?", "bc");
57-
expect(match).not.toBeNull();
58-
expect(match.matches[0]).toStrictEqual("");
59-
});
54+
// it("zero or one supports non-greedy mode", () => {
55+
// expectMatch("a?", ["a"]);
56+
// let match = exec("a??", "bc");
57+
// expect(match).not.toBeNull();
58+
// expect(match.matches[0]).toStrictEqual("");
59+
// });
6060
});

assembly/__tests__/range-quantifiers.spec.ts

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ it("handles nested quantifiers", () => {
4040
expectMatch("(a{3}){2}", ["aaaaaa"]);
4141
});
4242

43+
it("handles nongreedy quantifiers", () => {
44+
const match = exec("a{2,4}?", "aaaaaaaaaa");
45+
expect(match.matches[0]).toBe("aa");
46+
});
47+
4348
it("throws if quantifying a quantifier!", () => {
4449
expect(() => {
4550
let foo = new RegExp("a{3}{2}");

assembly/nfa/nfa.ts

+9-4
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,16 @@ function closure(nfa: Automata, greedy: bool): Automata {
134134
return new Automata(start, end);
135135
}
136136

137-
function zeroOrOne(nfa: Automata): Automata {
137+
function zeroOrOne(nfa: Automata, greedy: bool): Automata {
138138
const start = new State();
139139
const end = new State();
140-
start.transitions.push(nfa.start);
141-
start.transitions.push(end);
140+
if (greedy) {
141+
start.transitions.push(nfa.start);
142+
start.transitions.push(end);
143+
} else {
144+
start.transitions.push(end);
145+
start.transitions.push(nfa.start);
146+
}
142147
nfa.end.transitions.push(end);
143148
return new Automata(start, end);
144149
}
@@ -182,7 +187,7 @@ class AutomataFactor {
182187
const automata = this.automataForNode(node.expression);
183188
const quantifier = node.quantifier;
184189
if (quantifier == Char.Question) {
185-
return zeroOrOne(automata);
190+
return zeroOrOne(automata, node.greedy);
186191
} else if (quantifier == Char.Plus) {
187192
return oneOrMore(automata, node.greedy);
188193
} else if (quantifier == Char.Asterisk) {

assembly/parser/node.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,12 @@ export class RepetitionNode extends Node {
155155
}
156156

157157
export class RangeRepetitionNode extends Node {
158-
constructor(public expression: Node, public from: i32, public to: i32) {
158+
constructor(
159+
public expression: Node,
160+
public from: i32,
161+
public to: i32,
162+
public greedy: bool = true
163+
) {
159164
super(NodeType.RangeRepetition);
160165
if (expression.type == NodeType.RangeRepetition) {
161166
throw new Error("The preceding token is not quantifiable");

assembly/parser/parser.ts

+8-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,14 @@ export class Parser {
236236
const range = this.maybeParseRepetitionRange();
237237
if (range != null) {
238238
const expression = nodes.pop();
239-
nodes.push(new RangeRepetitionNode(expression, range.from, range.to));
239+
let greedy = true;
240+
if (this.iterator.current == Char.Question) {
241+
greedy = false;
242+
this.eatToken();
243+
}
244+
nodes.push(
245+
new RangeRepetitionNode(expression, range.from, range.to, greedy)
246+
);
240247
} else {
241248
// this is not the start of a repetition, it's just a char!
242249
nodes.push(this.parseCharacter());

assembly/parser/string-iterator.ts

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export class StringIterator {
1313
next(): bool {
1414
this.cursor++;
1515
if (this.cursor >= u32(this.sourceString.length)) {
16+
this.current = -1;
1617
return false;
1718
}
1819
this.current = this.sourceString.charCodeAt(this.cursor);

assembly/parser/walker.ts

+14-2
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,24 @@ export function expandRepetitions(visitor: NodeVisitor): void {
8181

8282
if (rangeRepNode.to == -1) {
8383
// a{4,} => aaaaa*
84-
clones.push(new RepetitionNode(expression.clone(), Char.Asterisk));
84+
clones.push(
85+
new RepetitionNode(
86+
expression.clone(),
87+
Char.Asterisk,
88+
rangeRepNode.greedy
89+
)
90+
);
8591
} else {
8692
// a{4,6} => aaaaa?a?
8793
const count = rangeRepNode.to - rangeRepNode.from;
8894
for (let i = 0; i < count; i++) {
89-
clones.push(new RepetitionNode(expression.clone(), Char.Question));
95+
clones.push(
96+
new RepetitionNode(
97+
expression.clone(),
98+
Char.Question,
99+
rangeRepNode.greedy
100+
)
101+
);
90102
}
91103
}
92104

spec/test-generator.js

+10-4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ const knownIssues = {
2222
...range(63, 68),
2323
1391,
2424
1392,
25+
...range(52, 55),
26+
57,
27+
58,
28+
72,
29+
73,
30+
78,
2531
],
2632
"lazy quantifiers should still yield the longest overall regex match": [
2733
...range(141, 143),
@@ -109,10 +115,10 @@ lines.forEach((line, index) => {
109115
return;
110116
}
111117

112-
if (["}?"].some((f) => regex.includes(f))) {
113-
testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`;
114-
return;
115-
}
118+
// if (["}?"].some((f) => regex.includes(f))) {
119+
// testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`;
120+
// return;
121+
// }
116122

117123
if (["(?"].some((f) => regex.includes(f))) {
118124
testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`;

ts/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ globalAny.log = console.log;
55

66
import { RegExp } from "../assembly/regexp";
77

8-
const regexObj = new RegExp(".*?");
9-
const match = regexObj.exec("abc");
8+
const regexObj = new RegExp("a?");
9+
const match = regexObj.exec("a");
1010

1111
console.log(match);

0 commit comments

Comments
 (0)