Skip to content

Commit c0537db

Browse files
authoredJul 27, 2024··
Regex parsing fixes for newline characters and marker verbs
1 parent a079c45 commit c0537db

File tree

3 files changed

+46
-11
lines changed

3 files changed

+46
-11
lines changed
 

‎resources/RegexGrammar.pp

+9-7
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@
4141
// @license New BSD License
4242
//
4343

44-
45-
// Skip.
46-
%skip nl \n
47-
4844
// Character classes.
4945
%token negative_class_ \[\^ -> class
5046
%token class_ \[ -> class
@@ -58,7 +54,7 @@
5854
%token class:character \\([aefnrtb]|c[\x00-\x7f])
5955
%token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
6056
%token class:character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
61-
%token class:literal \\.|.
57+
%token class:literal \\.|.|\n
6258

6359
// Internal options.
6460
// See https://www.regular-expressions.info/refmodifiers.html
@@ -82,6 +78,11 @@
8278
%token co:_comment \) -> default
8379
%token co:comment .*?(?=(?<!\\)\))
8480

81+
// Marker verbs
82+
%token marker_ \(\*: -> mark
83+
%token mark:name [^)]+
84+
%token mark:_marker \) -> default
85+
8586
// Capturing group.
8687
%token named_capturing_ \(\?P?< -> nc
8788
%token nc:_named_capturing > -> default
@@ -122,7 +123,7 @@
122123
%token character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
123124
%token anchor \\([bBAZzG])|\^|\$
124125
%token match_point_reset \\K
125-
%token literal \\.|.
126+
%token literal \\.|.|\n
126127

127128

128129
// Rules.
@@ -190,7 +191,8 @@
190191
| literal()
191192

192193
#capturing:
193-
::comment_:: <comment>? ::_comment:: #comment
194+
::marker_:: <name> ::_marker:: #mark
195+
| ::comment_:: <comment>? ::_comment:: #comment
194196
| (
195197
::named_capturing_:: <capturing_name> ::_named_capturing:: #namedcapturing
196198
| ::non_capturing_:: #noncapturing

‎src/Type/Php/RegexArrayShapeMatcher.php

+31-4
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ private function matchRegex(string $regex, ?int $flags, TrinaryLogic $wasMatched
125125
// regex could not be parsed by Hoa/Regex
126126
return null;
127127
}
128-
[$groupList, $groupCombinations] = $parseResult;
128+
[$groupList, $groupCombinations, $markVerbs] = $parseResult;
129129

130130
$trailingOptionals = 0;
131131
foreach (array_reverse($groupList) as $captureGroup) {
@@ -152,6 +152,7 @@ private function matchRegex(string $regex, ?int $flags, TrinaryLogic $wasMatched
152152
$wasMatched,
153153
$trailingOptionals,
154154
$flags ?? 0,
155+
$markVerbs,
155156
);
156157

157158
if (!$this->containsUnmatchedAsNull($flags ?? 0)) {
@@ -189,6 +190,7 @@ private function matchRegex(string $regex, ?int $flags, TrinaryLogic $wasMatched
189190
$wasMatched,
190191
$trailingOptionals,
191192
$flags ?? 0,
193+
$markVerbs,
192194
);
193195

194196
$combiTypes[] = $combiType;
@@ -211,6 +213,7 @@ private function matchRegex(string $regex, ?int $flags, TrinaryLogic $wasMatched
211213
$wasMatched,
212214
$trailingOptionals,
213215
$flags ?? 0,
216+
$markVerbs,
214217
);
215218
}
216219

@@ -266,12 +269,14 @@ private function getOnlyTopLevelAlternationId(array $captureGroups): ?int
266269

267270
/**
268271
* @param array<RegexCapturingGroup> $captureGroups
272+
* @param list<string> $markVerbs
269273
*/
270274
private function buildArrayType(
271275
array $captureGroups,
272276
TrinaryLogic $wasMatched,
273277
int $trailingOptionals,
274278
int $flags,
279+
array $markVerbs,
275280
): Type
276281
{
277282
$builder = ConstantArrayTypeBuilder::createEmpty();
@@ -325,6 +330,18 @@ private function buildArrayType(
325330
$i++;
326331
}
327332

333+
if (count($markVerbs) > 0) {
334+
$markTypes = [];
335+
foreach ($markVerbs as $mark) {
336+
$markTypes[] = new ConstantStringType($mark);
337+
}
338+
$builder->setOffsetValueType(
339+
$this->getKeyType('MARK'),
340+
TypeCombinator::union(...$markTypes),
341+
true,
342+
);
343+
}
344+
328345
return $builder->getArray();
329346
}
330347

@@ -372,7 +389,7 @@ private function getValueType(Type $baseType, int $flags): Type
372389
}
373390

374391
/**
375-
* @return array{array<int, RegexCapturingGroup>, array<int, array<int, int[]>>}|null
392+
* @return array{array<int, RegexCapturingGroup>, array<int, array<int, int[]>>, list<string>}|null
376393
*/
377394
private function parseGroups(string $regex): ?array
378395
{
@@ -398,6 +415,7 @@ private function parseGroups(string $regex): ?array
398415
$groupCombinations = [];
399416
$alternationId = -1;
400417
$captureGroupId = 100;
418+
$markVerbs = [];
401419
$this->walkRegexAst(
402420
$ast,
403421
false,
@@ -408,14 +426,16 @@ private function parseGroups(string $regex): ?array
408426
$captureGroupId,
409427
$capturingGroups,
410428
$groupCombinations,
429+
$markVerbs,
411430
);
412431

413-
return [$capturingGroups, $groupCombinations];
432+
return [$capturingGroups, $groupCombinations, $markVerbs];
414433
}
415434

416435
/**
417436
* @param array<int, RegexCapturingGroup> $capturingGroups
418437
* @param array<int, array<int, int[]>> $groupCombinations
438+
* @param list<string> $markVerbs
419439
*/
420440
private function walkRegexAst(
421441
TreeNode $ast,
@@ -427,6 +447,7 @@ private function walkRegexAst(
427447
int &$captureGroupId,
428448
array &$capturingGroups,
429449
array &$groupCombinations,
450+
array &$markVerbs,
430451
): void
431452
{
432453
$group = null;
@@ -441,7 +462,7 @@ private function walkRegexAst(
441462
);
442463
$parentGroup = $group;
443464
} elseif ($ast->getId() === '#namedcapturing') {
444-
$name = $ast->getChild(0)->getValue()['value'];
465+
$name = $ast->getChild(0)->getValueValue();
445466
$group = new RegexCapturingGroup(
446467
$captureGroupId++,
447468
$name,
@@ -483,6 +504,11 @@ private function walkRegexAst(
483504
$inAlternation = true;
484505
}
485506

507+
if ($ast->getId() === '#mark') {
508+
$markVerbs[] = $ast->getChild(0)->getValueValue();
509+
return;
510+
}
511+
486512
if ($group instanceof RegexCapturingGroup) {
487513
$capturingGroups[$group->getId()] = $group;
488514

@@ -506,6 +532,7 @@ private function walkRegexAst(
506532
$captureGroupId,
507533
$capturingGroups,
508534
$groupCombinations,
535+
$markVerbs,
509536
);
510537

511538
if ($ast->getId() !== '#alternation') {

‎tests/PHPStan/Analyser/nsrt/preg_match_shapes.php

+6
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,12 @@ function bug11323(string $s): void {
518518
if (preg_match('{([^1-4])}', $s, $matches)) {
519519
assertType('array{string, non-empty-string}', $matches);
520520
}
521+
if (preg_match("{([\r\n]+)(\n)([\n])}", $s, $matches)) {
522+
assertType('array{string, non-empty-string, non-empty-string, non-empty-string}', $matches);
523+
}
524+
if (preg_match('/foo(*:first)|bar(*:second)([x])/', $s, $matches)) {
525+
assertType("array{0: string, 1?: non-empty-string, MARK?: 'first'|'second'}", $matches);
526+
}
521527
}
522528

523529
function (string $s): void {

0 commit comments

Comments
 (0)