Skip to content

Commit d28def9

Browse files
jacobtylerwallslihu
and
lihu
authored
Ignore quantifiers when splitting comma-separated regexes (#8898)
Do not split on commas if they are between braces, since that indicates a quantifier. Also added a protection for slow implementations since existing workarounds may result in long strings of chained regular expressions. Adjust existing test for invalid regex to be truly invalid Co-authored-by: lihu <[email protected]>
1 parent 1f8c4d9 commit d28def9

File tree

5 files changed

+64
-4
lines changed

5 files changed

+64
-4
lines changed

doc/whatsnew/fragments/7229.bugfix

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
When parsing comma-separated lists of regular expressions in the config, ignore
2+
commas that are inside braces since those indicate quantifiers, not delineation
3+
between expressions.
4+
5+
Closes #7229

pylint/config/argument.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def _regex_transformer(value: str) -> Pattern[str]:
114114
def _regexp_csv_transfomer(value: str) -> Sequence[Pattern[str]]:
115115
"""Transforms a comma separated list of regular expressions."""
116116
patterns: list[Pattern[str]] = []
117-
for pattern in _csv_transformer(value):
117+
for pattern in pylint_utils._check_regexp_csv(value):
118118
patterns.append(_regex_transformer(pattern))
119119
return patterns
120120

pylint/utils/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
HAS_ISORT_5,
1515
IsortDriver,
1616
_check_csv,
17+
_check_regexp_csv,
1718
_splitstrip,
1819
_unquote,
1920
decoding_stream,
@@ -32,6 +33,7 @@
3233
"HAS_ISORT_5",
3334
"IsortDriver",
3435
"_check_csv",
36+
"_check_regexp_csv",
3537
"_splitstrip",
3638
"_unquote",
3739
"decoding_stream",

pylint/utils/utils.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
import textwrap
2323
import tokenize
2424
import warnings
25-
from collections.abc import Sequence
25+
from collections import deque
26+
from collections.abc import Iterable, Sequence
2627
from io import BufferedReader, BytesIO
2728
from typing import (
2829
TYPE_CHECKING,
@@ -253,6 +254,31 @@ def _check_csv(value: list[str] | tuple[str] | str) -> Sequence[str]:
253254
return _splitstrip(value)
254255

255256

257+
def _check_regexp_csv(value: list[str] | tuple[str] | str) -> Iterable[str]:
258+
r"""Split a comma-separated list of regexps, taking care to avoid splitting
259+
a regex employing a comma as quantifier, as in `\d{1,2}`."""
260+
if isinstance(value, (list, tuple)):
261+
yield from value
262+
else:
263+
# None is a sentinel value here
264+
regexps: deque[deque[str] | None] = deque([None])
265+
open_braces = False
266+
for char in value:
267+
if char == "{":
268+
open_braces = True
269+
elif char == "}" and open_braces:
270+
open_braces = False
271+
272+
if char == "," and not open_braces:
273+
regexps.append(None)
274+
elif regexps[-1] is None:
275+
regexps.pop()
276+
regexps.append(deque([char]))
277+
else:
278+
regexps[-1].append(char)
279+
yield from ("".join(regexp).strip() for regexp in regexps if regexp is not None)
280+
281+
256282
def _comment(string: str) -> str:
257283
"""Return string as a comment."""
258284
lines = [line.strip() for line in string.splitlines()]

tests/config/test_config.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
from __future__ import annotations
66

77
import os
8+
import re
89
from pathlib import Path
910
from tempfile import TemporaryDirectory
11+
from typing import Any
1012

1113
import pytest
1214
from pytest import CaptureFixture
@@ -115,6 +117,31 @@ def test_unknown_py_version(capsys: CaptureFixture) -> None:
115117
assert "the-newest has an invalid format, should be a version string." in output.err
116118

117119

120+
CSV_REGEX_COMMA_CASES = [
121+
("foo", ["foo"]),
122+
("foo,bar", ["foo", "bar"]),
123+
("foo, bar", ["foo", "bar"]),
124+
("foo, bar{1,3}", ["foo", "bar{1,3}"]),
125+
]
126+
127+
128+
@pytest.mark.parametrize("in_string,expected", CSV_REGEX_COMMA_CASES)
129+
def test_csv_regex_comma_in_quantifier(in_string: str, expected: list[str]) -> None:
130+
"""Check that we correctly parse a comma-separated regex when there are one
131+
or more commas within quantifier expressions.
132+
"""
133+
134+
def _template_run(in_string: str) -> list[re.Pattern[Any]]:
135+
r = Run(
136+
[str(EMPTY_MODULE), rf"--bad-names-rgx={in_string}"],
137+
exit=False,
138+
)
139+
bad_names_rgxs: list[re.Pattern[Any]] = r.linter.config.bad_names_rgxs
140+
return bad_names_rgxs
141+
142+
assert _template_run(in_string) == [re.compile(regex) for regex in expected]
143+
144+
118145
def test_regex_error(capsys: CaptureFixture) -> None:
119146
"""Check that we correctly error when an an option is passed whose value is an invalid regular expression."""
120147
with pytest.raises(SystemExit):
@@ -137,12 +164,12 @@ def test_csv_regex_error(capsys: CaptureFixture) -> None:
137164
"""
138165
with pytest.raises(SystemExit):
139166
Run(
140-
[str(EMPTY_MODULE), r"--bad-names-rgx=(foo{1,3})"],
167+
[str(EMPTY_MODULE), r"--bad-names-rgx=(foo{1,}, foo{1,3}})"],
141168
exit=False,
142169
)
143170
output = capsys.readouterr()
144171
assert (
145-
r"Error in provided regular expression: (foo{1 beginning at index 0: missing ), unterminated subpattern"
172+
r"Error in provided regular expression: (foo{1,} beginning at index 0: missing ), unterminated subpattern"
146173
in output.err
147174
)
148175

0 commit comments

Comments
 (0)