|
5 | 5 | import re
|
6 | 6 | import sys
|
7 | 7 | from functools import lru_cache
|
8 |
| -from typing import List, Pattern |
| 8 | +from typing import List, Match, Pattern |
| 9 | + |
| 10 | +from blib2to3.pytree import Leaf |
9 | 11 |
|
10 | 12 | if sys.version_info < (3, 8):
|
11 | 13 | from typing_extensions import Final
|
|
18 | 20 | r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
|
19 | 21 | )
|
20 | 22 | FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
|
| 23 | +UNICODE_ESCAPE_RE: Final = re.compile( |
| 24 | + r"(?P<backslashes>\\+)(?P<body>" |
| 25 | + r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx |
| 26 | + r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx |
| 27 | + r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh |
| 28 | + r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database |
| 29 | + r")", |
| 30 | + re.VERBOSE, |
| 31 | +) |
21 | 32 |
|
22 | 33 |
|
23 | 34 | def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
|
@@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
|
236 | 247 | return s # Prefer double quotes
|
237 | 248 |
|
238 | 249 | return f"{prefix}{new_quote}{new_body}{new_quote}"
|
| 250 | + |
| 251 | + |
| 252 | +def normalize_unicode_escape_sequences(leaf: Leaf) -> None: |
| 253 | + """Replace hex codes in Unicode escape sequences with lowercase representation.""" |
| 254 | + text = leaf.value |
| 255 | + prefix = get_string_prefix(text) |
| 256 | + if "r" in prefix.lower(): |
| 257 | + return |
| 258 | + |
| 259 | + def replace(m: Match[str]) -> str: |
| 260 | + groups = m.groupdict() |
| 261 | + back_slashes = groups["backslashes"] |
| 262 | + |
| 263 | + if len(back_slashes) % 2 == 0: |
| 264 | + return back_slashes + groups["body"] |
| 265 | + |
| 266 | + if groups["u"]: |
| 267 | + # \u |
| 268 | + return back_slashes + "u" + groups["u"].lower() |
| 269 | + elif groups["U"]: |
| 270 | + # \U |
| 271 | + return back_slashes + "U" + groups["U"].lower() |
| 272 | + elif groups["x"]: |
| 273 | + # \x |
| 274 | + return back_slashes + "x" + groups["x"].lower() |
| 275 | + else: |
| 276 | + assert groups["N"], f"Unexpected match: {m}" |
| 277 | + # \N{} |
| 278 | + return back_slashes + "N{" + groups["N"].upper() + "}" |
| 279 | + |
| 280 | + leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) |
0 commit comments