Skip to content

Commit eabff67

Browse files
Format hex code in unicode escape sequences in string literals (#2916)
Co-authored-by: Jelle Zijlstra <[email protected]>
1 parent 1557f7d commit eabff67

File tree

5 files changed

+82
-1
lines changed

5 files changed

+82
-1
lines changed

CHANGES.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
<!-- Changes that affect Black's preview style -->
1818

19+
- Format hex code in unicode escape sequences in string literals (#2916)
1920
- Add parentheses around `if`-`else` expressions (#2278)
2021
- Improve the performance on large expressions that contain many strings (#3467)
2122
- Fix a crash in preview style with assert + parenthesized string (#3415)

src/black/linegen.py

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
get_string_prefix,
6060
normalize_string_prefix,
6161
normalize_string_quotes,
62+
normalize_unicode_escape_sequences,
6263
)
6364
from black.trans import (
6465
CannotTransform,
@@ -368,6 +369,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
368369
yield from self.visit_default(node)
369370

370371
def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
372+
if Preview.hex_codes_in_unicode_sequences in self.mode:
373+
normalize_unicode_escape_sequences(leaf)
374+
371375
if is_docstring(leaf) and "\\\n" not in leaf.value:
372376
# We're ignoring docstrings with backslash newline escapes because changing
373377
# indentation of those changes the AST representation of the code.

src/black/mode.py

+1
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b
153153
class Preview(Enum):
154154
"""Individual preview style features."""
155155

156+
hex_codes_in_unicode_sequences = auto()
156157
annotation_parens = auto()
157158
empty_lines_before_class_or_def_with_leading_comments = auto()
158159
handle_trailing_commas_in_head = auto()

src/black/strings.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import re
66
import sys
77
from functools import lru_cache
8-
from typing import List, Pattern
8+
from typing import List, Match, Pattern
9+
10+
from blib2to3.pytree import Leaf
911

1012
if sys.version_info < (3, 8):
1113
from typing_extensions import Final
@@ -18,6 +20,15 @@
1820
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
1921
)
2022
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
23+
UNICODE_ESCAPE_RE: Final = re.compile(
24+
r"(?P<backslashes>\\+)(?P<body>"
25+
r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
26+
r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
27+
r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
28+
r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
29+
r")",
30+
re.VERBOSE,
31+
)
2132

2233

2334
def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
236247
return s # Prefer double quotes
237248

238249
return f"{prefix}{new_quote}{new_body}{new_quote}"
250+
251+
252+
def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
253+
"""Replace hex codes in Unicode escape sequences with lowercase representation."""
254+
text = leaf.value
255+
prefix = get_string_prefix(text)
256+
if "r" in prefix.lower():
257+
return
258+
259+
def replace(m: Match[str]) -> str:
260+
groups = m.groupdict()
261+
back_slashes = groups["backslashes"]
262+
263+
if len(back_slashes) % 2 == 0:
264+
return back_slashes + groups["body"]
265+
266+
if groups["u"]:
267+
# \u
268+
return back_slashes + "u" + groups["u"].lower()
269+
elif groups["U"]:
270+
# \U
271+
return back_slashes + "U" + groups["U"].lower()
272+
elif groups["x"]:
273+
# \x
274+
return back_slashes + "x" + groups["x"].lower()
275+
else:
276+
assert groups["N"], f"Unexpected match: {m}"
277+
# \N{}
278+
return back_slashes + "N{" + groups["N"].upper() + "}"
279+
280+
leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
x = "\x1F"
2+
x = "\\x1B"
3+
x = "\\\x1B"
4+
x = "\U0001F60E"
5+
x = "\u0001F60E"
6+
x = r"\u0001F60E"
7+
x = "don't format me"
8+
x = "\xA3"
9+
x = "\u2717"
10+
x = "\uFaCe"
11+
x = "\N{ox}\N{OX}"
12+
x = "\N{lAtIn smaLL letteR x}"
13+
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
14+
x = b"\x1Fdon't byte"
15+
x = rb"\x1Fdon't format"
16+
17+
# output
18+
19+
x = "\x1f"
20+
x = "\\x1B"
21+
x = "\\\x1b"
22+
x = "\U0001f60e"
23+
x = "\u0001F60E"
24+
x = r"\u0001F60E"
25+
x = "don't format me"
26+
x = "\xa3"
27+
x = "\u2717"
28+
x = "\uface"
29+
x = "\N{OX}\N{OX}"
30+
x = "\N{LATIN SMALL LETTER X}"
31+
x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
32+
x = b"\x1fdon't byte"
33+
x = rb"\x1Fdon't format"

0 commit comments

Comments
 (0)