Skip to content

gh-131507: Add support for syntax highlighting in PyREPL #131562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions Lib/_pyrepl/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
from dataclasses import dataclass, field, fields
from _colorize import can_colorize, ANSIColors


from . import commands, console, input
from .utils import wlen, unbracket, disp_str
from .utils import wlen, unbracket, disp_str, gen_colors
from .trace import trace


Expand All @@ -38,8 +37,7 @@
from .types import Callback, SimpleContextManager, KeySpec, CommandName


# syntax classes:

# syntax classes
SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3)


Expand Down Expand Up @@ -144,16 +142,17 @@ class Reader:
Instance variables of note include:

* buffer:
A *list* (*not* a string at the moment :-) containing all the
characters that have been entered.
A per-character list containing all the characters that have been
entered. Does not include color information.
* console:
Hopefully encapsulates the OS dependent stuff.
* pos:
A 0-based index into 'buffer' for where the insertion point
is.
* screeninfo:
Ahem. This list contains some info needed to move the
insertion point around reasonably efficiently.
A list of screen position tuples. Each list element is a tuple
representing information on visible line length for a given line.
Allows for efficient skipping of color escape sequences.
* cxy, lxy:
the position of the insertion point in screen ...
* syntax_table:
Expand Down Expand Up @@ -316,6 +315,11 @@ def calc_screen(self) -> list[str]:
pos -= offset

prompt_from_cache = (offset and self.buffer[offset - 1] != "\n")

if self.can_colorize:
colors = list(gen_colors(self.get_unicode()))
else:
colors = None
lines = "".join(self.buffer[offset:]).split("\n")
cursor_found = False
lines_beyond_cursor = 0
Expand Down Expand Up @@ -343,7 +347,7 @@ def calc_screen(self) -> list[str]:
screeninfo.append((0, []))
pos -= line_len + 1
prompt, prompt_len = self.process_prompt(prompt)
chars, char_widths = disp_str(line)
chars, char_widths = disp_str(line, colors, offset)
wrapcount = (sum(char_widths) + prompt_len) // self.console.width
trace("wrapcount = {wrapcount}", wrapcount=wrapcount)
if wrapcount == 0 or not char_widths:
Expand Down Expand Up @@ -567,6 +571,7 @@ def insert(self, text: str | list[str]) -> None:
def update_cursor(self) -> None:
"""Move the cursor to reflect changes in self.pos"""
self.cxy = self.pos2xy()
trace("update_cursor({pos}) = {cxy}", pos=self.pos, cxy=self.cxy)
self.console.move_cursor(*self.cxy)

def after_command(self, cmd: Command) -> None:
Expand Down
123 changes: 119 additions & 4 deletions Lib/_pyrepl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,56 @@
import unicodedata
import functools

from idlelib import colorizer
Copy link
Contributor

@Wulian233 Wulian233 Mar 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from idlelib import colorizer

On Windows, if you do not choose to install Tcl/Tk, IDLE will not be available

See https://github.com/python/cpython/blob/main/Tools%2Fmsi%2Ftcltk%2Ftcltk.wixproj

from typing import cast, Iterator, Literal, Match, NamedTuple, Pattern, Self
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are some deprecated aliases. Although there are no issues at present, it would be better to switch to collection.abc

https://docs.python.org/3.14/library/typing.html#typing.Iterator

from _colorize import ANSIColors

from .types import CharBuffer, CharWidths
from .trace import trace

ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02")
ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})
COLORIZE_RE: Pattern[str] = colorizer.prog
IDENTIFIER_RE: Pattern[str] = colorizer.idprog
IDENTIFIERS_AFTER = {"def", "class"}
COLORIZE_GROUP_NAME_MAP: dict[str, str] = colorizer.prog_group_name_to_tag

type ColorTag = (
Literal["KEYWORD"]
| Literal["BUILTIN"]
| Literal["COMMENT"]
| Literal["STRING"]
| Literal["DEFINITION"]
| Literal["SYNC"]
)
Comment on lines +20 to +27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
type ColorTag = (
Literal["KEYWORD"]
| Literal["BUILTIN"]
| Literal["COMMENT"]
| Literal["STRING"]
| Literal["DEFINITION"]
| Literal["SYNC"]
)
type ColorTag = Literal["KEYWORD", "BUILTIN", "COMMENT", "STRING", "DEFINITION", "SYNC"]



class Span(NamedTuple):
"""Span indexing that's inclusive on both ends."""

start: int
end: int

@classmethod
def from_re(cls, m: Match[str], group: int | str) -> Self:
re_span = m.span(group)
return cls(re_span[0], re_span[1] - 1)


class ColorSpan(NamedTuple):
span: Span
tag: ColorTag


TAG_TO_ANSI: dict[ColorTag, str] = {
"KEYWORD": ANSIColors.BOLD_BLUE,
"BUILTIN": ANSIColors.CYAN,
"COMMENT": ANSIColors.RED,
"STRING": ANSIColors.GREEN,
"DEFINITION": ANSIColors.BOLD_WHITE,
"SYNC": ANSIColors.RESET,
}


@functools.cache
Expand Down Expand Up @@ -41,25 +85,82 @@ def unbracket(s: str, including_content: bool = False) -> str:
return s.translate(ZERO_WIDTH_TRANS)


def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
r"""Decompose the input buffer into a printable variant.
def gen_colors(buffer: str) -> Iterator[ColorSpan]:
"""Returns a list of index spans to color using the given color tag.

The input `buffer` should be a valid start of a Python code block, i.e.
it cannot be a block starting in the middle of a multiline string.
"""
for match in COLORIZE_RE.finditer(buffer):
yield from gen_color_spans(match)


def gen_color_spans(re_match: Match[str]) -> Iterator[ColorSpan]:
"""Generate non-empty color spans."""
for tag, data in re_match.groupdict().items():
if not data:
continue
span = Span.from_re(re_match, tag)
tag = COLORIZE_GROUP_NAME_MAP.get(tag, tag)
yield ColorSpan(span, cast(ColorTag, tag))
if data in IDENTIFIERS_AFTER:
if name_match := IDENTIFIER_RE.match(re_match.string, span.end + 1):
span = Span.from_re(name_match, 1)
yield ColorSpan(span, "DEFINITION")


def disp_str(
buffer: str, colors: list[ColorSpan] | None = None, start_index: int = 0
) -> tuple[CharBuffer, CharWidths]:
r"""Decompose the input buffer into a printable variant with applied colors.

Returns a tuple of two lists:
- the first list is the input buffer, character by character;
- the first list is the input buffer, character by character, with color
escape codes added (while those codes contain multiple ASCII characters,
each code is considered atomic *and is attached for the corresponding
visible character*);
- the second list is the visible width of each character in the input
buffer.

Note on colors:
- The `colors` list, if provided, is partially consumed within. We're using
a list and not a generator since we need to hold onto the current
unfinished span between calls to disp_str in case of multiline strings.
- The `colors` list is computed from the start of the input block. `buffer`
is only a subset of that input block, a single line within. This is why
we need `start_index` to inform us which position is the start of `buffer`
actually within user input. This allows us to match color spans correctly.

Examples:
>>> utils.disp_str("a = 9")
(['a', ' ', '=', ' ', '9'], [1, 1, 1, 1, 1])

>>> line = "while 1:"
>>> colors = list(utils.gen_colors(line))
>>> utils.disp_str(line, colors=colors)
(['\x1b[1;34mw', 'h', 'i', 'l', 'e\x1b[0m', ' ', '1', ':'], [1, 1, 1, 1, 1, 1, 1, 1])

"""
chars: CharBuffer = []
char_widths: CharWidths = []

if not buffer:
return chars, char_widths

for c in buffer:
while colors and colors[0].span.end < start_index:
# move past irrelevant spans
colors.pop(0)

pre_color = ""
post_color = ""
if colors and colors[0].span.start < start_index:
# looks like we're continuing a previous color (e.g. a multiline str)
pre_color = TAG_TO_ANSI[colors[0].tag]

for i, c in enumerate(buffer, start_index):
if colors and colors[0].span.start == i: # new color starts now
pre_color = TAG_TO_ANSI[colors[0].tag]

if c == "\x1a": # CTRL-Z on Windows
chars.append(c)
char_widths.append(2)
Expand All @@ -73,5 +174,19 @@ def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
else:
chars.append(c)
char_widths.append(str_width(c))

if colors and colors[0].span.end == i: # current color ends now
post_color = TAG_TO_ANSI["SYNC"]
colors.pop(0)

chars[-1] = pre_color + chars[-1] + post_color
pre_color = ""
post_color = ""

if colors and colors[0].span.start < i and colors[0].span.end > i:
# even though the current color should be continued, reset it for now.
# the next call to `disp_str()` will revive it.
chars[-1] += TAG_TO_ANSI["SYNC"]

trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
return chars, char_widths
68 changes: 66 additions & 2 deletions Lib/test/test_pyrepl/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import itertools
import functools
import rlcompleter
from textwrap import dedent
from unittest import TestCase
from unittest.mock import MagicMock

from .support import handle_all_events, handle_events_narrow_console
from .support import ScreenEqualMixin, code_to_events
from .support import prepare_reader, prepare_console
from .support import prepare_reader, prepare_console, reader_force_colors
from _pyrepl.console import Event
from _pyrepl.reader import Reader
from _pyrepl.utils import TAG_TO_ANSI


colors = {k[0].lower(): v for k, v in TAG_TO_ANSI.items() if k != "SYNC"}
colors["z"] = TAG_TO_ANSI["SYNC"]


class TestReader(ScreenEqualMixin, TestCase):
Expand Down Expand Up @@ -123,8 +129,9 @@ def test_setpos_for_xy_simple(self):
def test_control_characters(self):
code = 'flag = "🏳️‍🌈"'
events = code_to_events(code)
reader, _ = handle_all_events(events)
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
self.assert_screen_equal(reader, 'flag = "🏳️\\u200d🌈"', clean=True)
self.assert_screen_equal(reader, 'flag = {s}"🏳️\\u200d🌈"{z}'.format(**colors))

def test_setpos_from_xy_multiple_lines(self):
# fmt: off
Expand Down Expand Up @@ -355,3 +362,60 @@ def test_setpos_from_xy_for_non_printing_char(self):
reader, _ = handle_all_events(events)
reader.setpos_from_xy(8, 0)
self.assertEqual(reader.pos, 7)

def test_syntax_highlighting_basic(self):
code = dedent(
"""\
import re, sys
def funct(case: str = sys.platform) -> None:
match = re.search(
"(me)",
'''
Come on
Come on now
You know that it's time to emerge
''',
)
match case:
case "emscripten": print("on the web")
case "ios" | "android": print("on the phone")
case _: print('arms around', match.group(1))
"""
)
expected = dedent(
"""\
{k}import{z} re, sys
{a}{k}def{z} {d}funct{z}(case: {b}str{z} = sys.platform) -> {k}None{z}:
match = re.search(
{s}"(me)"{z},
{s}'''{z}
{s} Come on{z}
{s} Come on now{z}
{s} You know that it's time to emerge{z}
{s} '''{z},
)
{k}match{z} case:
{k}case{z} {s}"emscripten"{z}: {b}print{z}({s}"on the web"{z})
{k}case{z} {s}"ios"{z} | {s}"android"{z}: {b}print{z}({s}"on the phone"{z})
{k}case{z} {k}_{z}: {b}print{z}({s}'arms around'{z}, match.group(1))
"""
)
expected_sync = expected.format(a="", **colors)
events = code_to_events(code)
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
self.assert_screen_equal(reader, code, clean=True)
self.assert_screen_equal(reader, expected_sync)
self.assertEqual(reader.pos, 2**7 + 2**8)
self.assertEqual(reader.cxy, (0, 14))

async_msg = "{k}async{z} ".format(**colors)
expected_async = expected.format(a=async_msg, **colors)
more_events = itertools.chain(
code_to_events(code),
[Event(evt="key", data="up", raw=bytearray(b"\x1bOA"))] * 13,
code_to_events("async "),
)
reader, _ = handle_all_events(more_events, prepare_reader=reader_force_colors)
self.assert_screen_equal(reader, expected_async)
self.assertEqual(reader.pos, 21)
self.assertEqual(reader.cxy, (6, 1))
23 changes: 17 additions & 6 deletions Lib/test/test_pyrepl/test_windows_console.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from unittest import TestCase
from unittest.mock import MagicMock, call

from .support import handle_all_events, code_to_events
from .support import handle_all_events, code_to_events, reader_no_colors
from .support import prepare_reader as default_prepare_reader

try:
from _pyrepl.console import Event, Console
Expand Down Expand Up @@ -46,14 +47,22 @@ def console(self, events, **kwargs) -> Console:
setattr(console, key, val)
return console

def handle_events(self, events: Iterable[Event], **kwargs):
return handle_all_events(events, partial(self.console, **kwargs))
def handle_events(
self,
events: Iterable[Event],
prepare_console=None,
prepare_reader=None,
**kwargs,
):
prepare_console = prepare_console or partial(self.console, **kwargs)
prepare_reader = prepare_reader or default_prepare_reader
return handle_all_events(events, prepare_console, prepare_reader)

def handle_events_narrow(self, events):
return self.handle_events(events, width=5)

def handle_events_short(self, events):
return self.handle_events(events, height=1)
def handle_events_short(self, events, **kwargs):
return self.handle_events(events, height=1, **kwargs)

def handle_events_height_3(self, events):
return self.handle_events(events, height=3)
Expand Down Expand Up @@ -248,7 +257,9 @@ def test_resize_bigger_on_multiline_function(self):
# fmt: on

events = itertools.chain(code_to_events(code))
reader, console = self.handle_events_short(events)
reader, console = self.handle_events_short(
events, prepare_reader=reader_no_colors
)

console.height = 2
console.getheightwidth = MagicMock(lambda _: (2, 80))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PyREPL now supports syntax highlighing. Contributed by Łukasz Langa.
Loading