fix: Fallback to slugified title as id for non-exact, non-code references ([Hello World][] -> [hello-world][])

pawamoy · pawamoy · commit 13428f15d72d · 2025-01-10T16:20:16.000+01:00
With a heading like `## Welcome`, we should be able to cross-reference it with `[Welcome][]`, without having to specify the actual, slugified identifier: `[Welcome][welcome]`. This is compliant with the original Markdown spec. How does it work? When the base Markdown converter doesn't convert a reference, autorefs kicks in. It converts the yet-unresolved reference to an `autoref` HTML element. If an identifier was explicitly given, it creates a regular `autoref` element like before. If only a title was provided, then there are two scenarios: - the title converts to a `code` HTML element, in which case we create a regular `autoref` again (important for API docs) - the title does not convert to a `code` HTML element, in which case we add a slug to the `autoref` element `autoref` elements without a slug are handled like before. `autoref` elements with a slug will first try to find an URL for the initial identifier (which is the title), and if that fails, will try again with the slugified title. Slugification is made with the `toc` extension's `slugify` function. Issue-58: #58
diff --git a/src/mkdocs_autorefs/references.py b/src/mkdocs_autorefs/references.py
@@ -17,11 +17,13 @@
 import markupsafe
 from markdown.core import Markdown
 from markdown.extensions import Extension
+from markdown.extensions.toc import slugify
 from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor
 from markdown.treeprocessors import Treeprocessor
 from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
     from pathlib import Path
     from re import Match
 
@@ -120,7 +122,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:  # noqa: D107
 
     # Code based on
     # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780
-
     def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | None, int | None]:  # type: ignore[override]  # noqa: N802
         """Handle an element that matched.
 
@@ -135,19 +136,19 @@ def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | N
         if not handled:
             return None, None, None
 
-        identifier, end, handled = self.evalId(data, index, text)
+        identifier, slug, end, handled = self._eval_id(data, index, text)
         if not handled or identifier is None:
             return None, None, None
 
-        if re.search(r"[\x00-\x1f]", identifier):
+        if slug is None and re.search(r"[\x00-\x1f]", identifier):
             # Do nothing if the matched reference contains control characters (from 0 to 31 included).
             # Specifically `\x01` is used by Python-Markdown HTML stash when there's inline formatting,
             # but references with Markdown formatting are not possible anyway.
             return None, m.start(0), end
 
-        return self._make_tag(identifier, text), m.start(0), end
+        return self._make_tag(identifier, text, slug=slug), m.start(0), end
 
-    def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:  # noqa: N802 (parent's casing)
+    def _eval_id(self, data: str, index: int, text: str) -> tuple[str | None, str | None, int, bool]:
         """Evaluate the id portion of `[ref][id]`.
 
         If `[ref][]` use `[ref]`.
@@ -158,23 +159,28 @@ def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, boo
             text: The text to use when no identifier.
 
         Returns:
-            A tuple containing the identifier, its end position, and whether it matched.
+            A tuple containing the identifier, its optional slug, its end position, and whether it matched.
         """
         m = self.RE_LINK.match(data, pos=index)
         if not m:
-            return None, index, False
+            return None, None, index, False
 
-        identifier = m.group(1)
-        if not identifier:
+        if identifier := m.group(1):
+            # An identifier was provided, match it exactly (later).
+            slug = None
+        else:
+            # Only a title was provided, use it as identifier.
             identifier = text
-            # Allow the entire content to be one placeholder, with the intent of catching things like [`Foo`][].
-            # It doesn't catch [*Foo*][] though, just due to the priority order.
-            # https://github.com/Python-Markdown/markdown/blob/1858c1b601ead62ed49646ae0d99298f41b1a271/markdown/inlinepatterns.py#L78
+
+            # Catch single stash entries, like the result of [`Foo`][].
             if match := INLINE_PLACEHOLDER_RE.fullmatch(identifier):
                 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes  # type: ignore[attr-defined]
                 el = stashed_nodes.get(match[1])
                 if isinstance(el, Element) and el.tag == "code":
+                    # The title was wrapped in backticks, we only keep the content,
+                    # and tell autorefs to match the identifier exactly.
                     identifier = "".join(el.itertext())
+                    slug = None
                     # Special case: allow pymdownx.inlinehilite raw <code> snippets but strip them back to unhighlighted.
                     if match := HTML_PLACEHOLDER_RE.fullmatch(identifier):
                         stash_index = int(match.group(1))
@@ -183,9 +189,9 @@ def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, boo
                         self.md.htmlStash.rawHtmlBlocks[stash_index] = escape(identifier)
 
         end = m.end(0)
-        return identifier, end, True
+        return identifier, slug, end, True
 
-    def _make_tag(self, identifier: str, text: str) -> Element:
+    def _make_tag(self, identifier: str, text: str, *, slug: str | None = None) -> Element:
         """Create a tag that can be matched by `AUTO_REF_RE`.
 
         Arguments:
@@ -201,6 +207,8 @@ def _make_tag(self, identifier: str, text: str) -> Element:
             el.attrib.update(self.hook.get_context().as_dict())
         el.set("identifier", identifier)
         el.text = text
+        if slug:
+            el.attrib["slug"] = slug
         return el
 
 
@@ -300,6 +308,7 @@ class _AutorefsAttrs(dict):
         "origin",
         "filepath",
         "lineno",
+        "slug",
     }
 
     @property
@@ -337,6 +346,15 @@ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None
 _html_attrs_parser = _HTMLAttrsParser()
 
 
+def _find_url(identifiers: Iterable[str], url_mapper: Callable[[str], str]) -> str:
+    for identifier in identifiers:
+        try:
+            return url_mapper(identifier)
+        except KeyError:
+            pass
+    raise KeyError(f"None of the identifiers {identifiers} were found")
+
+
 def fix_ref(
     url_mapper: Callable[[str], str],
     unmapped: list[tuple[str, AutorefsHookInterface.Context | None]],
@@ -363,11 +381,14 @@ def inner(match: Match) -> str:
         title = match["title"]
         attrs = _html_attrs_parser.parse(f"<a {match['attrs']}>")
         identifier: str = attrs["identifier"]
+        slug = attrs.get("slug", None)
         optional = "optional" in attrs
         hover = "hover" in attrs
 
+        identifiers = (identifier, slug) if slug else (identifier,)
+
         try:
-            url = url_mapper(unescape(identifier))
+            url = _find_url(identifiers, url_mapper)
         except KeyError:
             if optional:
                 if hover:
@@ -376,6 +397,8 @@ def inner(match: Match) -> str:
             unmapped.append((identifier, attrs.context))
             if title == identifier:
                 return f"[{identifier}][]"
+            if title == f"<code>{identifier}</code>" and not slug:
+                return f"[<code>{identifier}</code>][]"
             return f"[{title}][{identifier}]"
 
         parsed = urlsplit(url)
diff --git a/tests/test_references.py b/tests/test_references.py
@@ -181,7 +181,7 @@ def test_missing_reference_with_markdown_text() -> None:
     run_references_test(
         url_map={"NotFoo": "foo.html#NotFoo"},
         source="[`Foo`][Foo]",
-        output="<p>[<code>Foo</code>][Foo]</p>",
+        output="<p>[<code>Foo</code>][]</p>",
         unmapped=[("Foo", None)],
     )
 
@@ -201,8 +201,8 @@ def test_missing_reference_with_markdown_implicit() -> None:
     run_references_test(
         url_map={"Foo-bar": "foo.html#Foo-bar"},
         source="[*Foo-bar*][] and [`Foo`-bar][]",
-        output="<p>[<em>Foo-bar</em>][*Foo-bar*] and [<code>Foo</code>-bar][]</p>",
-        unmapped=[("*Foo-bar*", None)],
+        output="<p>[<em>Foo-bar</em>][*Foo-bar*] and [<code>Foo</code>-bar][`Foo`-bar]</p>",
+        unmapped=[("*Foo-bar*", None), ("`Foo`-bar", None)],
     )
 
 
@@ -405,3 +405,81 @@ def test_keep_data_attributes() -> None:
     source = '<autoref optional identifier="example" class="hi ho" data-foo data-bar="0">e</autoref>'
     output, _ = fix_refs(source, url_map.__getitem__)
     assert output == '<a class="autorefs autorefs-external hi ho" href="https://e.com" data-foo data-bar="0">e</a>'
+
+
+@pytest.mark.parametrize(
+    ("markdown_ref", "exact_expected"),
+    [
+        ("[Foo][]", False),
+        ("[\\`Foo][]", False),
+        ("[\\`\\`Foo][]", False),
+        ("[\\`\\`Foo\\`][]", False),
+        ("[Foo\\`][]", False),
+        ("[Foo\\`\\`][]", False),
+        ("[\\`Foo\\`\\`][]", False),
+        ("[`Foo` `Bar`][]", False),
+        ("[Foo][Foo]", True),
+        ("[`Foo`][]", True),
+        ("[`Foo``Bar`][]", True),
+        ("[`Foo```Bar`][]", True),
+        ("[``Foo```Bar``][]", True),
+        ("[``Foo`Bar``][]", True),
+        ("[```Foo``Bar```][]", True),
+    ],
+)
+def test_mark_identifiers_as_exact(markdown_ref: str, exact_expected: bool) -> None:
+    """Mark code and explicit identifiers as exact (no `slug` attribute in autoref elements)."""
+    plugin = AutorefsPlugin()
+    md = markdown.Markdown(extensions=["attr_list", "toc", AutorefsExtension(plugin)])
+    plugin.current_page = "page"
+    output = md.convert(markdown_ref)
+    if exact_expected:
+        assert "slug=" not in output
+    else:
+        assert "slug=" in output
+
+
+def test_slugified_identifier_fallback() -> None:
+    """Fallback to the slugified identifier when no URL is found."""
+    run_references_test(
+        url_map={"hello-world": "https://e.com#a"},
+        source='<autoref identifier="Hello World" slug="hello-world">Hello World</autoref>',
+        output='<p><a class="autorefs autorefs-external" href="https://e.com#a">Hello World</a></p>',
+    )
+    run_references_test(
+        url_map={"foo-bar": "https://e.com#a"},
+        source="[*Foo*-bar][]",
+        output='<p><a class="autorefs autorefs-external" href="https://e.com#a"><em>Foo</em>-bar</a></p>',
+    )
+    run_references_test(
+        url_map={"foo-bar": "https://e.com#a"},
+        source="[`Foo`-bar][]",
+        output='<p><a class="autorefs autorefs-external" href="https://e.com#a"><code>Foo</code>-bar</a></p>',
+    )
+
+
+def test_no_fallback_for_exact_identifiers() -> None:
+    """Do not fallback to the slugified identifier for exact identifiers."""
+    run_references_test(
+        url_map={"hello-world": "https://e.com"},
+        source='<autoref identifier="Hello World"><code>Hello World</code></autoref>',
+        output="<p>[<code>Hello World</code>][]</p>",
+        unmapped=[("Hello World", None)],
+    )
+
+    run_references_test(
+        url_map={"hello-world": "https://e.com"},
+        source='<autoref identifier="Hello World">Hello World</autoref>',
+        output="<p>[Hello World][]</p>",
+        unmapped=[("Hello World", None)],
+    )
+
+
+def test_no_fallback_for_provided_identifiers() -> None:
+    """Do not slugify provided identifiers."""
+    run_references_test(
+        url_map={"hello-world": "foo.html#hello-world"},
+        source="[Hello][Hello world]",
+        output="<p>[Hello][Hello world]</p>",
+        unmapped=[("Hello world", None)],
+    )