lrpb: add RenderedPageBreak.preceding_pa..fragment

scanny · scanny · commit ac6c216c8f0b · 2023-10-01T20:08:33.000-07:00
diff --git a/features/pbk-split-para.feature b/features/pbk-split-para.feature
@@ -4,13 +4,11 @@ Feature: Split paragraph on rendered page-breaks
   I need to a way to split a paragraph on its first rendered page break
 
 
-  @wip
   Scenario: RenderedPageBreak.preceding_paragraph_fragment is the content before break
     Given a rendered_page_break in a paragraph
      Then rendered_page_break.preceding_paragraph_fragment is the content before break
 
 
-  @wip
   Scenario: RenderedPageBreak.preceding_paragraph_fragment includes the hyperlink
     Given a rendered_page_break in a hyperlink
      Then rendered_page_break.preceding_paragraph_fragment includes the hyperlink
diff --git a/src/docx/oxml/text/pagebreak.py b/src/docx/oxml/text/pagebreak.py
@@ -2,7 +2,15 @@
 
 from __future__ import annotations
 
+import copy
+from typing import TYPE_CHECKING
+
 from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.shared import lazyproperty
+
+if TYPE_CHECKING:
+    from docx.oxml.text.hyperlink import CT_Hyperlink
+    from docx.oxml.text.paragraph import CT_P
 
 
 class CT_LastRenderedPageBreak(BaseOxmlElement):
@@ -16,3 +24,151 @@ class CT_LastRenderedPageBreak(BaseOxmlElement):
     `w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
     distinguished behavior. CT_Empty is used for many elements.
     """
+
+    @property
+    def precedes_all_content(self) -> bool:
+        """True when a `w:lastRenderedPageBreak` precedes all paragraph content.
+
+        This is a common case; it occurs whenever the page breaks on an even paragraph
+        boundary.
+        """
+        # -- a page-break inside a hyperlink never meets these criteria because there
+        # -- is always part of the hyperlink text before the page-break.
+        if self._is_in_hyperlink:
+            return False
+
+        return bool(
+            # -- XPath will match zero-or-one w:lastRenderedPageBreak element --
+            self._enclosing_p.xpath(
+                # -- in first run of paragraph --
+                f"./w:r[1]"
+                # -- all page-breaks --
+                f"/w:lastRenderedPageBreak"
+                # -- that are not preceded by any content-bearing elements --
+                f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
+            )
+        )
+
+    @property
+    def preceding_fragment_p(self) -> CT_P:
+        """A "loose" `CT_P` containing only the paragraph content before this break.
+
+        Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
+        paragraph in its paragraph.
+
+        The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
+        page-break with this `w:lastRenderedPageBreak` element and all its following
+        siblings removed.
+        """
+        if not self == self._first_lrpb_in_p(self._enclosing_p):
+            raise ValueError("only defined on first rendered page-break in paragraph")
+
+        # -- splitting approach is different when break is inside a hyperlink --
+        return (
+            self._preceding_frag_in_hlink
+            if self._is_in_hyperlink
+            else self._preceding_frag_in_run
+        )
+
+    def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
+        """The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
+
+        Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
+        when `._is_in_hyperlink` is True.
+        """
+        return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
+
+    @property
+    def _enclosing_p(self) -> CT_P:
+        """The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
+        return self.xpath("./ancestor::w:p[1]")[0]
+
+    def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
+        """The first `w:lastRenderedPageBreak` element in `p`.
+
+        Raises `ValueError` if there are no rendered page-breaks in `p`.
+        """
+        lrpbs = p.xpath(
+            "./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak"
+        )
+        if not lrpbs:
+            raise ValueError("no rendered page-breaks in paragraph element")
+        return lrpbs[0]
+
+    @lazyproperty
+    def _is_in_hyperlink(self) -> bool:
+        """True when this page-break is embedded in a hyperlink run."""
+        return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
+
+    @lazyproperty
+    def _preceding_frag_in_hlink(self) -> CT_P:
+        """Preceding CT_P fragment when break occurs within a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is not inside a
+        hyperlink.
+        """
+        if not self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
+        hyperlink = lrpb._enclosing_hyperlink(lrpb)
+
+        # -- delete all w:p inner-content following the hyperlink --
+        for e in hyperlink.xpath("./following-sibling::*"):
+            p.remove(e)
+
+        # -- remove this page-break from inside the hyperlink --
+        lrpb.getparent().remove(lrpb)
+
+        # -- that's it, the entire hyperlink goes into the preceding fragment so
+        # -- the hyperlink is not "split".
+        return p
+
+    @lazyproperty
+    def _preceding_frag_in_run(self) -> CT_P:
+        """Preceding CT_P fragment when break does not occur in a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
+        """
+        if self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break not in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
+        enclosing_r = lrpb.xpath("./parent::w:r")[0]
+
+        # -- delete all `w:p` inner-content following that run --
+        for e in enclosing_r.xpath("./following-sibling::*"):
+            p.remove(e)
+
+        # -- then delete all `w:r` inner-content following this lrpb in its run and
+        # -- also remove the page-break itself
+        for e in lrpb.xpath("./following-sibling::*"):
+            enclosing_r.remove(e)
+        enclosing_r.remove(lrpb)
+
+        return p
+
+    @lazyproperty
+    def _run_inner_content_xpath(self) -> str:
+        """XPath fragment matching any run inner-content elements."""
+        return (
+            "self::w:br"
+            " | self::w:cr"
+            " | self::w:drawing"
+            " | self::w:noBreakHyphen"
+            " | self::w:ptab"
+            " | self::w:t"
+            " | self::w:tab"
+        )
diff --git a/src/docx/text/pagebreak.py b/src/docx/text/pagebreak.py
@@ -2,10 +2,15 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from docx import types as t
 from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
 from docx.shared import Parented
 
+if TYPE_CHECKING:
+    from docx.text.paragraph import Paragraph
+
 
 class RenderedPageBreak(Parented):
     """A page-break inserted by Word during page-layout for print or display purposes.
@@ -27,3 +32,35 @@ def __init__(
         super().__init__(parent)
         self._element = lastRenderedPageBreak
         self._lastRenderedPageBreak = lastRenderedPageBreak
+
+    @property
+    def preceding_paragraph_fragment(self) -> Paragraph | None:
+        """A "loose" paragraph containing the content preceding this page-break.
+
+        Compare `.following_paragraph_fragment` as these two are intended to be used
+        together.
+
+        This value is `None` when no content precedes this page-break. This case is
+        common and occurs whenever a page breaks on an even paragraph boundary.
+        Returning `None` for this case avoids "inserting" a non-existent paragraph into
+        the content stream. Note that content can include DrawingML items like images or
+        charts.
+
+        Note the returned paragraph *is divorced from the document body*. Any changes
+        made to it will not be reflected in the document. It is intended to provide a
+        familiar container (`Paragraph`) to interrogate for the content preceding this
+        page-break in the paragraph in which it occured.
+
+        Also note that a rendered page-break can occur within a hyperlink; consider a
+        multi-word hyperlink like "excellent Wikipedia article on LLMs" that happens to
+        fall at the end of the last line on a page. THIS METHOD WILL "MOVE" the
+        page-break to occur after such a hyperlink. While this places the "tail" text of
+        the hyperlink on the "wrong" page, it avoids having two hyperlinks each with a
+        fragment of the actual text and pointing to the same address.
+        """
+        if self._lastRenderedPageBreak.precedes_all_content:
+            return None
+
+        from docx.text.paragraph import Paragraph
+
+        return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)
diff --git a/tests/text/test_pagebreak.py b/tests/text/test_pagebreak.py
@@ -0,0 +1,68 @@
+# pyright: reportPrivateUsage=false
+
+"""Unit-test suite for the docx.text.pagebreak module."""
+
+from typing import cast
+
+from docx import types as t
+from docx.oxml.text.paragraph import CT_P
+from docx.text.pagebreak import RenderedPageBreak
+
+from ..unitutil.cxml import element, xml
+
+
+class DescribeRenderedPageBreak:
+    """Unit-test suite for the docx.text.pagebreak.RenderedPageBreak object."""
+
+    def it_produces_None_for_preceding_fragment_when_page_break_is_leading(
+        self, fake_parent: t.StoryChild
+    ):
+        """A page-break with no preceding content is "leading"."""
+        p_cxml = 'w:p/(w:pPr/w:ind,w:r/(w:lastRenderedPageBreak,w:t"foo",w:t"bar"))'
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        preceding_fragment = page_break.preceding_paragraph_fragment
+
+        assert preceding_fragment is None
+
+    def it_can_split_off_the_preceding_paragraph_content_when_in_a_run(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = (
+            "w:p/("
+            "  w:pPr/w:ind"
+            '  ,w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
+            '  ,w:r/w:t"barfoo"'
+            ")"
+        )
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        preceding_fragment = page_break.preceding_paragraph_fragment
+
+        expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"foo")'
+        assert preceding_fragment is not None
+        assert preceding_fragment._p.xml == xml(expected_cxml)
+
+    def and_it_can_split_off_the_preceding_paragraph_content_when_in_a_hyperlink(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = (
+            "w:p/("
+            "  w:pPr/w:ind"
+            '  ,w:hyperlink/w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
+            '  ,w:r/w:t"barfoo"'
+            ")"
+        )
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        preceding_fragment = page_break.preceding_paragraph_fragment
+
+        expected_cxml = 'w:p/(w:pPr/w:ind,w:hyperlink/w:r/(w:t"foo",w:t"bar"))'
+        assert preceding_fragment is not None
+        assert preceding_fragment._p.xml == xml(expected_cxml)