lrpb: add RenderedPageBreak.following_pa..fragment

scanny · scanny · commit 1e42c55f9982 · 2023-10-01T20:08:33.000-07:00
diff --git a/features/pbk-split-para.feature b/features/pbk-split-para.feature
@@ -14,13 +14,11 @@ Feature: Split paragraph on rendered page-breaks
      Then rendered_page_break.preceding_paragraph_fragment includes the hyperlink
 
 
-  @wip
   Scenario: RenderedPageBreak.following_paragraph_fragment is the content after break
     Given a rendered_page_break in a paragraph
      Then rendered_page_break.following_paragraph_fragment is the content after break
 
 
-  @wip
   Scenario: RenderedPageBreak.following_paragraph_fragment excludes the hyperlink
     Given a rendered_page_break in a hyperlink
      Then rendered_page_break.following_paragraph_fragment excludes the hyperlink
diff --git a/src/docx/oxml/text/pagebreak.py b/src/docx/oxml/text/pagebreak.py
@@ -25,6 +25,57 @@ class CT_LastRenderedPageBreak(BaseOxmlElement):
     distinguished behavior. CT_Empty is used for many elements.
     """
 
+    @property
+    def following_fragment_p(self) -> CT_P:
+        """A "loose" `CT_P` containing only the paragraph content before this break.
+
+        Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
+        page-break in its paragraph.
+
+        The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
+        page-break with this `w:lastRenderedPageBreak` element and all content preceding
+        it removed.
+
+        NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
+        (when the paragraph contained more than one). While this is rare, the caller
+        should treat this paragraph the same as other paragraphs and split it if
+        necessary in a folloing step or recursion.
+        """
+        if not self == self._first_lrpb_in_p(self._enclosing_p):
+            raise ValueError("only defined on first rendered page-break in paragraph")
+
+        # -- splitting approach is different when break is inside a hyperlink --
+        return (
+            self._following_frag_in_hlink
+            if self._is_in_hyperlink
+            else self._following_frag_in_run
+        )
+
+    @property
+    def follows_all_content(self) -> bool:
+        """True when this page-break element is the last "content" in the paragraph.
+
+        This is very uncommon case and may only occur in contrived or cases where the
+        XML is edited by hand, but it is not precluded by the spec.
+        """
+        # -- a page-break inside a hyperlink never meets these criteria (for our
+        # -- purposes at least) because it is considered "atomic" and always associated
+        # -- with the page it starts on.
+        if self._is_in_hyperlink:
+            return False
+
+        return bool(
+            # -- XPath will match zero-or-one w:lastRenderedPageBreak element --
+            self._enclosing_p.xpath(
+                # -- in first run of paragraph --
+                f"(./w:r)[last()]"
+                # -- all page-breaks --
+                f"/w:lastRenderedPageBreak"
+                # -- that are not preceded by any content-bearing elements --
+                f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
+            )
+        )
+
     @property
     def precedes_all_content(self) -> bool:
         """True when a `w:lastRenderedPageBreak` precedes all paragraph content.
@@ -95,6 +146,65 @@ def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
             raise ValueError("no rendered page-breaks in paragraph element")
         return lrpbs[0]
 
+    @lazyproperty
+    def _following_frag_in_hlink(self) -> CT_P:
+        """Following CT_P fragment when break occurs within a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is not inside a
+        hyperlink.
+        """
+        if not self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
+        hyperlink = lrpb._enclosing_hyperlink(lrpb)
+
+        # -- delete all w:p inner-content preceding the hyperlink --
+        for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
+            p.remove(e)
+
+        # -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
+        hyperlink.getparent().remove(hyperlink)
+
+        # -- that's it, return the remaining fragment of `w:p` clone --
+        return p
+
+    @lazyproperty
+    def _following_frag_in_run(self) -> CT_P:
+        """following CT_P fragment when break does not occur in a hyperlink.
+
+        Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
+        """
+        if self._is_in_hyperlink:
+            raise ValueError("only defined on a rendered page-break not in a hyperlink")
+
+        # -- work on a clone `w:p` so our mutations don't persist --
+        p = copy.deepcopy(self._enclosing_p)
+
+        # -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
+        lrpb = self._first_lrpb_in_p(p)
+
+        # -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
+        enclosing_r = lrpb.xpath("./parent::w:r")[0]
+
+        # -- delete all w:p inner-content preceding that run (but not w:pPr) --
+        for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
+            p.remove(e)
+
+        # -- then remove all run inner-content preceding this lrpb in its run (but not
+        # -- the `w:rPr`) and also remove the page-break itself
+        for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
+            enclosing_r.remove(e)
+        enclosing_r.remove(lrpb)
+
+        return p
+
     @lazyproperty
     def _is_in_hyperlink(self) -> bool:
         """True when this page-break is embedded in a hyperlink run."""
diff --git a/src/docx/text/pagebreak.py b/src/docx/text/pagebreak.py
@@ -24,6 +24,15 @@ class RenderedPageBreak(Parented):
     Note these are never inserted by `python-docx` because it has no rendering function.
     These are generally only useful for text-extraction of existing documents when
     `python-docx` is being used solely as a document "reader".
+
+    NOTE: a rendered page-break can occur within a hyperlink; consider a multi-word
+    hyperlink like "excellent Wikipedia article on LLMs" that happens to fall close to
+    the end of the last line on a page such that the page breaks between "Wikipedia" and
+    "article". In such a "page-breaks-in-hyperlink" case, THESE METHODS WILL "MOVE" THE
+    PAGE-BREAK to occur after the hyperlink, such that the entire hyperlink appears in
+    the paragraph returned by `.preceding_paragraph_fragment`. While this places the
+    "tail" text of the hyperlink on the "wrong" page, it avoids having two hyperlinks
+    each with a fragment of the actual text and pointing to the same address.
     """
 
     def __init__(
@@ -51,16 +60,43 @@ def preceding_paragraph_fragment(self) -> Paragraph | None:
         familiar container (`Paragraph`) to interrogate for the content preceding this
         page-break in the paragraph in which it occured.
 
-        Also note that a rendered page-break can occur within a hyperlink; consider a
-        multi-word hyperlink like "excellent Wikipedia article on LLMs" that happens to
-        fall at the end of the last line on a page. THIS METHOD WILL "MOVE" the
-        page-break to occur after such a hyperlink. While this places the "tail" text of
-        the hyperlink on the "wrong" page, it avoids having two hyperlinks each with a
-        fragment of the actual text and pointing to the same address.
+        Contains the entire hyperlink when this break occurs within a hyperlink.
         """
         if self._lastRenderedPageBreak.precedes_all_content:
             return None
 
         from docx.text.paragraph import Paragraph
 
         return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)
+
+    @property
+    def following_paragraph_fragment(self) -> Paragraph | None:
+        """A "loose" paragraph containing the content following this page-break.
+
+        HAS POTENTIALLY SURPRISING BEHAVIORS so read carefully to be sure this is what
+        you want. This is primarily targeted toward text-extraction use-cases for which
+        precisely associating text with the page it occurs on is important.
+
+        Compare `.preceding_paragraph_fragment` as these two are intended to be used
+        together.
+
+        This value is `None` when no content follows this page-break. This case is
+        unlikely to occur in practice because Word places even-paragraph-boundary
+        page-breaks on the paragraph *following* the page-break. Still, it is possible
+        and must be checked for. Returning `None` for this case avoids "inserting" an
+        extra, non-existent paragraph into the content stream. Note that content can
+        include DrawingML items like images or charts, not just text.
+
+        The returned paragraph *is divorced from the document body*. Any changes made to
+        it will not be reflected in the document. It is intended to provide a container
+        (`Paragraph`) with familiar properties and methods that can be used to
+        characterize the paragraph content following a mid-paragraph page-break.
+
+        Contains no portion of the hyperlink when this break occurs within a hyperlink.
+        """
+        if self._lastRenderedPageBreak.follows_all_content:
+            return None
+
+        from docx.text.paragraph import Paragraph
+
+        return Paragraph(self._lastRenderedPageBreak.following_fragment_p, self._parent)
diff --git a/tests/text/test_pagebreak.py b/tests/text/test_pagebreak.py
@@ -4,6 +4,8 @@
 
 from typing import cast
 
+import pytest
+
 from docx import types as t
 from docx.oxml.text.paragraph import CT_P
 from docx.text.pagebreak import RenderedPageBreak
@@ -14,6 +16,17 @@
 class DescribeRenderedPageBreak:
     """Unit-test suite for the docx.text.pagebreak.RenderedPageBreak object."""
 
+    def it_raises_on_preceding_fragment_when_page_break_is_not_first_in_paragrah(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = 'w:p/(w:r/(w:t"abc",w:lastRenderedPageBreak,w:lastRenderedPageBreak))'
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[-1]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        with pytest.raises(ValueError, match="only defined on first rendered page-br"):
+            page_break.preceding_paragraph_fragment
+
     def it_produces_None_for_preceding_fragment_when_page_break_is_leading(
         self, fake_parent: t.StoryChild
     ):
@@ -66,3 +79,69 @@ def and_it_can_split_off_the_preceding_paragraph_content_when_in_a_hyperlink(
         expected_cxml = 'w:p/(w:pPr/w:ind,w:hyperlink/w:r/(w:t"foo",w:t"bar"))'
         assert preceding_fragment is not None
         assert preceding_fragment._p.xml == xml(expected_cxml)
+
+    def it_raises_on_following_fragment_when_page_break_is_not_first_in_paragrah(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = 'w:p/(w:r/(w:lastRenderedPageBreak,w:lastRenderedPageBreak,w:t"abc"))'
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[-1]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        with pytest.raises(ValueError, match="only defined on first rendered page-br"):
+            page_break.following_paragraph_fragment
+
+    def it_produces_None_for_following_fragment_when_page_break_is_trailing(
+        self, fake_parent: t.StoryChild
+    ):
+        """A page-break with no following content is "trailing"."""
+        p_cxml = 'w:p/(w:pPr/w:ind,w:r/(w:t"foo",w:t"bar",w:lastRenderedPageBreak))'
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        following_fragment = page_break.following_paragraph_fragment
+
+        assert following_fragment is None
+
+    def it_can_split_off_the_following_paragraph_content_when_in_a_run(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = (
+            "w:p/("
+            "  w:pPr/w:ind"
+            '  ,w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
+            '  ,w:r/w:t"foo"'
+            ")"
+        )
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        following_fragment = page_break.following_paragraph_fragment
+
+        expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"bar",w:r/w:t"foo")'
+        assert following_fragment is not None
+        assert following_fragment._p.xml == xml(expected_cxml)
+
+    def and_it_can_split_off_the_following_paragraph_content_when_in_a_hyperlink(
+        self, fake_parent: t.StoryChild
+    ):
+        p_cxml = (
+            "w:p/("
+            "  w:pPr/w:ind"
+            '  ,w:hyperlink/w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
+            '  ,w:r/w:t"baz"'
+            '  ,w:r/w:t"qux"'
+            ")"
+        )
+        p = cast(CT_P, element(p_cxml))
+        lrpb = p.lastRenderedPageBreaks[0]
+        page_break = RenderedPageBreak(lrpb, fake_parent)
+
+        following_fragment = page_break.following_paragraph_fragment
+
+        expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"baz",w:r/w:t"qux")'
+
+        assert following_fragment is not None
+        assert following_fragment._p.xml == xml(expected_cxml)