Skip to content

Commit ac6c216

Browse files
committed
lrpb: add RenderedPageBreak.preceding_pa..fragment
1 parent 557fdee commit ac6c216

File tree

4 files changed

+261
-2
lines changed

4 files changed

+261
-2
lines changed

features/pbk-split-para.feature

-2
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@ Feature: Split paragraph on rendered page-breaks
44
I need to a way to split a paragraph on its first rendered page break
55

66

7-
@wip
87
Scenario: RenderedPageBreak.preceding_paragraph_fragment is the content before break
98
Given a rendered_page_break in a paragraph
109
Then rendered_page_break.preceding_paragraph_fragment is the content before break
1110

1211

13-
@wip
1412
Scenario: RenderedPageBreak.preceding_paragraph_fragment includes the hyperlink
1513
Given a rendered_page_break in a hyperlink
1614
Then rendered_page_break.preceding_paragraph_fragment includes the hyperlink

src/docx/oxml/text/pagebreak.py

+156
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,15 @@
22

33
from __future__ import annotations
44

5+
import copy
6+
from typing import TYPE_CHECKING
7+
58
from docx.oxml.xmlchemy import BaseOxmlElement
9+
from docx.shared import lazyproperty
10+
11+
if TYPE_CHECKING:
12+
from docx.oxml.text.hyperlink import CT_Hyperlink
13+
from docx.oxml.text.paragraph import CT_P
614

715

816
class CT_LastRenderedPageBreak(BaseOxmlElement):
@@ -16,3 +24,151 @@ class CT_LastRenderedPageBreak(BaseOxmlElement):
1624
`w:lastRenderedPageBreak` maps to `CT_Empty`. This name was added to give it
1725
distinguished behavior. CT_Empty is used for many elements.
1826
"""
27+
28+
@property
29+
def precedes_all_content(self) -> bool:
30+
"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.
31+
32+
This is a common case; it occurs whenever the page breaks on an even paragraph
33+
boundary.
34+
"""
35+
# -- a page-break inside a hyperlink never meets these criteria because there
36+
# -- is always part of the hyperlink text before the page-break.
37+
if self._is_in_hyperlink:
38+
return False
39+
40+
return bool(
41+
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
42+
self._enclosing_p.xpath(
43+
# -- in first run of paragraph --
44+
f"./w:r[1]"
45+
# -- all page-breaks --
46+
f"/w:lastRenderedPageBreak"
47+
# -- that are not preceded by any content-bearing elements --
48+
f"[not(preceding-sibling::*[{self._run_inner_content_xpath}])]"
49+
)
50+
)
51+
52+
@property
53+
def preceding_fragment_p(self) -> CT_P:
54+
"""A "loose" `CT_P` containing only the paragraph content before this break.
55+
56+
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
57+
paragraph in its paragraph.
58+
59+
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
60+
page-break with this `w:lastRenderedPageBreak` element and all its following
61+
siblings removed.
62+
"""
63+
if not self == self._first_lrpb_in_p(self._enclosing_p):
64+
raise ValueError("only defined on first rendered page-break in paragraph")
65+
66+
# -- splitting approach is different when break is inside a hyperlink --
67+
return (
68+
self._preceding_frag_in_hlink
69+
if self._is_in_hyperlink
70+
else self._preceding_frag_in_run
71+
)
72+
73+
def _enclosing_hyperlink(self, lrpb: CT_LastRenderedPageBreak) -> CT_Hyperlink:
74+
"""The `w:hyperlink` grandparent of this `w:lastRenderedPageBreak`.
75+
76+
Raises `IndexError` when this page-break has a `w:p` grandparent, so only call
77+
when `._is_in_hyperlink` is True.
78+
"""
79+
return lrpb.xpath("./parent::w:r/parent::w:hyperlink")[0]
80+
81+
@property
82+
def _enclosing_p(self) -> CT_P:
83+
"""The `w:p` element parent or grandparent of this `w:lastRenderedPageBreak`."""
84+
return self.xpath("./ancestor::w:p[1]")[0]
85+
86+
def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
87+
"""The first `w:lastRenderedPageBreak` element in `p`.
88+
89+
Raises `ValueError` if there are no rendered page-breaks in `p`.
90+
"""
91+
lrpbs = p.xpath(
92+
"./w:r/w:lastRenderedPageBreak | ./w:hyperlink/w:r/w:lastRenderedPageBreak"
93+
)
94+
if not lrpbs:
95+
raise ValueError("no rendered page-breaks in paragraph element")
96+
return lrpbs[0]
97+
98+
@lazyproperty
99+
def _is_in_hyperlink(self) -> bool:
100+
"""True when this page-break is embedded in a hyperlink run."""
101+
return bool(self.xpath("./parent::w:r/parent::w:hyperlink"))
102+
103+
@lazyproperty
104+
def _preceding_frag_in_hlink(self) -> CT_P:
105+
"""Preceding CT_P fragment when break occurs within a hyperlink.
106+
107+
Note this is a *partial-function* and raises when `lrpb` is not inside a
108+
hyperlink.
109+
"""
110+
if not self._is_in_hyperlink:
111+
raise ValueError("only defined on a rendered page-break in a hyperlink")
112+
113+
# -- work on a clone `w:p` so our mutations don't persist --
114+
p = copy.deepcopy(self._enclosing_p)
115+
116+
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
117+
lrpb = self._first_lrpb_in_p(p)
118+
119+
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
120+
hyperlink = lrpb._enclosing_hyperlink(lrpb)
121+
122+
# -- delete all w:p inner-content following the hyperlink --
123+
for e in hyperlink.xpath("./following-sibling::*"):
124+
p.remove(e)
125+
126+
# -- remove this page-break from inside the hyperlink --
127+
lrpb.getparent().remove(lrpb)
128+
129+
# -- that's it, the entire hyperlink goes into the preceding fragment so
130+
# -- the hyperlink is not "split".
131+
return p
132+
133+
@lazyproperty
134+
def _preceding_frag_in_run(self) -> CT_P:
135+
"""Preceding CT_P fragment when break does not occur in a hyperlink.
136+
137+
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
138+
"""
139+
if self._is_in_hyperlink:
140+
raise ValueError("only defined on a rendered page-break not in a hyperlink")
141+
142+
# -- work on a clone `w:p` so our mutations don't persist --
143+
p = copy.deepcopy(self._enclosing_p)
144+
145+
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
146+
lrpb = self._first_lrpb_in_p(p)
147+
148+
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
149+
enclosing_r = lrpb.xpath("./parent::w:r")[0]
150+
151+
# -- delete all `w:p` inner-content following that run --
152+
for e in enclosing_r.xpath("./following-sibling::*"):
153+
p.remove(e)
154+
155+
# -- then delete all `w:r` inner-content following this lrpb in its run and
156+
# -- also remove the page-break itself
157+
for e in lrpb.xpath("./following-sibling::*"):
158+
enclosing_r.remove(e)
159+
enclosing_r.remove(lrpb)
160+
161+
return p
162+
163+
@lazyproperty
164+
def _run_inner_content_xpath(self) -> str:
165+
"""XPath fragment matching any run inner-content elements."""
166+
return (
167+
"self::w:br"
168+
" | self::w:cr"
169+
" | self::w:drawing"
170+
" | self::w:noBreakHyphen"
171+
" | self::w:ptab"
172+
" | self::w:t"
173+
" | self::w:tab"
174+
)

src/docx/text/pagebreak.py

+37
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22

33
from __future__ import annotations
44

5+
from typing import TYPE_CHECKING
6+
57
from docx import types as t
68
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
79
from docx.shared import Parented
810

11+
if TYPE_CHECKING:
12+
from docx.text.paragraph import Paragraph
13+
914

1015
class RenderedPageBreak(Parented):
1116
"""A page-break inserted by Word during page-layout for print or display purposes.
@@ -27,3 +32,35 @@ def __init__(
2732
super().__init__(parent)
2833
self._element = lastRenderedPageBreak
2934
self._lastRenderedPageBreak = lastRenderedPageBreak
35+
36+
@property
37+
def preceding_paragraph_fragment(self) -> Paragraph | None:
38+
"""A "loose" paragraph containing the content preceding this page-break.
39+
40+
Compare `.following_paragraph_fragment` as these two are intended to be used
41+
together.
42+
43+
This value is `None` when no content precedes this page-break. This case is
44+
common and occurs whenever a page breaks on an even paragraph boundary.
45+
Returning `None` for this case avoids "inserting" a non-existent paragraph into
46+
the content stream. Note that content can include DrawingML items like images or
47+
charts.
48+
49+
Note the returned paragraph *is divorced from the document body*. Any changes
50+
made to it will not be reflected in the document. It is intended to provide a
51+
familiar container (`Paragraph`) to interrogate for the content preceding this
52+
page-break in the paragraph in which it occured.
53+
54+
Also note that a rendered page-break can occur within a hyperlink; consider a
55+
multi-word hyperlink like "excellent Wikipedia article on LLMs" that happens to
56+
fall at the end of the last line on a page. THIS METHOD WILL "MOVE" the
57+
page-break to occur after such a hyperlink. While this places the "tail" text of
58+
the hyperlink on the "wrong" page, it avoids having two hyperlinks each with a
59+
fragment of the actual text and pointing to the same address.
60+
"""
61+
if self._lastRenderedPageBreak.precedes_all_content:
62+
return None
63+
64+
from docx.text.paragraph import Paragraph
65+
66+
return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)

tests/text/test_pagebreak.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# pyright: reportPrivateUsage=false
2+
3+
"""Unit-test suite for the docx.text.pagebreak module."""
4+
5+
from typing import cast
6+
7+
from docx import types as t
8+
from docx.oxml.text.paragraph import CT_P
9+
from docx.text.pagebreak import RenderedPageBreak
10+
11+
from ..unitutil.cxml import element, xml
12+
13+
14+
class DescribeRenderedPageBreak:
15+
"""Unit-test suite for the docx.text.pagebreak.RenderedPageBreak object."""
16+
17+
def it_produces_None_for_preceding_fragment_when_page_break_is_leading(
18+
self, fake_parent: t.StoryChild
19+
):
20+
"""A page-break with no preceding content is "leading"."""
21+
p_cxml = 'w:p/(w:pPr/w:ind,w:r/(w:lastRenderedPageBreak,w:t"foo",w:t"bar"))'
22+
p = cast(CT_P, element(p_cxml))
23+
lrpb = p.lastRenderedPageBreaks[0]
24+
page_break = RenderedPageBreak(lrpb, fake_parent)
25+
26+
preceding_fragment = page_break.preceding_paragraph_fragment
27+
28+
assert preceding_fragment is None
29+
30+
def it_can_split_off_the_preceding_paragraph_content_when_in_a_run(
31+
self, fake_parent: t.StoryChild
32+
):
33+
p_cxml = (
34+
"w:p/("
35+
" w:pPr/w:ind"
36+
' ,w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
37+
' ,w:r/w:t"barfoo"'
38+
")"
39+
)
40+
p = cast(CT_P, element(p_cxml))
41+
lrpb = p.lastRenderedPageBreaks[0]
42+
page_break = RenderedPageBreak(lrpb, fake_parent)
43+
44+
preceding_fragment = page_break.preceding_paragraph_fragment
45+
46+
expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"foo")'
47+
assert preceding_fragment is not None
48+
assert preceding_fragment._p.xml == xml(expected_cxml)
49+
50+
def and_it_can_split_off_the_preceding_paragraph_content_when_in_a_hyperlink(
51+
self, fake_parent: t.StoryChild
52+
):
53+
p_cxml = (
54+
"w:p/("
55+
" w:pPr/w:ind"
56+
' ,w:hyperlink/w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
57+
' ,w:r/w:t"barfoo"'
58+
")"
59+
)
60+
p = cast(CT_P, element(p_cxml))
61+
lrpb = p.lastRenderedPageBreaks[0]
62+
page_break = RenderedPageBreak(lrpb, fake_parent)
63+
64+
preceding_fragment = page_break.preceding_paragraph_fragment
65+
66+
expected_cxml = 'w:p/(w:pPr/w:ind,w:hyperlink/w:r/(w:t"foo",w:t"bar"))'
67+
assert preceding_fragment is not None
68+
assert preceding_fragment._p.xml == xml(expected_cxml)

0 commit comments

Comments
 (0)