Skip to content

Commit 1e42c55

Browse files
committed
lrpb: add RenderedPageBreak.following_pa..fragment
1 parent ac6c216 commit 1e42c55

File tree

4 files changed

+231
-8
lines changed

4 files changed

+231
-8
lines changed

features/pbk-split-para.feature

-2
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,11 @@ Feature: Split paragraph on rendered page-breaks
1414
Then rendered_page_break.preceding_paragraph_fragment includes the hyperlink
1515

1616

17-
@wip
1817
Scenario: RenderedPageBreak.following_paragraph_fragment is the content after break
1918
Given a rendered_page_break in a paragraph
2019
Then rendered_page_break.following_paragraph_fragment is the content after break
2120

2221

23-
@wip
2422
Scenario: RenderedPageBreak.following_paragraph_fragment excludes the hyperlink
2523
Given a rendered_page_break in a hyperlink
2624
Then rendered_page_break.following_paragraph_fragment excludes the hyperlink

src/docx/oxml/text/pagebreak.py

+110
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,57 @@ class CT_LastRenderedPageBreak(BaseOxmlElement):
2525
distinguished behavior. CT_Empty is used for many elements.
2626
"""
2727

28+
@property
29+
def following_fragment_p(self) -> CT_P:
30+
"""A "loose" `CT_P` containing only the paragraph content before this break.
31+
32+
Raises `ValueError` if this `w:lastRenderedPageBreak` is not the first rendered
33+
page-break in its paragraph.
34+
35+
The returned `CT_P` is a "clone" (deepcopy) of the `w:p` ancestor of this
36+
page-break with this `w:lastRenderedPageBreak` element and all content preceding
37+
it removed.
38+
39+
NOTE: this `w:p` can itself contain one or more `w:renderedPageBreak` elements
40+
(when the paragraph contained more than one). While this is rare, the caller
41+
should treat this paragraph the same as other paragraphs and split it if
42+
necessary in a folloing step or recursion.
43+
"""
44+
if not self == self._first_lrpb_in_p(self._enclosing_p):
45+
raise ValueError("only defined on first rendered page-break in paragraph")
46+
47+
# -- splitting approach is different when break is inside a hyperlink --
48+
return (
49+
self._following_frag_in_hlink
50+
if self._is_in_hyperlink
51+
else self._following_frag_in_run
52+
)
53+
54+
@property
55+
def follows_all_content(self) -> bool:
56+
"""True when this page-break element is the last "content" in the paragraph.
57+
58+
This is very uncommon case and may only occur in contrived or cases where the
59+
XML is edited by hand, but it is not precluded by the spec.
60+
"""
61+
# -- a page-break inside a hyperlink never meets these criteria (for our
62+
# -- purposes at least) because it is considered "atomic" and always associated
63+
# -- with the page it starts on.
64+
if self._is_in_hyperlink:
65+
return False
66+
67+
return bool(
68+
# -- XPath will match zero-or-one w:lastRenderedPageBreak element --
69+
self._enclosing_p.xpath(
70+
# -- in first run of paragraph --
71+
f"(./w:r)[last()]"
72+
# -- all page-breaks --
73+
f"/w:lastRenderedPageBreak"
74+
# -- that are not preceded by any content-bearing elements --
75+
f"[not(following-sibling::*[{self._run_inner_content_xpath}])]"
76+
)
77+
)
78+
2879
@property
2980
def precedes_all_content(self) -> bool:
3081
"""True when a `w:lastRenderedPageBreak` precedes all paragraph content.
@@ -95,6 +146,65 @@ def _first_lrpb_in_p(self, p: CT_P) -> CT_LastRenderedPageBreak:
95146
raise ValueError("no rendered page-breaks in paragraph element")
96147
return lrpbs[0]
97148

149+
@lazyproperty
150+
def _following_frag_in_hlink(self) -> CT_P:
151+
"""Following CT_P fragment when break occurs within a hyperlink.
152+
153+
Note this is a *partial-function* and raises when `lrpb` is not inside a
154+
hyperlink.
155+
"""
156+
if not self._is_in_hyperlink:
157+
raise ValueError("only defined on a rendered page-break in a hyperlink")
158+
159+
# -- work on a clone `w:p` so our mutations don't persist --
160+
p = copy.deepcopy(self._enclosing_p)
161+
162+
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
163+
lrpb = self._first_lrpb_in_p(p)
164+
165+
# -- locate `w:hyperlink` in which this `w:lastRenderedPageBreak` is found --
166+
hyperlink = lrpb._enclosing_hyperlink(lrpb)
167+
168+
# -- delete all w:p inner-content preceding the hyperlink --
169+
for e in hyperlink.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
170+
p.remove(e)
171+
172+
# -- remove the whole hyperlink, it belongs to the preceding-fragment-p --
173+
hyperlink.getparent().remove(hyperlink)
174+
175+
# -- that's it, return the remaining fragment of `w:p` clone --
176+
return p
177+
178+
@lazyproperty
179+
def _following_frag_in_run(self) -> CT_P:
180+
"""following CT_P fragment when break does not occur in a hyperlink.
181+
182+
Note this is a *partial-function* and raises when `lrpb` is inside a hyperlink.
183+
"""
184+
if self._is_in_hyperlink:
185+
raise ValueError("only defined on a rendered page-break not in a hyperlink")
186+
187+
# -- work on a clone `w:p` so our mutations don't persist --
188+
p = copy.deepcopy(self._enclosing_p)
189+
190+
# -- get this `w:lastRenderedPageBreak` in the cloned `w:p` (not self) --
191+
lrpb = self._first_lrpb_in_p(p)
192+
193+
# -- locate `w:r` in which this `w:lastRenderedPageBreak` is found --
194+
enclosing_r = lrpb.xpath("./parent::w:r")[0]
195+
196+
# -- delete all w:p inner-content preceding that run (but not w:pPr) --
197+
for e in enclosing_r.xpath("./preceding-sibling::*[not(self::w:pPr)]"):
198+
p.remove(e)
199+
200+
# -- then remove all run inner-content preceding this lrpb in its run (but not
201+
# -- the `w:rPr`) and also remove the page-break itself
202+
for e in lrpb.xpath("./preceding-sibling::*[not(self::w:rPr)]"):
203+
enclosing_r.remove(e)
204+
enclosing_r.remove(lrpb)
205+
206+
return p
207+
98208
@lazyproperty
99209
def _is_in_hyperlink(self) -> bool:
100210
"""True when this page-break is embedded in a hyperlink run."""

src/docx/text/pagebreak.py

+42-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ class RenderedPageBreak(Parented):
2424
Note these are never inserted by `python-docx` because it has no rendering function.
2525
These are generally only useful for text-extraction of existing documents when
2626
`python-docx` is being used solely as a document "reader".
27+
28+
NOTE: a rendered page-break can occur within a hyperlink; consider a multi-word
29+
hyperlink like "excellent Wikipedia article on LLMs" that happens to fall close to
30+
the end of the last line on a page such that the page breaks between "Wikipedia" and
31+
"article". In such a "page-breaks-in-hyperlink" case, THESE METHODS WILL "MOVE" THE
32+
PAGE-BREAK to occur after the hyperlink, such that the entire hyperlink appears in
33+
the paragraph returned by `.preceding_paragraph_fragment`. While this places the
34+
"tail" text of the hyperlink on the "wrong" page, it avoids having two hyperlinks
35+
each with a fragment of the actual text and pointing to the same address.
2736
"""
2837

2938
def __init__(
@@ -51,16 +60,43 @@ def preceding_paragraph_fragment(self) -> Paragraph | None:
5160
familiar container (`Paragraph`) to interrogate for the content preceding this
5261
page-break in the paragraph in which it occured.
5362
54-
Also note that a rendered page-break can occur within a hyperlink; consider a
55-
multi-word hyperlink like "excellent Wikipedia article on LLMs" that happens to
56-
fall at the end of the last line on a page. THIS METHOD WILL "MOVE" the
57-
page-break to occur after such a hyperlink. While this places the "tail" text of
58-
the hyperlink on the "wrong" page, it avoids having two hyperlinks each with a
59-
fragment of the actual text and pointing to the same address.
63+
Contains the entire hyperlink when this break occurs within a hyperlink.
6064
"""
6165
if self._lastRenderedPageBreak.precedes_all_content:
6266
return None
6367

6468
from docx.text.paragraph import Paragraph
6569

6670
return Paragraph(self._lastRenderedPageBreak.preceding_fragment_p, self._parent)
71+
72+
@property
73+
def following_paragraph_fragment(self) -> Paragraph | None:
74+
"""A "loose" paragraph containing the content following this page-break.
75+
76+
HAS POTENTIALLY SURPRISING BEHAVIORS so read carefully to be sure this is what
77+
you want. This is primarily targeted toward text-extraction use-cases for which
78+
precisely associating text with the page it occurs on is important.
79+
80+
Compare `.preceding_paragraph_fragment` as these two are intended to be used
81+
together.
82+
83+
This value is `None` when no content follows this page-break. This case is
84+
unlikely to occur in practice because Word places even-paragraph-boundary
85+
page-breaks on the paragraph *following* the page-break. Still, it is possible
86+
and must be checked for. Returning `None` for this case avoids "inserting" an
87+
extra, non-existent paragraph into the content stream. Note that content can
88+
include DrawingML items like images or charts, not just text.
89+
90+
The returned paragraph *is divorced from the document body*. Any changes made to
91+
it will not be reflected in the document. It is intended to provide a container
92+
(`Paragraph`) with familiar properties and methods that can be used to
93+
characterize the paragraph content following a mid-paragraph page-break.
94+
95+
Contains no portion of the hyperlink when this break occurs within a hyperlink.
96+
"""
97+
if self._lastRenderedPageBreak.follows_all_content:
98+
return None
99+
100+
from docx.text.paragraph import Paragraph
101+
102+
return Paragraph(self._lastRenderedPageBreak.following_fragment_p, self._parent)

tests/text/test_pagebreak.py

+79
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from typing import cast
66

7+
import pytest
8+
79
from docx import types as t
810
from docx.oxml.text.paragraph import CT_P
911
from docx.text.pagebreak import RenderedPageBreak
@@ -14,6 +16,17 @@
1416
class DescribeRenderedPageBreak:
1517
"""Unit-test suite for the docx.text.pagebreak.RenderedPageBreak object."""
1618

19+
def it_raises_on_preceding_fragment_when_page_break_is_not_first_in_paragrah(
20+
self, fake_parent: t.StoryChild
21+
):
22+
p_cxml = 'w:p/(w:r/(w:t"abc",w:lastRenderedPageBreak,w:lastRenderedPageBreak))'
23+
p = cast(CT_P, element(p_cxml))
24+
lrpb = p.lastRenderedPageBreaks[-1]
25+
page_break = RenderedPageBreak(lrpb, fake_parent)
26+
27+
with pytest.raises(ValueError, match="only defined on first rendered page-br"):
28+
page_break.preceding_paragraph_fragment
29+
1730
def it_produces_None_for_preceding_fragment_when_page_break_is_leading(
1831
self, fake_parent: t.StoryChild
1932
):
@@ -66,3 +79,69 @@ def and_it_can_split_off_the_preceding_paragraph_content_when_in_a_hyperlink(
6679
expected_cxml = 'w:p/(w:pPr/w:ind,w:hyperlink/w:r/(w:t"foo",w:t"bar"))'
6780
assert preceding_fragment is not None
6881
assert preceding_fragment._p.xml == xml(expected_cxml)
82+
83+
def it_raises_on_following_fragment_when_page_break_is_not_first_in_paragrah(
84+
self, fake_parent: t.StoryChild
85+
):
86+
p_cxml = 'w:p/(w:r/(w:lastRenderedPageBreak,w:lastRenderedPageBreak,w:t"abc"))'
87+
p = cast(CT_P, element(p_cxml))
88+
lrpb = p.lastRenderedPageBreaks[-1]
89+
page_break = RenderedPageBreak(lrpb, fake_parent)
90+
91+
with pytest.raises(ValueError, match="only defined on first rendered page-br"):
92+
page_break.following_paragraph_fragment
93+
94+
def it_produces_None_for_following_fragment_when_page_break_is_trailing(
95+
self, fake_parent: t.StoryChild
96+
):
97+
"""A page-break with no following content is "trailing"."""
98+
p_cxml = 'w:p/(w:pPr/w:ind,w:r/(w:t"foo",w:t"bar",w:lastRenderedPageBreak))'
99+
p = cast(CT_P, element(p_cxml))
100+
lrpb = p.lastRenderedPageBreaks[0]
101+
page_break = RenderedPageBreak(lrpb, fake_parent)
102+
103+
following_fragment = page_break.following_paragraph_fragment
104+
105+
assert following_fragment is None
106+
107+
def it_can_split_off_the_following_paragraph_content_when_in_a_run(
108+
self, fake_parent: t.StoryChild
109+
):
110+
p_cxml = (
111+
"w:p/("
112+
" w:pPr/w:ind"
113+
' ,w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
114+
' ,w:r/w:t"foo"'
115+
")"
116+
)
117+
p = cast(CT_P, element(p_cxml))
118+
lrpb = p.lastRenderedPageBreaks[0]
119+
page_break = RenderedPageBreak(lrpb, fake_parent)
120+
121+
following_fragment = page_break.following_paragraph_fragment
122+
123+
expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"bar",w:r/w:t"foo")'
124+
assert following_fragment is not None
125+
assert following_fragment._p.xml == xml(expected_cxml)
126+
127+
def and_it_can_split_off_the_following_paragraph_content_when_in_a_hyperlink(
128+
self, fake_parent: t.StoryChild
129+
):
130+
p_cxml = (
131+
"w:p/("
132+
" w:pPr/w:ind"
133+
' ,w:hyperlink/w:r/(w:t"foo",w:lastRenderedPageBreak,w:t"bar")'
134+
' ,w:r/w:t"baz"'
135+
' ,w:r/w:t"qux"'
136+
")"
137+
)
138+
p = cast(CT_P, element(p_cxml))
139+
lrpb = p.lastRenderedPageBreaks[0]
140+
page_break = RenderedPageBreak(lrpb, fake_parent)
141+
142+
following_fragment = page_break.following_paragraph_fragment
143+
144+
expected_cxml = 'w:p/(w:pPr/w:ind,w:r/w:t"baz",w:r/w:t"qux")'
145+
146+
assert following_fragment is not None
147+
assert following_fragment._p.xml == xml(expected_cxml)

0 commit comments

Comments
 (0)