Skip to content

Commit 557fdee

Browse files
committed
acpt: add RenderedPageBreak split-para scenarios
1 parent 9abd14a commit 557fdee

File tree

2 files changed

+199
-0
lines changed

2 files changed

+199
-0
lines changed

features/pbk-split-para.feature

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Feature: Split paragraph on rendered page-breaks
2+
In order to extract document content with high page-attribution fidelity
3+
As a developer using python-docx
4+
I need to a way to split a paragraph on its first rendered page break
5+
6+
7+
@wip
8+
Scenario: RenderedPageBreak.preceding_paragraph_fragment is the content before break
9+
Given a rendered_page_break in a paragraph
10+
Then rendered_page_break.preceding_paragraph_fragment is the content before break
11+
12+
13+
@wip
14+
Scenario: RenderedPageBreak.preceding_paragraph_fragment includes the hyperlink
15+
Given a rendered_page_break in a hyperlink
16+
Then rendered_page_break.preceding_paragraph_fragment includes the hyperlink
17+
18+
19+
@wip
20+
Scenario: RenderedPageBreak.following_paragraph_fragment is the content after break
21+
Given a rendered_page_break in a paragraph
22+
Then rendered_page_break.following_paragraph_fragment is the content after break
23+
24+
25+
@wip
26+
Scenario: RenderedPageBreak.following_paragraph_fragment excludes the hyperlink
27+
Given a rendered_page_break in a hyperlink
28+
Then rendered_page_break.following_paragraph_fragment excludes the hyperlink

features/steps/pagebreak.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Step implementations for rendered page-break related features."""
2+
3+
from __future__ import annotations
4+
5+
from behave import given, then
6+
from behave.runner import Context
7+
8+
from docx import Document
9+
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
10+
11+
from helpers import test_docx
12+
13+
# given ===================================================
14+
15+
16+
@given("a rendered_page_break in a hyperlink")
17+
def given_a_rendered_page_break_in_a_hyperlink(context: Context):
18+
document = Document(test_docx("par-rendered-page-breaks"))
19+
paragraph = document.paragraphs[2]
20+
context.rendered_page_break = paragraph.rendered_page_breaks[0]
21+
22+
23+
@given("a rendered_page_break in a paragraph")
24+
def given_a_rendered_page_break_in_a_paragraph(context: Context):
25+
document = Document(test_docx("par-rendered-page-breaks"))
26+
paragraph = document.paragraphs[1]
27+
context.rendered_page_break = paragraph.rendered_page_breaks[0]
28+
29+
30+
# then =====================================================
31+
32+
33+
@then("rendered_page_break.preceding_paragraph_fragment includes the hyperlink")
34+
def then_rendered_page_break_preceding_paragraph_fragment_includes_the_hyperlink(
35+
context: Context,
36+
):
37+
para_frag = context.rendered_page_break.preceding_paragraph_fragment
38+
39+
actual_value = type(para_frag).__name__
40+
expected_value = "Paragraph"
41+
assert (
42+
actual_value == expected_value
43+
), f"expected: '{expected_value}', got: '{actual_value}'"
44+
45+
actual_value = para_frag.text
46+
expected_value = "Page break in>><<this hyperlink"
47+
assert (
48+
actual_value == expected_value
49+
), f"expected: '{expected_value}', got: '{actual_value}'"
50+
51+
actual_value = para_frag.alignment
52+
expected_value = WD_PARAGRAPH_ALIGNMENT.RIGHT # pyright: ignore
53+
assert (
54+
actual_value == expected_value
55+
), f"expected: '{expected_value}', got: '{actual_value}'"
56+
57+
actual_value = para_frag.hyperlinks[0].runs[0].style.name
58+
expected_value = "Hyperlink"
59+
assert (
60+
actual_value == expected_value
61+
), f"expected: '{expected_value}', got: '{actual_value}'"
62+
63+
actual_value = para_frag.hyperlinks[0].address
64+
expected_value = "http://google.com/"
65+
assert (
66+
actual_value == expected_value
67+
), f"expected: '{expected_value}', got: '{actual_value}'"
68+
69+
70+
@then("rendered_page_break.preceding_paragraph_fragment is the content before break")
71+
def then_rendered_page_break_preceding_paragraph_fragment_is_the_content_before_break(
72+
context: Context,
73+
):
74+
para_frag = context.rendered_page_break.preceding_paragraph_fragment
75+
76+
actual_value = type(para_frag).__name__
77+
expected_value = "Paragraph"
78+
assert (
79+
actual_value == expected_value
80+
), f"expected: '{expected_value}', got: '{actual_value}'"
81+
82+
actual_value = para_frag.text
83+
expected_value = "Page break here>>"
84+
assert (
85+
actual_value == expected_value
86+
), f"expected: '{expected_value}', got: '{actual_value}'"
87+
88+
actual_value = para_frag.alignment
89+
expected_value = WD_PARAGRAPH_ALIGNMENT.CENTER # pyright: ignore
90+
assert (
91+
actual_value == expected_value
92+
), f"expected: '{expected_value}', got: '{actual_value}'"
93+
94+
actual_value = para_frag.runs[0].style.name
95+
expected_value = "Default Paragraph Font"
96+
assert (
97+
actual_value == expected_value
98+
), f"expected: '{expected_value}', got: '{actual_value}'"
99+
100+
101+
@then("rendered_page_break.following_paragraph_fragment excludes the hyperlink")
102+
def then_rendered_page_break_following_paragraph_fragment_excludes_the_hyperlink(
103+
context: Context,
104+
):
105+
para_frag = context.rendered_page_break.following_paragraph_fragment
106+
107+
# -- paragraph fragment is a Paragraph object --
108+
actual_value = type(para_frag).__name__
109+
expected_value = "Paragraph"
110+
assert (
111+
actual_value == expected_value
112+
), f"expected: '{expected_value}', got: '{actual_value}'"
113+
114+
# -- paragraph text is only the fragment after the page-break --
115+
actual_value = para_frag.text
116+
expected_value = " and another one here>><<with text following"
117+
assert (
118+
actual_value == expected_value
119+
), f"expected: '{expected_value}', got: '{actual_value}'"
120+
121+
# -- paragraph properties are preserved --
122+
actual_value = para_frag.alignment
123+
expected_value = WD_PARAGRAPH_ALIGNMENT.RIGHT # pyright: ignore
124+
assert (
125+
actual_value == expected_value
126+
), f"expected: '{expected_value}', got: '{actual_value}'"
127+
128+
# -- paragraph has no hyperlinks --
129+
actual_value = para_frag.hyperlinks
130+
expected_value = []
131+
assert (
132+
actual_value == expected_value
133+
), f"expected: '{expected_value}', got: '{actual_value}'"
134+
135+
# -- following paragraph fragment retains any remaining page-breaks --
136+
actual_value = [type(rpb).__name__ for rpb in para_frag.rendered_page_breaks]
137+
expected_value = ["RenderedPageBreak"]
138+
assert (
139+
actual_value == expected_value
140+
), f"expected: '{expected_value}', got: '{actual_value}'"
141+
142+
143+
@then("rendered_page_break.following_paragraph_fragment is the content after break")
144+
def then_rendered_page_break_following_paragraph_fragment_is_the_content_after_break(
145+
context: Context,
146+
):
147+
para_frag = context.rendered_page_break.following_paragraph_fragment
148+
149+
actual_value = type(para_frag).__name__
150+
expected_value = "Paragraph"
151+
assert (
152+
actual_value == expected_value
153+
), f"expected: '{expected_value}', got: '{actual_value}'"
154+
155+
actual_value = para_frag.text
156+
expected_value = "<<followed by more text."
157+
assert (
158+
actual_value == expected_value
159+
), f"expected: '{expected_value}', got: '{actual_value}'"
160+
161+
actual_value = para_frag.alignment
162+
expected_value = WD_PARAGRAPH_ALIGNMENT.CENTER # pyright: ignore
163+
assert (
164+
actual_value == expected_value
165+
), f"expected: '{expected_value}', got: '{actual_value}'"
166+
167+
actual_value = para_frag.runs[0].style.name
168+
expected_value = "Default Paragraph Font"
169+
assert (
170+
actual_value == expected_value
171+
), f"expected: '{expected_value}', got: '{actual_value}'"

0 commit comments

Comments
 (0)