-
Notifications
You must be signed in to change notification settings - Fork 4.7k
/
Copy pathtextsplitter.py
232 lines (198 loc) · 8.1 KB
/
textsplitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import logging
from abc import ABC
from collections.abc import Generator
import tiktoken
from .page import Page, SplitPage
logger = logging.getLogger("scripts")
class TextSplitter(ABC):
"""
Splits a list of pages into smaller chunks
:param pages: The pages to split
:return: A generator of SplitPage
"""
def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
if False:
yield # pragma: no cover - this is necessary for mypy to type check
ENCODING_MODEL = "text-embedding-ada-002"
STANDARD_WORD_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
# See W3C document https://www.w3.org/TR/jlreq/#cl-01
CJK_WORD_BREAKS = [
"、",
",",
";",
":",
"(",
")",
"【",
"】",
"「",
"」",
"『",
"』",
"〔",
"〕",
"〈",
"〉",
"《",
"》",
"〖",
"〗",
"〘",
"〙",
"〚",
"〛",
"〝",
"〞",
"〟",
"〰",
"–",
"—",
"‘",
"’",
"‚",
"‛",
"“",
"”",
"„",
"‟",
"‹",
"›",
]
STANDARD_SENTENCE_ENDINGS = [".", "!", "?"]
# See CL05 and CL06, based on JIS X 4051:2004
# https://www.w3.org/TR/jlreq/#cl-04
CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"]
# NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance
DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English
class SentenceTextSplitter(TextSplitter):
"""
Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once
"""
def __init__(self, max_tokens_per_section: int = 500):
self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS
self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS
self.max_section_length = DEFAULT_SECTION_LENGTH
self.sentence_search_limit = 100
self.max_tokens_per_section = max_tokens_per_section
self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100)
def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:
"""
Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
"""
tokens = bpe.encode(text)
if len(tokens) <= self.max_tokens_per_section:
# Section is already within max tokens, return
yield SplitPage(page_num=page_num, text=text)
else:
# Start from the center and try and find the closest sentence ending by spiralling outward.
# IF we get to the outer thirds, then just split in half with a 5% overlap
start = int(len(text) // 2)
pos = 0
boundary = int(len(text) // 3)
split_position = -1
while start - pos > boundary:
if text[start - pos] in self.sentence_endings:
split_position = start - pos
break
elif text[start + pos] in self.sentence_endings:
split_position = start + pos
break
else:
pos += 1
if split_position > 0:
first_half = text[: split_position + 1]
second_half = text[split_position + 1 :]
else:
# Split page in half and call function again
# Overlap first and second halves by DEFAULT_OVERLAP_PERCENT%
middle = int(len(text) // 2)
overlap = int(len(text) * (DEFAULT_OVERLAP_PERCENT / 100))
first_half = text[: middle + overlap]
second_half = text[middle - overlap :]
yield from self.split_page_by_max_tokens(page_num, first_half)
yield from self.split_page_by_max_tokens(page_num, second_half)
def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
def find_page(offset):
num_pages = len(pages)
for i in range(num_pages - 1):
if offset >= pages[i].offset and offset < pages[i + 1].offset:
return pages[i].page_num
return pages[num_pages - 1].page_num
all_text = "".join(page.text for page in pages)
if len(all_text.strip()) == 0:
return
length = len(all_text)
if length <= self.max_section_length:
yield from self.split_page_by_max_tokens(page_num=find_page(0), text=all_text)
return
start = 0
end = length
while start + self.section_overlap < length:
last_word = -1
end = start + self.max_section_length
if end > length:
end = length
else:
# Try to find the end of the sentence
while (
end < length
and (end - start - self.max_section_length) < self.sentence_search_limit
and all_text[end] not in self.sentence_endings
):
if all_text[end] in self.word_breaks:
last_word = end
end += 1
if end < length and all_text[end] not in self.sentence_endings and last_word > 0:
end = last_word # Fall back to at least keeping a whole word
if end < length:
end += 1
# Try to find the start of the sentence or at least a whole word boundary
last_word = -1
while (
start > 0
and start > end - self.max_section_length - 2 * self.sentence_search_limit
and all_text[start] not in self.sentence_endings
):
if all_text[start] in self.word_breaks:
last_word = start
start -= 1
if all_text[start] not in self.sentence_endings and last_word > 0:
start = last_word
if start > 0:
start += 1
section_text = all_text[start:end]
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
last_figure_start = section_text.rfind("<figure")
if last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
"</figure"
):
# If the section ends with an unclosed figure, we need to start the next section with the figure.
start = min(end - self.section_overlap, start + last_figure_start)
logger.info(
f"Section ends with unclosed figure, starting next section with the figure at page {find_page(start)} offset {start} figure start {last_figure_start}"
)
else:
start = end - self.section_overlap
if start + self.section_overlap < end:
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=all_text[start:end])
class SimpleTextSplitter(TextSplitter):
"""
Class that splits pages into smaller chunks based on a max object length. It is not aware of the content of the page.
This is required because embedding models may not be able to analyze an entire page at once
"""
def __init__(self, max_object_length: int = 1000):
self.max_object_length = max_object_length
def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
all_text = "".join(page.text for page in pages)
if len(all_text.strip()) == 0:
return
length = len(all_text)
if length <= self.max_object_length:
yield SplitPage(page_num=0, text=all_text)
return
# its too big, so we need to split it
for i in range(0, length, self.max_object_length):
yield SplitPage(page_num=i // self.max_object_length, text=all_text[i : i + self.max_object_length])
return