Skip to content

Commit b11fe00

Browse files
authored
Merge pull request #927 from ayaan-qadri/dev
Added list of string support to sent_tokenize
2 parents 2875544 + 1a2b457 commit b11fe00

File tree

2 files changed

+117
-15
lines changed

2 files changed

+117
-15
lines changed

pythainlp/tokenize/core.py

+100-15
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77
import re
88
from typing import Iterable, List, Union
9+
import copy
910

1011
from pythainlp.tokenize import (
1112
DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -198,7 +199,7 @@ def word_tokenize(
198199
199200
word_tokenize(text, engine="newmm", keep_whitespace=False)
200201
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
201-
202+
202203
Join broken formatted numeric (e.g. time, decimals, IP addresses)::
203204
204205
text = "เงิน1,234บาท19:32น 127.0.0.1"
@@ -322,17 +323,50 @@ def word_tokenize(
322323
return segments
323324

324325

326+
def indices_words(words):
327+
indices = []
328+
start_index = 0
329+
for word in words:
330+
end_index = start_index + len(word) - 1
331+
indices.append((start_index, end_index))
332+
start_index += len(word)
333+
334+
return indices
335+
336+
337+
def map_indices_to_words(index_list, sentences):
338+
result = []
339+
c = copy.copy(index_list)
340+
n_sum = 0
341+
for sentence in sentences:
342+
words = sentence
343+
sentence_result = []
344+
n = 0
345+
for start, end in c:
346+
if start > n_sum + len(words) - 1:
347+
break
348+
else:
349+
word = sentence[start - n_sum:end + 1 - n_sum]
350+
sentence_result.append(word)
351+
n += 1
352+
353+
result.append(sentence_result)
354+
n_sum += len(words)
355+
for _ in range(n):
356+
del c[0]
357+
return result
358+
325359
def sent_tokenize(
326-
text: str,
360+
text: Union[str, List[str]],
327361
engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
328362
keep_whitespace: bool = True,
329363
) -> List[str]:
330364
"""
331365
Sentence tokenizer.
332366
333-
Tokenizes running text into "sentences"
367+
Tokenizes running text into "sentences". Supports both string and list of strings.
334368
335-
:param str text: the text to be tokenized
369+
:param text: the text (string) or list of words (list of strings) to be tokenized
336370
:param str engine: choose among *'crfcut'*, *'whitespace'*, \
337371
*'whitespace+newline'*
338372
:return: list of split sentences
@@ -394,38 +428,84 @@ def sent_tokenize(
394428
'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
395429
"""
396430

397-
if not text or not isinstance(text, str):
431+
if not text or not isinstance(text, (str, list)):
398432
return []
399433

434+
is_list_input = isinstance(text, list)
435+
436+
if is_list_input:
437+
438+
try:
439+
original_text = "".join(text)
440+
except ValueError:
441+
return []
442+
443+
else:
444+
original_text = text
445+
400446
segments = []
401447

402448
if engine == "crfcut":
403449
from pythainlp.tokenize.crfcut import segment
404450

405-
segments = segment(text)
451+
segments = segment(original_text)
452+
453+
if is_list_input:
454+
word_indices = indices_words(text)
455+
result = map_indices_to_words(word_indices, [original_text])
456+
return result
406457
elif engine == "whitespace":
407-
segments = re.split(r" +", text, flags=re.U)
458+
segments = re.split(r" +", original_text, flags=re.U)
459+
if is_list_input:
460+
result = []
461+
_temp = []
462+
for i, w in enumerate(text):
463+
if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []:
464+
if _temp == []:
465+
continue
466+
result.append(_temp)
467+
_temp = []
468+
else:
469+
_temp.append(w)
470+
if i + 1 == len(text):
471+
result.append(_temp)
472+
return result
408473
elif engine == "whitespace+newline":
409-
segments = text.split()
474+
segments = original_text.split()
475+
if is_list_input:
476+
result = []
477+
_temp = []
478+
for i, w in enumerate(text):
479+
if (
480+
(re.findall(r"\s", w) != [] or
481+
re.findall(r"\n", w) != []) and
482+
re.findall(r"\w", w) == []
483+
):
484+
if _temp == []:
485+
continue
486+
result.append(_temp)
487+
_temp = []
488+
else:
489+
_temp.append(w)
490+
if i + 1 == len(text):
491+
result.append(_temp)
492+
return result
410493
elif engine == "tltk":
411494
from pythainlp.tokenize.tltk import sent_tokenize as segment
412-
413-
segments = segment(text)
495+
segments = segment(original_text)
414496
elif engine == "thaisum":
415497
from pythainlp.tokenize.thaisumcut import (
416498
ThaiSentenceSegmentor as segmentor,
417499
)
418-
419500
segment = segmentor()
420-
segments = segment.split_into_sentences(text)
501+
segments = segment.split_into_sentences(original_text)
421502
elif engine.startswith("wtp"):
422503
if "-" not in engine:
423504
_size = "mini"
424505
else:
425506
_size = engine.split("-")[-1]
426507
from pythainlp.tokenize.wtsplit import tokenize as segment
427-
428-
segments = segment(text, size=_size, tokenize="sentence")
508+
segments = segment(original_text, size=_size, tokenize="sentence")
429509
else:
430510
raise ValueError(
431511
f"""Tokenizer \"{engine}\" not found.
@@ -435,7 +515,12 @@ def sent_tokenize(
435515
if not keep_whitespace:
436516
segments = strip_whitespace(segments)
437517

438-
return segments
518+
if is_list_input and engine not in ["crfcut"]:
519+
word_indices = indices_words(text)
520+
result = map_indices_to_words(word_indices, segments)
521+
return result
522+
else:
523+
return [segments]
439524

440525

441526
def paragraph_tokenize(

tests/test_tokenize.py

+17
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,23 @@ def test_sent_tokenize(self):
333333
# engine="wtp-large",
334334
# ),
335335
# )
336+
sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
337+
self.assertEqual(
338+
sent_tokenize(sent_4, engine="crfcut"),
339+
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
340+
)
341+
self.assertEqual(
342+
sent_tokenize(sent_4, engine="whitespace"),
343+
[["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]],
344+
)
345+
self.assertEqual(
346+
sent_tokenize(sent_4, engine="whitespace+newline"),
347+
[["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]],
348+
)
349+
self.assertEqual(
350+
sent_tokenize(sent_4, engine="thaisum"),
351+
[["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
352+
)
336353
self.assertFalse(
337354
" "
338355
in sent_tokenize(

0 commit comments

Comments
 (0)