Skip to content

Commit ef0e01d

Browse files
authored
Merge pull request #1058 from PyThaiNLP/wannaphong/add-display-cell-tokenizer
Add display cell tokenizer
2 parents ae4c5fa + 9c86f85 commit ef0e01d

File tree

4 files changed

+55
-0
lines changed

4 files changed

+55
-0
lines changed

docs/api/tokenize.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ Modules
4444

4545
The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
4646

47+
.. autoclass:: display_cell_tokenize
48+
4749
Tokenization Engines
4850
--------------------
4951

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"syllable_tokenize",
1717
"word_detokenize",
1818
"word_tokenize",
19+
"display_cell_tokenize",
1920
]
2021

2122
from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
3839
syllable_tokenize,
3940
word_detokenize,
4041
word_tokenize,
42+
display_cell_tokenize,
4143
)
4244

4345
from pythainlp.corpus import get_corpus as _get_corpus

pythainlp/tokenize/core.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,46 @@ def syllable_tokenize(
733733
)
734734

735735

736+
def display_cell_tokenize(text: str) -> List[str]:
737+
"""
738+
Display cell tokenizer.
739+
740+
Tokenizes Thai text into display cells without splitting tone marks.
741+
742+
:param str text: text to be tokenized
743+
:return: list of display cells
744+
:rtype: List[str]
745+
:Example:
746+
747+
Tokenize Thai text into display cells::
748+
749+
from pythainlp.tokenize import display_cell_tokenize
750+
751+
text = "แม่น้ำอยู่ที่ไหน"
752+
display_cell_tokenize(text)
753+
# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
754+
"""
755+
if not text or not isinstance(text, str):
756+
return []
757+
758+
display_cells = []
759+
current_cell = ""
760+
text = text.replace("ำ", "ํา")
761+
762+
for char in text:
763+
if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
764+
current_cell += char
765+
else:
766+
if current_cell:
767+
display_cells.append(current_cell)
768+
current_cell = char
769+
770+
if current_cell:
771+
display_cells.append(current_cell)
772+
773+
return display_cells
774+
775+
736776
class Tokenizer:
737777
"""
738778
Tokenizer class for a custom tokenizer.

tests/core/test_tokenize.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
tcc_p,
2020
word_detokenize,
2121
word_tokenize,
22+
display_cell_tokenize,
2223
)
2324
from pythainlp.util import dict_trie
2425

@@ -638,3 +639,13 @@ def test_tcc_p(self):
638639
# )
639640
self.assertEqual(list(tcc_p.tcc("")), [])
640641
self.assertEqual(tcc_p.tcc_pos(""), set())
642+
643+
def test_display_cell_tokenize(self):
644+
self.assertEqual(display_cell_tokenize(""), [])
645+
self.assertEqual(
646+
display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
647+
["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
648+
)
649+
self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
650+
self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
651+
self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])

0 commit comments

Comments
 (0)