Skip to content

Add display cell tokenizer #1058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Modules

The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.

.. autoclass:: display_cell_tokenize

Tokenization Engines
--------------------

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"syllable_tokenize",
"word_detokenize",
"word_tokenize",
"display_cell_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
Expand All @@ -38,6 +39,7 @@
syllable_tokenize,
word_detokenize,
word_tokenize,
display_cell_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus
Expand Down
40 changes: 40 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,46 @@ def syllable_tokenize(
)


def display_cell_tokenize(text: str) -> List[str]:
"""
Display cell tokenizer.

Tokenizes Thai text into display cells without splitting tone marks.

:param str text: text to be tokenized
:return: list of display cells
:rtype: List[str]
:Example:

Tokenize Thai text into display cells::

from pythainlp.tokenize import display_cell_tokenize

text = "แม่น้ำอยู่ที่ไหน"
display_cell_tokenize(text)
# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
"""
if not text or not isinstance(text, str):
return []

display_cells = []
current_cell = ""
text = text.replace("ำ", "ํา")

for char in text:
if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
current_cell += char
else:
if current_cell:
display_cells.append(current_cell)
current_cell = char

if current_cell:
display_cells.append(current_cell)

return display_cells


class Tokenizer:
"""
Tokenizer class for a custom tokenizer.
Expand Down
11 changes: 11 additions & 0 deletions tests/core/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
tcc_p,
word_detokenize,
word_tokenize,
display_cell_tokenize,
)
from pythainlp.util import dict_trie

Expand Down Expand Up @@ -604,3 +605,13 @@ def test_tcc_p(self):
# )
self.assertEqual(list(tcc_p.tcc("")), [])
self.assertEqual(tcc_p.tcc_pos(""), set())

def test_display_cell_tokenize(self):
self.assertEqual(display_cell_tokenize(""), [])
self.assertEqual(
display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
)
self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])
Loading