Merge pull request #1058 from PyThaiNLP/wannaphong/add-display-cell-tokenizer

wannaphong · web-flow · commit ef0e01d7f371 · 2025-01-13T12:53:40.000+07:00
Add display cell tokenizer
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -44,6 +44,8 @@ Modules
     
     The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
 
+.. autoclass:: display_cell_tokenize
+
 Tokenization Engines
 --------------------
 
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,6 +16,7 @@
     "syllable_tokenize",
     "word_detokenize",
     "word_tokenize",
+    "display_cell_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
     syllable_tokenize,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -733,6 +733,46 @@ def syllable_tokenize(
     )
 
 
+def display_cell_tokenize(text: str) -> List[str]:
+    """
+    Display cell tokenizer.
+
+    Tokenizes Thai text into display cells without splitting tone marks.
+
+    :param str text: text to be tokenized
+    :return: list of display cells
+    :rtype: List[str]
+    :Example:
+
+    Tokenize Thai text into display cells::
+
+        from pythainlp.tokenize import display_cell_tokenize
+
+        text = "แม่น้ำอยู่ที่ไหน"
+        display_cell_tokenize(text)
+        # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    display_cells = []
+    current_cell = ""
+    text = text.replace("ำ", "ํา")
+
+    for char in text:
+        if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
+            current_cell += char
+        else:
+            if current_cell:
+                display_cells.append(current_cell)
+            current_cell = char
+
+    if current_cell:
+        display_cells.append(current_cell)
+
+    return display_cells
+
+
 class Tokenizer:
     """
     Tokenizer class for a custom tokenizer.
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -19,6 +19,7 @@
     tcc_p,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 from pythainlp.util import dict_trie
 
@@ -638,3 +639,13 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
+
+    def test_display_cell_tokenize(self):
+        self.assertEqual(display_cell_tokenize(""), [])
+        self.assertEqual(
+            display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
+            ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
+        )
+        self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
+        self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
+        self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`"syllable_tokenize",`
`17`	`17`	`"word_detokenize",`
`18`	`18`	`"word_tokenize",`
	`19`	`+ "display_cell_tokenize",`
`19`	`20`	`]`
`20`	`21`
`21`	`22`	`from pythainlp.corpus import thai_syllables, thai_words`
`@@ -38,6 +39,7 @@`
`38`	`39`	`syllable_tokenize,`
`39`	`40`	`word_detokenize,`
`40`	`41`	`word_tokenize,`
	`42`	`+ display_cell_tokenize,`
`41`	`43`	`)`
`42`	`44`
`43`	`45`	`from pythainlp.corpus import get_corpus as _get_corpus`