PyThaiNLP · wannaphong · Jan 13, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -44,6 +44,8 @@ Modules
 
     The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
 
+.. autoclass:: display_cell_tokenize
+
 Tokenization Engines
 --------------------
 

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,6 +16,7 @@
     "syllable_tokenize",
     "word_detokenize",
     "word_tokenize",
+    "display_cell_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
     syllable_tokenize,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -733,6 +733,46 @@ def syllable_tokenize(
     )
 
 
+def display_cell_tokenize(text: str) -> List[str]:
+    """
+    Display cell tokenizer.
+
+    Tokenizes Thai text into display cells without splitting tone marks.
+
+    :param str text: text to be tokenized
+    :return: list of display cells
+    :rtype: List[str]
+    :Example:
+
+    Tokenize Thai text into display cells::
+
+        from pythainlp.tokenize import display_cell_tokenize
+
+        text = "แม่น้ำอยู่ที่ไหน"
+        display_cell_tokenize(text)
+        # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    display_cells = []
+    current_cell = ""
+    text = text.replace("ำ", "ํา")
+
+    for char in text:
+        if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
+            current_cell += char
+        else:
+            if current_cell:
+                display_cells.append(current_cell)
+            current_cell = char
+
+    if current_cell:
+        display_cells.append(current_cell)
+
+    return display_cells
+
+
 class Tokenizer:
     """
     Tokenizer class for a custom tokenizer.

diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -19,6 +19,7 @@
     tcc_p,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 from pythainlp.util import dict_trie
 
@@ -604,3 +605,13 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
+
+    def test_display_cell_tokenize(self):
+        self.assertEqual(display_cell_tokenize(""), [])
+        self.assertEqual(
+            display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
+            ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
+        )
+        self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
+        self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
+        self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,6 +44,8 @@ Modules @@
         The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
+    .. autoclass:: display_cell_tokenize
     Tokenization Engines
     --------------------
@@ Expand Down @@