Merge pull request #1024 from PyThaiNLP/remove-clause_tokenize

bact · web-flow · commit 2287b30b3529 · 2024-12-12T07:56:36.000Z
Remove clause_tokenize
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a
 Modules
 -------
 
-.. autofunction:: clause_tokenize
-    :noindex:
-    
-    Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks.
 
 .. autofunction:: sent_tokenize
     :noindex:
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -9,7 +9,6 @@
     "THAI2FIT_TOKENIZER",
     "Tokenizer",
     "Trie",
-    "clause_tokenize",
     "paragraph_tokenize",
     "sent_tokenize",
     "subword_tokenize",
@@ -32,7 +31,6 @@
 
 from pythainlp.tokenize.core import (
     Tokenizer,
-    clause_tokenize,
     paragraph_tokenize,
     sent_tokenize,
     subword_tokenize,
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -25,43 +25,6 @@
 from pythainlp.util.trie import Trie, dict_trie
 
 
-def clause_tokenize(doc: List[str]) -> List[List[str]]:
-    """
-    Clause tokenizer. (or Clause segmentation)
-    Tokenizes running word list into list of clauses (list of strings).
-    Split by CRF trained on Blackboard Treebank.
-
-    :param str doc: word list to be clause tokenized
-    :return: list of clauses
-    :rtype: list[list[str]]
-    :Example:
-    ::
-
-        from pythainlp.tokenize import clause_tokenize
-
-        clause_tokenize(
-            [
-                "ฉัน",
-                "นอน",
-                "และ",
-                "คุณ",
-                "เล่น",
-                "มือถือ",
-                "ส่วน",
-                "น้อง",
-                "เขียน",
-                "โปรแกรม",
-            ]
-        )
-        # [['ฉัน', 'นอน'],
-        # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
-        # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
-    """
-    from pythainlp.tokenize.crfcls import segment
-
-    return segment(doc)
-
-
 def word_detokenize(
     segments: Union[List[List[str]], List[str]], output: str = "str"
 ) -> Union[List[str], str]:
diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
@@ -20,7 +20,6 @@
     tltk,
     word_tokenize,
 )
-from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 
 from ..core.test_tokenize import (
     SENT_1,
@@ -31,12 +30,6 @@
 )
 
 
-class ClauseTokenizeTestCase(unittest.TestCase):
-    def test_clause_tokenize(self):
-        self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
-        self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)
-
-
 class DetokenizeTestCase(unittest.TestCase):
     def test_numeric_data_format(self):
         engines = ["attacut", "deepcut", "sefr_cut"]