Remove pythainlp.tokenize.syllable_tokenize

wannaphong · wannaphong · commit 7ac06ff2dbe9 · 2022-07-24T20:18:04.000+07:00
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -11,7 +11,6 @@ Modules
 .. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
 .. autofunction:: subword_tokenize
-.. autofunction:: syllable_tokenize
 .. autofunction:: word_tokenize
 .. autoclass:: Tokenizer
    :members:
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -48,7 +48,6 @@
     Tokenizer,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 from pythainlp.transliterate import romanize, transliterate
diff --git a/pythainlp/cli/tokenize.py b/pythainlp/cli/tokenize.py
@@ -12,7 +12,6 @@
     DEFAULT_WORD_TOKENIZE_ENGINE,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 
@@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-class SyllableTokenizationApp(SubAppBase):
-    def __init__(self, *args, **kwargs):
-        self.keep_whitespace = True
-        self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
-        self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
-        self.run = syllable_tokenize
-        super().__init__(*args, **kwargs)
-
-
 class SentenceTokenizationApp(SubAppBase):
     def __init__(self, *args, **kwargs):
         self.keep_whitespace = True
@@ -132,7 +122,7 @@ def __init__(self, argv):
             ),
         )
         parser.add_argument(
-            "token_type", type=str, help="[subword|syllable|word|sent]",
+            "token_type", type=str, help="[subword|word|sent]",
         )
 
         args = parser.parse_args(argv[2:3])
@@ -142,8 +132,6 @@ def __init__(self, argv):
         argv = argv[3:]
         if token_type.startswith("w"):
             WordTokenizationApp("word", argv)
-        elif token_type.startswith("sy"):
-            SyllableTokenizationApp("syllable", argv)
         elif token_type.startswith("su"):
             SubwordTokenizationApp("subword", argv)
         elif token_type.startswith("se"):
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -10,7 +10,6 @@
     "clause_tokenize",
     "sent_tokenize",
     "subword_tokenize",
-    "syllable_tokenize",
     "word_tokenize",
 ]
 
@@ -31,7 +30,6 @@
     clause_tokenize,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -422,80 +422,6 @@ def subword_tokenize(
     return segments
 
 
-def syllable_tokenize(
-    text: str,
-    engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
-    keep_whitespace: bool = True,
-) -> List[str]:
-    """
-    Syllable tokenizer.
-
-    **syllable_tokenize is deprecated, use subword_tokenize instead**
-
-    Tokenizes text into syllable (Thai: พยางค์), a unit of
-    pronunciation having one vowel sound.  For example, the word 'รถไฟ'
-    contains two syallbles including 'รถ', and 'ไฟ'.
-
-    Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
-    with *newmm* as a tokenizer. The function tokenize the text with
-    the dictionary of Thai words from
-    :func:`pythainlp.corpus.common.thai_words`
-    and then dictionary of Thai syllable from
-    :func:`pythainlp.corpus.common.thai_syllables`.
-    As a result, only syllables are obtained.
-
-    :param str text: input string to be tokenized
-    :param str engine: name of the syllable tokenizer
-    :return: list of syllables where whitespaces in the text **are included**
-    :rtype: list[str]
-    **Options for engine**
-        * *dict* (default) - newmm word tokenizer with a syllable dictionary
-        * *ssg* - CRF syllable segmenter for Thai
-    :Example::
-    ::
-
-        from pythainlp.tokenize import syllable_tokenize
-
-        text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
-        syllable_tokenize(text)
-        ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
-        'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
-    """
-    warnings.warn(
-        """syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
-        use subword_tokenize instead""",
-        PendingDeprecationWarning
-    )
-
-    if not text or not isinstance(text, str):
-        return []
-
-    segments = []
-
-    if engine == "dict" or engine == "default":  # use syllable dictionary
-        words = word_tokenize(text)
-        for word in words:
-            segments.extend(
-                word_tokenize(
-                    text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
-                )
-            )
-    elif engine == "ssg":
-        from pythainlp.tokenize.ssg import segment
-
-        segments = segment(text)
-    else:
-        raise ValueError(
-            f"""Tokenizer \"{engine}\" not found.
-            It might be a typo; if not, please consult our document."""
-        )
-
-    if not keep_whitespace:
-        segments = [token.strip(" ") for token in segments if token.strip(" ")]
-
-    return segments
-
-
 class Tokenizer:
     """
     Tokenizer class, for a custom tokenizer.
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -17,7 +17,6 @@
     sent_tokenize,
     ssg,
     subword_tokenize,
-    syllable_tokenize,
     tcc,
     word_tokenize,
     sefr_cut,
@@ -317,7 +316,6 @@ def test_subword_tokenize(self):
         )
         self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
         self.assertEqual(subword_tokenize(None, engine="ssg"), [])
-        self.assertEqual(syllable_tokenize("", engine="ssg"), [])
         self.assertEqual(
             subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
         )
@@ -344,30 +342,6 @@ def test_subword_tokenize(self):
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist
 
-    def test_syllable_tokenize(self):
-        self.assertEqual(syllable_tokenize(None), [])
-        self.assertEqual(syllable_tokenize(""), [])
-        self.assertEqual(
-            syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
-        )
-        self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก"))
-        self.assertEqual(syllable_tokenize(None, engine="ssg"), [])
-        self.assertEqual(syllable_tokenize("", engine="ssg"), [])
-        self.assertEqual(
-            syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
-        )
-        self.assertTrue(
-            "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
-        )
-        self.assertFalse(
-            "า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
-        )
-        self.assertFalse(
-            " " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
-        )
-        with self.assertRaises(ValueError):
-            syllable_tokenize("กรอเทป", engine="XX")  # engine does not exist
-
     def test_word_tokenize(self):
         self.assertEqual(word_tokenize(""), [])
         self.assertEqual(
@@ -626,7 +600,7 @@ def test_ssg(self):
         self.assertEqual(ssg.segment(None), [])
         self.assertEqual(ssg.segment(""), [])
         self.assertTrue(
-            "ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
+            "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
         )
 
     def test_tcc(self):

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@`
`48`	`48`	`Tokenizer,`
`49`	`49`	`sent_tokenize,`
`50`	`50`	`subword_tokenize,`
`51`		`- syllable_tokenize,`
`52`	`51`	`word_tokenize,`
`53`	`52`	`)`
`54`	`53`	`from pythainlp.transliterate import romanize, transliterate`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`"clause_tokenize",`
`11`	`11`	`"sent_tokenize",`
`12`	`12`	`"subword_tokenize",`
`13`		`- "syllable_tokenize",`
`14`	`13`	`"word_tokenize",`
`15`	`14`	`]`
`16`	`15`
`@@ -31,7 +30,6 @@`
`31`	`30`	`clause_tokenize,`
`32`	`31`	`sent_tokenize,`
`33`	`32`	`subword_tokenize,`
`34`		`- syllable_tokenize,`
`35`	`33`	`word_tokenize,`
`36`	`34`	`)`
`37`	`35`