Merge pull request #687 from PyThaiNLP/remove-deprecated-function

wannaphong · web-flow · commit 6a88f6ff460c · 2022-08-08T21:40:52.000+07:00
Remove deprecated function
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -11,7 +11,6 @@ Modules
 .. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
 .. autofunction:: subword_tokenize
-.. autofunction:: syllable_tokenize
 .. autofunction:: word_tokenize
 .. autoclass:: Tokenizer
    :members:
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -48,7 +48,6 @@
     Tokenizer,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 from pythainlp.transliterate import romanize, transliterate
diff --git a/pythainlp/cli/tokenize.py b/pythainlp/cli/tokenize.py
@@ -12,7 +12,6 @@
     DEFAULT_WORD_TOKENIZE_ENGINE,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 
@@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
 
-class SyllableTokenizationApp(SubAppBase):
-    def __init__(self, *args, **kwargs):
-        self.keep_whitespace = True
-        self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
-        self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
-        self.run = syllable_tokenize
-        super().__init__(*args, **kwargs)
-
-
 class SentenceTokenizationApp(SubAppBase):
     def __init__(self, *args, **kwargs):
         self.keep_whitespace = True
@@ -132,7 +122,7 @@ def __init__(self, argv):
             ),
         )
         parser.add_argument(
-            "token_type", type=str, help="[subword|syllable|word|sent]",
+            "token_type", type=str, help="[subword|word|sent]",
         )
 
         args = parser.parse_args(argv[2:3])
@@ -142,8 +132,6 @@ def __init__(self, argv):
         argv = argv[3:]
         if token_type.startswith("w"):
             WordTokenizationApp("word", argv)
-        elif token_type.startswith("sy"):
-            SyllableTokenizationApp("syllable", argv)
         elif token_type.startswith("su"):
             SubwordTokenizationApp("subword", argv)
         elif token_type.startswith("se"):
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -10,7 +10,6 @@
     "clause_tokenize",
     "sent_tokenize",
     "subword_tokenize",
-    "syllable_tokenize",
     "word_tokenize",
 ]
 
@@ -31,7 +30,6 @@
     clause_tokenize,
     sent_tokenize,
     subword_tokenize,
-    syllable_tokenize,
     word_tokenize,
 )
 
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -429,80 +429,6 @@ def subword_tokenize(
     return segments
 
 
-def syllable_tokenize(
-    text: str,
-    engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
-    keep_whitespace: bool = True,
-) -> List[str]:
-    """
-    Syllable tokenizer.
-
-    **syllable_tokenize is deprecated, use subword_tokenize instead**
-
-    Tokenizes text into syllable (Thai: พยางค์), a unit of
-    pronunciation having one vowel sound.  For example, the word 'รถไฟ'
-    contains two syallbles including 'รถ', and 'ไฟ'.
-
-    Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
-    with *newmm* as a tokenizer. The function tokenize the text with
-    the dictionary of Thai words from
-    :func:`pythainlp.corpus.common.thai_words`
-    and then dictionary of Thai syllable from
-    :func:`pythainlp.corpus.common.thai_syllables`.
-    As a result, only syllables are obtained.
-
-    :param str text: input string to be tokenized
-    :param str engine: name of the syllable tokenizer
-    :return: list of syllables where whitespaces in the text **are included**
-    :rtype: list[str]
-    **Options for engine**
-        * *dict* (default) - newmm word tokenizer with a syllable dictionary
-        * *ssg* - CRF syllable segmenter for Thai
-    :Example::
-    ::
-
-        from pythainlp.tokenize import syllable_tokenize
-
-        text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
-        syllable_tokenize(text)
-        ['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
-        'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
-    """
-    warnings.warn(
-        """syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
-        use subword_tokenize instead""",
-        PendingDeprecationWarning
-    )
-
-    if not text or not isinstance(text, str):
-        return []
-
-    segments = []
-
-    if engine == "dict" or engine == "default":  # use syllable dictionary
-        words = word_tokenize(text)
-        for word in words:
-            segments.extend(
-                word_tokenize(
-                    text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
-                )
-            )
-    elif engine == "ssg":
-        from pythainlp.tokenize.ssg import segment
-
-        segments = segment(text)
-    else:
-        raise ValueError(
-            f"""Tokenizer \"{engine}\" not found.
-            It might be a typo; if not, please consult our document."""
-        )
-
-    if not keep_whitespace:
-        segments = [token.strip(" ") for token in segments if token.strip(" ")]
-
-    return segments
-
-
 class Tokenizer:
     """
     Tokenizer class, for a custom tokenizer.
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -9,7 +9,6 @@
     "bahttext",
     "collate",
     "countthai",
-    "delete_tone",
     "dict_trie",
     "digit_to_text",
     "display_thai_char",
@@ -35,7 +34,6 @@
     "thai_digit_to_arabic_digit",
     "thai_keyboard_dist",
     "thai_strftime",
-    "thai_time",
     "thai_to_eng",
     "thai_word_tone_detector",
     "thaiword_to_date",
@@ -72,7 +70,6 @@
 from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
 from pythainlp.util.keywords import find_keyword, rank
 from pythainlp.util.normalize import (
-    delete_tone,
     normalize,
     maiyamok,
     remove_dangling,
@@ -92,7 +89,7 @@
     thai_word_tone_detector,
 )
 from pythainlp.util.thaiwordcheck import is_native_thai
-from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
+from pythainlp.util.time import thaiword_to_time, time_to_thaiword
 from pythainlp.util.trie import Trie, dict_trie
 from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num
 from pythainlp.util.syllable import (
diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py
@@ -129,9 +129,9 @@ def remove_tonemark(text: str) -> str:
     :Example:
     ::
 
-        from pythainlp.util import delete_tone
+        from pythainlp.util import remove_tonemark
 
-        delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
+        remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
         # output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
     """
     for ch in tonemarks:
@@ -248,17 +248,6 @@ def normalize(text: str) -> str:
     return text
 
 
-def delete_tone(text: str) -> str:
-    """
-    DEPRECATED: Please use remove_tonemark().
-    """
-    warnings.warn(
-        "delete_tone is deprecated, use remove_tonemark instead",
-        DeprecationWarning,
-    )
-    return remove_tonemark(text)
-
-
 def maiyamok(sent: Union[str, List[str]]) -> List[str]:
     """
     Thai MaiYaMok
diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py
@@ -230,21 +230,6 @@ def time_to_thaiword(
     return text
 
 
-def thai_time(
-    time_data: Union[time, datetime, str],
-    fmt: str = "24h",
-    precision: Union[str, None] = None,
-) -> str:
-    """
-    DEPRECATED: Please use time_to_thaiword().
-    """
-    warnings.warn(
-        "thai_time is deprecated, use time_to_thaiword instead",
-        DeprecationWarning,
-    )
-    return time_to_thaiword(time_data, fmt, precision)
-
-
 def thaiword_to_time(text: str, padding: bool = True) -> str:
     """
     Convert Thai time in words into time (H:M).
diff --git a/pythainlp/word_vector/__init__.py b/pythainlp/word_vector/__init__.py
@@ -5,19 +5,9 @@
 Initial code from https://github.com/cstorm125/thai2fit
 """
 __all__ = [
-    "doesnt_match",
-    "get_model",
-    "most_similar_cosmul",
-    "sentence_vectorizer",
-    "similarity",
     "WordVector",
 ]
 
 from pythainlp.word_vector.core import (
-    doesnt_match,
-    get_model,
-    most_similar_cosmul,
-    sentence_vectorizer,
-    similarity,
     WordVector,
 )
diff --git a/pythainlp/word_vector/core.py b/pythainlp/word_vector/core.py
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
diff --git a/tests/test_util.py b/tests/test_util.py
diff --git a/tests/test_word_vector.py b/tests/test_word_vector.py

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@`
`48`	`48`	`Tokenizer,`
`49`	`49`	`sent_tokenize,`
`50`	`50`	`subword_tokenize,`
`51`		`- syllable_tokenize,`
`52`	`51`	`word_tokenize,`
`53`	`52`	`)`
`54`	`53`	`from pythainlp.transliterate import romanize, transliterate`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`"clause_tokenize",`
`11`	`11`	`"sent_tokenize",`
`12`	`12`	`"subword_tokenize",`
`13`		`- "syllable_tokenize",`
`14`	`13`	`"word_tokenize",`
`15`	`14`	`]`
`16`	`15`
`@@ -31,7 +30,6 @@`
`31`	`30`	`clause_tokenize,`
`32`	`31`	`sent_tokenize,`
`33`	`32`	`subword_tokenize,`
`34`		`- syllable_tokenize,`
`35`	`33`	`word_tokenize,`
`36`	`34`	`)`
`37`	`35`