Skip to content

Commit 6a88f6f

Browse files
authored
Merge pull request #687 from PyThaiNLP/remove-deprecated-function
Remove deprecated function
2 parents c6c8824 + 7ac06ff commit 6a88f6f

File tree

13 files changed

+5
-473
lines changed

13 files changed

+5
-473
lines changed

docs/api/tokenize.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ Modules
1111
.. autofunction:: clause_tokenize
1212
.. autofunction:: sent_tokenize
1313
.. autofunction:: subword_tokenize
14-
.. autofunction:: syllable_tokenize
1514
.. autofunction:: word_tokenize
1615
.. autoclass:: Tokenizer
1716
:members:

pythainlp/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
Tokenizer,
4949
sent_tokenize,
5050
subword_tokenize,
51-
syllable_tokenize,
5251
word_tokenize,
5352
)
5453
from pythainlp.transliterate import romanize, transliterate

pythainlp/cli/tokenize.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
DEFAULT_WORD_TOKENIZE_ENGINE,
1313
sent_tokenize,
1414
subword_tokenize,
15-
syllable_tokenize,
1615
word_tokenize,
1716
)
1817

@@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
7978
super().__init__(*args, **kwargs)
8079

8180

82-
class SyllableTokenizationApp(SubAppBase):
83-
def __init__(self, *args, **kwargs):
84-
self.keep_whitespace = True
85-
self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
86-
self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
87-
self.run = syllable_tokenize
88-
super().__init__(*args, **kwargs)
89-
90-
9181
class SentenceTokenizationApp(SubAppBase):
9282
def __init__(self, *args, **kwargs):
9383
self.keep_whitespace = True
@@ -132,7 +122,7 @@ def __init__(self, argv):
132122
),
133123
)
134124
parser.add_argument(
135-
"token_type", type=str, help="[subword|syllable|word|sent]",
125+
"token_type", type=str, help="[subword|word|sent]",
136126
)
137127

138128
args = parser.parse_args(argv[2:3])
@@ -142,8 +132,6 @@ def __init__(self, argv):
142132
argv = argv[3:]
143133
if token_type.startswith("w"):
144134
WordTokenizationApp("word", argv)
145-
elif token_type.startswith("sy"):
146-
SyllableTokenizationApp("syllable", argv)
147135
elif token_type.startswith("su"):
148136
SubwordTokenizationApp("subword", argv)
149137
elif token_type.startswith("se"):

pythainlp/tokenize/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
"clause_tokenize",
1111
"sent_tokenize",
1212
"subword_tokenize",
13-
"syllable_tokenize",
1413
"word_tokenize",
1514
]
1615

@@ -31,7 +30,6 @@
3130
clause_tokenize,
3231
sent_tokenize,
3332
subword_tokenize,
34-
syllable_tokenize,
3533
word_tokenize,
3634
)
3735

pythainlp/tokenize/core.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -429,80 +429,6 @@ def subword_tokenize(
429429
return segments
430430

431431

432-
def syllable_tokenize(
433-
text: str,
434-
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
435-
keep_whitespace: bool = True,
436-
) -> List[str]:
437-
"""
438-
Syllable tokenizer.
439-
440-
**syllable_tokenize is deprecated, use subword_tokenize instead**
441-
442-
Tokenizes text into syllable (Thai: พยางค์), a unit of
443-
pronunciation having one vowel sound. For example, the word 'รถไฟ'
444-
contains two syallbles including 'รถ', and 'ไฟ'.
445-
446-
Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
447-
with *newmm* as a tokenizer. The function tokenize the text with
448-
the dictionary of Thai words from
449-
:func:`pythainlp.corpus.common.thai_words`
450-
and then dictionary of Thai syllable from
451-
:func:`pythainlp.corpus.common.thai_syllables`.
452-
As a result, only syllables are obtained.
453-
454-
:param str text: input string to be tokenized
455-
:param str engine: name of the syllable tokenizer
456-
:return: list of syllables where whitespaces in the text **are included**
457-
:rtype: list[str]
458-
**Options for engine**
459-
* *dict* (default) - newmm word tokenizer with a syllable dictionary
460-
* *ssg* - CRF syllable segmenter for Thai
461-
:Example::
462-
::
463-
464-
from pythainlp.tokenize import syllable_tokenize
465-
466-
text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
467-
syllable_tokenize(text)
468-
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
469-
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
470-
"""
471-
warnings.warn(
472-
"""syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
473-
use subword_tokenize instead""",
474-
PendingDeprecationWarning
475-
)
476-
477-
if not text or not isinstance(text, str):
478-
return []
479-
480-
segments = []
481-
482-
if engine == "dict" or engine == "default": # use syllable dictionary
483-
words = word_tokenize(text)
484-
for word in words:
485-
segments.extend(
486-
word_tokenize(
487-
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
488-
)
489-
)
490-
elif engine == "ssg":
491-
from pythainlp.tokenize.ssg import segment
492-
493-
segments = segment(text)
494-
else:
495-
raise ValueError(
496-
f"""Tokenizer \"{engine}\" not found.
497-
It might be a typo; if not, please consult our document."""
498-
)
499-
500-
if not keep_whitespace:
501-
segments = [token.strip(" ") for token in segments if token.strip(" ")]
502-
503-
return segments
504-
505-
506432
class Tokenizer:
507433
"""
508434
Tokenizer class, for a custom tokenizer.

pythainlp/util/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
"bahttext",
1010
"collate",
1111
"countthai",
12-
"delete_tone",
1312
"dict_trie",
1413
"digit_to_text",
1514
"display_thai_char",
@@ -35,7 +34,6 @@
3534
"thai_digit_to_arabic_digit",
3635
"thai_keyboard_dist",
3736
"thai_strftime",
38-
"thai_time",
3937
"thai_to_eng",
4038
"thai_word_tone_detector",
4139
"thaiword_to_date",
@@ -72,7 +70,6 @@
7270
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
7371
from pythainlp.util.keywords import find_keyword, rank
7472
from pythainlp.util.normalize import (
75-
delete_tone,
7673
normalize,
7774
maiyamok,
7875
remove_dangling,
@@ -92,7 +89,7 @@
9289
thai_word_tone_detector,
9390
)
9491
from pythainlp.util.thaiwordcheck import is_native_thai
95-
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
92+
from pythainlp.util.time import thaiword_to_time, time_to_thaiword
9693
from pythainlp.util.trie import Trie, dict_trie
9794
from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num
9895
from pythainlp.util.syllable import (

pythainlp/util/normalize.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ def remove_tonemark(text: str) -> str:
129129
:Example:
130130
::
131131
132-
from pythainlp.util import delete_tone
132+
from pythainlp.util import remove_tonemark
133133
134-
delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
134+
remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
135135
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
136136
"""
137137
for ch in tonemarks:
@@ -248,17 +248,6 @@ def normalize(text: str) -> str:
248248
return text
249249

250250

251-
def delete_tone(text: str) -> str:
252-
"""
253-
DEPRECATED: Please use remove_tonemark().
254-
"""
255-
warnings.warn(
256-
"delete_tone is deprecated, use remove_tonemark instead",
257-
DeprecationWarning,
258-
)
259-
return remove_tonemark(text)
260-
261-
262251
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
263252
"""
264253
Thai MaiYaMok

pythainlp/util/time.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -230,21 +230,6 @@ def time_to_thaiword(
230230
return text
231231

232232

233-
def thai_time(
234-
time_data: Union[time, datetime, str],
235-
fmt: str = "24h",
236-
precision: Union[str, None] = None,
237-
) -> str:
238-
"""
239-
DEPRECATED: Please use time_to_thaiword().
240-
"""
241-
warnings.warn(
242-
"thai_time is deprecated, use time_to_thaiword instead",
243-
DeprecationWarning,
244-
)
245-
return time_to_thaiword(time_data, fmt, precision)
246-
247-
248233
def thaiword_to_time(text: str, padding: bool = True) -> str:
249234
"""
250235
Convert Thai time in words into time (H:M).

pythainlp/word_vector/__init__.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,9 @@
55
Initial code from https://github.com/cstorm125/thai2fit
66
"""
77
__all__ = [
8-
"doesnt_match",
9-
"get_model",
10-
"most_similar_cosmul",
11-
"sentence_vectorizer",
12-
"similarity",
138
"WordVector",
149
]
1510

1611
from pythainlp.word_vector.core import (
17-
doesnt_match,
18-
get_model,
19-
most_similar_cosmul,
20-
sentence_vectorizer,
21-
similarity,
2212
WordVector,
2313
)

0 commit comments

Comments
 (0)