Skip to content

Commit 7ac06ff

Browse files
committed
Remove pythainlp.tokenize.syllable_tokenize
1 parent 552b6d2 commit 7ac06ff

File tree

6 files changed

+2
-118
lines changed

6 files changed

+2
-118
lines changed

docs/api/tokenize.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ Modules
1111
.. autofunction:: clause_tokenize
1212
.. autofunction:: sent_tokenize
1313
.. autofunction:: subword_tokenize
14-
.. autofunction:: syllable_tokenize
1514
.. autofunction:: word_tokenize
1615
.. autoclass:: Tokenizer
1716
:members:

pythainlp/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
Tokenizer,
4949
sent_tokenize,
5050
subword_tokenize,
51-
syllable_tokenize,
5251
word_tokenize,
5352
)
5453
from pythainlp.transliterate import romanize, transliterate

pythainlp/cli/tokenize.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
DEFAULT_WORD_TOKENIZE_ENGINE,
1313
sent_tokenize,
1414
subword_tokenize,
15-
syllable_tokenize,
1615
word_tokenize,
1716
)
1817

@@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
7978
super().__init__(*args, **kwargs)
8079

8180

82-
class SyllableTokenizationApp(SubAppBase):
83-
def __init__(self, *args, **kwargs):
84-
self.keep_whitespace = True
85-
self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
86-
self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
87-
self.run = syllable_tokenize
88-
super().__init__(*args, **kwargs)
89-
90-
9181
class SentenceTokenizationApp(SubAppBase):
9282
def __init__(self, *args, **kwargs):
9383
self.keep_whitespace = True
@@ -132,7 +122,7 @@ def __init__(self, argv):
132122
),
133123
)
134124
parser.add_argument(
135-
"token_type", type=str, help="[subword|syllable|word|sent]",
125+
"token_type", type=str, help="[subword|word|sent]",
136126
)
137127

138128
args = parser.parse_args(argv[2:3])
@@ -142,8 +132,6 @@ def __init__(self, argv):
142132
argv = argv[3:]
143133
if token_type.startswith("w"):
144134
WordTokenizationApp("word", argv)
145-
elif token_type.startswith("sy"):
146-
SyllableTokenizationApp("syllable", argv)
147135
elif token_type.startswith("su"):
148136
SubwordTokenizationApp("subword", argv)
149137
elif token_type.startswith("se"):

pythainlp/tokenize/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
"clause_tokenize",
1111
"sent_tokenize",
1212
"subword_tokenize",
13-
"syllable_tokenize",
1413
"word_tokenize",
1514
]
1615

@@ -31,7 +30,6 @@
3130
clause_tokenize,
3231
sent_tokenize,
3332
subword_tokenize,
34-
syllable_tokenize,
3533
word_tokenize,
3634
)
3735

pythainlp/tokenize/core.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -422,80 +422,6 @@ def subword_tokenize(
422422
return segments
423423

424424

425-
def syllable_tokenize(
426-
text: str,
427-
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
428-
keep_whitespace: bool = True,
429-
) -> List[str]:
430-
"""
431-
Syllable tokenizer.
432-
433-
**syllable_tokenize is deprecated, use subword_tokenize instead**
434-
435-
Tokenizes text into syllable (Thai: พยางค์), a unit of
436-
pronunciation having one vowel sound. For example, the word 'รถไฟ'
437-
contains two syallbles including 'รถ', and 'ไฟ'.
438-
439-
Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
440-
with *newmm* as a tokenizer. The function tokenize the text with
441-
the dictionary of Thai words from
442-
:func:`pythainlp.corpus.common.thai_words`
443-
and then dictionary of Thai syllable from
444-
:func:`pythainlp.corpus.common.thai_syllables`.
445-
As a result, only syllables are obtained.
446-
447-
:param str text: input string to be tokenized
448-
:param str engine: name of the syllable tokenizer
449-
:return: list of syllables where whitespaces in the text **are included**
450-
:rtype: list[str]
451-
**Options for engine**
452-
* *dict* (default) - newmm word tokenizer with a syllable dictionary
453-
* *ssg* - CRF syllable segmenter for Thai
454-
:Example::
455-
::
456-
457-
from pythainlp.tokenize import syllable_tokenize
458-
459-
text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
460-
syllable_tokenize(text)
461-
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
462-
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
463-
"""
464-
warnings.warn(
465-
"""syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
466-
use subword_tokenize instead""",
467-
PendingDeprecationWarning
468-
)
469-
470-
if not text or not isinstance(text, str):
471-
return []
472-
473-
segments = []
474-
475-
if engine == "dict" or engine == "default": # use syllable dictionary
476-
words = word_tokenize(text)
477-
for word in words:
478-
segments.extend(
479-
word_tokenize(
480-
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
481-
)
482-
)
483-
elif engine == "ssg":
484-
from pythainlp.tokenize.ssg import segment
485-
486-
segments = segment(text)
487-
else:
488-
raise ValueError(
489-
f"""Tokenizer \"{engine}\" not found.
490-
It might be a typo; if not, please consult our document."""
491-
)
492-
493-
if not keep_whitespace:
494-
segments = [token.strip(" ") for token in segments if token.strip(" ")]
495-
496-
return segments
497-
498-
499425
class Tokenizer:
500426
"""
501427
Tokenizer class, for a custom tokenizer.

tests/test_tokenize.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
sent_tokenize,
1818
ssg,
1919
subword_tokenize,
20-
syllable_tokenize,
2120
tcc,
2221
word_tokenize,
2322
sefr_cut,
@@ -317,7 +316,6 @@ def test_subword_tokenize(self):
317316
)
318317
self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
319318
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
320-
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
321319
self.assertEqual(
322320
subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
323321
)
@@ -344,30 +342,6 @@ def test_subword_tokenize(self):
344342
with self.assertRaises(ValueError):
345343
subword_tokenize("นกแก้ว", engine="XX") # engine does not exist
346344

347-
def test_syllable_tokenize(self):
348-
self.assertEqual(syllable_tokenize(None), [])
349-
self.assertEqual(syllable_tokenize(""), [])
350-
self.assertEqual(
351-
syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
352-
)
353-
self.assertFalse("า" in syllable_tokenize("สวัสดีชาวโลก"))
354-
self.assertEqual(syllable_tokenize(None, engine="ssg"), [])
355-
self.assertEqual(syllable_tokenize("", engine="ssg"), [])
356-
self.assertEqual(
357-
syllable_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
358-
)
359-
self.assertTrue(
360-
"ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
361-
)
362-
self.assertFalse(
363-
"า" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
364-
)
365-
self.assertFalse(
366-
" " in syllable_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
367-
)
368-
with self.assertRaises(ValueError):
369-
syllable_tokenize("กรอเทป", engine="XX") # engine does not exist
370-
371345
def test_word_tokenize(self):
372346
self.assertEqual(word_tokenize(""), [])
373347
self.assertEqual(
@@ -626,7 +600,7 @@ def test_ssg(self):
626600
self.assertEqual(ssg.segment(None), [])
627601
self.assertEqual(ssg.segment(""), [])
628602
self.assertTrue(
629-
"ดาว" in syllable_tokenize("สวัสดีดาวอังคาร", engine="ssg")
603+
"ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
630604
)
631605

632606
def test_tcc(self):

0 commit comments

Comments
 (0)