Skip to content

Commit 2287b30

Browse files
authored
Merge pull request #1024 from PyThaiNLP/remove-clause_tokenize
Remove clause_tokenize
2 parents 216d443 + 3e8501f commit 2287b30

File tree

5 files changed

+0
-127
lines changed

5 files changed

+0
-127
lines changed

docs/api/tokenize.rst

-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a
88
Modules
99
-------
1010

11-
.. autofunction:: clause_tokenize
12-
:noindex:
13-
14-
Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks.
1511

1612
.. autofunction:: sent_tokenize
1713
:noindex:

pythainlp/tokenize/__init__.py

-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
"THAI2FIT_TOKENIZER",
1010
"Tokenizer",
1111
"Trie",
12-
"clause_tokenize",
1312
"paragraph_tokenize",
1413
"sent_tokenize",
1514
"subword_tokenize",
@@ -32,7 +31,6 @@
3231

3332
from pythainlp.tokenize.core import (
3433
Tokenizer,
35-
clause_tokenize,
3634
paragraph_tokenize,
3735
sent_tokenize,
3836
subword_tokenize,

pythainlp/tokenize/core.py

-37
Original file line numberDiff line numberDiff line change
@@ -25,43 +25,6 @@
2525
from pythainlp.util.trie import Trie, dict_trie
2626

2727

28-
def clause_tokenize(doc: List[str]) -> List[List[str]]:
29-
"""
30-
Clause tokenizer. (or Clause segmentation)
31-
Tokenizes running word list into list of clauses (list of strings).
32-
Split by CRF trained on Blackboard Treebank.
33-
34-
:param str doc: word list to be clause tokenized
35-
:return: list of clauses
36-
:rtype: list[list[str]]
37-
:Example:
38-
::
39-
40-
from pythainlp.tokenize import clause_tokenize
41-
42-
clause_tokenize(
43-
[
44-
"ฉัน",
45-
"นอน",
46-
"และ",
47-
"คุณ",
48-
"เล่น",
49-
"มือถือ",
50-
"ส่วน",
51-
"น้อง",
52-
"เขียน",
53-
"โปรแกรม",
54-
]
55-
)
56-
# [['ฉัน', 'นอน'],
57-
# ['และ', 'คุณ', 'เล่น', 'มือถือ'],
58-
# ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
59-
"""
60-
from pythainlp.tokenize.crfcls import segment
61-
62-
return segment(doc)
63-
64-
6528
def word_detokenize(
6629
segments: Union[List[List[str]], List[str]], output: str = "str"
6730
) -> Union[List[str], str]:

pythainlp/tokenize/crfcls.py

-77
This file was deleted.

tests/extra/testx_tokenize.py

-7
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
tltk,
2121
word_tokenize,
2222
)
23-
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
2423

2524
from ..core.test_tokenize import (
2625
SENT_1,
@@ -31,12 +30,6 @@
3130
)
3231

3332

34-
class ClauseTokenizeTestCase(unittest.TestCase):
35-
def test_clause_tokenize(self):
36-
self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
37-
self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)
38-
39-
4033
class DetokenizeTestCase(unittest.TestCase):
4134
def test_numeric_data_format(self):
4235
engines = ["attacut", "deepcut", "sefr_cut"]

0 commit comments

Comments
 (0)