Skip to content

Commit 02e9cb5

Browse files
committed
Update other non-Thai characters in newmm
1 parent fb3e7bb commit 02e9cb5

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

pythainlp/tokenize/newmm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
\d+([,\.]\d+)*| # numbers
4545
[ \t]+| # spaces
4646
\r?\n| # newlines
47-
[^\u0E00-\u0E7F]+ # other non-Thai characters
47+
[^\u0E00-\u0E7F \t]+ # other non-Thai characters
4848
"""
4949
)
5050

tests/test_tokenize.py

+9
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,15 @@ def test_newmm(self):
653653
keep_whitespace=False,
654654
)
655655
)
656+
self.assertEqual(
657+
word_tokenize("(คนไม่เอา)", engine="newmm"), ['(', 'คน', 'ไม่', 'เอา', ')']
658+
)
659+
self.assertEqual(
660+
word_tokenize("กม/ชม", engine="newmm"), ['กม', '/', 'ชม']
661+
)
662+
self.assertEqual(
663+
word_tokenize("สีหน้า(รถ)", engine="newmm"), ['สีหน้า', '(', 'รถ', ')']
664+
)
656665

657666
def test_newmm_longtext(self):
658667
self.assertIsInstance(

0 commit comments

Comments
 (0)