Skip to content

Commit cae175c

Browse files
authored
Merge pull request #1062 from PyThaiNLP/wannaphong/fix-tokenizer
2 parents f56e7d9 + 97dfe87 commit cae175c

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

pythainlp/tokenize/longest.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
_TRAILING_CHAR = ["ๆ", "ฯ"]
3939

4040
_RE_NONTHAI = re.compile(r"[A-Za-z\d]*")
41+
_RE_SPACES = re.compile(r"\s+")
4142

4243
_KNOWN = True
4344
_UNKNOWN = False
@@ -134,7 +135,15 @@ def __segment(self, text: str):
134135
token_statuses.append(_KNOWN)
135136
begin_pos += len(match)
136137

137-
return tokens
138+
# Group consecutive spaces into one token
139+
grouped_tokens = []
140+
for token in tokens:
141+
if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
142+
grouped_tokens[-1] += token
143+
else:
144+
grouped_tokens.append(token)
145+
146+
return grouped_tokens
138147

139148
def tokenize(self, text: str) -> List[str]:
140149
tokens = self.__segment(text)

tests/core/test_tokenize.py

+12
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,18 @@ def test_longest(self):
390390
longest_tokenizer.word_tokenize("เฉียบพลัน"),
391391
["เฉียบพลัน"],
392392
)
393+
self.assertEqual(
394+
longest.segment("ทดสอบ ทดสอบ ทดสอบ"),
395+
["ทดสอบ", " ", "ทดสอบ", " ", "ทดสอบ"],
396+
)
397+
self.assertEqual(
398+
longest.segment("ทดสอบ ทดสอบ"),
399+
["ทดสอบ", " ", "ทดสอบ"],
400+
)
401+
self.assertEqual(
402+
longest.segment("ทดสอบ ทดสอบ"),
403+
["ทดสอบ", " ", "ทดสอบ"],
404+
)
393405

394406
def test_mm(self):
395407
self.assertEqual(multi_cut.segment(None), [])

0 commit comments

Comments
 (0)