Skip to content

Commit 638702f

Browse files
committed
Fix bug in Longest Matching tokenizer to preprocess spaces consistently
Fixes #1061 Update the Longest Matching tokenizer to preprocess spaces consistently with the Multi-Cut tokenizer. * Modify `pythainlp/tokenize/longest.py` to group consecutive spaces into one token using regex. * Add test cases in `tests/core/test_tokenize.py` to verify consistent preprocessing of spaces between Longest Matching and Multi-Cut tokenizers.
1 parent 9a9d11f commit 638702f

File tree

2 files changed

+23
-1
lines changed

2 files changed

+23
-1
lines changed

pythainlp/tokenize/longest.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
33
# SPDX-FileType: SOURCE
4+
# SPDX-FileType: SOURCE
45
# SPDX-License-Identifier: Apache-2.0
56
"""
67
Dictionary-based longest-matching Thai word segmentation. Implementation is based
@@ -38,6 +39,7 @@
3839
_TRAILING_CHAR = ["ๆ", "ฯ"]
3940

4041
_RE_NONTHAI = re.compile(r"[A-Za-z\d]*")
42+
_RE_SPACES = re.compile(r"\s+")
4143

4244
_KNOWN = True
4345
_UNKNOWN = False
@@ -134,7 +136,15 @@ def __segment(self, text: str):
134136
token_statuses.append(_KNOWN)
135137
begin_pos += len(match)
136138

137-
return tokens
139+
# Group consecutive spaces into one token
140+
grouped_tokens = []
141+
for token in tokens:
142+
if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
143+
grouped_tokens[-1] += token
144+
else:
145+
grouped_tokens.append(token)
146+
147+
return grouped_tokens
138148

139149
def tokenize(self, text: str) -> List[str]:
140150
tokens = self.__segment(text)

tests/core/test_tokenize.py

+12
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,18 @@ def test_longest(self):
390390
longest_tokenizer.word_tokenize("เฉียบพลัน"),
391391
["เฉียบพลัน"],
392392
)
393+
self.assertEqual(
394+
longest.segment("ทดสอบ ทดสอบ ทดสอบ"),
395+
["ทดสอบ", " ", "ทดสอบ", " ", "ทดสอบ"],
396+
)
397+
self.assertEqual(
398+
longest.segment("ทดสอบ ทดสอบ"),
399+
["ทดสอบ", " ", "ทดสอบ"],
400+
)
401+
self.assertEqual(
402+
longest.segment("ทดสอบ ทดสอบ"),
403+
["ทดสอบ", " ", "ทดสอบ"],
404+
)
393405

394406
def test_mm(self):
395407
self.assertEqual(multi_cut.segment(None), [])

0 commit comments

Comments
 (0)