Fix bug in Longest Matching tokenizer to preprocess spaces consistently

wannaphong · wannaphong · commit 638702f8a1be · 2025-01-10T22:02:16.000+07:00
Fixes #1061 Update the Longest Matching tokenizer to preprocess spaces consistently with the Multi-Cut tokenizer. * Modify `pythainlp/tokenize/longest.py` to group consecutive spaces into one token using regex. * Add test cases in `tests/core/test_tokenize.py` to verify consistent preprocessing of spaces between Longest Matching and Multi-Cut tokenizers.
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
 # SPDX-FileType: SOURCE
+# SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
 """
 Dictionary-based longest-matching Thai word segmentation. Implementation is based
@@ -38,6 +39,7 @@
 _TRAILING_CHAR = ["ๆ", "ฯ"]
 
 _RE_NONTHAI = re.compile(r"[A-Za-z\d]*")
+_RE_SPACES = re.compile(r"\s+")
 
 _KNOWN = True
 _UNKNOWN = False
@@ -134,7 +136,15 @@ def __segment(self, text: str):
                     token_statuses.append(_KNOWN)
                 begin_pos += len(match)
 
-        return tokens
+        # Group consecutive spaces into one token
+        grouped_tokens = []
+        for token in tokens:
+            if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
+                grouped_tokens[-1] += token
+            else:
+                grouped_tokens.append(token)
+
+        return grouped_tokens
 
     def tokenize(self, text: str) -> List[str]:
         tokens = self.__segment(text)
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -390,6 +390,18 @@ def test_longest(self):
             longest_tokenizer.word_tokenize("เฉียบพลัน"),
             ["เฉียบพลัน"],
         )
+        self.assertEqual(
+            longest.segment("ทดสอบ  ทดสอบ  ทดสอบ"),
+            ["ทดสอบ", " ", "ทดสอบ", " ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            longest.segment("ทดสอบ   ทดสอบ"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            longest.segment("ทดสอบ    ทดสอบ"),
+            ["ทดสอบ", "   ", "ทดสอบ"],
+        )
 
     def test_mm(self):
         self.assertEqual(multi_cut.segment(None), [])