Skip to content

[Ready] Reduce reload word tokenizer engine in word_tokenize #1064

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions pythainlp/tokenize/attacut.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
:See Also:
* `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
"""
from typing import List
from typing import Dict, List

from attacut import Tokenizer

Expand All @@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
return self._tokenizer.tokenize(text)


_tokenizers: Dict[str, AttacutTokenizer] = {}


def segment(text: str, model: str = "attacut-sc") -> List[str]:
"""
Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
Expand All @@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
if not text or not isinstance(text, str):
return []

_tokenizer = AttacutTokenizer(model)
global _tokenizers
if model not in _tokenizers:
_tokenizers[model] = AttacutTokenizer(model)

return _tokenizer.tokenize(text)
return _tokenizers[model].tokenize(text)
16 changes: 11 additions & 5 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

"""
import re
from typing import List, Union
from typing import Dict, List, Union

from pythainlp import thai_tonemarks
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
Expand Down Expand Up @@ -149,9 +149,10 @@ def tokenize(self, text: str) -> List[str]:
return tokens


def segment(
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> List[str]:
_tokenizers: Dict[int, LongestMatchTokenizer] = {}


def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
"""
Dictionary-based longest matching word segmentation.

Expand All @@ -165,4 +166,9 @@ def segment(
if not custom_dict:
custom_dict = DEFAULT_WORD_DICT_TRIE

return LongestMatchTokenizer(custom_dict).tokenize(text)
global _tokenizers
custom_dict_ref_id = id(custom_dict)
if custom_dict_ref_id not in _tokenizers:
_tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)

return _tokenizers[custom_dict_ref_id].tokenize(text)
3 changes: 2 additions & 1 deletion pythainlp/tokenize/pyicu.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

from icu import BreakIterator, Locale

bd = BreakIterator.createWordInstance(Locale("th"))

def _gen_words(text: str) -> str:
bd = BreakIterator.createWordInstance(Locale("th"))
global bd
bd.setText(text)
p = bd.first()
for q in bd:
Expand Down
22 changes: 22 additions & 0 deletions tests/core/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,28 @@ def test_longest(self):
["ทดสอบ", " ", "ทดสอบ"],
)

def test_longest_custom_dict(self):
"""Test switching the custom dict on longest segment function"""

self.assertEqual(
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
["ทดสอบ", " ", "ทดสอบ"],
)
self.assertEqual(
word_tokenize(
"ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"])
),
["ปวดเฉียบพลัน"],
)
self.assertEqual(
word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])),
["ทดสอบท", "ดสอบ"],
)
self.assertEqual(
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
["ทดสอบ", " ", "ทดสอบ"],
)

def test_mm(self):
self.assertEqual(multi_cut.segment(None), [])
self.assertEqual(multi_cut.segment(""), [])
Expand Down
Loading