Skip to content

Commit d0402d0

Browse files
authored
Merge pull request #667 from PyThaiNLP/Fix-666
Fixed #666
2 parents e5ab511 + d7a1bcb commit d0402d0

File tree

2 files changed

+18
-6
lines changed

2 files changed

+18
-6
lines changed

pythainlp/tokenize/nercut.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def segment(
2525
"DATE",
2626
"TIME",
2727
],
28+
tagger=_thainer
2829
) -> List[str]:
2930
"""
3031
Dictionary-based maximal matching word segmentation, constrained with
@@ -33,18 +34,17 @@ def segment(
3334
3435
:param str text: text to be tokenized to words
3536
:parm list taglist: a list of named-entity tags to be used
37+
:parm class tagger: ner tagger engine
3638
:return: list of words, tokenized from the text
3739
"""
38-
if not text or not isinstance(text, str):
40+
if not isinstance(text, str):
3941
return []
4042

41-
global _thainer
42-
tagged_words = _thainer.tag(text, pos=False)
43+
tagged_words = tagger.tag(text, pos=False)
4344

4445
words = []
4546
combining_word = ""
46-
combining_word = ""
47-
for curr_word, curr_tag in tagged_words:
47+
for idx, (curr_word, curr_tag) in enumerate(tagged_words):
4848
if curr_tag != "O":
4949
tag = curr_tag[2:]
5050
else:
@@ -68,5 +68,15 @@ def segment(
6868
else:
6969
combining_word = ""
7070
words.append(curr_word)
71+
if idx + 1 == len(tagged_words):
72+
if (
73+
curr_tag.startswith("B-") or curr_tag == "O"
74+
) and combining_word != "":
75+
words.append(combining_word)
76+
combining_word = ""
77+
words.append(curr_word)
78+
else: # if tag is O
79+
combining_word += curr_word
80+
words.append(combining_word)
7181

7282
return words

tests/test_tokenize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,9 @@ def test_nercut(self):
610610
self.assertEqual(nercut.segment(None), [])
611611
self.assertEqual(nercut.segment(""), [])
612612
self.assertIsNotNone(nercut.segment("ทดสอบ"))
613-
self.assertIsNotNone(nercut.segment("ทดสอบ"))
613+
self.assertIsNotNone(nercut.segment("ทุ๊กกโคนน"))
614+
self.assertIsNotNone(nercut.segment("อือหือ"))
615+
self.assertIsNotNone(nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"))
614616
self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
615617

616618
def test_ssg(self):

0 commit comments

Comments
 (0)