@@ -25,6 +25,7 @@ def segment(
25
25
"DATE" ,
26
26
"TIME" ,
27
27
],
28
+ tagger = _thainer
28
29
) -> List [str ]:
29
30
"""
30
31
Dictionary-based maximal matching word segmentation, constrained with
@@ -33,18 +34,17 @@ def segment(
33
34
34
35
:param str text: text to be tokenized to words
35
36
:parm list taglist: a list of named-entity tags to be used
37
+ :parm class tagger: ner tagger engine
36
38
:return: list of words, tokenized from the text
37
39
"""
38
- if not text or not isinstance (text , str ):
40
+ if not isinstance (text , str ):
39
41
return []
40
42
41
- global _thainer
42
- tagged_words = _thainer .tag (text , pos = False )
43
+ tagged_words = tagger .tag (text , pos = False )
43
44
44
45
words = []
45
46
combining_word = ""
46
- combining_word = ""
47
- for curr_word , curr_tag in tagged_words :
47
+ for idx , (curr_word , curr_tag ) in enumerate (tagged_words ):
48
48
if curr_tag != "O" :
49
49
tag = curr_tag [2 :]
50
50
else :
@@ -68,5 +68,15 @@ def segment(
68
68
else :
69
69
combining_word = ""
70
70
words .append (curr_word )
71
+ if idx + 1 == len (tagged_words ):
72
+ if (
73
+ curr_tag .startswith ("B-" ) or curr_tag == "O"
74
+ ) and combining_word != "" :
75
+ words .append (combining_word )
76
+ combining_word = ""
77
+ words .append (curr_word )
78
+ else : # if tag is O
79
+ combining_word += curr_word
80
+ words .append (combining_word )
71
81
72
82
return words
0 commit comments