Skip to content

Commit dd1b3f8

Browse files
authored
Merge pull request #962 from bact/fix-expand-maiyamok
Fix expand maiyamok
2 parents 1c9a243 + 5d0cb62 commit dd1b3f8

File tree

1 file changed

+40
-25
lines changed

1 file changed

+40
-25
lines changed

pythainlp/util/normalize.py

+40-25
Original file line numberDiff line numberDiff line change
@@ -258,39 +258,54 @@ def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
258258
repetition. This function preprocesses Thai text by replacing
259259
Maiyamok with a word being repeated.
260260
261-
:param Union[str, List[str]] sent: input sentence (list or str)
261+
:param Union[str, List[str]] sent: sentence (list or string)
262262
:return: list of words
263263
:rtype: List[str]
264264
265265
:Example:
266266
::
267-
268267
from pythainlp.util import expand_maiyamok
269268
270-
expand_maiyamok("เด็กๆกิน")
271-
# output: ['เด็ก', 'เด็ก', 'กิน']
269+
expand_maiyamok("คนๆนก")
270+
# output: ['คน', 'คน', 'นก']
272271
"""
273272
if isinstance(sent, str):
274273
sent = word_tokenize(sent)
275-
_list_word: list[str] = []
276-
i = 0
277-
for j, text in enumerate(sent):
278-
if text.isspace() and "ๆ" in sent[j + 1]:
279-
continue
280-
if " ๆ" in text:
281-
text = text.replace(" ๆ", "ๆ")
282-
if "ๆ" == text:
283-
text = _list_word[i - 1]
284-
elif "ๆ" in text:
285-
count = text.count("ๆ")
286-
text = _list_word[i - 1]
287-
for _ in range(count):
288-
_list_word.append(text)
289-
i += 1
274+
275+
# Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
276+
temp_toks: list[str] = []
277+
for _, token in enumerate(sent):
278+
toks = re.split(r"(ๆ)", token)
279+
toks = [tok for tok in toks if tok] # remove empty string ("")
280+
temp_toks.extend(toks)
281+
sent = temp_toks
282+
283+
output_toks: list[str] = []
284+
285+
yamok = "ๆ"
286+
yamok_count = 0
287+
len_sent = len(sent)
288+
for i in range(len_sent - 1, -1, -1): # do it backward
289+
if yamok_count == 0 or (i + 1 >= len_sent):
290+
if sent[i] == yamok:
291+
yamok_count = yamok_count + 1
292+
else:
293+
output_toks.append(sent[i])
290294
continue
291-
_list_word.append(text)
292-
i += 1
293-
return _list_word
295+
296+
if sent[i] == yamok:
297+
yamok_count = yamok_count + 1
298+
else:
299+
if sent[i].isspace():
300+
if yamok_count > 0: # remove space before yamok
301+
continue
302+
else: # with preprocessing above, this should not happen
303+
output_toks.append(sent[i])
304+
else:
305+
output_toks.extend([sent[i]] * (yamok_count + 1))
306+
yamok_count = 0
307+
308+
return output_toks[::-1]
294309

295310

296311
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
@@ -303,7 +318,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
303318
repetition. This function preprocesses Thai text by replacing
304319
Maiyamok with a word being repeated.
305320
306-
:param Union[str, List[str]] sent: input sentence (list or str)
321+
:param Union[str, List[str]] sent: sentence (list or string)
307322
:return: list of words
308323
:rtype: List[str]
309324
@@ -312,8 +327,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
312327
313328
from pythainlp.util import expand_maiyamok
314329
315-
expand_maiyamok("เด็กๆกิน")
316-
# output: ['เด็ก', 'เด็ก', 'กิน']
330+
expand_maiyamok("คนๆนก")
331+
# output: ['คน', 'คน', 'นก']
317332
"""
318333
warn_deprecation(
319334
"pythainlp.util.maiyamok", "pythainlp.util.expand_maiyamok"

0 commit comments

Comments
 (0)