@@ -258,39 +258,54 @@ def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
258
258
repetition. This function preprocesses Thai text by replacing
259
259
Maiyamok with a word being repeated.
260
260
261
- :param Union[str, List[str]] sent: input sentence (list or str )
261
+ :param Union[str, List[str]] sent: sentence (list or string )
262
262
:return: list of words
263
263
:rtype: List[str]
264
264
265
265
:Example:
266
266
::
267
-
268
267
from pythainlp.util import expand_maiyamok
269
268
270
- expand_maiyamok("เด็กๆกิน ")
271
- # output: ['เด็ก ', 'เด็ก ', 'กิน ']
269
+ expand_maiyamok("คนๆนก ")
270
+ # output: ['คน ', 'คน ', 'นก ']
272
271
"""
273
272
if isinstance (sent , str ):
274
273
sent = word_tokenize (sent )
275
- _list_word : list [str ] = []
276
- i = 0
277
- for j , text in enumerate (sent ):
278
- if text .isspace () and "ๆ" in sent [j + 1 ]:
279
- continue
280
- if " ๆ" in text :
281
- text = text .replace (" ๆ" , "ๆ" )
282
- if "ๆ" == text :
283
- text = _list_word [i - 1 ]
284
- elif "ๆ" in text :
285
- count = text .count ("ๆ" )
286
- text = _list_word [i - 1 ]
287
- for _ in range (count ):
288
- _list_word .append (text )
289
- i += 1
274
+
275
+ # Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
276
+ temp_toks : list [str ] = []
277
+ for _ , token in enumerate (sent ):
278
+ toks = re .split (r"(ๆ)" , token )
279
+ toks = [tok for tok in toks if tok ] # remove empty string ("")
280
+ temp_toks .extend (toks )
281
+ sent = temp_toks
282
+
283
+ output_toks : list [str ] = []
284
+
285
+ yamok = "ๆ"
286
+ yamok_count = 0
287
+ len_sent = len (sent )
288
+ for i in range (len_sent - 1 , - 1 , - 1 ): # do it backward
289
+ if yamok_count == 0 or (i + 1 >= len_sent ):
290
+ if sent [i ] == yamok :
291
+ yamok_count = yamok_count + 1
292
+ else :
293
+ output_toks .append (sent [i ])
290
294
continue
291
- _list_word .append (text )
292
- i += 1
293
- return _list_word
295
+
296
+ if sent [i ] == yamok :
297
+ yamok_count = yamok_count + 1
298
+ else :
299
+ if sent [i ].isspace ():
300
+ if yamok_count > 0 : # remove space before yamok
301
+ continue
302
+ else : # with preprocessing above, this should not happen
303
+ output_toks .append (sent [i ])
304
+ else :
305
+ output_toks .extend ([sent [i ]] * (yamok_count + 1 ))
306
+ yamok_count = 0
307
+
308
+ return output_toks [::- 1 ]
294
309
295
310
296
311
def maiyamok (sent : Union [str , List [str ]]) -> List [str ]:
@@ -303,7 +318,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
303
318
repetition. This function preprocesses Thai text by replacing
304
319
Maiyamok with a word being repeated.
305
320
306
- :param Union[str, List[str]] sent: input sentence (list or str )
321
+ :param Union[str, List[str]] sent: sentence (list or string )
307
322
:return: list of words
308
323
:rtype: List[str]
309
324
@@ -312,8 +327,8 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]:
312
327
313
328
from pythainlp.util import expand_maiyamok
314
329
315
- expand_maiyamok("เด็กๆกิน ")
316
- # output: ['เด็ก ', 'เด็ก ', 'กิน ']
330
+ expand_maiyamok("คนๆนก ")
331
+ # output: ['คน ', 'คน ', 'นก ']
317
332
"""
318
333
warn_deprecation (
319
334
"pythainlp.util.maiyamok" , "pythainlp.util.expand_maiyamok"
0 commit comments