6
6
"""
7
7
import re
8
8
from typing import Iterable , List , Union
9
+ import copy
9
10
10
11
from pythainlp .tokenize import (
11
12
DEFAULT_SENT_TOKENIZE_ENGINE ,
@@ -198,7 +199,7 @@ def word_tokenize(
198
199
199
200
word_tokenize(text, engine="newmm", keep_whitespace=False)
200
201
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
201
-
202
+
202
203
Join broken formatted numeric (e.g. time, decimals, IP addresses)::
203
204
204
205
text = "เงิน1,234บาท19:32น 127.0.0.1"
@@ -322,17 +323,50 @@ def word_tokenize(
322
323
return segments
323
324
324
325
326
+ def indices_words (words ):
327
+ indices = []
328
+ start_index = 0
329
+ for word in words :
330
+ end_index = start_index + len (word ) - 1
331
+ indices .append ((start_index , end_index ))
332
+ start_index += len (word )
333
+
334
+ return indices
335
+
336
+
337
+ def map_indices_to_words (index_list , sentences ):
338
+ result = []
339
+ c = copy .copy (index_list )
340
+ n_sum = 0
341
+ for sentence in sentences :
342
+ words = sentence
343
+ sentence_result = []
344
+ n = 0
345
+ for start , end in c :
346
+ if start > n_sum + len (words ) - 1 :
347
+ break
348
+ else :
349
+ word = sentence [start - n_sum :end + 1 - n_sum ]
350
+ sentence_result .append (word )
351
+ n += 1
352
+
353
+ result .append (sentence_result )
354
+ n_sum += len (words )
355
+ for _ in range (n ):
356
+ del c [0 ]
357
+ return result
358
+
325
359
def sent_tokenize (
326
- text : str ,
360
+ text : Union [ str , List [ str ]] ,
327
361
engine : str = DEFAULT_SENT_TOKENIZE_ENGINE ,
328
362
keep_whitespace : bool = True ,
329
363
) -> List [str ]:
330
364
"""
331
365
Sentence tokenizer.
332
366
333
- Tokenizes running text into "sentences"
367
+ Tokenizes running text into "sentences". Supports both string and list of strings.
334
368
335
- :param str text: the text to be tokenized
369
+ :param text: the text (string) or list of words (list of strings) to be tokenized
336
370
:param str engine: choose among *'crfcut'*, *'whitespace'*, \
337
371
*'whitespace+newline'*
338
372
:return: list of split sentences
@@ -394,38 +428,84 @@ def sent_tokenize(
394
428
'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
395
429
"""
396
430
397
- if not text or not isinstance (text , str ):
431
+ if not text or not isinstance (text , ( str , list ) ):
398
432
return []
399
433
434
+ is_list_input = isinstance (text , list )
435
+
436
+ if is_list_input :
437
+
438
+ try :
439
+ original_text = "" .join (text )
440
+ except ValueError :
441
+ return []
442
+
443
+ else :
444
+ original_text = text
445
+
400
446
segments = []
401
447
402
448
if engine == "crfcut" :
403
449
from pythainlp .tokenize .crfcut import segment
404
450
405
- segments = segment (text )
451
+ segments = segment (original_text )
452
+
453
+ if is_list_input :
454
+ word_indices = indices_words (text )
455
+ result = map_indices_to_words (word_indices , [original_text ])
456
+ return result
406
457
elif engine == "whitespace" :
407
- segments = re .split (r" +" , text , flags = re .U )
458
+ segments = re .split (r" +" , original_text , flags = re .U )
459
+ if is_list_input :
460
+ result = []
461
+ _temp = []
462
+ for i , w in enumerate (text ):
463
+ if re .findall (r" " , w ) != [] and re .findall (r"\w" , w ) == []:
464
+ if _temp == []:
465
+ continue
466
+ result .append (_temp )
467
+ _temp = []
468
+ else :
469
+ _temp .append (w )
470
+ if i + 1 == len (text ):
471
+ result .append (_temp )
472
+ return result
408
473
elif engine == "whitespace+newline" :
409
- segments = text .split ()
474
+ segments = original_text .split ()
475
+ if is_list_input :
476
+ result = []
477
+ _temp = []
478
+ for i , w in enumerate (text ):
479
+ if (
480
+ (re .findall (r"\s" , w ) != [] or
481
+ re .findall (r"\n" , w ) != []) and
482
+ re .findall (r"\w" , w ) == []
483
+ ):
484
+ if _temp == []:
485
+ continue
486
+ result .append (_temp )
487
+ _temp = []
488
+ else :
489
+ _temp .append (w )
490
+ if i + 1 == len (text ):
491
+ result .append (_temp )
492
+ return result
410
493
elif engine == "tltk" :
411
494
from pythainlp .tokenize .tltk import sent_tokenize as segment
412
-
413
- segments = segment (text )
495
+ segments = segment (original_text )
414
496
elif engine == "thaisum" :
415
497
from pythainlp .tokenize .thaisumcut import (
416
498
ThaiSentenceSegmentor as segmentor ,
417
499
)
418
-
419
500
segment = segmentor ()
420
- segments = segment .split_into_sentences (text )
501
+ segments = segment .split_into_sentences (original_text )
421
502
elif engine .startswith ("wtp" ):
422
503
if "-" not in engine :
423
504
_size = "mini"
424
505
else :
425
506
_size = engine .split ("-" )[- 1 ]
426
507
from pythainlp .tokenize .wtsplit import tokenize as segment
427
-
428
- segments = segment (text , size = _size , tokenize = "sentence" )
508
+ segments = segment (original_text , size = _size , tokenize = "sentence" )
429
509
else :
430
510
raise ValueError (
431
511
f"""Tokenizer \" { engine } \" not found.
@@ -435,7 +515,12 @@ def sent_tokenize(
435
515
if not keep_whitespace :
436
516
segments = strip_whitespace (segments )
437
517
438
- return segments
518
+ if is_list_input and engine not in ["crfcut" ]:
519
+ word_indices = indices_words (text )
520
+ result = map_indices_to_words (word_indices , segments )
521
+ return result
522
+ else :
523
+ return [segments ]
439
524
440
525
441
526
def paragraph_tokenize (
0 commit comments