Skip to content

Commit 73b17e3

Browse files
authored
Merge pull request #844 from pavaris-pm/dev
Add extra segmentation style for `paragraph_tokenize` function
2 parents fcf567c + a0053c1 commit 73b17e3

File tree

2 files changed

+47
-10
lines changed

2 files changed

+47
-10
lines changed

pythainlp/tokenize/core.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,12 @@ def sent_tokenize(
446446
return segments
447447

448448

449-
def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]:
449+
def paragraph_tokenize(
450+
text: str,
451+
engine: str = "wtp-mini",
452+
paragraph_threshold:float=0.5,
453+
style:str='newline',
454+
) -> List[List[str]]:
450455
"""
451456
Paragraph tokenizer.
452457
@@ -492,7 +497,13 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:
492497
else:
493498
_size = engine.split("-")[-1]
494499
from pythainlp.tokenize.wtsplit import tokenize as segment
495-
segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold)
500+
segments = segment(
501+
text,
502+
size=_size,
503+
tokenize="paragraph",
504+
paragraph_threshold=paragraph_threshold,
505+
style=style,
506+
)
496507

497508
else:
498509
raise ValueError(

pythainlp/tokenize/wtsplit.py

+34-8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def _tokenize(
3030
model:str="wtp-bert-mini",
3131
tokenize:str="sentence",
3232
paragraph_threshold:float=0.5,
33+
style:str='newline',
3334
)-> List[str]:
3435
global _MODEL_NAME,_MODEL
3536
if _MODEL_NAME != model:
@@ -38,15 +39,34 @@ def _tokenize(
3839
if tokenize=="sentence":
3940
return _MODEL.split(text,lang_code=lang_code)
4041
else: # Paragraph
41-
return _MODEL.split(
42-
text,
43-
lang_code=lang_code,
44-
do_paragraph_segmentation=True,
45-
paragraph_threshold=paragraph_threshold
42+
if style=='newline':
43+
return _MODEL.split(
44+
text,
45+
lang_code=lang_code,
46+
do_paragraph_segmentation=True,
47+
paragraph_threshold=paragraph_threshold
48+
)
49+
elif style=='opus100':
50+
return _MODEL.split(
51+
text,
52+
lang_code=lang_code,
53+
do_paragraph_segmentation=True,
54+
threshold=paragraph_threshold,
55+
style=style,
56+
)
57+
else:
58+
raise ValueError(
59+
f"""Segmentation style \"{style}\" not found.
60+
It might be a typo; if not, please consult our document."""
4661
)
4762

48-
49-
def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]:
63+
def tokenize(
64+
text:str,
65+
size:str="mini",
66+
tokenize:str="sentence",
67+
paragraph_threshold:float=0.5,
68+
style:str='newline',
69+
)-> List[str]:
5070
_model_load=""
5171
if size=="tiny":
5272
_model_load="wtp-bert-tiny"
@@ -56,4 +76,10 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_thres
5676
_model_load="wtp-canine-s-12l"
5777
else: # mini
5878
_model_load="wtp-bert-mini"
59-
return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold)
79+
return _tokenize(
80+
text,
81+
model=_model_load,
82+
tokenize=tokenize,
83+
paragraph_threshold=paragraph_threshold,
84+
style=style,
85+
)

0 commit comments

Comments
 (0)