Merge pull request #844 from pavaris-pm/dev

wannaphong · web-flow · commit 73b17e326a45 · 2023-10-07T01:40:47.000+07:00
Add extra segmentation style for `paragraph_tokenize` function
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -446,7 +446,12 @@ def sent_tokenize(
     return segments
 
 
-def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]:
+def paragraph_tokenize(
+      text: str, 
+      engine: str = "wtp-mini", 
+      paragraph_threshold:float=0.5,
+      style:str='newline',
+    ) -> List[List[str]]:
     """
     Paragraph tokenizer.
 
@@ -492,7 +497,13 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:
         else:
             _size = engine.split("-")[-1]
         from pythainlp.tokenize.wtsplit import tokenize as segment
-        segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold)
+        segments = segment(
+                      text,
+                      size=_size,
+                      tokenize="paragraph",
+                      paragraph_threshold=paragraph_threshold,
+                      style=style,
+                    )
         
     else:
         raise ValueError(
diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
@@ -30,6 +30,7 @@ def _tokenize(
         model:str="wtp-bert-mini",
         tokenize:str="sentence",
         paragraph_threshold:float=0.5,
+        style:str='newline',
     )-> List[str]:
     global _MODEL_NAME,_MODEL
     if _MODEL_NAME != model:
@@ -38,15 +39,34 @@ def _tokenize(
     if tokenize=="sentence":
         return _MODEL.split(text,lang_code=lang_code)
     else: # Paragraph
-        return _MODEL.split(
-            text,
-            lang_code=lang_code,
-            do_paragraph_segmentation=True,
-            paragraph_threshold=paragraph_threshold
+        if style=='newline':
+          return _MODEL.split(
+              text,
+              lang_code=lang_code,
+              do_paragraph_segmentation=True,
+              paragraph_threshold=paragraph_threshold
+          )
+        elif style=='opus100':
+          return _MODEL.split(
+              text,
+              lang_code=lang_code,
+              do_paragraph_segmentation=True,
+              threshold=paragraph_threshold,
+              style=style,
+          )
+        else:
+          raise ValueError(
+            f"""Segmentation style \"{style}\" not found.
+            It might be a typo; if not, please consult our document."""
         )
 
-
-def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]:
+def tokenize(
+        text:str, 
+        size:str="mini", 
+        tokenize:str="sentence", 
+        paragraph_threshold:float=0.5,
+        style:str='newline',
+    )-> List[str]:
     _model_load=""
     if size=="tiny":
         _model_load="wtp-bert-tiny"
@@ -56,4 +76,10 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_thres
         _model_load="wtp-canine-s-12l"
     else:  # mini
         _model_load="wtp-bert-mini"
-    return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold)
+    return _tokenize(
+              text, 
+              model=_model_load,
+              tokenize=tokenize,
+              paragraph_threshold=paragraph_threshold,
+              style=style,
+            )