@@ -30,6 +30,7 @@ def _tokenize(
30
30
model :str = "wtp-bert-mini" ,
31
31
tokenize :str = "sentence" ,
32
32
paragraph_threshold :float = 0.5 ,
33
+ style :str = 'newline' ,
33
34
)-> List [str ]:
34
35
global _MODEL_NAME ,_MODEL
35
36
if _MODEL_NAME != model :
@@ -38,15 +39,34 @@ def _tokenize(
38
39
if tokenize == "sentence" :
39
40
return _MODEL .split (text ,lang_code = lang_code )
40
41
else : # Paragraph
41
- return _MODEL .split (
42
- text ,
43
- lang_code = lang_code ,
44
- do_paragraph_segmentation = True ,
45
- paragraph_threshold = paragraph_threshold
42
+ if style == 'newline' :
43
+ return _MODEL .split (
44
+ text ,
45
+ lang_code = lang_code ,
46
+ do_paragraph_segmentation = True ,
47
+ paragraph_threshold = paragraph_threshold
48
+ )
49
+ elif style == 'opus100' :
50
+ return _MODEL .split (
51
+ text ,
52
+ lang_code = lang_code ,
53
+ do_paragraph_segmentation = True ,
54
+ threshold = paragraph_threshold ,
55
+ style = style ,
56
+ )
57
+ else :
58
+ raise ValueError (
59
+ f"""Segmentation style \" { style } \" not found.
60
+ It might be a typo; if not, please consult our document."""
46
61
)
47
62
48
-
49
- def tokenize (text :str , size :str = "mini" , tokenize :str = "sentence" , paragraph_threshold :float = 0.5 )-> List [str ]:
63
+ def tokenize (
64
+ text :str ,
65
+ size :str = "mini" ,
66
+ tokenize :str = "sentence" ,
67
+ paragraph_threshold :float = 0.5 ,
68
+ style :str = 'newline' ,
69
+ )-> List [str ]:
50
70
_model_load = ""
51
71
if size == "tiny" :
52
72
_model_load = "wtp-bert-tiny"
@@ -56,4 +76,10 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_thres
56
76
_model_load = "wtp-canine-s-12l"
57
77
else : # mini
58
78
_model_load = "wtp-bert-mini"
59
- return _tokenize (text , model = _model_load ,tokenize = tokenize ,paragraph_threshold = paragraph_threshold )
79
+ return _tokenize (
80
+ text ,
81
+ model = _model_load ,
82
+ tokenize = tokenize ,
83
+ paragraph_threshold = paragraph_threshold ,
84
+ style = style ,
85
+ )
0 commit comments