Skip to content

Commit 1b2f39b

Browse files
authored
Merge pull request #695 from PyThaiNLP/dev
PyThaiNLP v3.1.0-dev1
2 parents 4862cf1 + a700564 commit 1b2f39b

File tree

6 files changed

+92
-14
lines changed

6 files changed

+92
-14
lines changed

docker_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,5 @@ OSKut==1.3
2929
nlpo3==1.2.2
3030
thai-nner==0.3
3131
spacy==2.3.*
32-
wunsen==0.0.1
32+
wunsen==0.0.3
3333
khanaa==0.0.6

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# Copyright (C) 2016-2022 PyThaiNLP Project
55
# URL: <https://pythainlp.github.io/>
66
# For license information, see LICENSE
7-
__version__ = "3.1.0-dev0"
7+
__version__ = "3.1.0-dev1"
88

99
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
1010

pythainlp/transliterate/wunsen.py

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
3+
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
4+
to Thai text
45
By Wunsen
56
67
:See Also:
@@ -12,25 +13,40 @@
1213

1314
class WunsenTransliterate:
1415
"""
15-
Transliterating Japanese/Korean/Vietnamese romanization text to Thai text
16+
Transliterating Japanese/Korean/Mandarin/Vietnamese romanization text
17+
to Thai text
1618
by Wunsen
1719
1820
:See Also:
1921
* `GitHub \
2022
<https://github.com/cakimpei/wunsen>`_
2123
"""
24+
2225
def __init__(self) -> None:
2326
self.thap_value = None
2427
self.lang = None
2528
self.jp_input = None
29+
self.zh_sandhi = None
30+
self.system = None
2631

27-
def transliterate(self, text: str, lang: str, jp_input: str = None):
32+
def transliterate(
33+
self,
34+
text: str,
35+
lang: str,
36+
jp_input: str = None,
37+
zh_sandhi: bool = None,
38+
system: str = None,
39+
):
2840
"""
2941
Use Wunsen for transliteration
3042
3143
:param str text: text wants transliterated to Thai text.
3244
:param str lang: source language
3345
:param str jp_input: japanese input method (for japanese only)
46+
:param bool zh_sandhi: mandarin third tone sandhi option
47+
(for mandarin only)
48+
:param str system: transliteration system (for japanese and
49+
mandarin only)
3450
3551
:return: Thai text
3652
:rtype: str
@@ -39,8 +55,22 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
3955
* *jp* - Japanese (from Hepburn romanization)
4056
* *ko* - Korean (from Revised Romanization)
4157
* *vi* - Vietnamese (Latin script)
58+
* *zh* - Mandarin (from Hanyu Pinyin)
4259
:Options for jp_input:
4360
* *Hepburn-no diacritic* - Hepburn-no diacritic (without macron)
61+
:Options for zh_sandhi:
62+
* *True* - apply third tone sandhi rule
63+
* *False* - do not apply third tone sandhi rule
64+
:Options for system:
65+
* *ORS61* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
66+
(สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
67+
* *RI35* - for Japanese หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น
68+
(ราชบัณฑิตยสถาน พ.ศ. 2535)
69+
* *RI49* - for Mandarin หลักเกณฑ์การทับศัพท์ภาษาจีน
70+
(ราชบัณฑิตยสถาน พ.ศ. 2549)
71+
* *THC43* - for Mandarin เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดาริน
72+
ด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสาร
73+
ภาษาจีน พ.ศ. 2543)
4474
4575
:Example:
4676
::
@@ -58,24 +88,56 @@ def transliterate(self, text: str, lang: str, jp_input: str = None):
5888
)
5989
# output: 'โอฮาโย'
6090
91+
wt.transliterate("ohayō", lang="jp", system="RI35")
92+
# output: 'โอะฮะโย'
93+
6194
wt.transliterate("annyeonghaseyo", lang="ko")
6295
# output: 'อันนย็องฮาเซโย'
6396
6497
wt.transliterate("xin chào", lang="vi")
6598
# output: 'ซีน จ่าว'
99+
100+
wt.transliterate("ni3 hao3", lang="zh")
101+
# output: 'หนี เห่า'
102+
103+
wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False)
104+
# output: 'หนี่ เห่า'
105+
106+
wt.transliterate("ni3 hao3", lang="zh", system="RI49")
107+
# output: 'หนี ห่าว'
66108
"""
67-
if self.lang != lang or self.jp_input != jp_input:
109+
if (
110+
self.lang != lang
111+
or self.jp_input != jp_input
112+
or self.zh_sandhi != zh_sandhi
113+
or self.system != system
114+
):
68115
if lang == "jp":
69-
if jp_input is None:
70-
self.thap_value = ThapSap("ja")
71-
else:
72-
self.thap_value = ThapSap("ja", input=jp_input)
73116
self.jp_input = jp_input
117+
self.zh_sandhi = None
118+
self.system = system
119+
elif lang == "zh":
120+
self.jp_input = None
121+
self.zh_sandhi = zh_sandhi
122+
self.system = system
74123
elif lang == "ko" or lang == "vi":
75124
self.jp_input = None
76-
self.thap_value = ThapSap(lang)
125+
self.zh_sandhi = None
126+
self.system = None
77127
else:
78128
raise NotImplementedError(
79129
"The %s language is not implemented." % lang
80130
)
131+
self.lang = lang
132+
input_lang = lang
133+
if input_lang == "jp":
134+
input_lang = "ja"
135+
setting = {}
136+
if self.jp_input is not None:
137+
setting.update({"input": self.jp_input})
138+
if self.zh_sandhi is not None:
139+
setting.update({"option": {"sandhi": self.zh_sandhi}})
140+
if self.system is not None:
141+
setting.update({"system": self.system})
142+
self.thap_value = ThapSap(input_lang, **setting)
81143
return self.thap_value.thap(text)

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.0.8
2+
current_version = 3.1.0
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,13 @@
108108
"nlpo3>=1.2.2",
109109
"onnxruntime>=1.10.0",
110110
"thai_nner",
111-
"wunsen>=0.0.1"
111+
"wunsen>=0.0.3"
112112
],
113113
}
114114

115115
setup(
116116
name="pythainlp",
117-
version="3.1.0-dev0",
117+
version="3.1.0-dev1",
118118
description="Thai Natural Language Processing library",
119119
long_description=readme,
120120
long_description_content_type="text/markdown",

tests/test_transliterate.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ def test_transliterate_wunsen(self):
171171
),
172172
'โอฮาโย'
173173
)
174+
self.assertEqual(
175+
wt.transliterate("ohayō", lang="jp", system="RI35"),
176+
'โอะฮะโย'
177+
)
174178
self.assertEqual(
175179
wt.transliterate("annyeonghaseyo", lang="ko"),
176180
'อันนย็องฮาเซโย'
@@ -179,6 +183,18 @@ def test_transliterate_wunsen(self):
179183
wt.transliterate("xin chào", lang="vi"),
180184
'ซีน จ่าว'
181185
)
186+
self.assertEqual(
187+
wt.transliterate("ni3 hao3", lang="zh"),
188+
'หนี เห่า'
189+
)
190+
self.assertEqual(
191+
wt.transliterate("ni3 hao3", lang="zh", zh_sandhi=False),
192+
'หนี่ เห่า'
193+
)
194+
self.assertEqual(
195+
wt.transliterate("ni3 hao3", lang="zh", system="RI49"),
196+
'หนี ห่าว'
197+
)
182198
with self.assertRaises(NotImplementedError):
183199
wt.transliterate("xin chào", lang="vii")
184200

0 commit comments

Comments
 (0)