Skip to content

Commit 2d04829

Browse files
authored
Merge pull request #147 from bact/dev
Minor bug fixes + add test cases + update readme
2 parents 734b5b5 + cc601f1 commit 2d04829

File tree

8 files changed

+104
-60
lines changed

8 files changed

+104
-60
lines changed

README-pypi.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz
2020
- thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset
2121
- Sentiment classifier based on ULMFit and various product review datasets
2222
- Add ULMFit utility to PyThaiNLP
23-
- Add Thai romanization model thai2rom
23+
- Add Thai romanization model ThaiTransliterator
2424
- Retrain POS-tagging model
2525
- Improved word_tokenize (newmm, mm) and dict_word_tokenize
2626
- Documentation added

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
Thai Natural Language Processing in Python.
1212

13-
PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk`, but with focus on Thai language.
13+
PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language.
1414

1515
PyThaiNLP supports Python 3.4+. Since version 1.7, PyThaiNLP deprecates its support for Python 2. Python 2 users can still use PyThaiNLP 1.6.
1616

@@ -44,7 +44,7 @@ Development release
4444
$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
4545
```
4646

47-
Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature.
47+
Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries.
4848

4949
## Documentation
5050

@@ -103,7 +103,7 @@ $ pip install pythainlp
103103
$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
104104
```
105105

106-
หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน
106+
หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน มอดูลที่อาศัยการเรียนรู้ของเครื่องอื่นๆ อาจจำเป็นต้องติดตั้ง gensim และ keras ก่อนเช่นกัน
107107

108108
## เอกสารการใช้งาน
109109

pythainlp/corpus/ttc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def word_freqs():
2020
ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
2121
โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
2222
"""
23-
path = get_full_data_path("tnc_freq.txt") # try local copy first
23+
path = get_full_data_path("ttc_freq.txt") # try local copy first
2424
if not os.path.exists(path): # if fail, download from internet
2525
response = requests.get(_TCC_FREQ_URL)
2626
with open(path, "wb") as f:

pythainlp/number/thainum.py

+64-49
Original file line numberDiff line numberDiff line change
@@ -90,30 +90,38 @@ def bahttext(amount_number):
9090
"""
9191
Converts a number to Thai text and adds a suffix of "Baht" currency.
9292
93-
Similar to BAHTTEXT funcation in Excel
93+
Similar to BAHTTEXT function in Excel
9494
"""
95-
amount_number = number_format(amount_number, 2).replace(" ", "")
96-
pt = amount_number.find(".")
97-
number, fraction = "", ""
98-
amount_number1 = amount_number.split(".")
95+
ret = ""
9996

100-
if not pt:
101-
number = amount_number
97+
if amount_number is None:
98+
pass
99+
elif amount_number == 0:
100+
ret = "ศูนย์บาทถ้วน"
102101
else:
103-
amount_number = amount_number.split(".")
104-
number = amount_number[0]
105-
fraction = int(amount_number1[1])
102+
amount_number = number_format(amount_number, 2).replace(" ", "")
103+
pt = amount_number.find(".")
104+
number, fraction = "", ""
105+
amount_number1 = amount_number.split(".")
106106

107-
ret = ""
108-
number = ast.literal_eval(number.replace(",", ""))
109-
baht = num_to_thaiword(number)
110-
if baht != "":
111-
ret = "".join([ret, baht, "บาท"])
112-
satang = num_to_thaiword(fraction)
113-
if satang != "":
114-
ret = "".join([ret, satang, "สตางค์"])
115-
else:
116-
ret = "".join([ret, "ถ้วน"])
107+
if not pt:
108+
number = amount_number
109+
else:
110+
amount_number = amount_number.split(".")
111+
number = amount_number[0]
112+
fraction = int(amount_number1[1])
113+
114+
number = ast.literal_eval(number.replace(",", ""))
115+
116+
baht = num_to_thaiword(number)
117+
if baht != "":
118+
ret = "".join([ret, baht, "บาท"])
119+
120+
satang = num_to_thaiword(fraction)
121+
if satang != "" and satang != "ศูนย์":
122+
ret = "".join([ret, satang, "สตางค์"])
123+
else:
124+
ret = "".join([ret, "ถ้วน"])
117125

118126
return ret
119127

@@ -123,38 +131,45 @@ def num_to_thaiword(number):
123131
:param float number: a float number (with decimals) indicating a quantity
124132
:return: a text that indicates the full amount in word form, properly ending each digit with the right term.
125133
"""
126-
position_call = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
127-
number_call = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]
128-
129134
ret = ""
130-
if number == 0:
131-
return ret
132-
if number > 1000000:
133-
ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
134-
number = int(math.fmod(number, 1000000))
135-
divider = 100000
136-
137-
pos = 0
138-
while number > 0:
139-
d = int(number / divider)
140-
if (divider == 10) and (d == 2):
141-
ret += "ยี่"
142-
elif (divider == 10) and (d == 1):
143-
ret += ""
144-
elif (divider == 1) and (d == 1) and (ret != ""):
145-
ret += "เอ็ด"
146-
else:
147-
ret += number_call[d]
148-
if d:
149-
ret += position_call[pos]
150-
else:
151-
ret += ""
152-
number = number % divider
153-
divider = divider / 10
154-
pos += 1
135+
136+
if number is None:
137+
pass
138+
elif number == 0:
139+
ret = "ศูนย์"
140+
else:
141+
_POS_CALL = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
142+
_NUM_CALL = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]
143+
144+
if number > 1000000:
145+
ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
146+
number = int(math.fmod(number, 1000000))
147+
divider = 100000
148+
149+
pos = 0
150+
while number > 0:
151+
d = int(number / divider)
152+
153+
if (divider == 10) and (d == 2):
154+
ret += "ยี่"
155+
elif (divider == 10) and (d == 1):
156+
ret += ""
157+
elif (divider == 1) and (d == 1) and (ret != ""):
158+
ret += "เอ็ด"
159+
else:
160+
ret += _NUM_CALL[d]
161+
162+
if d:
163+
ret += _POS_CALL[pos]
164+
else:
165+
ret += ""
166+
167+
number = number % divider
168+
divider = divider / 10
169+
pos += 1
155170

156171
return ret
157172

158173

159174
if __name__ == "__main__":
160-
print(bahtext(4000.0))
175+
print(bahttext(4000.0))

pythainlp/number/wordtonum.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
"""
88
import re
99

10+
from pythainlp.tokenize import Tokenizer
11+
1012
_THAIWORD_NUMS = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split())
1113
_THAIWORD_UNITS = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split())
1214
_THAIWORD_NUMS_UNITS = _THAIWORD_NUMS | _THAIWORD_UNITS
@@ -34,12 +36,14 @@
3436
_NU_PAT = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?") # หกสิบ, ร้อยเอ็ด
3537
# assuming that the units are separated already
3638

39+
_TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS)
40+
3741

3842
def _thaiword_to_num(tokens):
3943
len_tokens = len(tokens)
4044

4145
if len_tokens == 0:
42-
return 0
46+
return None
4347

4448
if len_tokens == 1:
4549
return _THAI_INT_MAP[tokens[0]]
@@ -61,7 +65,17 @@ def _thaiword_to_num(tokens):
6165
return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:])
6266

6367

64-
def thaiword_to_num(tokens):
68+
def thaiword_to_num(thaiword):
69+
if not thaiword:
70+
return None
71+
72+
tokens = []
73+
if type(thaiword) == str:
74+
tokens = _TOKENIZER.word_tokenize(thaiword)
75+
elif type(thaiword) in (list, tuple, set, frozenset):
76+
for w in thaiword:
77+
tokens.extend(_TOKENIZER.word_tokenize(w))
78+
6579
res = []
6680
for tok in tokens:
6781
if tok in _THAIWORD_NUMS_UNITS:
@@ -72,4 +86,5 @@ def thaiword_to_num(tokens):
7286
res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง
7387
else:
7488
pass # should not be here
89+
7590
return _thaiword_to_num(res)

pythainlp/sentiment/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,12 @@ def sentiment(text, engine="old"):
4343
os.path.join(_SENTIMENT_PATH, "vocabulary.data"), "rb"
4444
) as in_strm:
4545
vocabulary = dill.load(in_strm)
46+
4647
with open(
4748
os.path.join(_SENTIMENT_PATH, "sentiment.data"), "rb"
4849
) as in_strm:
4950
classifier = dill.load(in_strm)
51+
5052
text = set(word_tokenize(text)) - _STOPWORDS
5153
featurized_test_sentence = {i: (i in text) for i in vocabulary}
5254

pythainlp/ulmfit/utils.py

-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from pythainlp.corpus import download, get_file
99
from pythainlp.tokenize import word_tokenize
1010

11-
1211
try:
1312
import numpy as np
1413
from fastai.text import *
@@ -43,7 +42,6 @@ def __init__(self, engine="newmm"):
4342
* newmm - dictionary-based, Maximum Matching algorithm + TCC
4443
* longest - dictionary-based, Longest Matching
4544
* icu - use ICU, dictionary-based
46-
* pylexto - use LexTo, dictionary-based
4745
* deepcut - use deepcut, language model-based
4846
"""
4947
self.engine = engine

tests/__init__.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
conceptnet,
88
countries,
99
provinces,
10+
remove,
1011
thai_negations,
1112
thai_stopwords,
1213
thai_syllables,
@@ -66,6 +67,7 @@ def test_corpus(self):
6667
self.assertIsNotNone(thai_stopwords())
6768
self.assertIsNotNone(thai_syllables())
6869
self.assertIsNotNone(thai_words())
70+
self.assertIsNotNone(remove("tnc_freq"))
6971

7072
def test_tnc(self):
7173
self.assertIsNotNone(tnc.word_freqs())
@@ -150,13 +152,25 @@ def test_number(self):
150152
bahttext(5611116.50),
151153
"ห้าล้านหกแสนหนึ่งหมื่นหนึ่งพันหนึ่งร้อยสิบหกบาทห้าสิบสตางค์",
152154
)
155+
self.assertEqual(bahttext(116), "หนึ่งร้อยสิบหกบาทถ้วน")
156+
self.assertEqual(bahttext(0), "ศูนย์บาทถ้วน")
157+
self.assertEqual(bahttext(None), "")
158+
153159
self.assertEqual(num_to_thaiword(112), "หนึ่งร้อยสิบสอง")
160+
self.assertEqual(num_to_thaiword(0), "ศูนย์")
161+
self.assertEqual(num_to_thaiword(None), "")
162+
163+
self.assertEqual(thaiword_to_num("ร้อยสิบสอง"), 112)
154164
self.assertEqual(
155165
thaiword_to_num(
156-
["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
166+
["หก", "ล้าน", "หก", "แสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
157167
),
158168
6666666,
159169
)
170+
self.assertEqual(thaiword_to_num("ยี่สิบ"), 20)
171+
self.assertEqual(thaiword_to_num("ศูนย์"), 0)
172+
self.assertEqual(thaiword_to_num(""), None)
173+
self.assertEqual(thaiword_to_num(None), None)
160174

161175
# ### pythainlp.rank
162176

@@ -181,7 +195,7 @@ def test_romanization_royin(self):
181195

182196
def test_sentiment(self):
183197
text = "เสียใจมาก"
184-
# self.assertEqual(sentiment(text, engine="old"), "neg")
198+
self.assertEqual(sentiment(text, engine="old"), "neg")
185199
# self.assertEqual(sentiment(text, engine="ulmfit"), "neg")
186200

187201
# ### pythainlp.soundex

0 commit comments

Comments
 (0)