Merge pull request #147 from bact/dev

bact · web-flow · commit 2d0482979c26 · 2018-11-04T17:06:18.000+07:00
Minor bug fixes + add test cases + update readme
diff --git a/README-pypi.md b/README-pypi.md
@@ -20,7 +20,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz
 - thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset
 - Sentiment classifier based on ULMFit and various product review datasets
 - Add ULMFit utility to PyThaiNLP
-- Add Thai romanization model thai2rom
+- Add Thai romanization model ThaiTransliterator
 - Retrain POS-tagging model
 - Improved word_tokenize (newmm, mm) and dict_word_tokenize
 - Documentation added
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 Thai Natural Language Processing in Python.
 
-PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk`, but with focus on Thai language.
+PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language.
 
 PyThaiNLP supports Python 3.4+. Since version 1.7, PyThaiNLP deprecates its support for Python 2. Python 2 users can still use PyThaiNLP 1.6.
 
@@ -44,7 +44,7 @@ Development release
 $ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
 ```
 
-Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature.
+Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries.
 
 ## Documentation
 
@@ -103,7 +103,7 @@ $ pip install pythainlp
 $ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
 ```
 
-หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน
+หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน มอดูลที่อาศัยการเรียนรู้ของเครื่องอื่นๆ อาจจำเป็นต้องติดตั้ง gensim และ keras ก่อนเช่นกัน
 
 ## เอกสารการใช้งาน
 
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
@@ -20,7 +20,7 @@ def word_freqs():
     ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
     โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
     """
-    path = get_full_data_path("tnc_freq.txt")  # try local copy first
+    path = get_full_data_path("ttc_freq.txt")  # try local copy first
     if not os.path.exists(path):  # if fail, download from internet
         response = requests.get(_TCC_FREQ_URL)
         with open(path, "wb") as f:
diff --git a/pythainlp/number/thainum.py b/pythainlp/number/thainum.py
@@ -90,30 +90,38 @@ def bahttext(amount_number):
     """
     Converts a number to Thai text and adds a suffix of "Baht" currency.
 
-    Similar to BAHTTEXT funcation in Excel
+    Similar to BAHTTEXT function in Excel
     """
-    amount_number = number_format(amount_number, 2).replace(" ", "")
-    pt = amount_number.find(".")
-    number, fraction = "", ""
-    amount_number1 = amount_number.split(".")
+    ret = ""
 
-    if not pt:
-        number = amount_number
+    if amount_number is None:
+        pass
+    elif amount_number == 0:
+        ret = "ศูนย์บาทถ้วน"
     else:
-        amount_number = amount_number.split(".")
-        number = amount_number[0]
-        fraction = int(amount_number1[1])
+        amount_number = number_format(amount_number, 2).replace(" ", "")
+        pt = amount_number.find(".")
+        number, fraction = "", ""
+        amount_number1 = amount_number.split(".")
 
-    ret = ""
-    number = ast.literal_eval(number.replace(",", ""))
-    baht = num_to_thaiword(number)
-    if baht != "":
-        ret = "".join([ret, baht, "บาท"])
-    satang = num_to_thaiword(fraction)
-    if satang != "":
-        ret = "".join([ret, satang, "สตางค์"])
-    else:
-        ret = "".join([ret, "ถ้วน"])
+        if not pt:
+            number = amount_number
+        else:
+            amount_number = amount_number.split(".")
+            number = amount_number[0]
+            fraction = int(amount_number1[1])
+
+        number = ast.literal_eval(number.replace(",", ""))
+
+        baht = num_to_thaiword(number)
+        if baht != "":
+            ret = "".join([ret, baht, "บาท"])
+
+        satang = num_to_thaiword(fraction)
+        if satang != "" and satang != "ศูนย์":
+            ret = "".join([ret, satang, "สตางค์"])
+        else:
+            ret = "".join([ret, "ถ้วน"])
 
     return ret
 
@@ -123,38 +131,45 @@ def num_to_thaiword(number):
     :param float number: a float number (with decimals) indicating a quantity
     :return: a text that indicates the full amount in word form, properly ending each digit with the right term.
     """
-    position_call = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
-    number_call = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]
-
     ret = ""
-    if number == 0:
-        return ret
-    if number > 1000000:
-        ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
-        number = int(math.fmod(number, 1000000))
-    divider = 100000
-
-    pos = 0
-    while number > 0:
-        d = int(number / divider)
-        if (divider == 10) and (d == 2):
-            ret += "ยี่"
-        elif (divider == 10) and (d == 1):
-            ret += ""
-        elif (divider == 1) and (d == 1) and (ret != ""):
-            ret += "เอ็ด"
-        else:
-            ret += number_call[d]
-        if d:
-            ret += position_call[pos]
-        else:
-            ret += ""
-        number = number % divider
-        divider = divider / 10
-        pos += 1
+
+    if number is None:
+        pass
+    elif number == 0:
+        ret = "ศูนย์"
+    else:
+        _POS_CALL = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""]
+        _NUM_CALL = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]
+
+        if number > 1000000:
+            ret += num_to_thaiword(int(number / 1000000)) + "ล้าน"
+            number = int(math.fmod(number, 1000000))
+        divider = 100000
+
+        pos = 0
+        while number > 0:
+            d = int(number / divider)
+
+            if (divider == 10) and (d == 2):
+                ret += "ยี่"
+            elif (divider == 10) and (d == 1):
+                ret += ""
+            elif (divider == 1) and (d == 1) and (ret != ""):
+                ret += "เอ็ด"
+            else:
+                ret += _NUM_CALL[d]
+
+            if d:
+                ret += _POS_CALL[pos]
+            else:
+                ret += ""
+
+            number = number % divider
+            divider = divider / 10
+            pos += 1
 
     return ret
 
 
 if __name__ == "__main__":
-    print(bahtext(4000.0))
+    print(bahttext(4000.0))
diff --git a/pythainlp/number/wordtonum.py b/pythainlp/number/wordtonum.py
@@ -7,6 +7,8 @@
 """
 import re
 
+from pythainlp.tokenize import Tokenizer
+
 _THAIWORD_NUMS = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split())
 _THAIWORD_UNITS = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split())
 _THAIWORD_NUMS_UNITS = _THAIWORD_NUMS | _THAIWORD_UNITS
@@ -34,12 +36,14 @@
 _NU_PAT = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?")  # หกสิบ, ร้อยเอ็ด
 # assuming that the units are separated already
 
+_TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS)
+
 
 def _thaiword_to_num(tokens):
     len_tokens = len(tokens)
 
     if len_tokens == 0:
-        return 0
+        return None
 
     if len_tokens == 1:
         return _THAI_INT_MAP[tokens[0]]
@@ -61,7 +65,17 @@ def _thaiword_to_num(tokens):
         return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:])
 
 
-def thaiword_to_num(tokens):
+def thaiword_to_num(thaiword):
+    if not thaiword:
+        return None
+
+    tokens = []
+    if type(thaiword) == str:
+        tokens = _TOKENIZER.word_tokenize(thaiword)
+    elif type(thaiword) in (list, tuple, set, frozenset):
+        for w in thaiword:
+            tokens.extend(_TOKENIZER.word_tokenize(w))
+
     res = []
     for tok in tokens:
         if tok in _THAIWORD_NUMS_UNITS:
@@ -72,4 +86,5 @@ def thaiword_to_num(tokens):
                 res.extend([t for t in m.groups() if t])  # ตัด None ทิ้ง
             else:
                 pass  # should not be here
+
     return _thaiword_to_num(res)
diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py
@@ -43,10 +43,12 @@ def sentiment(text, engine="old"):
             os.path.join(_SENTIMENT_PATH, "vocabulary.data"), "rb"
         ) as in_strm:
             vocabulary = dill.load(in_strm)
+
         with open(
             os.path.join(_SENTIMENT_PATH, "sentiment.data"), "rb"
         ) as in_strm:
             classifier = dill.load(in_strm)
+
         text = set(word_tokenize(text)) - _STOPWORDS
         featurized_test_sentence = {i: (i in text) for i in vocabulary}
 
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
@@ -8,7 +8,6 @@
 from pythainlp.corpus import download, get_file
 from pythainlp.tokenize import word_tokenize
 
-
 try:
     import numpy as np
     from fastai.text import *
@@ -43,7 +42,6 @@ def __init__(self, engine="newmm"):
             * newmm - dictionary-based, Maximum Matching algorithm + TCC
             * longest - dictionary-based, Longest Matching
             * icu - use ICU, dictionary-based
-            * pylexto - use LexTo, dictionary-based
             * deepcut - use deepcut, language model-based
         """
         self.engine = engine
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -7,6 +7,7 @@
     conceptnet,
     countries,
     provinces,
+    remove,
     thai_negations,
     thai_stopwords,
     thai_syllables,
@@ -66,6 +67,7 @@ def test_corpus(self):
         self.assertIsNotNone(thai_stopwords())
         self.assertIsNotNone(thai_syllables())
         self.assertIsNotNone(thai_words())
+        self.assertIsNotNone(remove("tnc_freq"))
 
     def test_tnc(self):
         self.assertIsNotNone(tnc.word_freqs())
@@ -150,13 +152,25 @@ def test_number(self):
             bahttext(5611116.50),
             "ห้าล้านหกแสนหนึ่งหมื่นหนึ่งพันหนึ่งร้อยสิบหกบาทห้าสิบสตางค์",
         )
+        self.assertEqual(bahttext(116), "หนึ่งร้อยสิบหกบาทถ้วน")
+        self.assertEqual(bahttext(0), "ศูนย์บาทถ้วน")
+        self.assertEqual(bahttext(None), "")
+
         self.assertEqual(num_to_thaiword(112), "หนึ่งร้อยสิบสอง")
+        self.assertEqual(num_to_thaiword(0), "ศูนย์")
+        self.assertEqual(num_to_thaiword(None), "")
+
+        self.assertEqual(thaiword_to_num("ร้อยสิบสอง"), 112)
         self.assertEqual(
             thaiword_to_num(
-                ["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
+                ["หก", "ล้าน", "หก", "แสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"]
             ),
             6666666,
         )
+        self.assertEqual(thaiword_to_num("ยี่สิบ"), 20)
+        self.assertEqual(thaiword_to_num("ศูนย์"), 0)
+        self.assertEqual(thaiword_to_num(""), None)
+        self.assertEqual(thaiword_to_num(None), None)
 
     # ### pythainlp.rank
 
@@ -181,7 +195,7 @@ def test_romanization_royin(self):
 
     def test_sentiment(self):
         text = "เสียใจมาก"
-        # self.assertEqual(sentiment(text, engine="old"), "neg")
+        self.assertEqual(sentiment(text, engine="old"), "neg")
         # self.assertEqual(sentiment(text, engine="ulmfit"), "neg")
 
     # ### pythainlp.soundex