diff --git a/README-pypi.md b/README-pypi.md index 5cd150ab4..65d12f03b 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -20,7 +20,7 @@ PyThaiNLP features include Thai word and subword segmentations, soundex, romaniz - thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset - Sentiment classifier based on ULMFit and various product review datasets - Add ULMFit utility to PyThaiNLP -- Add Thai romanization model thai2rom +- Add Thai romanization model ThaiTransliterator - Retrain POS-tagging model - Improved word_tokenize (newmm, mm) and dict_word_tokenize - Documentation added diff --git a/README.md b/README.md index b4a73a901..2336141ae 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Thai Natural Language Processing in Python. -PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk`, but with focus on Thai language. +PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language. PyThaiNLP supports Python 3.4+. Since version 1.7, PyThaiNLP deprecates its support for Python 2. Python 2 users can still use PyThaiNLP 1.6. @@ -44,7 +44,7 @@ Development release $ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip ``` -Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. +Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries. ## Documentation @@ -103,7 +103,7 @@ $ pip install pythainlp $ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip ``` -หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน +หมายเหตุ: เนื่องจาก ulmfit sentiment analyser ต้องใช้ PyTorch จึงต้อง ```pip install torch``` เพื่อติดตั้ง PyTorhc ก่อน มอดูลที่อาศัยการเรียนรู้ของเครื่องอื่นๆ อาจจำเป็นต้องติดตั้ง gensim และ keras ก่อนเช่นกัน ## เอกสารการใช้งาน diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 964441924..b3509d715 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -20,7 +20,7 @@ def word_freqs(): ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...] """ - path = get_full_data_path("tnc_freq.txt") # try local copy first + path = get_full_data_path("ttc_freq.txt") # try local copy first if not os.path.exists(path): # if fail, download from internet response = requests.get(_TCC_FREQ_URL) with open(path, "wb") as f: diff --git a/pythainlp/number/thainum.py b/pythainlp/number/thainum.py index 1851bef36..4f274b6a2 100644 --- a/pythainlp/number/thainum.py +++ b/pythainlp/number/thainum.py @@ -90,30 +90,38 @@ def bahttext(amount_number): """ Converts a number to Thai text and adds a suffix of "Baht" currency. - Similar to BAHTTEXT funcation in Excel + Similar to BAHTTEXT function in Excel """ - amount_number = number_format(amount_number, 2).replace(" ", "") - pt = amount_number.find(".") - number, fraction = "", "" - amount_number1 = amount_number.split(".") + ret = "" - if not pt: - number = amount_number + if amount_number is None: + pass + elif amount_number == 0: + ret = "ศูนย์บาทถ้วน" else: - amount_number = amount_number.split(".") - number = amount_number[0] - fraction = int(amount_number1[1]) + amount_number = number_format(amount_number, 2).replace(" ", "") + pt = amount_number.find(".") + number, fraction = "", "" + amount_number1 = amount_number.split(".") - ret = "" - number = ast.literal_eval(number.replace(",", "")) - baht = num_to_thaiword(number) - if baht != "": - ret = "".join([ret, baht, "บาท"]) - satang = num_to_thaiword(fraction) - if satang != "": - ret = "".join([ret, satang, "สตางค์"]) - else: - ret = "".join([ret, "ถ้วน"]) + if not pt: + number = amount_number + else: + amount_number = amount_number.split(".") + number = amount_number[0] + fraction = int(amount_number1[1]) + + number = ast.literal_eval(number.replace(",", "")) + + baht = num_to_thaiword(number) + if baht != "": + ret = "".join([ret, baht, "บาท"]) + + satang = num_to_thaiword(fraction) + if satang != "" and satang != "ศูนย์": + ret = "".join([ret, satang, "สตางค์"]) + else: + ret = "".join([ret, "ถ้วน"]) return ret @@ -123,38 +131,45 @@ def num_to_thaiword(number): :param float number: a float number (with decimals) indicating a quantity :return: a text that indicates the full amount in word form, properly ending each digit with the right term. """ - position_call = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""] - number_call = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"] - ret = "" - if number == 0: - return ret - if number > 1000000: - ret += num_to_thaiword(int(number / 1000000)) + "ล้าน" - number = int(math.fmod(number, 1000000)) - divider = 100000 - - pos = 0 - while number > 0: - d = int(number / divider) - if (divider == 10) and (d == 2): - ret += "ยี่" - elif (divider == 10) and (d == 1): - ret += "" - elif (divider == 1) and (d == 1) and (ret != ""): - ret += "เอ็ด" - else: - ret += number_call[d] - if d: - ret += position_call[pos] - else: - ret += "" - number = number % divider - divider = divider / 10 - pos += 1 + + if number is None: + pass + elif number == 0: + ret = "ศูนย์" + else: + _POS_CALL = ["แสน", "หมื่น", "พัน", "ร้อย", "สิบ", ""] + _NUM_CALL = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"] + + if number > 1000000: + ret += num_to_thaiword(int(number / 1000000)) + "ล้าน" + number = int(math.fmod(number, 1000000)) + divider = 100000 + + pos = 0 + while number > 0: + d = int(number / divider) + + if (divider == 10) and (d == 2): + ret += "ยี่" + elif (divider == 10) and (d == 1): + ret += "" + elif (divider == 1) and (d == 1) and (ret != ""): + ret += "เอ็ด" + else: + ret += _NUM_CALL[d] + + if d: + ret += _POS_CALL[pos] + else: + ret += "" + + number = number % divider + divider = divider / 10 + pos += 1 return ret if __name__ == "__main__": - print(bahtext(4000.0)) + print(bahttext(4000.0)) diff --git a/pythainlp/number/wordtonum.py b/pythainlp/number/wordtonum.py index 7e60ccb41..1242b5495 100644 --- a/pythainlp/number/wordtonum.py +++ b/pythainlp/number/wordtonum.py @@ -7,6 +7,8 @@ """ import re +from pythainlp.tokenize import Tokenizer + _THAIWORD_NUMS = set("ศูนย์ หนึ่ง เอ็ด สอง ยี่ สาม สี่ ห้า หก เจ็ด แปด เก้า".split()) _THAIWORD_UNITS = set("สิบ ร้อย พัน หมื่น แสน ล้าน".split()) _THAIWORD_NUMS_UNITS = _THAIWORD_NUMS | _THAIWORD_UNITS @@ -34,12 +36,14 @@ _NU_PAT = re.compile("(.+)?(สิบ|ร้อย|พัน|หมื่น|แสน|ล้าน)(.+)?") # หกสิบ, ร้อยเอ็ด # assuming that the units are separated already +_TOKENIZER = Tokenizer(custom_dict=_THAIWORD_NUMS_UNITS) + def _thaiword_to_num(tokens): len_tokens = len(tokens) if len_tokens == 0: - return 0 + return None if len_tokens == 1: return _THAI_INT_MAP[tokens[0]] @@ -61,7 +65,17 @@ def _thaiword_to_num(tokens): return _THAI_INT_MAP[a] * _THAI_INT_MAP[b] + _thaiword_to_num(tokens[2:]) -def thaiword_to_num(tokens): +def thaiword_to_num(thaiword): + if not thaiword: + return None + + tokens = [] + if type(thaiword) == str: + tokens = _TOKENIZER.word_tokenize(thaiword) + elif type(thaiword) in (list, tuple, set, frozenset): + for w in thaiword: + tokens.extend(_TOKENIZER.word_tokenize(w)) + res = [] for tok in tokens: if tok in _THAIWORD_NUMS_UNITS: @@ -72,4 +86,5 @@ def thaiword_to_num(tokens): res.extend([t for t in m.groups() if t]) # ตัด None ทิ้ง else: pass # should not be here + return _thaiword_to_num(res) diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py index 5a61bc935..dee3e86bf 100644 --- a/pythainlp/sentiment/__init__.py +++ b/pythainlp/sentiment/__init__.py @@ -43,10 +43,12 @@ def sentiment(text, engine="old"): os.path.join(_SENTIMENT_PATH, "vocabulary.data"), "rb" ) as in_strm: vocabulary = dill.load(in_strm) + with open( os.path.join(_SENTIMENT_PATH, "sentiment.data"), "rb" ) as in_strm: classifier = dill.load(in_strm) + text = set(word_tokenize(text)) - _STOPWORDS featurized_test_sentence = {i: (i in text) for i in vocabulary} diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py index 295aa52f5..17e26a0a6 100644 --- a/pythainlp/ulmfit/utils.py +++ b/pythainlp/ulmfit/utils.py @@ -8,7 +8,6 @@ from pythainlp.corpus import download, get_file from pythainlp.tokenize import word_tokenize - try: import numpy as np from fastai.text import * @@ -43,7 +42,6 @@ def __init__(self, engine="newmm"): * newmm - dictionary-based, Maximum Matching algorithm + TCC * longest - dictionary-based, Longest Matching * icu - use ICU, dictionary-based - * pylexto - use LexTo, dictionary-based * deepcut - use deepcut, language model-based """ self.engine = engine diff --git a/tests/__init__.py b/tests/__init__.py index e1bd59312..c2d15362a 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,6 +7,7 @@ conceptnet, countries, provinces, + remove, thai_negations, thai_stopwords, thai_syllables, @@ -66,6 +67,7 @@ def test_corpus(self): self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) + self.assertIsNotNone(remove("tnc_freq")) def test_tnc(self): self.assertIsNotNone(tnc.word_freqs()) @@ -150,13 +152,25 @@ def test_number(self): bahttext(5611116.50), "ห้าล้านหกแสนหนึ่งหมื่นหนึ่งพันหนึ่งร้อยสิบหกบาทห้าสิบสตางค์", ) + self.assertEqual(bahttext(116), "หนึ่งร้อยสิบหกบาทถ้วน") + self.assertEqual(bahttext(0), "ศูนย์บาทถ้วน") + self.assertEqual(bahttext(None), "") + self.assertEqual(num_to_thaiword(112), "หนึ่งร้อยสิบสอง") + self.assertEqual(num_to_thaiword(0), "ศูนย์") + self.assertEqual(num_to_thaiword(None), "") + + self.assertEqual(thaiword_to_num("ร้อยสิบสอง"), 112) self.assertEqual( thaiword_to_num( - ["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"] + ["หก", "ล้าน", "หก", "แสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"] ), 6666666, ) + self.assertEqual(thaiword_to_num("ยี่สิบ"), 20) + self.assertEqual(thaiword_to_num("ศูนย์"), 0) + self.assertEqual(thaiword_to_num(""), None) + self.assertEqual(thaiword_to_num(None), None) # ### pythainlp.rank @@ -181,7 +195,7 @@ def test_romanization_royin(self): def test_sentiment(self): text = "เสียใจมาก" - # self.assertEqual(sentiment(text, engine="old"), "neg") + self.assertEqual(sentiment(text, engine="old"), "neg") # self.assertEqual(sentiment(text, engine="ulmfit"), "neg") # ### pythainlp.soundex