From 0780c38fd139dd29e16e739d6fc34736eac450c2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 19:07:36 +0700 Subject: [PATCH 01/19] More test cases --- pythainlp/tokenize/__init__.py | 2 +- tests/__init__.py | 69 ++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index e81c3214d..e7ea1f984 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -99,7 +99,7 @@ def sent_tokenize(text, engine="whitespace+newline"): if engine == "whitespace": sentences = nltk.tokenize.WhitespaceTokenizer().tokenize(text) else: # default, use whitespace + newline - sentences = re.sub(r"\n+|\s+", "|", text).split("|") + sentences = re.sub(r"\n+|\s+", "|", text.strip()).split("|") return sentences diff --git a/tests/__init__.py b/tests/__init__.py index ec4a492d6..16fe1d176 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -36,7 +36,17 @@ from pythainlp.spell import correct, spell from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents -from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize +from pythainlp.tokenize import ( + FROZEN_DICT_TRIE, + dict_word_tokenize, + etcc, + multi_cut, + sent_tokenize, + subword_tokenize, + syllable_tokenize, + tcc, + word_tokenize, +) from pythainlp.transliterate import romanize, transliterate from pythainlp.transliterate.ipa import trans_list, xsampa_list from pythainlp.util import ( @@ -285,9 +295,38 @@ def test_pos_tag(self): # ### pythainlp.tokenize - def test_syllable_tokenize(self): - self.assertEqual( - syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] + def test_dict_word_tokenize(self): + self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) + self.assertIsNotNone( + dict_word_tokenize("รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="newmm" + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="longest" + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="mm" + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="XX" + ) + ) + + def test_etcc(self): + self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") + self.assertIsNotNone( + etcc.etcc( + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์มีแขนขาหน้าหัวเราะ" + ) ) def test_word_tokenize(self): @@ -295,6 +334,9 @@ def test_word_tokenize(self): word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertEqual(word_tokenize(""), []) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="ulmfit")) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX")) def test_word_tokenize_icu(self): self.assertEqual( @@ -308,6 +350,8 @@ def test_word_tokenize_mm(self): ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานคร")) + def test_word_tokenize_newmm(self): self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), @@ -332,12 +376,23 @@ def test_word_tokenize_longest_matching(self): ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + def test_sent_tokenize(self): + self.assertEqual( + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), ["รักน้ำ", "รักปลา"] + ) + self.assertEqual(sent_tokenize("รักน้ำ รักปลา "), ["รักน้ำ", "รักปลา"]) + + def test_subword_tokenize(self): + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร")) + + def test_syllable_tokenize(self): + self.assertEqual( + syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] + ) + def test_tcc(self): self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") - def test_etcc(self): - self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") - # ### pythainlp.transliterate def test_romanize(self): From 612114806cc13cc7d6a24a3a99816f980d4f10f2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 19:24:42 +0700 Subject: [PATCH 02/19] Add English test cases --- tests/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 16fe1d176..d894838dd 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -298,26 +298,26 @@ def test_pos_tag(self): def test_dict_word_tokenize(self): self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) self.assertIsNotNone( - dict_word_tokenize("รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE) + dict_word_tokenize("รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="newmm" + "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="newmm" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="longest" + "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="longest" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="mm" + "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="mm" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="XX" + "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="XX" ) ) @@ -325,7 +325,8 @@ def test_etcc(self): self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") self.assertIsNotNone( etcc.etcc( - "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์มีแขนขาหน้าหัวเราะ" + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" ) ) @@ -350,7 +351,7 @@ def test_word_tokenize_mm(self): ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) - self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานคร")) + self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS")) def test_word_tokenize_newmm(self): self.assertEqual( From 55774991c3c97090e5cc938fcccbb0346a4c8adf Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 19:33:34 +0700 Subject: [PATCH 03/19] more test cases for spellchecker --- tests/__init__.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index d894838dd..ebca4e778 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -34,6 +34,7 @@ from pythainlp.sentiment import sentiment from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell +from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents from pythainlp.tokenize import ( @@ -252,6 +253,14 @@ def test_spell(self): self.assertEqual(correct(""), "") self.assertEqual(correct(None), "") + self.assertIsNotNone(dictionary()) + self.assertGreaterEqual(prob("มี"), 0) + self.assertIsNotNone(known(["เกิด", "abc", ""])) + + checker = NorvigSpellChecker(dict_filter="") + self.assertIsNotNone(checker.dictionary()) + self.assertGreaterEqual(checker.prob("มี"), 0) + # ### pythainlp.summarize def test_summarize(self): @@ -325,8 +334,8 @@ def test_etcc(self): self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") self.assertIsNotNone( etcc.etcc( - "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + - "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" ) ) From a7689ab1991b3fd967daab744e9d4d94d82db6fa Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 20:32:57 +0700 Subject: [PATCH 04/19] more wordnet test cases --- tests/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index ebca4e778..c1c830254 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import unittest from collections import Counter +from nltk.corpus import wordnet as wn from pythainlp.collation import collate from pythainlp.corpus import ( @@ -97,10 +98,18 @@ def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) def test_wordnet(self): + self.assertIsNotNone(wordnet.langs()) + self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] ) - self.assertIsNotNone(wordnet.langs()) + self.assertIsNotNone(wordnet.synsets("นก")) + self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) + + self.assertIsNotNone(wordnet.lemmas("นก")) + self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADJ)) + + self.assertEqual(wordnet.morphy("dogs"), "dog") # ### pythainlp.date From 7a1f4e40dd6f7f6dbf307f356f624affabb3e021 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 20:51:40 +0700 Subject: [PATCH 05/19] more romanize() (royin) test cases --- pythainlp/transliterate/royin.py | 3 +++ tests/__init__.py | 23 +++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py index 69a3671d9..415e0fce3 100644 --- a/pythainlp/transliterate/royin.py +++ b/pythainlp/transliterate/royin.py @@ -168,6 +168,9 @@ def _replace_consonants(word, res): def romanize(word): + if not word: + return "" + word2 = _replace_vowels(_normalize(word)) res = re.findall(_RE_CONSONANT, word2) # 2-character word, all consonants diff --git a/tests/__init__.py b/tests/__init__.py index c1c830254..cba0f573a 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -107,7 +107,7 @@ def test_wordnet(self): self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) self.assertIsNotNone(wordnet.lemmas("นก")) - self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADJ)) + self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) self.assertEqual(wordnet.morphy("dogs"), "dog") @@ -316,26 +316,26 @@ def test_pos_tag(self): def test_dict_word_tokenize(self): self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) self.assertIsNotNone( - dict_word_tokenize("รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE) + dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="newmm" + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="longest" + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="mm" + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm" ) ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="XX" + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX" ) ) @@ -422,6 +422,17 @@ def test_romanize(self): self.assertEqual(romanize("ดู", engine="royin"), "du") self.assertEqual(romanize("ดำ", engine="royin"), "dam") self.assertEqual(romanize("บัว", engine="royin"), "bua") + self.assertEqual(romanize("กร", engine="royin"), "kon") + self.assertEqual(romanize("กรร", engine="royin"), "kan") + self.assertEqual(romanize("กรรม", engine="royin"), "kam") + self.assertEqual(romanize(""), "") + self.assertEqual(romanize(None), "") + self.assertIsNotNone(romanize("หาย", engine="royin")) + self.assertIsNotNone(romanize("หยาก", engine="royin")) + self.assertIsNotNone(romanize("ฝ้าย", engine="royin")) + self.assertIsNotNone(romanize("กรม", engine="royin")) + self.assertIsNotNone(romanize("ธรรพ์", engine="royin")) + self.assertIsNotNone(romanize("กฏa์", engine="royin")) # self.assertIsNotNone(romanize("บัว", engine="thai2rom")) def test_transliterate(self): From 65b16ca297fc52f5c021d9d9655c46b21ea10e38 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 21:10:44 +0700 Subject: [PATCH 06/19] handles None and empty cases --- pythainlp/tokenize/__init__.py | 18 ++++++++++++++++++ pythainlp/tokenize/etcc.py | 4 ++++ pythainlp/tokenize/tcc.py | 10 ++++++++-- pythainlp/transliterate/__init__.py | 10 ++++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index e7ea1f984..450fd0131 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -34,6 +34,9 @@ def word_tokenize(text, engine="newmm", whitespaces=True): >>> word_tokenize(text, engine="icu") ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด'] """ + if not text: + return [] + if engine == "newmm" or engine == "onecut": from .newmm import mmcut as segment elif engine == "longest" or engine == "longest-matching": @@ -73,6 +76,10 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"): >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ + + if not text: + return [] + if engine == "newmm" or engine == "onecut": from .newmm import mmcut as segment elif engine == "longest" or engine == "longest-matching": @@ -94,6 +101,10 @@ def sent_tokenize(text, engine="whitespace+newline"): :return: a list of text, split by whitespace or new line. """ + + if not text: + return [] + sentences = [] if engine == "whitespace": @@ -110,6 +121,9 @@ def subword_tokenize(text, engine="tcc"): :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. :return: a list of tokenized strings. """ + if not text: + return "" + from .tcc import tcc return tcc(text) @@ -121,6 +135,10 @@ def syllable_tokenize(text): :return: returns list of strings of syllables """ + + if not text: + return [] + tokens = [] if text: words = word_tokenize(text) diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index a90e0b835..5e73b4586 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -27,6 +27,10 @@ def etcc(text): รับ str ส่งออก str """ + + if not text: + return "" + if re.search(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text): search = re.findall(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text) for i in search: diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index bfb5920e9..54464cd4d 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -60,6 +60,9 @@ def tcc_gen(w): def tcc_pos(text): + if not text: + return set() + p_set = set() p = 0 for w in tcc_gen(text): @@ -68,5 +71,8 @@ def tcc_pos(text): return p_set -def tcc(w, sep="/"): - return sep.join(tcc_gen(w)) \ No newline at end of file +def tcc(text, sep="/"): + if not text: + return "" + + return sep.join(tcc_gen(text)) diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index 48bd5cfd2..7ede03197 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -10,11 +10,17 @@ def romanize(text, engine="royin"): :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). :return: English (more or less) text that spells out how the Thai text should read. """ + + if not text: + return "" + if engine == "thai2rom": from .thai2rom import romanize + return romanize(text) else: # use default engine "royin" from .royin import romanize + words = word_tokenize(text) romanized_words = [romanize(word) for word in words] return "".join(romanized_words) @@ -26,6 +32,10 @@ def transliterate(text, engine="ipa"): :param str engine: 'ipa' (default) or 'pyicu'. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. """ + + if not text: + return "" + if engine == "pyicu": from .pyicu import transliterate else: From 03dbcc05a0eefc737104093b8102cc66e69ee5c7 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 21:46:48 +0700 Subject: [PATCH 07/19] - handles None and empty cases - more test cases for tokenize --- pythainlp/tokenize/__init__.py | 14 +++++++------- pythainlp/tokenize/longest.py | 8 ++++++-- pythainlp/tokenize/multi_cut.py | 21 +++++++++++++-------- pythainlp/tokenize/newmm.py | 6 +++++- pythainlp/tokenize/pyicu.py | 3 +++ pythainlp/tokenize/tcc.py | 4 ++++ tests/__init__.py | 27 +++++++++++++++++++++++++-- 7 files changed, 63 insertions(+), 20 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 450fd0131..f44ac7410 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -38,14 +38,14 @@ def word_tokenize(text, engine="newmm", whitespaces=True): return [] if engine == "newmm" or engine == "onecut": - from .newmm import mmcut as segment + from .newmm import segment elif engine == "longest" or engine == "longest-matching": from .longest import segment elif engine == "ulmfit": - from .newmm import mmcut + from .newmm import segment def segment(text): - return mmcut(text, trie=FROZEN_DICT_TRIE) + return segment(text, trie=FROZEN_DICT_TRIE) elif engine == "icu": from .pyicu import segment @@ -54,7 +54,7 @@ def segment(text): elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment else: # default, use "newmm" engine - from .newmm import mmcut as segment + from .newmm import segment if not whitespaces: return [token.strip(" ") for token in segment(text) if token.strip(" ")] @@ -81,13 +81,13 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"): return [] if engine == "newmm" or engine == "onecut": - from .newmm import mmcut as segment + from .newmm import segment elif engine == "longest" or engine == "longest-matching": from .longest import segment elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment else: # default, use "newmm" engine - from .newmm import mmcut as segment + from .newmm import segment return segment(text, custom_dict) @@ -189,6 +189,6 @@ def __init__(self, custom_dict=None): self.__trie_dict = Trie(thai_words()) def word_tokenize(self, text, engine="newmm"): - from .newmm import mmcut as segment + from .newmm import segment return segment(text, self.__trie_dict) diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 483685da2..33ff1fa0a 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -35,7 +35,7 @@ _UNKNOWN = False -class Tokenizer(object): +class LongestMatchTokenizer(object): def __init__(self, trie): self.__trie = trie @@ -95,6 +95,9 @@ def __longest_matching(self, text, begin_pos): return "" def __segment_text(self, text): + if not text: + return [] + begin_pos = 0 len_text = len(text) tokens = [] @@ -137,4 +140,5 @@ def segment(text, trie=None): """ตัดคำภาษาไทยด้วยวิธี longest matching""" if not trie: trie = DEFAULT_DICT_TRIE - return Tokenizer(trie).tokenize(text) + + return LongestMatchTokenizer(trie).tokenize(text) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 80f621c27..d161bdf4e 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -40,7 +40,7 @@ def __init__(self, value, multi=None, in_dict=True): _PAT_ENG = re.compile(_RE_ENG) -def multicut(text, trie=None): +def _multicut(text, trie=None): """ ส่งคืน LatticeString คืนมาเป็นก้อนๆ """ @@ -95,18 +95,18 @@ def serialize(p, p2): # helper function def mmcut(text): res = [] - for w in multicut(text): + for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) res.extend(mm.split("/")) return res -def combine(ww): +def _combine(ww): if ww == []: yield "" else: w = ww[0] - for tail in combine(ww[1:]): + for tail in _combine(ww[1:]): if w.unique: yield w + "|" + tail else: @@ -118,13 +118,18 @@ def segment(text, trie=None): """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ - ww = list(multicut(text, trie=trie)) - return ww + if not text: + return [] + + return list(_multicut(text, trie=trie)) def find_all_segment(text, trie=None): """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ - ww = list(multicut(text, trie=trie)) - return list(combine(ww)) + if not text: + return [] + + ww = list(_multicut(text, trie=trie)) + return list(_combine(ww)) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 08fda8628..17815fd9f 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -90,7 +90,11 @@ def onecut(text, trie): # ช่วยให้ไม่ต้องพิมพ์ยาวๆ -def mmcut(text, trie=None): +def segment(text, trie=None): + if not text: + return [] + if not trie: trie = DEFAULT_DICT_TRIE + return list(onecut(text, trie)) diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index aefcc9311..23b7b38e4 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -17,5 +17,8 @@ def _gen_words(text): def segment(text): + if not text: + return [] + text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text) return list(_gen_words(text)) diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index 54464cd4d..b50bdb24a 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -48,6 +48,9 @@ def tcc_gen(w): + if not w: + return '' + p = 0 while p < len(w): m = PAT_TCC.match(w[p:]) @@ -68,6 +71,7 @@ def tcc_pos(text): for w in tcc_gen(text): p += len(w) p_set.add(p) + return p_set diff --git a/tests/__init__.py b/tests/__init__.py index cba0f573a..5919f7c91 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -42,13 +42,16 @@ FROZEN_DICT_TRIE, dict_word_tokenize, etcc, + longest, multi_cut, + newmm, sent_tokenize, subword_tokenize, syllable_tokenize, tcc, word_tokenize, ) +from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.transliterate import romanize, transliterate from pythainlp.transliterate.ipa import trans_list, xsampa_list from pythainlp.util import ( @@ -325,7 +328,9 @@ def test_dict_word_tokenize(self): ) self.assertIsNotNone( dict_word_tokenize( - "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest" + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", + custom_dict=FROZEN_DICT_TRIE, + engine="longest", ) ) self.assertIsNotNone( @@ -340,6 +345,7 @@ def test_dict_word_tokenize(self): ) def test_etcc(self): + self.assertEqual(etcc.etcc(""), "") self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") self.assertIsNotNone( etcc.etcc( @@ -349,21 +355,26 @@ def test_etcc(self): ) def test_word_tokenize(self): + self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) - self.assertEqual(word_tokenize(""), []) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="ulmfit")) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX")) def test_word_tokenize_icu(self): + self.assertEqual(tokenize_pyicu.segment(None), "") + self.assertEqual(tokenize_pyicu.segment(""), "") self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) def test_word_tokenize_mm(self): + self.assertEqual(multi_cut.segment(None), []) + self.assertEqual(multi_cut.segment(""), []) + self.assertEqual(word_tokenize("", engine="mm"), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], @@ -372,6 +383,8 @@ def test_word_tokenize_mm(self): self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS")) def test_word_tokenize_newmm(self): + self.assertEqual(newmm.segment(None), []) + self.assertEqual(newmm.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], @@ -390,26 +403,36 @@ def test_word_tokenize_newmm(self): ) def test_word_tokenize_longest_matching(self): + self.assertEqual(longest.segment(None), []) + self.assertEqual(longest.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) def test_sent_tokenize(self): + self.assertEqual(sent_tokenize(None), []) + self.assertEqual(sent_tokenize(""), []) self.assertEqual( sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), ["รักน้ำ", "รักปลา"] ) self.assertEqual(sent_tokenize("รักน้ำ รักปลา "), ["รักน้ำ", "รักปลา"]) def test_subword_tokenize(self): + self.assertEqual(subword_tokenize(None), "") + self.assertEqual(subword_tokenize(""), "") self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร")) def test_syllable_tokenize(self): + self.assertEqual(syllable_tokenize(None), []) + self.assertEqual(syllable_tokenize(""), []) self.assertEqual( syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] ) def test_tcc(self): + self.assertEqual(tcc.tcc(None), "") + self.assertEqual(tcc.tcc(""), "") self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") # ### pythainlp.transliterate From 7190aba154c5449d681c45f33e42a90dd595eb90 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 22:04:52 +0700 Subject: [PATCH 08/19] fix test cases --- pythainlp/tokenize/__init__.py | 4 ++-- pythainlp/tokenize/deepcut.py | 3 +++ tests/__init__.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index f44ac7410..3c97535c0 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -42,10 +42,10 @@ def word_tokenize(text, engine="newmm", whitespaces=True): elif engine == "longest" or engine == "longest-matching": from .longest import segment elif engine == "ulmfit": - from .newmm import segment + from .newmm import segment as segment_ def segment(text): - return segment(text, trie=FROZEN_DICT_TRIE) + return segment_(text, trie=FROZEN_DICT_TRIE) elif engine == "icu": from .pyicu import segment diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 395e76583..510a1b848 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -7,4 +7,7 @@ def segment(text): + if not text: + return [] + return deepcut.tokenize(text) diff --git a/tests/__init__.py b/tests/__init__.py index 5919f7c91..921e2a719 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -364,8 +364,8 @@ def test_word_tokenize(self): self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX")) def test_word_tokenize_icu(self): - self.assertEqual(tokenize_pyicu.segment(None), "") - self.assertEqual(tokenize_pyicu.segment(""), "") + self.assertEqual(tokenize_pyicu.segment(None), []) + self.assertEqual(tokenize_pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], From 69cb3a708a1c39ff56620615c8bf9631b1d87651 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 22:52:40 +0700 Subject: [PATCH 09/19] handles None and empty cases in pos taggers --- pythainlp/tag/__init__.py | 21 +++++++++++++++------ pythainlp/tag/perceptron.py | 27 ++++++++++++++++----------- pythainlp/tag/unigram.py | 17 ++++++++++------- tests/__init__.py | 23 ++++++++++++++++++++--- 4 files changed, 61 insertions(+), 27 deletions(-) diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index d60ee950f..7b694375a 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -20,21 +20,30 @@ def pos_tag(words, engine="unigram", corpus="orchid"): * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ + if not words: + return [] + if engine == "perceptron": - from .perceptron import tag as _tag + from .perceptron import tag as tag_ elif engine == "artagger": - def _tag(text, corpus=None): + def tag_(words, corpus=None): + if not words: + return [] + from artagger import Tagger - words = Tagger().tag(" ".join(text)) + words_ = Tagger().tag(" ".join(words)) - return [(word.word, word.tag) for word in words] + return [(word.word, word.tag) for word in words_] else: # default, use "unigram" ("old") engine - from .unigram import tag as _tag + from .unigram import tag as tag_ - return _tag(words, corpus=corpus) + return tag_(words, corpus=corpus) def pos_tag_sents(sentences, engine="unigram", corpus="orchid"): + if not sentences: + return [] + return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences] diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 8d4fe1280..e5dc9e424 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -7,28 +7,33 @@ import dill from pythainlp.corpus import CORPUS_PATH +_ORCHID_DATA_FILENAME = "orchid_pt_tagger.dill" +_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.dill" -def orchid_data(): - data_filename = os.path.join(CORPUS_PATH, "orchid_pt_tagger.dill") + +def _load_tagger(filename): + data_filename = os.path.join(CORPUS_PATH, filename) with open(data_filename, "rb") as fh: model = dill.load(fh) return model -def pud_data(): - data_filename = os.path.join(CORPUS_PATH, "ud_thai_pud_pt_tagger.dill") - with open(data_filename, "rb") as fh: - model = dill.load(fh) - return model +_ORCHID_TAGGER = _load_tagger(_ORCHID_DATA_FILENAME) +_PUD_TAGGER = _load_tagger(_PUD_DATA_FILENAME) -def tag(text, corpus="pud"): +def tag(words, corpus="pud"): """ รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...] """ + if not words: + return [] + + words = [word.strip() for word in words if word.strip()] + if corpus == "orchid": - tagger = orchid_data() + tagger = _ORCHID_TAGGER else: # default, use "pud" as a corpus - tagger = pud_data() + tagger = _PUD_TAGGER - return tagger.tag(text) + return tagger.tag(words) diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index 21324bf64..e90c992f0 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -15,26 +15,29 @@ _THAI_POS_PUD_PATH = os.path.join(CORPUS_PATH, _THAI_POS_PUD_FILENAME) -def orchid_data(): +def _orchid_tagger(): with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f: model = json.load(f) return model -def pud_data(): +def _pud_tagger(): with open(_THAI_POS_PUD_PATH, "rb") as handle: model = dill.load(handle) return model -def tag(text, corpus): +def tag(words, corpus): """ รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...] """ + if not words: + return [] + if corpus == "orchid": - tagger = nltk.tag.UnigramTagger(model=orchid_data()) - return tagger.tag(text) + tagger = nltk.tag.UnigramTagger(model=_orchid_tagger()) + return tagger.tag(words) # default, use "pud" as a corpus - tagger = pud_data() - return tagger.tag(text) + tagger = _pud_tagger() + return tagger.tag(words) diff --git a/tests/__init__.py b/tests/__init__.py index 921e2a719..972c1cf46 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -37,7 +37,7 @@ from pythainlp.spell import correct, spell from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob from pythainlp.summarize import summarize -from pythainlp.tag import pos_tag, pos_tag_sents +from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram from pythainlp.tokenize import ( FROZEN_DICT_TRIE, dict_word_tokenize, @@ -293,8 +293,19 @@ def test_summarize(self): def test_pos_tag(self): tokens = ["ผม", "รัก", "คุณ"] + + self.assertEqual(pos_tag(None), []) + self.assertEqual(pos_tag([]), []) + self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) + self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud")) + + self.assertEqual(unigram.tag(None, corpus="pud"), []) + self.assertEqual(unigram.tag([], corpus="pud"), []) + self.assertEqual(unigram.tag(None, corpus="orchid"), []) + self.assertEqual(unigram.tag([], corpus="orchid"), []) + self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], @@ -302,10 +313,16 @@ def test_pos_tag(self): self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) + self.assertEqual(perceptron.tag(None, corpus="pud"), []) + self.assertEqual(perceptron.tag([], corpus="pud"), []) + self.assertEqual(perceptron.tag(None, corpus="orchid"), []) + self.assertEqual(perceptron.tag([], corpus="orchid"), []) - # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) - # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) + self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid")) + self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud")) + self.assertEqual(pos_tag_sents(None), []) + self.assertEqual(pos_tag_sents([]), []) self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [ From 5292715cf7728cf3189d0d67a80058558800ccfc Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 23:05:24 +0700 Subject: [PATCH 10/19] remove artagger tests for now --- tests/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 972c1cf46..a814fe761 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -318,8 +318,8 @@ def test_pos_tag(self): self.assertEqual(perceptron.tag(None, corpus="orchid"), []) self.assertEqual(perceptron.tag([], corpus="orchid"), []) - self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid")) - self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud")) + # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid")) + # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud")) self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) From e7381429e409fd63950ed8e8932e827e1c03a7f5 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 23:25:58 +0700 Subject: [PATCH 11/19] more test cases for tokenization --- tests/__init__.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index a814fe761..6d9794776 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -40,6 +40,7 @@ from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram from pythainlp.tokenize import ( FROZEN_DICT_TRIE, + deepcut, dict_word_tokenize, etcc, longest, @@ -388,6 +389,19 @@ def test_word_tokenize_icu(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) + def test_word_tokenize_deepcut(self): + self.assertEqual(deepcut.segment(None), []) + self.assertEqual(deepcut.segment(""), []) + self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) + + def test_word_tokenize_longest_matching(self): + self.assertEqual(longest.segment(None), []) + self.assertEqual(longest.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), + ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], + ) + def test_word_tokenize_mm(self): self.assertEqual(multi_cut.segment(None), []) self.assertEqual(multi_cut.segment(""), []) @@ -419,14 +433,6 @@ def test_word_tokenize_newmm(self): ["จุ๋ม", "ง่วง"], ) - def test_word_tokenize_longest_matching(self): - self.assertEqual(longest.segment(None), []) - self.assertEqual(longest.segment(""), []) - self.assertEqual( - word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), - ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], - ) - def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), []) @@ -452,11 +458,15 @@ def test_tcc(self): self.assertEqual(tcc.tcc(""), "") self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") + self.assertEqual(tcc.tcc_gen(), "") + self.assertEqual(tcc.tcc_pos(""), set()) + # ### pythainlp.transliterate def test_romanize(self): + self.assertEqual(romanize(None), "") + self.assertEqual(romanize(""), "") self.assertEqual(romanize("แมว"), "maeo") - self.assertIsNotNone(romanize("กก", engine="royin")) self.assertEqual(romanize("แมว", engine="royin"), "maeo") self.assertEqual(romanize("เดือน", engine="royin"), "duean") self.assertEqual(romanize("ดู", engine="royin"), "du") @@ -465,8 +475,7 @@ def test_romanize(self): self.assertEqual(romanize("กร", engine="royin"), "kon") self.assertEqual(romanize("กรร", engine="royin"), "kan") self.assertEqual(romanize("กรรม", engine="royin"), "kam") - self.assertEqual(romanize(""), "") - self.assertEqual(romanize(None), "") + self.assertIsNotNone(romanize("กก", engine="royin")) self.assertIsNotNone(romanize("หาย", engine="royin")) self.assertIsNotNone(romanize("หยาก", engine="royin")) self.assertIsNotNone(romanize("ฝ้าย", engine="royin")) @@ -476,6 +485,7 @@ def test_romanize(self): # self.assertIsNotNone(romanize("บัว", engine="thai2rom")) def test_transliterate(self): + self.assertEqual(transliterate(""), "") self.assertEqual(transliterate("แมว", "pyicu"), "mæw") self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") self.assertIsNotNone(trans_list("คน")) From 985fcf96e6fab5ce2a851f2b764815231c76e8e9 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 9 Nov 2018 23:42:16 +0700 Subject: [PATCH 12/19] - adjust extras_require - remove deepcut tests for now --- .travis.yml | 2 +- appveyor.yml | 2 +- setup.py | 10 +++++++--- tests/__init__.py | 9 ++++----- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8f4edb93f..3ca3d5b8b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt - - pip install .[icu,ner,pos,tokenize,transliterate] + - pip install .[icu,ipa,ner] - pip install coveralls os: diff --git a/appveyor.yml b/appveyor.yml index 00b4e1ae2..560766dc2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e .[icu,ner,pos,tokenize,transliterate]" + - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/setup.py b/setup.py index 3fa7c5c18..583a5d98a 100644 --- a/setup.py +++ b/setup.py @@ -9,21 +9,25 @@ requirements = f.read().splitlines() extras = { + "artagger": ["artagger"], + "deepcut": ["deepcut", "keras", "tensorflow"], "icu": ["pyicu"], + "ipa": ["epitran"], "ml": ["fastai==0.7.0", "keras", "numpy", "torch"], "ner": ["sklearn_crfsuite"], - "pos": ["artagger"], - "tokenize": ["deepcut", "pyicu"], - "transliterate": ["epitran", "pyicu"], + "thai2rom": ["keras", "numpy"], + "thai2vec": ["gensim", "numpy"], "full": [ "artagger", "deepcut", "epitran", "fastai==0.7.0", + "gensim", "keras", "numpy", "pyicu", "sklearn_crfsuite", + "tensorflow", "torch", ], } diff --git a/tests/__init__.py b/tests/__init__.py index 6d9794776..39c515d94 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -40,7 +40,6 @@ from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram from pythainlp.tokenize import ( FROZEN_DICT_TRIE, - deepcut, dict_word_tokenize, etcc, longest, @@ -389,10 +388,10 @@ def test_word_tokenize_icu(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) - def test_word_tokenize_deepcut(self): - self.assertEqual(deepcut.segment(None), []) - self.assertEqual(deepcut.segment(""), []) - self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) + # def test_word_tokenize_deepcut(self): + # self.assertEqual(deepcut.segment(None), []) + # self.assertEqual(deepcut.segment(""), []) + # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) def test_word_tokenize_longest_matching(self): self.assertEqual(longest.segment(None), []) From 9496571b3de186772d12b87857ed8748033d2a9a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 00:01:19 +0700 Subject: [PATCH 13/19] fix tcc_gen() test --- tests/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 39c515d94..fcd9a4a84 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -257,13 +257,14 @@ def test_soundex(self): # ### pythainlp.spell def test_spell(self): - self.assertIsNotNone(spell("เน้ร")) - self.assertEqual(spell(""), "") self.assertEqual(spell(None), "") + self.assertEqual(spell(""), "") + self.assertIsNotNone(spell("เน้ร")) + self.assertIsNotNone(spell("เกสมร์")) - self.assertIsNotNone(correct("ทดสอง")) - self.assertEqual(correct(""), "") self.assertEqual(correct(None), "") + self.assertEqual(correct(""), "") + self.assertIsNotNone(correct("ทดสอง")) self.assertIsNotNone(dictionary()) self.assertGreaterEqual(prob("มี"), 0) @@ -457,7 +458,7 @@ def test_tcc(self): self.assertEqual(tcc.tcc(""), "") self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") - self.assertEqual(tcc.tcc_gen(), "") + self.assertEqual(tcc.tcc_gen(""), "") self.assertEqual(tcc.tcc_pos(""), set()) # ### pythainlp.transliterate From a145a2bc20228adbe792e3d9cd74cdf170863882 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 00:09:51 +0700 Subject: [PATCH 14/19] thai2vec: load model only once --- pythainlp/word_vector/thai2vec.py | 21 +++++++++++++-------- tests/__init__.py | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index e2b4b1329..22b682fae 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -10,7 +10,7 @@ from pythainlp.tokenize import word_tokenize -def download(): +def _download(): path = get_file("thai2vec02") if not path: download_data("thai2vec02") @@ -20,8 +20,13 @@ def download(): def get_model(): """ - :return: Downloads the `gensim` model.""" - return KeyedVectors.load_word2vec_format(download(), binary=False) + Download model + :return: `gensim` model + """ + return KeyedVectors.load_word2vec_format(_download(), binary=False) + + +_MODEL = get_model() def most_similar_cosmul(positive, negative): @@ -29,11 +34,11 @@ def most_similar_cosmul(positive, negative): การใช้งาน input list """ - return get_model().most_similar_cosmul(positive=positive, negative=negative) + return _MODEL.most_similar_cosmul(positive=positive, negative=negative) def doesnt_match(listdata): - return get_model().doesnt_match(listdata) + return _MODEL.doesnt_match(listdata) def similarity(word1, word2): @@ -42,15 +47,15 @@ def similarity(word1, word2): :param str word2: second word :return: the cosine similarity between the two word vectors """ - return get_model().similarity(word1, word2) + return _MODEL.similarity(word1, word2) def sentence_vectorizer(text, dim=300, use_mean=False): words = word_tokenize(text) vec = np.zeros((1, dim)) for word in words: - if word in get_model().wv.index2word: - vec += get_model().wv.word_vec(word) + if word in _MODEL.wv.index2word: + vec += _MODEL.wv.word_vec(word) else: pass if use_mean: diff --git a/tests/__init__.py b/tests/__init__.py index fcd9a4a84..88ef4bc11 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -458,7 +458,7 @@ def test_tcc(self): self.assertEqual(tcc.tcc(""), "") self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") - self.assertEqual(tcc.tcc_gen(""), "") + self.assertEqual(list(tcc.tcc_gen("")), []) self.assertEqual(tcc.tcc_pos(""), set()) # ### pythainlp.transliterate From cd37a040aa76ddc82e256be0af5d2688c1966c52 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 11:35:18 +0700 Subject: [PATCH 15/19] thai2vec test cases + more wordnet test cases --- .travis.yml | 2 +- appveyor.yml | 2 +- pythainlp/sentiment/ulmfit_sent.py | 14 +++++++++----- pythainlp/word_vector/thai2vec.py | 2 ++ tests/__init__.py | 28 +++++++++++++++++++++++++--- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3ca3d5b8b..db0c8a6ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt - - pip install .[icu,ipa,ner] + - pip install .[icu,ipa,ner,thai2vec] - pip install coveralls os: diff --git a/appveyor.yml b/appveyor.yml index 560766dc2..808598eae 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner]" + - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner,thai2vec]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py index 19ca3368f..19532f453 100644 --- a/pythainlp/sentiment/ulmfit_sent.py +++ b/pythainlp/sentiment/ulmfit_sent.py @@ -15,6 +15,8 @@ # from fastai.text import multiBatchRNN +__all__ = ["about", "get_sentiment"] + MODEL_NAME = "sent_model" ITOS_NAME = "itos_sent" @@ -29,24 +31,26 @@ def get_path(fname): # load model -model = torch.load(get_path(MODEL_NAME)) -model.eval() +MODEL = torch.load(get_path(MODEL_NAME)) +MODEL.eval() # load itos and stoi itos = pickle.load(open(get_path(ITOS_NAME), "rb")) stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) + # get sentiment; 1 for positive and 0 for negative # or score if specified return_score=True -softmax = lambda x: np.exp(x) / np.sum(np.exp(x)) +def softmax(x): + return np.exp(x) / np.sum(np.exp(x)) def get_sentiment(text, return_score=False): words = word_tokenize(text) tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu() tensor = Variable(tensor, volatile=False) - model.reset() - pred, *_ = model(tensor) + MODEL.reset() + pred, *_ = MODEL(tensor) result = pred.data.cpu().numpy().reshape(-1) if return_score: diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index 22b682fae..0f371e31e 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -43,6 +43,8 @@ def doesnt_match(listdata): def similarity(word1, word2): """ + Get cosine similarity between two words. + If a word is not in the vocabulary, KeyError will be raised. :param str word1: first word :param str word2: second word :return: the cosine similarity between the two word vectors diff --git a/tests/__init__.py b/tests/__init__.py index 88ef4bc11..760e442bb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -63,6 +63,7 @@ normalize, thai_to_eng, ) +from pythainlp.word_vector import thai2vec class TestUM(unittest.TestCase): @@ -111,9 +112,18 @@ def test_wordnet(self): self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) + self.assertIsNotNone(wordnet.lemma('cat.n.01.cat')) self.assertEqual(wordnet.morphy("dogs"), "dog") + bird = wordnet.synset('bird.n.01') + mouse = wordnet.synset('mouse.n.01') + self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)) + self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)) + + cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() + self.assertIsNotNone(wordnet.lemma_from_key(cat_key)) + # ### pythainlp.date def test_date(self): @@ -390,9 +400,9 @@ def test_word_tokenize_icu(self): ) # def test_word_tokenize_deepcut(self): - # self.assertEqual(deepcut.segment(None), []) - # self.assertEqual(deepcut.segment(""), []) - # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) + # self.assertEqual(deepcut.segment(None), []) + # self.assertEqual(deepcut.segment(""), []) + # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) def test_word_tokenize_longest_matching(self): self.assertEqual(longest.segment(None), []) @@ -519,6 +529,18 @@ def test_keyboard(self): self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ") self.assertEqual(thai_to_eng("สวัสดีครับ"), "l;ylfu8iy[") + # ### pythainlp.word_vector + + def test_thai2vec(self): + self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0) + self.assertIsNotNone(thai2vec.sentence_vectorizer("")) + self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม")) + self.assertEqual( + thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0], + "ราชินี", + ) + self.assertEqual(thai2vec.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม") + if __name__ == "__main__": unittest.main() From afb106694b1c9c4780a230a48a35910b209c1d7e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 12:01:13 +0700 Subject: [PATCH 16/19] workaround to make boto work on Travis CI from https://github.com/travis-ci/travis-ci/issues/7940 --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.travis.yml b/.travis.yml index db0c8a6ba..f04002977 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,12 @@ language: python python: - "3.6" + +# workaround to make boto work on travis +# from https://github.com/travis-ci/travis-ci/issues/7940 +before_install: + - sudo rm -f /etc/boto.cfg + # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt From 823f702036d6f8ba3776e124a3f5fa1218eeb763 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 13:38:05 +0700 Subject: [PATCH 17/19] =?UTF-8?q?fix=20royin=20romanize(),=20bring=20back?= =?UTF-8?q?=20portion=20of=20old=20code=20from=205e44053=20(for=20the=20ca?= =?UTF-8?q?se=20silent=20=E0=B8=AB)=20https://github.com/PyThaiNLP/pythain?= =?UTF-8?q?lp/blob/5e44053ca95522934a7042505bde589228d74647/pythainlp/roma?= =?UTF-8?q?nization/royin.py#L124?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pythainlp/number/wordtonum.py | 6 +++--- pythainlp/transliterate/royin.py | 5 +++-- tests/__init__.py | 11 +++++++++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pythainlp/number/wordtonum.py b/pythainlp/number/wordtonum.py index 7184cf61a..871d4c784 100644 --- a/pythainlp/number/wordtonum.py +++ b/pythainlp/number/wordtonum.py @@ -40,11 +40,11 @@ def _thaiword_to_num(tokens): - len_tokens = len(tokens) - - if len_tokens == 0: + if not tokens: return None + len_tokens = len(tokens) + if len_tokens == 1: return _THAI_INT_MAP[tokens[0]] diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py index 415e0fce3..e868f10d0 100644 --- a/pythainlp/transliterate/royin.py +++ b/pythainlp/transliterate/royin.py @@ -145,8 +145,9 @@ def _replace_consonants(word, res): lenword = len(res) while i < lenword: if i == 0 and res[0] == "ห": - word = word.replace(res[0], _CONSONANTS[res[0]][0]) - i += 1 + word = word.replace(res[0], "") + del res[0] + lenword -= 1 elif i == 0 and res[0] != "ห": word = word.replace(res[0], _CONSONANTS[res[0]][0]) i += 1 diff --git a/tests/__init__.py b/tests/__init__.py index 760e442bb..8b59e8cb1 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -54,6 +54,7 @@ from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.transliterate import romanize, transliterate from pythainlp.transliterate.ipa import trans_list, xsampa_list +from pythainlp.transliterate.royin import romanize as romanize_royin from pythainlp.util import ( deletetone, eng_to_thai, @@ -203,6 +204,7 @@ def test_number(self): ) self.assertEqual(thaiword_to_num("ยี่สิบ"), 20) self.assertEqual(thaiword_to_num("ศูนย์"), 0) + self.assertEqual(thaiword_to_num("ศูนย์อะไรนะ"), 0) self.assertEqual(thaiword_to_num(""), None) self.assertEqual(thaiword_to_num(None), None) @@ -477,6 +479,12 @@ def test_romanize(self): self.assertEqual(romanize(None), "") self.assertEqual(romanize(""), "") self.assertEqual(romanize("แมว"), "maeo") + + self.assertEqual(romanize_royin(None), "") + self.assertEqual(romanize_royin(""), "") + self.assertEqual(romanize_royin("หาย"), "hai") + self.assertEqual(romanize_royin("หยาก"), "yak") + self.assertEqual(romanize("แมว", engine="royin"), "maeo") self.assertEqual(romanize("เดือน", engine="royin"), "duean") self.assertEqual(romanize("ดู", engine="royin"), "du") @@ -486,9 +494,8 @@ def test_romanize(self): self.assertEqual(romanize("กรร", engine="royin"), "kan") self.assertEqual(romanize("กรรม", engine="royin"), "kam") self.assertIsNotNone(romanize("กก", engine="royin")) - self.assertIsNotNone(romanize("หาย", engine="royin")) - self.assertIsNotNone(romanize("หยาก", engine="royin")) self.assertIsNotNone(romanize("ฝ้าย", engine="royin")) + self.assertIsNotNone(romanize("ทีปกร", engine="royin")) self.assertIsNotNone(romanize("กรม", engine="royin")) self.assertIsNotNone(romanize("ธรรพ์", engine="royin")) self.assertIsNotNone(romanize("กฏa์", engine="royin")) From ec25189cbb16e604e8cfeae511cec8645ecf953e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 17:02:54 +0700 Subject: [PATCH 18/19] Add doc on extras_require --- CONTRIBUTING.md | 6 +++--- README.md | 29 ++++++++++++++++++++++++----- tests/__init__.py | 2 ++ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5ba12656d..dd52500c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,8 +23,8 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod - Write tests for your new features (please see "Tests" topic below); - Always remember that [commented code is dead code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); -- Name identifiers (variables, classes, functions, module names) with readable - names (`x` is always wrong); +- Name identifiers (variables, classes, functions, module names) with meaningful + and pronounceable names (`x` is always wrong); - When manipulating strings, use [Python's new-style formatting](http://docs.python.org/library/string.html#format-string-syntax) (`'{} = {}'.format(a, b)` instead of `'%s = %s' % (a, b)`); @@ -55,7 +55,7 @@ Happy hacking! (; ## newmm (onecut), mm, TCC, and Thai Soundex Code - Korakot Chaovavanich -## Thai2Vec & ulmfit +## Thai2Vec & ULMFiT - Charin Polpanumas ## Docs diff --git a/README.md b/README.md index ef71bf205..ddfb287cc 100644 --- a/README.md +++ b/README.md @@ -34,21 +34,40 @@ Python 2 users can still use PyThaiNLP 1.6. ## Installation -**Using pip** +PyThaiNLP uses PyPI as its main distribution channel, see https://pypi.org/project/pythainlp/ -Stable release +### Stable release + +Standard installation: ```sh $ pip install pythainlp ``` -Development release +For some advanced functionalities, like word vector, extra packages may be needed. Install them with these options during pip install: ```sh -$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip +$ pip install pythainlp[extra1,extra2,...] ``` -Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries. +where ```extras``` can be + - ```artagger``` (to support artagger part-of-speech tagger) + - ```deepcut``` (to support deepcut machine-learnt tokenizer) + - ```icu``` (for ICU support in transliteration and tokenization) + - ```ipa``` (for International Phonetic Alphabet support in transliteration) + - ```ml``` (to support ULMFit models, like one for sentiment analyser) + - ```ner``` (for named-entity recognizer) + - ```thai2rom``` (for machine-learnt romanization) + - ```thai2vec``` (for Thai word vector) + - ```full``` (install everything) + +see ```extras``` and ```extras_require``` in [```setup.py```](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py) for details. + +Development release: + +```sh +$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip +``` ## Documentation diff --git a/tests/__init__.py b/tests/__init__.py index 8b59e8cb1..bef9532e4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -542,11 +542,13 @@ def test_thai2vec(self): self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0) self.assertIsNotNone(thai2vec.sentence_vectorizer("")) self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม")) + self.assertIsNotNone(thai2vec.sentence_vectorizer("I think therefore I am ผ็ฎ์")) self.assertEqual( thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0], "ราชินี", ) self.assertEqual(thai2vec.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม") + self.assertIsNotNone(thai2vec.about()) if __name__ == "__main__": From af83c4decf94b0e86e2022faed907b83e00ce61b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 10 Nov 2018 17:26:14 +0700 Subject: [PATCH 19/19] update README --- README-pypi.md | 22 ++++++++-------------- README.md | 6 +++--- tests/__init__.py | 19 +++++++++++++------ 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/README-pypi.md b/README-pypi.md index 70a8a53c2..8141c642e 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -10,20 +10,14 @@ PyThaiNLP is a Python library for natural language processing (NLP) of Thai language. -PyThaiNLP features include Thai word and subword segmentations, soundex, romanization, part-of-speech taggers, and spelling corrections. - -## What's new in version 1.7 ? - -- Deprecate Python 2 support. (Python 2 compatibility code will be completely dropped in PyThaiNLP 1.8) -- Refactor pythainlp.tokenize.pyicu for readability -- Add Thai NER model to pythainlp.ner -- thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset -- Sentiment classifier based on ULMFit and various product review datasets -- Add ULMFit utility to PyThaiNLP -- Add Thai romanization model ThaiTransliterator -- Retrain POS-tagging model -- Improved word_tokenize (newmm, mm) and dict_word_tokenize -- Documentation added +PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers. + +## What's new in version 1.8 ? + +- New NorvigSpellChecker spell checker class, which can be initialized with custom dictionary. +- Terminate Python 2 support. Remove all Python 2 compatibility code. +- Remove old, obsolated, deprecated, and experimental code. +- see [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118) ## Install diff --git a/README.md b/README.md index ddfb287cc..c3399a200 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ Thai Natural Language Processing in Python. PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language. -PyThaiNLP supports Python 3.4+. -Since version 1.7, PyThaiNLP deprecates its support for Python 2. The future PyThaiNLP 1.8 will completely drop all supports for Python 2. -Python 2 users can still use PyThaiNLP 1.6. +PyThaiNLP 1.8 supports Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). + +Python 2 users can use PyThaiNLP 1.6, our latest released that tested with Python 2.7. **This is a document for development branch (post 1.7.x). Things will break. For a document for stable branch, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** diff --git a/tests/__init__.py b/tests/__init__.py index bef9532e4..12fc36236 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -113,14 +113,18 @@ def test_wordnet(self): self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) - self.assertIsNotNone(wordnet.lemma('cat.n.01.cat')) + self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) self.assertEqual(wordnet.morphy("dogs"), "dog") - bird = wordnet.synset('bird.n.01') - mouse = wordnet.synset('mouse.n.01') - self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)) - self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)) + bird = wordnet.synset("bird.n.01") + mouse = wordnet.synset("mouse.n.01") + self.assertEqual( + wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse) + ) + self.assertEqual( + wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) + ) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key)) @@ -542,7 +546,10 @@ def test_thai2vec(self): self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0) self.assertIsNotNone(thai2vec.sentence_vectorizer("")) self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม")) - self.assertIsNotNone(thai2vec.sentence_vectorizer("I think therefore I am ผ็ฎ์")) + self.assertIsNotNone( + thai2vec.sentence_vectorizer("เสรีภาพในการสมาคม", use_mean=True) + ) + self.assertIsNotNone(thai2vec.sentence_vectorizer("I คิด therefore I am ผ็ฎ์")) self.assertEqual( thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0], "ราชินี",