From 0780c38fd139dd29e16e739d6fc34736eac450c2 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 19:07:36 +0700
Subject: [PATCH 01/19] More test cases

---
 pythainlp/tokenize/__init__.py |  2 +-
 tests/__init__.py              | 69 ++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index e81c3214d..e7ea1f984 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -99,7 +99,7 @@ def sent_tokenize(text, engine="whitespace+newline"):
     if engine == "whitespace":
         sentences = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
     else:  # default, use whitespace + newline
-        sentences = re.sub(r"\n+|\s+", "|", text).split("|")
+        sentences = re.sub(r"\n+|\s+", "|", text.strip()).split("|")
 
     return sentences
 
diff --git a/tests/__init__.py b/tests/__init__.py
index ec4a492d6..16fe1d176 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -36,7 +36,17 @@
 from pythainlp.spell import correct, spell
 from pythainlp.summarize import summarize
 from pythainlp.tag import pos_tag, pos_tag_sents
-from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize
+from pythainlp.tokenize import (
+    FROZEN_DICT_TRIE,
+    dict_word_tokenize,
+    etcc,
+    multi_cut,
+    sent_tokenize,
+    subword_tokenize,
+    syllable_tokenize,
+    tcc,
+    word_tokenize,
+)
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.transliterate.ipa import trans_list, xsampa_list
 from pythainlp.util import (
@@ -285,9 +295,38 @@ def test_pos_tag(self):
 
     # ### pythainlp.tokenize
 
-    def test_syllable_tokenize(self):
-        self.assertEqual(
-            syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
+    def test_dict_word_tokenize(self):
+        self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), [])
+        self.assertIsNotNone(
+            dict_word_tokenize("รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE)
+        )
+        self.assertIsNotNone(
+            dict_word_tokenize(
+                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
+            )
+        )
+        self.assertIsNotNone(
+            dict_word_tokenize(
+                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="longest"
+            )
+        )
+        self.assertIsNotNone(
+            dict_word_tokenize(
+                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="mm"
+            )
+        )
+        self.assertIsNotNone(
+            dict_word_tokenize(
+                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="XX"
+            )
+        )
+
+    def test_etcc(self):
+        self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข")
+        self.assertIsNotNone(
+            etcc.etcc(
+                "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์มีแขนขาหน้าหัวเราะ"
+            )
         )
 
     def test_word_tokenize(self):
@@ -295,6 +334,9 @@ def test_word_tokenize(self):
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
+        self.assertEqual(word_tokenize(""), [])
+        self.assertIsNotNone(word_tokenize("ทดสอบ", engine="ulmfit"))
+        self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX"))
 
     def test_word_tokenize_icu(self):
         self.assertEqual(
@@ -308,6 +350,8 @@ def test_word_tokenize_mm(self):
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 
+        self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานคร"))
+
     def test_word_tokenize_newmm(self):
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
@@ -332,12 +376,23 @@ def test_word_tokenize_longest_matching(self):
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 
+    def test_sent_tokenize(self):
+        self.assertEqual(
+            sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"), ["รักน้ำ", "รักปลา"]
+        )
+        self.assertEqual(sent_tokenize("รักน้ำ  รักปลา  "), ["รักน้ำ", "รักปลา"])
+
+    def test_subword_tokenize(self):
+        self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร"))
+
+    def test_syllable_tokenize(self):
+        self.assertEqual(
+            syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
+        )
+
     def test_tcc(self):
         self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย")
 
-    def test_etcc(self):
-        self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข")
-
     # ### pythainlp.transliterate
 
     def test_romanize(self):

From 612114806cc13cc7d6a24a3a99816f980d4f10f2 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 19:24:42 +0700
Subject: [PATCH 02/19] Add English test cases

---
 tests/__init__.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 16fe1d176..d894838dd 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -298,26 +298,26 @@ def test_pos_tag(self):
     def test_dict_word_tokenize(self):
         self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), [])
         self.assertIsNotNone(
-            dict_word_tokenize("รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE)
+            dict_word_tokenize("รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE)
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
+                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="longest"
+                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="longest"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="mm"
+                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="mm"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพ", custom_dict=FROZEN_DICT_TRIE, engine="XX"
+                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="XX"
             )
         )
 
@@ -325,7 +325,8 @@ def test_etcc(self):
         self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข")
         self.assertIsNotNone(
             etcc.etcc(
-                "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์มีแขนขาหน้าหัวเราะ"
+                "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" +
+                "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน"
             )
         )
 
@@ -350,7 +351,7 @@ def test_word_tokenize_mm(self):
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 
-        self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานคร"))
+        self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS"))
 
     def test_word_tokenize_newmm(self):
         self.assertEqual(

From 55774991c3c97090e5cc938fcccbb0346a4c8adf Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 19:33:34 +0700
Subject: [PATCH 03/19] more test cases for spellchecker

---
 tests/__init__.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index d894838dd..ebca4e778 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -34,6 +34,7 @@
 from pythainlp.sentiment import sentiment
 from pythainlp.soundex import lk82, metasound, soundex, udom83
 from pythainlp.spell import correct, spell
+from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob
 from pythainlp.summarize import summarize
 from pythainlp.tag import pos_tag, pos_tag_sents
 from pythainlp.tokenize import (
@@ -252,6 +253,14 @@ def test_spell(self):
         self.assertEqual(correct(""), "")
         self.assertEqual(correct(None), "")
 
+        self.assertIsNotNone(dictionary())
+        self.assertGreaterEqual(prob("มี"), 0)
+        self.assertIsNotNone(known(["เกิด", "abc", ""]))
+
+        checker = NorvigSpellChecker(dict_filter="")
+        self.assertIsNotNone(checker.dictionary())
+        self.assertGreaterEqual(checker.prob("มี"), 0)
+
     # ### pythainlp.summarize
 
     def test_summarize(self):
@@ -325,8 +334,8 @@ def test_etcc(self):
         self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข")
         self.assertIsNotNone(
             etcc.etcc(
-                "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" +
-                "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน"
+                "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์"
+                + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน"
             )
         )
 

From a7689ab1991b3fd967daab744e9d4d94d82db6fa Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 20:32:57 +0700
Subject: [PATCH 04/19] more wordnet test cases

---
 tests/__init__.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index ebca4e778..c1c830254 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 from collections import Counter
+from nltk.corpus import wordnet as wn
 
 from pythainlp.collation import collate
 from pythainlp.corpus import (
@@ -97,10 +98,18 @@ def test_ttc(self):
         self.assertIsNotNone(ttc.word_freqs())
 
     def test_wordnet(self):
+        self.assertIsNotNone(wordnet.langs())
+
         self.assertEqual(
             wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]
         )
-        self.assertIsNotNone(wordnet.langs())
+        self.assertIsNotNone(wordnet.synsets("นก"))
+        self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))
+
+        self.assertIsNotNone(wordnet.lemmas("นก"))
+        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADJ))
+
+        self.assertEqual(wordnet.morphy("dogs"), "dog")
 
     # ### pythainlp.date
 

From 7a1f4e40dd6f7f6dbf307f356f624affabb3e021 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 20:51:40 +0700
Subject: [PATCH 05/19] more romanize() (royin) test cases

---
 pythainlp/transliterate/royin.py |  3 +++
 tests/__init__.py                | 23 +++++++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py
index 69a3671d9..415e0fce3 100644
--- a/pythainlp/transliterate/royin.py
+++ b/pythainlp/transliterate/royin.py
@@ -168,6 +168,9 @@ def _replace_consonants(word, res):
 
 
 def romanize(word):
+    if not word:
+        return ""
+
     word2 = _replace_vowels(_normalize(word))
     res = re.findall(_RE_CONSONANT, word2)
     # 2-character word, all consonants
diff --git a/tests/__init__.py b/tests/__init__.py
index c1c830254..cba0f573a 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -107,7 +107,7 @@ def test_wordnet(self):
         self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ))
 
         self.assertIsNotNone(wordnet.lemmas("นก"))
-        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADJ))
+        self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
 
         self.assertEqual(wordnet.morphy("dogs"), "dog")
 
@@ -316,26 +316,26 @@ def test_pos_tag(self):
     def test_dict_word_tokenize(self):
         self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), [])
         self.assertIsNotNone(
-            dict_word_tokenize("รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE)
+            dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE)
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
+                "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="longest"
+                "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="mm"
+                "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm"
             )
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพBTS", custom_dict=FROZEN_DICT_TRIE, engine="XX"
+                "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX"
             )
         )
 
@@ -422,6 +422,17 @@ def test_romanize(self):
         self.assertEqual(romanize("ดู", engine="royin"), "du")
         self.assertEqual(romanize("ดำ", engine="royin"), "dam")
         self.assertEqual(romanize("บัว", engine="royin"), "bua")
+        self.assertEqual(romanize("กร", engine="royin"), "kon")
+        self.assertEqual(romanize("กรร", engine="royin"), "kan")
+        self.assertEqual(romanize("กรรม", engine="royin"), "kam")
+        self.assertEqual(romanize(""), "")
+        self.assertEqual(romanize(None), "")
+        self.assertIsNotNone(romanize("หาย", engine="royin"))
+        self.assertIsNotNone(romanize("หยาก", engine="royin"))
+        self.assertIsNotNone(romanize("ฝ้าย", engine="royin"))
+        self.assertIsNotNone(romanize("กรม", engine="royin"))
+        self.assertIsNotNone(romanize("ธรรพ์", engine="royin"))
+        self.assertIsNotNone(romanize("กฏa์", engine="royin"))
         # self.assertIsNotNone(romanize("บัว", engine="thai2rom"))
 
     def test_transliterate(self):

From 65b16ca297fc52f5c021d9d9655c46b21ea10e38 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 21:10:44 +0700
Subject: [PATCH 06/19] handles None and empty cases

---
 pythainlp/tokenize/__init__.py      | 18 ++++++++++++++++++
 pythainlp/tokenize/etcc.py          |  4 ++++
 pythainlp/tokenize/tcc.py           | 10 ++++++++--
 pythainlp/transliterate/__init__.py | 10 ++++++++++
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index e7ea1f984..450fd0131 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -34,6 +34,9 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
         >>> word_tokenize(text, engine="icu")
         ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
+    if not text:
+        return []
+
     if engine == "newmm" or engine == "onecut":
         from .newmm import mmcut as segment
     elif engine == "longest" or engine == "longest-matching":
@@ -73,6 +76,10 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"):
         >>> dict_word_tokenize("แมวดีดีแมว", trie)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
+
+    if not text:
+        return []
+
     if engine == "newmm" or engine == "onecut":
         from .newmm import mmcut as segment
     elif engine == "longest" or engine == "longest-matching":
@@ -94,6 +101,10 @@ def sent_tokenize(text, engine="whitespace+newline"):
 
     :return: a list of text, split by whitespace or new line.
     """
+
+    if not text:
+        return []
+
     sentences = []
 
     if engine == "whitespace":
@@ -110,6 +121,9 @@ def subword_tokenize(text, engine="tcc"):
     :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units.
     :return: a list of tokenized strings.
     """
+    if not text:
+        return ""
+
     from .tcc import tcc
 
     return tcc(text)
@@ -121,6 +135,10 @@ def syllable_tokenize(text):
 
     :return: returns list of strings of syllables
     """
+
+    if not text:
+        return []
+
     tokens = []
     if text:
         words = word_tokenize(text)
diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py
index a90e0b835..5e73b4586 100644
--- a/pythainlp/tokenize/etcc.py
+++ b/pythainlp/tokenize/etcc.py
@@ -27,6 +27,10 @@ def etcc(text):
     รับ str
     ส่งออก str
     """
+
+    if not text:
+        return ""
+
     if re.search(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text):
         search = re.findall(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text)
         for i in search:
diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py
index bfb5920e9..54464cd4d 100644
--- a/pythainlp/tokenize/tcc.py
+++ b/pythainlp/tokenize/tcc.py
@@ -60,6 +60,9 @@ def tcc_gen(w):
 
 
 def tcc_pos(text):
+    if not text:
+        return set()
+
     p_set = set()
     p = 0
     for w in tcc_gen(text):
@@ -68,5 +71,8 @@ def tcc_pos(text):
     return p_set
 
 
-def tcc(w, sep="/"):
-    return sep.join(tcc_gen(w))
\ No newline at end of file
+def tcc(text, sep="/"):
+    if not text:
+        return ""
+
+    return sep.join(tcc_gen(text))
diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py
index 48bd5cfd2..7ede03197 100644
--- a/pythainlp/transliterate/__init__.py
+++ b/pythainlp/transliterate/__init__.py
@@ -10,11 +10,17 @@ def romanize(text, engine="royin"):
     :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras).
     :return: English (more or less) text that spells out how the Thai text should read.
     """
+
+    if not text:
+        return ""
+
     if engine == "thai2rom":
         from .thai2rom import romanize
+
         return romanize(text)
     else:  # use default engine "royin"
         from .royin import romanize
+
         words = word_tokenize(text)
         romanized_words = [romanize(word) for word in words]
         return "".join(romanized_words)
@@ -26,6 +32,10 @@ def transliterate(text, engine="ipa"):
     :param str engine: 'ipa' (default) or 'pyicu'.
     :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read.
     """
+
+    if not text:
+        return ""
+
     if engine == "pyicu":
         from .pyicu import transliterate
     else:

From 03dbcc05a0eefc737104093b8102cc66e69ee5c7 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 21:46:48 +0700
Subject: [PATCH 07/19] - handles None and empty cases - more test cases for
 tokenize

---
 pythainlp/tokenize/__init__.py  | 14 +++++++-------
 pythainlp/tokenize/longest.py   |  8 ++++++--
 pythainlp/tokenize/multi_cut.py | 21 +++++++++++++--------
 pythainlp/tokenize/newmm.py     |  6 +++++-
 pythainlp/tokenize/pyicu.py     |  3 +++
 pythainlp/tokenize/tcc.py       |  4 ++++
 tests/__init__.py               | 27 +++++++++++++++++++++++++--
 7 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 450fd0131..f44ac7410 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -38,14 +38,14 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
         return []
 
     if engine == "newmm" or engine == "onecut":
-        from .newmm import mmcut as segment
+        from .newmm import segment
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "ulmfit":
-        from .newmm import mmcut
+        from .newmm import segment
 
         def segment(text):
-            return mmcut(text, trie=FROZEN_DICT_TRIE)
+            return segment(text, trie=FROZEN_DICT_TRIE)
 
     elif engine == "icu":
         from .pyicu import segment
@@ -54,7 +54,7 @@ def segment(text):
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
     else:  # default, use "newmm" engine
-        from .newmm import mmcut as segment
+        from .newmm import segment
 
     if not whitespaces:
         return [token.strip(" ") for token in segment(text) if token.strip(" ")]
@@ -81,13 +81,13 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"):
         return []
 
     if engine == "newmm" or engine == "onecut":
-        from .newmm import mmcut as segment
+        from .newmm import segment
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "mm" or engine == "multi_cut":
         from .multi_cut import segment
     else:  # default, use "newmm" engine
-        from .newmm import mmcut as segment
+        from .newmm import segment
 
     return segment(text, custom_dict)
 
@@ -189,6 +189,6 @@ def __init__(self, custom_dict=None):
             self.__trie_dict = Trie(thai_words())
 
     def word_tokenize(self, text, engine="newmm"):
-        from .newmm import mmcut as segment
+        from .newmm import segment
 
         return segment(text, self.__trie_dict)
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index 483685da2..33ff1fa0a 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -35,7 +35,7 @@
 _UNKNOWN = False
 
 
-class Tokenizer(object):
+class LongestMatchTokenizer(object):
     def __init__(self, trie):
         self.__trie = trie
 
@@ -95,6 +95,9 @@ def __longest_matching(self, text, begin_pos):
             return ""
 
     def __segment_text(self, text):
+        if not text:
+            return []
+
         begin_pos = 0
         len_text = len(text)
         tokens = []
@@ -137,4 +140,5 @@ def segment(text, trie=None):
     """ตัดคำภาษาไทยด้วยวิธี longest matching"""
     if not trie:
         trie = DEFAULT_DICT_TRIE
-    return Tokenizer(trie).tokenize(text)
+
+    return LongestMatchTokenizer(trie).tokenize(text)
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
index 80f621c27..d161bdf4e 100644
--- a/pythainlp/tokenize/multi_cut.py
+++ b/pythainlp/tokenize/multi_cut.py
@@ -40,7 +40,7 @@ def __init__(self, value, multi=None, in_dict=True):
 _PAT_ENG = re.compile(_RE_ENG)
 
 
-def multicut(text, trie=None):
+def _multicut(text, trie=None):
     """
     ส่งคืน LatticeString คืนมาเป็นก้อนๆ
     """
@@ -95,18 +95,18 @@ def serialize(p, p2):  # helper function
 
 def mmcut(text):
     res = []
-    for w in multicut(text):
+    for w in _multicut(text):
         mm = min(w.multi, key=lambda x: x.count("/"))
         res.extend(mm.split("/"))
     return res
 
 
-def combine(ww):
+def _combine(ww):
     if ww == []:
         yield ""
     else:
         w = ww[0]
-        for tail in combine(ww[1:]):
+        for tail in _combine(ww[1:]):
             if w.unique:
                 yield w + "|" + tail
             else:
@@ -118,13 +118,18 @@ def segment(text, trie=None):
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
-    ww = list(multicut(text, trie=trie))
-    return ww
+    if not text:
+        return []
+
+    return list(_multicut(text, trie=trie))
 
 
 def find_all_segment(text, trie=None):
     """
     ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด
     """
-    ww = list(multicut(text, trie=trie))
-    return list(combine(ww))
+    if not text:
+        return []
+
+    ww = list(_multicut(text, trie=trie))
+    return list(_combine(ww))
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 08fda8628..17815fd9f 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -90,7 +90,11 @@ def onecut(text, trie):
 
 
 # ช่วยให้ไม่ต้องพิมพ์ยาวๆ
-def mmcut(text, trie=None):
+def segment(text, trie=None):
+    if not text:
+        return []
+
     if not trie:
         trie = DEFAULT_DICT_TRIE
+
     return list(onecut(text, trie))
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
index aefcc9311..23b7b38e4 100644
--- a/pythainlp/tokenize/pyicu.py
+++ b/pythainlp/tokenize/pyicu.py
@@ -17,5 +17,8 @@ def _gen_words(text):
 
 
 def segment(text):
+    if not text:
+        return []
+
     text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text)
     return list(_gen_words(text))
diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py
index 54464cd4d..b50bdb24a 100644
--- a/pythainlp/tokenize/tcc.py
+++ b/pythainlp/tokenize/tcc.py
@@ -48,6 +48,9 @@
 
 
 def tcc_gen(w):
+    if not w:
+        return ''
+
     p = 0
     while p < len(w):
         m = PAT_TCC.match(w[p:])
@@ -68,6 +71,7 @@ def tcc_pos(text):
     for w in tcc_gen(text):
         p += len(w)
         p_set.add(p)
+
     return p_set
 
 
diff --git a/tests/__init__.py b/tests/__init__.py
index cba0f573a..5919f7c91 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -42,13 +42,16 @@
     FROZEN_DICT_TRIE,
     dict_word_tokenize,
     etcc,
+    longest,
     multi_cut,
+    newmm,
     sent_tokenize,
     subword_tokenize,
     syllable_tokenize,
     tcc,
     word_tokenize,
 )
+from pythainlp.tokenize import pyicu as tokenize_pyicu
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.transliterate.ipa import trans_list, xsampa_list
 from pythainlp.util import (
@@ -325,7 +328,9 @@ def test_dict_word_tokenize(self):
         )
         self.assertIsNotNone(
             dict_word_tokenize(
-                "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="longest"
+                "รถไฟฟ้ากรุงเทพBTSหูว์ค์",
+                custom_dict=FROZEN_DICT_TRIE,
+                engine="longest",
             )
         )
         self.assertIsNotNone(
@@ -340,6 +345,7 @@ def test_dict_word_tokenize(self):
         )
 
     def test_etcc(self):
+        self.assertEqual(etcc.etcc(""), "")
         self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข")
         self.assertIsNotNone(
             etcc.etcc(
@@ -349,21 +355,26 @@ def test_etcc(self):
         )
 
     def test_word_tokenize(self):
+        self.assertEqual(word_tokenize(""), [])
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
-        self.assertEqual(word_tokenize(""), [])
         self.assertIsNotNone(word_tokenize("ทดสอบ", engine="ulmfit"))
         self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX"))
 
     def test_word_tokenize_icu(self):
+        self.assertEqual(tokenize_pyicu.segment(None), "")
+        self.assertEqual(tokenize_pyicu.segment(""), "")
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
         )
 
     def test_word_tokenize_mm(self):
+        self.assertEqual(multi_cut.segment(None), [])
+        self.assertEqual(multi_cut.segment(""), [])
+        self.assertEqual(word_tokenize("", engine="mm"), [])
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
@@ -372,6 +383,8 @@ def test_word_tokenize_mm(self):
         self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS"))
 
     def test_word_tokenize_newmm(self):
+        self.assertEqual(newmm.segment(None), [])
+        self.assertEqual(newmm.segment(""), [])
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
@@ -390,26 +403,36 @@ def test_word_tokenize_newmm(self):
         )
 
     def test_word_tokenize_longest_matching(self):
+        self.assertEqual(longest.segment(None), [])
+        self.assertEqual(longest.segment(""), [])
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 
     def test_sent_tokenize(self):
+        self.assertEqual(sent_tokenize(None), [])
+        self.assertEqual(sent_tokenize(""), [])
         self.assertEqual(
             sent_tokenize("รักน้ำ  รักปลา  ", engine="whitespace"), ["รักน้ำ", "รักปลา"]
         )
         self.assertEqual(sent_tokenize("รักน้ำ  รักปลา  "), ["รักน้ำ", "รักปลา"])
 
     def test_subword_tokenize(self):
+        self.assertEqual(subword_tokenize(None), "")
+        self.assertEqual(subword_tokenize(""), "")
         self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร"))
 
     def test_syllable_tokenize(self):
+        self.assertEqual(syllable_tokenize(None), [])
+        self.assertEqual(syllable_tokenize(""), [])
         self.assertEqual(
             syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"]
         )
 
     def test_tcc(self):
+        self.assertEqual(tcc.tcc(None), "")
+        self.assertEqual(tcc.tcc(""), "")
         self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย")
 
     # ### pythainlp.transliterate

From 7190aba154c5449d681c45f33e42a90dd595eb90 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 22:04:52 +0700
Subject: [PATCH 08/19] fix test cases

---
 pythainlp/tokenize/__init__.py | 4 ++--
 pythainlp/tokenize/deepcut.py  | 3 +++
 tests/__init__.py              | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index f44ac7410..3c97535c0 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -42,10 +42,10 @@ def word_tokenize(text, engine="newmm", whitespaces=True):
     elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "ulmfit":
-        from .newmm import segment
+        from .newmm import segment as segment_
 
         def segment(text):
-            return segment(text, trie=FROZEN_DICT_TRIE)
+            return segment_(text, trie=FROZEN_DICT_TRIE)
 
     elif engine == "icu":
         from .pyicu import segment
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index 395e76583..510a1b848 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -7,4 +7,7 @@
 
 
 def segment(text):
+    if not text:
+        return []
+
     return deepcut.tokenize(text)
diff --git a/tests/__init__.py b/tests/__init__.py
index 5919f7c91..921e2a719 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -364,8 +364,8 @@ def test_word_tokenize(self):
         self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX"))
 
     def test_word_tokenize_icu(self):
-        self.assertEqual(tokenize_pyicu.segment(None), "")
-        self.assertEqual(tokenize_pyicu.segment(""), "")
+        self.assertEqual(tokenize_pyicu.segment(None), [])
+        self.assertEqual(tokenize_pyicu.segment(""), [])
         self.assertEqual(
             word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"),
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],

From 69cb3a708a1c39ff56620615c8bf9631b1d87651 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 22:52:40 +0700
Subject: [PATCH 09/19] handles None and empty cases in pos taggers

---
 pythainlp/tag/__init__.py   | 21 +++++++++++++++------
 pythainlp/tag/perceptron.py | 27 ++++++++++++++++-----------
 pythainlp/tag/unigram.py    | 17 ++++++++++-------
 tests/__init__.py           | 23 ++++++++++++++++++++---
 4 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index d60ee950f..7b694375a 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -20,21 +20,30 @@ def pos_tag(words, engine="unigram", corpus="orchid"):
         * pud - Parallel Universal Dependencies (PUD) treebanks
     :return: returns a list of labels regarding which part of speech it is
     """
+    if not words:
+        return []
+
     if engine == "perceptron":
-        from .perceptron import tag as _tag
+        from .perceptron import tag as tag_
     elif engine == "artagger":
 
-        def _tag(text, corpus=None):
+        def tag_(words, corpus=None):
+            if not words:
+                return []
+
             from artagger import Tagger
-            words = Tagger().tag(" ".join(text))
+            words_ = Tagger().tag(" ".join(words))
 
-            return [(word.word, word.tag) for word in words]
+            return [(word.word, word.tag) for word in words_]
 
     else:  # default, use "unigram" ("old") engine
-        from .unigram import tag as _tag
+        from .unigram import tag as tag_
 
-    return _tag(words, corpus=corpus)
+    return tag_(words, corpus=corpus)
 
 
 def pos_tag_sents(sentences, engine="unigram", corpus="orchid"):
+    if not sentences:
+        return []
+
     return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
index 8d4fe1280..e5dc9e424 100644
--- a/pythainlp/tag/perceptron.py
+++ b/pythainlp/tag/perceptron.py
@@ -7,28 +7,33 @@
 import dill
 from pythainlp.corpus import CORPUS_PATH
 
+_ORCHID_DATA_FILENAME = "orchid_pt_tagger.dill"
+_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.dill"
 
-def orchid_data():
-    data_filename = os.path.join(CORPUS_PATH, "orchid_pt_tagger.dill")
+
+def _load_tagger(filename):
+    data_filename = os.path.join(CORPUS_PATH, filename)
     with open(data_filename, "rb") as fh:
         model = dill.load(fh)
     return model
 
 
-def pud_data():
-    data_filename = os.path.join(CORPUS_PATH, "ud_thai_pud_pt_tagger.dill")
-    with open(data_filename, "rb") as fh:
-        model = dill.load(fh)
-    return model
+_ORCHID_TAGGER = _load_tagger(_ORCHID_DATA_FILENAME)
+_PUD_TAGGER = _load_tagger(_PUD_DATA_FILENAME)
 
 
-def tag(text, corpus="pud"):
+def tag(words, corpus="pud"):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...]
     """
+    if not words:
+        return []
+
+    words = [word.strip() for word in words if word.strip()]
+
     if corpus == "orchid":
-        tagger = orchid_data()
+        tagger = _ORCHID_TAGGER
     else:  # default, use "pud" as a corpus
-        tagger = pud_data()
+        tagger = _PUD_TAGGER
 
-    return tagger.tag(text)
+    return tagger.tag(words)
diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
index 21324bf64..e90c992f0 100644
--- a/pythainlp/tag/unigram.py
+++ b/pythainlp/tag/unigram.py
@@ -15,26 +15,29 @@
 _THAI_POS_PUD_PATH = os.path.join(CORPUS_PATH, _THAI_POS_PUD_FILENAME)
 
 
-def orchid_data():
+def _orchid_tagger():
     with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f:
         model = json.load(f)
     return model
 
 
-def pud_data():
+def _pud_tagger():
     with open(_THAI_POS_PUD_PATH, "rb") as handle:
         model = dill.load(handle)
     return model
 
 
-def tag(text, corpus):
+def tag(words, corpus):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...]
     """
+    if not words:
+        return []
+
     if corpus == "orchid":
-        tagger = nltk.tag.UnigramTagger(model=orchid_data())
-        return tagger.tag(text)
+        tagger = nltk.tag.UnigramTagger(model=_orchid_tagger())
+        return tagger.tag(words)
 
     # default, use "pud" as a corpus
-    tagger = pud_data()
-    return tagger.tag(text)
+    tagger = _pud_tagger()
+    return tagger.tag(words)
diff --git a/tests/__init__.py b/tests/__init__.py
index 921e2a719..972c1cf46 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -37,7 +37,7 @@
 from pythainlp.spell import correct, spell
 from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob
 from pythainlp.summarize import summarize
-from pythainlp.tag import pos_tag, pos_tag_sents
+from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
 from pythainlp.tokenize import (
     FROZEN_DICT_TRIE,
     dict_word_tokenize,
@@ -293,8 +293,19 @@ def test_summarize(self):
 
     def test_pos_tag(self):
         tokens = ["ผม", "รัก", "คุณ"]
+
+        self.assertEqual(pos_tag(None), [])
+        self.assertEqual(pos_tag([]), [])
+
         self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid"))
         self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
+        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
+
+        self.assertEqual(unigram.tag(None, corpus="pud"), [])
+        self.assertEqual(unigram.tag([], corpus="pud"), [])
+        self.assertEqual(unigram.tag(None, corpus="orchid"), [])
+        self.assertEqual(unigram.tag([], corpus="orchid"), [])
+
         self.assertEqual(
             pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"),
             [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
@@ -302,10 +313,16 @@ def test_pos_tag(self):
 
         self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid"))
         self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud"))
+        self.assertEqual(perceptron.tag(None, corpus="pud"), [])
+        self.assertEqual(perceptron.tag([], corpus="pud"), [])
+        self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
+        self.assertEqual(perceptron.tag([], corpus="orchid"), [])
 
-        # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid"))
-        # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud"))
+        self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid"))
+        self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud"))
 
+        self.assertEqual(pos_tag_sents(None), [])
+        self.assertEqual(pos_tag_sents([]), [])
         self.assertEqual(
             pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
             [

From 5292715cf7728cf3189d0d67a80058558800ccfc Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 23:05:24 +0700
Subject: [PATCH 10/19] remove artagger tests for now

---
 tests/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 972c1cf46..a814fe761 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -318,8 +318,8 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
         self.assertEqual(perceptron.tag([], corpus="orchid"), [])
 
-        self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid"))
-        self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud"))
+        # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid"))
+        # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud"))
 
         self.assertEqual(pos_tag_sents(None), [])
         self.assertEqual(pos_tag_sents([]), [])

From e7381429e409fd63950ed8e8932e827e1c03a7f5 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 23:25:58 +0700
Subject: [PATCH 11/19] more test cases for tokenization

---
 tests/__init__.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index a814fe761..6d9794776 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -40,6 +40,7 @@
 from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
 from pythainlp.tokenize import (
     FROZEN_DICT_TRIE,
+    deepcut,
     dict_word_tokenize,
     etcc,
     longest,
@@ -388,6 +389,19 @@ def test_word_tokenize_icu(self):
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
         )
 
+    def test_word_tokenize_deepcut(self):
+        self.assertEqual(deepcut.segment(None), [])
+        self.assertEqual(deepcut.segment(""), [])
+        self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
+
+    def test_word_tokenize_longest_matching(self):
+        self.assertEqual(longest.segment(None), [])
+        self.assertEqual(longest.segment(""), [])
+        self.assertEqual(
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
+            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
+        )
+
     def test_word_tokenize_mm(self):
         self.assertEqual(multi_cut.segment(None), [])
         self.assertEqual(multi_cut.segment(""), [])
@@ -419,14 +433,6 @@ def test_word_tokenize_newmm(self):
             ["จุ๋ม", "ง่วง"],
         )
 
-    def test_word_tokenize_longest_matching(self):
-        self.assertEqual(longest.segment(None), [])
-        self.assertEqual(longest.segment(""), [])
-        self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
-            ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
-        )
-
     def test_sent_tokenize(self):
         self.assertEqual(sent_tokenize(None), [])
         self.assertEqual(sent_tokenize(""), [])
@@ -452,11 +458,15 @@ def test_tcc(self):
         self.assertEqual(tcc.tcc(""), "")
         self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย")
 
+        self.assertEqual(tcc.tcc_gen(), "")
+        self.assertEqual(tcc.tcc_pos(""), set())
+
     # ### pythainlp.transliterate
 
     def test_romanize(self):
+        self.assertEqual(romanize(None), "")
+        self.assertEqual(romanize(""), "")
         self.assertEqual(romanize("แมว"), "maeo")
-        self.assertIsNotNone(romanize("กก", engine="royin"))
         self.assertEqual(romanize("แมว", engine="royin"), "maeo")
         self.assertEqual(romanize("เดือน", engine="royin"), "duean")
         self.assertEqual(romanize("ดู", engine="royin"), "du")
@@ -465,8 +475,7 @@ def test_romanize(self):
         self.assertEqual(romanize("กร", engine="royin"), "kon")
         self.assertEqual(romanize("กรร", engine="royin"), "kan")
         self.assertEqual(romanize("กรรม", engine="royin"), "kam")
-        self.assertEqual(romanize(""), "")
-        self.assertEqual(romanize(None), "")
+        self.assertIsNotNone(romanize("กก", engine="royin"))
         self.assertIsNotNone(romanize("หาย", engine="royin"))
         self.assertIsNotNone(romanize("หยาก", engine="royin"))
         self.assertIsNotNone(romanize("ฝ้าย", engine="royin"))
@@ -476,6 +485,7 @@ def test_romanize(self):
         # self.assertIsNotNone(romanize("บัว", engine="thai2rom"))
 
     def test_transliterate(self):
+        self.assertEqual(transliterate(""), "")
         self.assertEqual(transliterate("แมว", "pyicu"), "mæw")
         self.assertEqual(transliterate("คน", engine="ipa"), "kʰon")
         self.assertIsNotNone(trans_list("คน"))

From 985fcf96e6fab5ce2a851f2b764815231c76e8e9 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 9 Nov 2018 23:42:16 +0700
Subject: [PATCH 12/19] - adjust extras_require - remove deepcut tests for now

---
 .travis.yml       |  2 +-
 appveyor.yml      |  2 +-
 setup.py          | 10 +++++++---
 tests/__init__.py |  9 ++++-----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8f4edb93f..3ca3d5b8b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ python:
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
   - pip install -r requirements.txt
-  - pip install .[icu,ner,pos,tokenize,transliterate]
+  - pip install .[icu,ipa,ner]
   - pip install coveralls
 
 os:
diff --git a/appveyor.yml b/appveyor.yml
index 00b4e1ae2..560766dc2 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -32,7 +32,7 @@ install:
   # - "set ICU_VERSION=62"
   - "%PYTHON%/python.exe -m pip install --upgrade pip"
   - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%"
-  - "%PYTHON%/python.exe -m pip install -e .[icu,ner,pos,tokenize,transliterate]"
+  - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner]"
 
 test_script:
   - "%PYTHON%/python.exe -m pip --version"
diff --git a/setup.py b/setup.py
index 3fa7c5c18..583a5d98a 100644
--- a/setup.py
+++ b/setup.py
@@ -9,21 +9,25 @@
     requirements = f.read().splitlines()
 
 extras = {
+    "artagger": ["artagger"],
+    "deepcut": ["deepcut", "keras", "tensorflow"],
     "icu": ["pyicu"],
+    "ipa": ["epitran"],
     "ml": ["fastai==0.7.0", "keras", "numpy", "torch"],
     "ner": ["sklearn_crfsuite"],
-    "pos": ["artagger"],
-    "tokenize": ["deepcut", "pyicu"],
-    "transliterate": ["epitran", "pyicu"],
+    "thai2rom": ["keras", "numpy"],
+    "thai2vec": ["gensim", "numpy"],
     "full": [
         "artagger",
         "deepcut",
         "epitran",
         "fastai==0.7.0",
+        "gensim",
         "keras",
         "numpy",
         "pyicu",
         "sklearn_crfsuite",
+        "tensorflow",
         "torch",
     ],
 }
diff --git a/tests/__init__.py b/tests/__init__.py
index 6d9794776..39c515d94 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -40,7 +40,6 @@
 from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram
 from pythainlp.tokenize import (
     FROZEN_DICT_TRIE,
-    deepcut,
     dict_word_tokenize,
     etcc,
     longest,
@@ -389,10 +388,10 @@ def test_word_tokenize_icu(self):
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
         )
 
-    def test_word_tokenize_deepcut(self):
-        self.assertEqual(deepcut.segment(None), [])
-        self.assertEqual(deepcut.segment(""), [])
-        self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
+    # def test_word_tokenize_deepcut(self):
+        # self.assertEqual(deepcut.segment(None), [])
+        # self.assertEqual(deepcut.segment(""), [])
+        # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
 
     def test_word_tokenize_longest_matching(self):
         self.assertEqual(longest.segment(None), [])

From 9496571b3de186772d12b87857ed8748033d2a9a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 00:01:19 +0700
Subject: [PATCH 13/19] fix tcc_gen() test

---
 tests/__init__.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index 39c515d94..fcd9a4a84 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -257,13 +257,14 @@ def test_soundex(self):
     # ### pythainlp.spell
 
     def test_spell(self):
-        self.assertIsNotNone(spell("เน้ร"))
-        self.assertEqual(spell(""), "")
         self.assertEqual(spell(None), "")
+        self.assertEqual(spell(""), "")
+        self.assertIsNotNone(spell("เน้ร"))
+        self.assertIsNotNone(spell("เกสมร์"))
 
-        self.assertIsNotNone(correct("ทดสอง"))
-        self.assertEqual(correct(""), "")
         self.assertEqual(correct(None), "")
+        self.assertEqual(correct(""), "")
+        self.assertIsNotNone(correct("ทดสอง"))
 
         self.assertIsNotNone(dictionary())
         self.assertGreaterEqual(prob("มี"), 0)
@@ -457,7 +458,7 @@ def test_tcc(self):
         self.assertEqual(tcc.tcc(""), "")
         self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย")
 
-        self.assertEqual(tcc.tcc_gen(), "")
+        self.assertEqual(tcc.tcc_gen(""), "")
         self.assertEqual(tcc.tcc_pos(""), set())
 
     # ### pythainlp.transliterate

From a145a2bc20228adbe792e3d9cd74cdf170863882 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 00:09:51 +0700
Subject: [PATCH 14/19] thai2vec: load model only once

---
 pythainlp/word_vector/thai2vec.py | 21 +++++++++++++--------
 tests/__init__.py                 |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index e2b4b1329..22b682fae 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -10,7 +10,7 @@
 from pythainlp.tokenize import word_tokenize
 
 
-def download():
+def _download():
     path = get_file("thai2vec02")
     if not path:
         download_data("thai2vec02")
@@ -20,8 +20,13 @@ def download():
 
 def get_model():
     """
-    :return: Downloads the `gensim` model."""
-    return KeyedVectors.load_word2vec_format(download(), binary=False)
+    Download model
+    :return: `gensim` model
+    """
+    return KeyedVectors.load_word2vec_format(_download(), binary=False)
+
+
+_MODEL = get_model()
 
 
 def most_similar_cosmul(positive, negative):
@@ -29,11 +34,11 @@ def most_similar_cosmul(positive, negative):
     การใช้งาน
     input list
     """
-    return get_model().most_similar_cosmul(positive=positive, negative=negative)
+    return _MODEL.most_similar_cosmul(positive=positive, negative=negative)
 
 
 def doesnt_match(listdata):
-    return get_model().doesnt_match(listdata)
+    return _MODEL.doesnt_match(listdata)
 
 
 def similarity(word1, word2):
@@ -42,15 +47,15 @@ def similarity(word1, word2):
     :param str word2: second word
     :return: the cosine similarity between the two word vectors
     """
-    return get_model().similarity(word1, word2)
+    return _MODEL.similarity(word1, word2)
 
 
 def sentence_vectorizer(text, dim=300, use_mean=False):
     words = word_tokenize(text)
     vec = np.zeros((1, dim))
     for word in words:
-        if word in get_model().wv.index2word:
-            vec += get_model().wv.word_vec(word)
+        if word in _MODEL.wv.index2word:
+            vec += _MODEL.wv.word_vec(word)
         else:
             pass
     if use_mean:
diff --git a/tests/__init__.py b/tests/__init__.py
index fcd9a4a84..88ef4bc11 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -458,7 +458,7 @@ def test_tcc(self):
         self.assertEqual(tcc.tcc(""), "")
         self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย")
 
-        self.assertEqual(tcc.tcc_gen(""), "")
+        self.assertEqual(list(tcc.tcc_gen("")), [])
         self.assertEqual(tcc.tcc_pos(""), set())
 
     # ### pythainlp.transliterate

From cd37a040aa76ddc82e256be0af5d2688c1966c52 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 11:35:18 +0700
Subject: [PATCH 15/19] thai2vec test cases + more wordnet test cases

---
 .travis.yml                        |  2 +-
 appveyor.yml                       |  2 +-
 pythainlp/sentiment/ulmfit_sent.py | 14 +++++++++-----
 pythainlp/word_vector/thai2vec.py  |  2 ++
 tests/__init__.py                  | 28 +++++++++++++++++++++++++---
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3ca3d5b8b..db0c8a6ba 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ python:
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
   - pip install -r requirements.txt
-  - pip install .[icu,ipa,ner]
+  - pip install .[icu,ipa,ner,thai2vec]
   - pip install coveralls
 
 os:
diff --git a/appveyor.yml b/appveyor.yml
index 560766dc2..808598eae 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -32,7 +32,7 @@ install:
   # - "set ICU_VERSION=62"
   - "%PYTHON%/python.exe -m pip install --upgrade pip"
   - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%"
-  - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner]"
+  - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner,thai2vec]"
 
 test_script:
   - "%PYTHON%/python.exe -m pip --version"
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
index 19ca3368f..19532f453 100644
--- a/pythainlp/sentiment/ulmfit_sent.py
+++ b/pythainlp/sentiment/ulmfit_sent.py
@@ -15,6 +15,8 @@
 
 # from fastai.text import multiBatchRNN
 
+__all__ = ["about", "get_sentiment"]
+
 MODEL_NAME = "sent_model"
 ITOS_NAME = "itos_sent"
 
@@ -29,24 +31,26 @@ def get_path(fname):
 
 
 # load model
-model = torch.load(get_path(MODEL_NAME))
-model.eval()
+MODEL = torch.load(get_path(MODEL_NAME))
+MODEL.eval()
 
 # load itos and stoi
 itos = pickle.load(open(get_path(ITOS_NAME), "rb"))
 stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})
 
+
 # get sentiment; 1 for positive and 0 for negative
 # or score if specified return_score=True
-softmax = lambda x: np.exp(x) / np.sum(np.exp(x))
+def softmax(x):
+    return np.exp(x) / np.sum(np.exp(x))
 
 
 def get_sentiment(text, return_score=False):
     words = word_tokenize(text)
     tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu()
     tensor = Variable(tensor, volatile=False)
-    model.reset()
-    pred, *_ = model(tensor)
+    MODEL.reset()
+    pred, *_ = MODEL(tensor)
     result = pred.data.cpu().numpy().reshape(-1)
 
     if return_score:
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index 22b682fae..0f371e31e 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -43,6 +43,8 @@ def doesnt_match(listdata):
 
 def similarity(word1, word2):
     """
+    Get cosine similarity between two words.
+    If a word is not in the vocabulary, KeyError will be raised.
     :param str word1: first word
     :param str word2: second word
     :return: the cosine similarity between the two word vectors
diff --git a/tests/__init__.py b/tests/__init__.py
index 88ef4bc11..760e442bb 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -63,6 +63,7 @@
     normalize,
     thai_to_eng,
 )
+from pythainlp.word_vector import thai2vec
 
 
 class TestUM(unittest.TestCase):
@@ -111,9 +112,18 @@ def test_wordnet(self):
 
         self.assertIsNotNone(wordnet.lemmas("นก"))
         self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
+        self.assertIsNotNone(wordnet.lemma('cat.n.01.cat'))
 
         self.assertEqual(wordnet.morphy("dogs"), "dog")
 
+        bird = wordnet.synset('bird.n.01')
+        mouse = wordnet.synset('mouse.n.01')
+        self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse))
+        self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse))
+
+        cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
+        self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
+
     # ### pythainlp.date
 
     def test_date(self):
@@ -390,9 +400,9 @@ def test_word_tokenize_icu(self):
         )
 
     # def test_word_tokenize_deepcut(self):
-        # self.assertEqual(deepcut.segment(None), [])
-        # self.assertEqual(deepcut.segment(""), [])
-        # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
+    # self.assertEqual(deepcut.segment(None), [])
+    # self.assertEqual(deepcut.segment(""), [])
+    # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut"))
 
     def test_word_tokenize_longest_matching(self):
         self.assertEqual(longest.segment(None), [])
@@ -519,6 +529,18 @@ def test_keyboard(self):
         self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ")
         self.assertEqual(thai_to_eng("สวัสดีครับ"), "l;ylfu8iy[")
 
+    # ### pythainlp.word_vector
+
+    def test_thai2vec(self):
+        self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0)
+        self.assertIsNotNone(thai2vec.sentence_vectorizer(""))
+        self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม"))
+        self.assertEqual(
+            thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0],
+            "ราชินี",
+        )
+        self.assertEqual(thai2vec.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม")
+
 
 if __name__ == "__main__":
     unittest.main()

From afb106694b1c9c4780a230a48a35910b209c1d7e Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 12:01:13 +0700
Subject: [PATCH 16/19] workaround to make boto work on Travis CI from
 https://github.com/travis-ci/travis-ci/issues/7940

---
 .travis.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index db0c8a6ba..f04002977 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,12 @@
 language: python
 python:
   - "3.6"
+
+# workaround to make boto work on travis
+# from https://github.com/travis-ci/travis-ci/issues/7940
+before_install:
+  - sudo rm -f /etc/boto.cfg
+
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
   - pip install -r requirements.txt

From 823f702036d6f8ba3776e124a3f5fa1218eeb763 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 13:38:05 +0700
Subject: [PATCH 17/19] =?UTF-8?q?fix=20royin=20romanize(),=20bring=20back?=
 =?UTF-8?q?=20portion=20of=20old=20code=20from=205e44053=20(for=20the=20ca?=
 =?UTF-8?q?se=20silent=20=E0=B8=AB)=20https://github.com/PyThaiNLP/pythain?=
 =?UTF-8?q?lp/blob/5e44053ca95522934a7042505bde589228d74647/pythainlp/roma?=
 =?UTF-8?q?nization/royin.py#L124?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pythainlp/number/wordtonum.py    |  6 +++---
 pythainlp/transliterate/royin.py |  5 +++--
 tests/__init__.py                | 11 +++++++++--
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/pythainlp/number/wordtonum.py b/pythainlp/number/wordtonum.py
index 7184cf61a..871d4c784 100644
--- a/pythainlp/number/wordtonum.py
+++ b/pythainlp/number/wordtonum.py
@@ -40,11 +40,11 @@
 
 
 def _thaiword_to_num(tokens):
-    len_tokens = len(tokens)
-
-    if len_tokens == 0:
+    if not tokens:
         return None
 
+    len_tokens = len(tokens)
+
     if len_tokens == 1:
         return _THAI_INT_MAP[tokens[0]]
 
diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py
index 415e0fce3..e868f10d0 100644
--- a/pythainlp/transliterate/royin.py
+++ b/pythainlp/transliterate/royin.py
@@ -145,8 +145,9 @@ def _replace_consonants(word, res):
         lenword = len(res)
         while i < lenword:
             if i == 0 and res[0] == "ห":
-                word = word.replace(res[0], _CONSONANTS[res[0]][0])
-                i += 1
+                word = word.replace(res[0], "")
+                del res[0]
+                lenword -= 1
             elif i == 0 and res[0] != "ห":
                 word = word.replace(res[0], _CONSONANTS[res[0]][0])
                 i += 1
diff --git a/tests/__init__.py b/tests/__init__.py
index 760e442bb..8b59e8cb1 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -54,6 +54,7 @@
 from pythainlp.tokenize import pyicu as tokenize_pyicu
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.transliterate.ipa import trans_list, xsampa_list
+from pythainlp.transliterate.royin import romanize as romanize_royin
 from pythainlp.util import (
     deletetone,
     eng_to_thai,
@@ -203,6 +204,7 @@ def test_number(self):
         )
         self.assertEqual(thaiword_to_num("ยี่สิบ"), 20)
         self.assertEqual(thaiword_to_num("ศูนย์"), 0)
+        self.assertEqual(thaiword_to_num("ศูนย์อะไรนะ"), 0)
         self.assertEqual(thaiword_to_num(""), None)
         self.assertEqual(thaiword_to_num(None), None)
 
@@ -477,6 +479,12 @@ def test_romanize(self):
         self.assertEqual(romanize(None), "")
         self.assertEqual(romanize(""), "")
         self.assertEqual(romanize("แมว"), "maeo")
+
+        self.assertEqual(romanize_royin(None), "")
+        self.assertEqual(romanize_royin(""), "")
+        self.assertEqual(romanize_royin("หาย"), "hai")
+        self.assertEqual(romanize_royin("หยาก"), "yak")
+
         self.assertEqual(romanize("แมว", engine="royin"), "maeo")
         self.assertEqual(romanize("เดือน", engine="royin"), "duean")
         self.assertEqual(romanize("ดู", engine="royin"), "du")
@@ -486,9 +494,8 @@ def test_romanize(self):
         self.assertEqual(romanize("กรร", engine="royin"), "kan")
         self.assertEqual(romanize("กรรม", engine="royin"), "kam")
         self.assertIsNotNone(romanize("กก", engine="royin"))
-        self.assertIsNotNone(romanize("หาย", engine="royin"))
-        self.assertIsNotNone(romanize("หยาก", engine="royin"))
         self.assertIsNotNone(romanize("ฝ้าย", engine="royin"))
+        self.assertIsNotNone(romanize("ทีปกร", engine="royin"))
         self.assertIsNotNone(romanize("กรม", engine="royin"))
         self.assertIsNotNone(romanize("ธรรพ์", engine="royin"))
         self.assertIsNotNone(romanize("กฏa์", engine="royin"))

From ec25189cbb16e604e8cfeae511cec8645ecf953e Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 17:02:54 +0700
Subject: [PATCH 18/19] Add doc on extras_require

---
 CONTRIBUTING.md   |  6 +++---
 README.md         | 29 ++++++++++++++++++++++++-----
 tests/__init__.py |  2 ++
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5ba12656d..dd52500c3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,8 +23,8 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod
 - Write tests for your new features (please see "Tests" topic below);
 - Always remember that [commented code is dead
   code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html);
-- Name identifiers (variables, classes, functions, module names) with readable
-  names (`x` is always wrong);
+- Name identifiers (variables, classes, functions, module names) with meaningful
+  and pronounceable names (`x` is always wrong);
 - When manipulating strings, use [Python's new-style
   formatting](http://docs.python.org/library/string.html#format-string-syntax)
   (`'{} = {}'.format(a, b)` instead of `'%s = %s' % (a, b)`);
@@ -55,7 +55,7 @@ Happy hacking! (;
 ## newmm (onecut), mm, TCC, and Thai Soundex Code
 - Korakot Chaovavanich
 
-## Thai2Vec & ulmfit
+## Thai2Vec & ULMFiT
 - Charin Polpanumas
 
 ## Docs
diff --git a/README.md b/README.md
index ef71bf205..ddfb287cc 100644
--- a/README.md
+++ b/README.md
@@ -34,21 +34,40 @@ Python 2 users can still use PyThaiNLP 1.6.
 
 ## Installation
 
-**Using pip**
+PyThaiNLP uses PyPI as its main distribution channel, see https://pypi.org/project/pythainlp/
 
-Stable release
+### Stable release
+
+Standard installation:
 
 ```sh
 $ pip install pythainlp
 ```
 
-Development release
+For some advanced functionalities, like word vector, extra packages may be needed. Install them with these options during pip install:
 
 ```sh
-$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
+$ pip install pythainlp[extra1,extra2,...]
 ```
 
-Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries.
+where ```extras``` can be
+  - ```artagger``` (to support artagger part-of-speech tagger)
+  - ```deepcut``` (to support deepcut machine-learnt tokenizer)
+  - ```icu``` (for ICU support in transliteration and tokenization)
+  - ```ipa``` (for International Phonetic Alphabet support in transliteration)
+  - ```ml``` (to support ULMFit models, like one for sentiment analyser)
+  - ```ner``` (for named-entity recognizer)
+  - ```thai2rom``` (for machine-learnt romanization)
+  - ```thai2vec``` (for Thai word vector)
+  - ```full``` (install everything)
+
+see ```extras``` and ```extras_require``` in [```setup.py```](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py) for details.
+
+Development release:
+
+```sh
+$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
+```
 
 ## Documentation
 
diff --git a/tests/__init__.py b/tests/__init__.py
index 8b59e8cb1..bef9532e4 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -542,11 +542,13 @@ def test_thai2vec(self):
         self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0)
         self.assertIsNotNone(thai2vec.sentence_vectorizer(""))
         self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม"))
+        self.assertIsNotNone(thai2vec.sentence_vectorizer("I think therefore I am ผ็ฎ์"))
         self.assertEqual(
             thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0],
             "ราชินี",
         )
         self.assertEqual(thai2vec.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม")
+        self.assertIsNotNone(thai2vec.about())
 
 
 if __name__ == "__main__":

From af83c4decf94b0e86e2022faed907b83e00ce61b Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 10 Nov 2018 17:26:14 +0700
Subject: [PATCH 19/19] update README

---
 README-pypi.md    | 22 ++++++++--------------
 README.md         |  6 +++---
 tests/__init__.py | 19 +++++++++++++------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/README-pypi.md b/README-pypi.md
index 70a8a53c2..8141c642e 100644
--- a/README-pypi.md
+++ b/README-pypi.md
@@ -10,20 +10,14 @@
 
 PyThaiNLP is a Python library for natural language processing (NLP) of Thai language.
 
-PyThaiNLP features include Thai word and subword segmentations, soundex, romanization, part-of-speech taggers, and spelling corrections.
-
-## What's new in version 1.7 ?
-
-- Deprecate Python 2 support. (Python 2 compatibility code will be completely dropped in PyThaiNLP 1.8)
-- Refactor pythainlp.tokenize.pyicu for readability
-- Add Thai NER model to pythainlp.ner
-- thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset
-- Sentiment classifier based on ULMFit and various product review datasets
-- Add ULMFit utility to PyThaiNLP
-- Add Thai romanization model ThaiTransliterator
-- Retrain POS-tagging model
-- Improved word_tokenize (newmm, mm) and dict_word_tokenize
-- Documentation added
+PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers.
+
+## What's new in version 1.8 ?
+
+- New NorvigSpellChecker spell checker class, which can be initialized with custom dictionary.
+- Terminate Python 2 support. Remove all Python 2 compatibility code.
+- Remove old, obsolated, deprecated, and experimental code.
+- see [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118)
 
 ## Install
 
diff --git a/README.md b/README.md
index ddfb287cc..c3399a200 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,9 @@ Thai Natural Language Processing in Python.
 
 PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language.
 
-PyThaiNLP supports Python 3.4+.
-Since version 1.7, PyThaiNLP deprecates its support for Python 2. The future PyThaiNLP 1.8 will completely drop all supports for Python 2.
-Python 2 users can still use PyThaiNLP 1.6.
+PyThaiNLP 1.8 supports Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118).
+
+Python 2 users can use PyThaiNLP 1.6, our latest released that tested with Python 2.7.
 
 **This is a document for development branch (post 1.7.x). Things will break. For a document for stable branch, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).**
 
diff --git a/tests/__init__.py b/tests/__init__.py
index bef9532e4..12fc36236 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -113,14 +113,18 @@ def test_wordnet(self):
 
         self.assertIsNotNone(wordnet.lemmas("นก"))
         self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV))
-        self.assertIsNotNone(wordnet.lemma('cat.n.01.cat'))
+        self.assertIsNotNone(wordnet.lemma("cat.n.01.cat"))
 
         self.assertEqual(wordnet.morphy("dogs"), "dog")
 
-        bird = wordnet.synset('bird.n.01')
-        mouse = wordnet.synset('mouse.n.01')
-        self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse))
-        self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse))
+        bird = wordnet.synset("bird.n.01")
+        mouse = wordnet.synset("mouse.n.01")
+        self.assertEqual(
+            wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)
+        )
+        self.assertEqual(
+            wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)
+        )
 
         cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key()
         self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
@@ -542,7 +546,10 @@ def test_thai2vec(self):
         self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0)
         self.assertIsNotNone(thai2vec.sentence_vectorizer(""))
         self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม"))
-        self.assertIsNotNone(thai2vec.sentence_vectorizer("I think therefore I am ผ็ฎ์"))
+        self.assertIsNotNone(
+            thai2vec.sentence_vectorizer("เสรีภาพในการสมาคม", use_mean=True)
+        )
+        self.assertIsNotNone(thai2vec.sentence_vectorizer("I คิด therefore I am ผ็ฎ์"))
         self.assertEqual(
             thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0],
             "ราชินี",