From be7815cb0a6b461d959427e7c6bec554cd25123a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Nov 2024 13:47:16 +0000 Subject: [PATCH 1/5] Add testc_tools (misspell) --- tests/compact/__init__.py | 3 +- .../compact/{testc_tag.py => testc_parse.py} | 2 +- tests/compact/testc_tokenize.py | 15 ++-- .../testc_tools.py} | 2 +- tests/compact/testc_util.py | 2 +- tests/core/test_tag.py | 79 +++++++++++-------- tests/core/test_tokenize.py | 4 + 7 files changed, 67 insertions(+), 40 deletions(-) rename tests/compact/{testc_tag.py => testc_parse.py} (89%) rename tests/{extra/testx_misspell.py => compact/testc_tools.py} (97%) diff --git a/tests/compact/__init__.py b/tests/compact/__init__.py index 1199940b1..ba07ae576 100644 --- a/tests/compact/__init__.py +++ b/tests/compact/__init__.py @@ -11,8 +11,9 @@ # Names of module to be tested test_packages: list[str] = [ - "tests.compact.testc_tag", + "tests.compact.testc_parse", "tests.compact.testc_tokenize", + "tests.compact.testc_tools", "tests.compact.testc_util", ] diff --git a/tests/compact/testc_tag.py b/tests/compact/testc_parse.py similarity index 89% rename from tests/compact/testc_tag.py rename to tests/compact/testc_parse.py index 6ded75f8e..c48b766bb 100644 --- a/tests/compact/testc_tag.py +++ b/tests/compact/testc_parse.py @@ -7,7 +7,7 @@ from pythainlp.tag import chunk_parse, pos_tag -class TagTestCase(unittest.TestCase): +class ChunkParseTestCase(unittest.TestCase): def test_chunk_parse(self): tokens = ["ผม", "รัก", "คุณ"] diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py index 525f6a70e..c04837550 100644 --- a/tests/compact/testc_tokenize.py +++ b/tests/compact/testc_tokenize.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 +# Tests for tokenize functions that need "compact" dependencies + import unittest from pythainlp.tokenize import ( @@ -23,8 +25,7 @@ ) -# Tests for functions that need "compact" dependencies -class TokenizeTestCaseCompact(unittest.TestCase): +class WordTokenizeICUTestCase(unittest.TestCase): def test_icu(self): self.assertEqual(pyicu.segment(None), []) self.assertEqual(pyicu.segment(""), []) @@ -33,6 +34,11 @@ def test_icu(self): ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) + def test_word_tokenize_icu(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu")) + + +class SentTokenizeCRFCutTestCase(unittest.TestCase): def test_sent_tokenize(self): # Use default engine (crfcut) self.assertEqual(sent_tokenize(None), []) @@ -67,6 +73,8 @@ def test_sent_tokenize(self): [["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]], ) + +class SubwordTokenizeHanSoloTestCase(unittest.TestCase): def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None, engine="han_solo"), []) self.assertEqual( @@ -80,6 +88,3 @@ def test_subword_tokenize(self): self.assertNotIn( "า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo") ) - - def test_word_tokenize_icu(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu")) diff --git a/tests/extra/testx_misspell.py b/tests/compact/testc_tools.py similarity index 97% rename from tests/extra/testx_misspell.py rename to tests/compact/testc_tools.py index dbefc053b..ea8ff32f9 100644 --- a/tests/extra/testx_misspell.py +++ b/tests/compact/testc_tools.py @@ -9,7 +9,7 @@ from pythainlp.tools.misspell import misspell -def _count_difference(st1, st2): +def _count_difference(st1: str, st2: str) -> int: # this assumes len(st1) == len(st2) count = 0 diff --git a/tests/compact/testc_util.py b/tests/compact/testc_util.py index 0e753507a..072e54df0 100644 --- a/tests/compact/testc_util.py +++ b/tests/compact/testc_util.py @@ -11,7 +11,7 @@ from pythainlp.util.spell_words import spell_word -class UtilTestCaseX(unittest.TestCase): +class SpellWordTestCase(unittest.TestCase): def test_spell_word(self): self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"]) self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"]) diff --git a/tests/core/test_tag.py b/tests/core/test_tag.py index c596f3428..b98c2bc0d 100644 --- a/tests/core/test_tag.py +++ b/tests/core/test_tag.py @@ -14,13 +14,13 @@ unigram, ) +TEST_TOKENS = ["ผม", "รัก", "คุณ"] + class TagTestCase(unittest.TestCase): - # ### pythainlp.tag.pos_tag + """Test pythainlp.tag.pos_tag""" def test_pos_tag(self): - tokens = ["ผม", "รัก", "คุณ"] - self.assertEqual(pos_tag(None), []) self.assertEqual(pos_tag([]), []) self.assertEqual( @@ -40,15 +40,17 @@ def test_pos_tag(self): self.assertEqual(unigram.tag(None, corpus="tud"), []) self.assertEqual(unigram.tag([], corpus="tud"), []) self.assertIsNotNone( - pos_tag(tokens, engine="unigram", corpus="orchid") + pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid") ) self.assertIsNotNone( - pos_tag(tokens, engine="unigram", corpus="orchid_ud") + pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid_ud") + ) + self.assertIsNotNone( + pos_tag(TEST_TOKENS, engine="unigram", corpus="pud") ) - self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud")) self.assertIsNotNone( - pos_tag(tokens, engine="unigram", corpus="blackboard") + pos_tag(TEST_TOKENS, engine="unigram", corpus="blackboard") ) self.assertIsNotNone( pos_tag([""], engine="unigram", corpus="blackboard") @@ -56,9 +58,13 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag([""], engine="unigram", corpus="blackboard_ud") ) - self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tdtb")) + self.assertIsNotNone( + pos_tag(TEST_TOKENS, engine="unigram", corpus="tdtb") + ) self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tdtb")) - self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud")) + self.assertIsNotNone( + pos_tag(TEST_TOKENS, engine="unigram", corpus="tud") + ) self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud")) self.assertEqual( pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"), @@ -72,6 +78,25 @@ def test_pos_tag(self): pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN" ) + self.assertEqual(pos_tag_sents(None), []) + self.assertEqual(pos_tag_sents([]), []) + self.assertEqual( + pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), + [ + [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")], + [("แมว", "NCMN"), ("วิ่ง", "VACT")], + ], + ) + + +class PerceptronTaggerTestCase(unittest.TestCase): + """Test pythainlp.tag.PerceptronTagger + + :param unittest: _description_ + :type unittest: _type_ + """ + + def test_perceptron_tagger(self): self.assertEqual(perceptron.tag(None, corpus="orchid"), []) self.assertEqual(perceptron.tag([], corpus="orchid"), []) self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), []) @@ -82,44 +107,34 @@ def test_pos_tag(self): self.assertEqual(perceptron.tag([], corpus="blackboard"), []) self.assertEqual(perceptron.tag(None, corpus="tud"), []) self.assertEqual(perceptron.tag([], corpus="tud"), []) + self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="orchid") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="orchid_ud") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid_ud") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="pud") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="pud") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="blackboard") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="blackboard_ud") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard_ud") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="tdtb") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="tdtb") + pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb") ) self.assertIsNotNone( - pos_tag(tokens, engine="perceptron", corpus="tud") - ) - - self.assertEqual(pos_tag_sents(None), []) - self.assertEqual(pos_tag_sents([]), []) - self.assertEqual( - pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), - [ - [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")], - [("แมว", "NCMN"), ("วิ่ง", "VACT")], - ], + pos_tag(TEST_TOKENS, engine="perceptron", corpus="tud") ) - # ### pythainlp.tag.PerceptronTagger - - def test_perceptron_tagger(self): + def test_perceptron_tagger_custom(self): + """Test pythainlp.tag.PerceptronTagger""" tagger = PerceptronTagger() # train data, with "กิน" > 20 instances to trigger conditions # in _make_tagdict() @@ -182,7 +197,9 @@ def test_perceptron_tagger(self): with self.assertRaises(IOError): tagger.load("ptagger_notexistX4AcOcX.pkl") # file does not exist - # ### pythainlp.tag.locations + +class TagLocationsTestCase(unittest.TestCase): + """Test pythainlp.tag.locations""" def test_ner_locations(self): self.assertEqual( diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 687145087..393b02741 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -551,6 +551,10 @@ def test_tcc_p(self): self.assertEqual(list(tcc_p.tcc("")), []) self.assertEqual(tcc_p.tcc_pos(""), set()) + +class DetokenizeTestCase(unittest.TestCase): + """Detokenize and regrouping test cases""" + def test_word_detokenize(self): self.assertIsInstance(word_detokenize(["ผม", "5"]), str) self.assertEqual( From 798a2cf47ed7b89b8a614ff7c3b600038172d624 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Nov 2024 14:23:46 +0000 Subject: [PATCH 2/5] Break down testx_tokenize --- pythainlp/util/strftime.py | 9 +- tests/extra/testx_spell.py | 2 +- tests/extra/testx_tokenize.py | 321 +++++++++++++++++++--------------- 3 files changed, 186 insertions(+), 146 deletions(-) diff --git a/pythainlp/util/strftime.py b/pythainlp/util/strftime.py index ee11274db..0a70da0b1 100644 --- a/pythainlp/util/strftime.py +++ b/pythainlp/util/strftime.py @@ -36,9 +36,9 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str: try: str_ = dt_obj.strftime(f"%{fmt_char}") if not str_ or str_ == "%{}".format(fmt_char): - # normalize outputs for unsupported directives - # in different platforms - # "%Q" may result "%Q", "Q", or "", make it "Q" + # Normalize outputs for unsupported directives + # in different platforms: + # "%Q" may result "", "%Q", or "Q", make it all "Q" str_ = fmt_char except ValueError as err: # Unsupported directives may raise ValueError on Windows, @@ -46,7 +46,8 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str: warnings.warn( ( f"String format directive unknown/not support: %{fmt_char}\n" - f"The system raises this ValueError: {err}" + f"The system raises this ValueError: {err}\n" + f"Continue working without the directive." ), UserWarning, ) diff --git a/tests/extra/testx_spell.py b/tests/extra/testx_spell.py index 9c587de12..97d27f6df 100644 --- a/tests/extra/testx_spell.py +++ b/tests/extra/testx_spell.py @@ -12,7 +12,7 @@ symspellpy, ) -from .test_spell import SENT_TOKS +from ..core.test_spell import SENT_TOKS class SpellTestCaseX(unittest.TestCase): diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index 25e178118..00472b786 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 +# Tests for tokenize functions that need extra dependencies + import unittest from pythainlp.tokenize import ( @@ -20,7 +22,7 @@ ) from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize -from ..test_tokenize import ( +from ..core.test_tokenize import ( SENT_1, SENT_2, SENT_3, @@ -29,49 +31,77 @@ ) -# Tests for functions that need more dependencies -class TokenizeTestCaseX(unittest.TestCase): - def test_subword_tokenize(self): - self.assertEqual(subword_tokenize(None, engine="ssg"), []) - self.assertEqual( - subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] - ) - self.assertIn("ดาว", subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) - self.assertNotIn("า", subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) +class ClauseTokenizeTestCase(unittest.TestCase): + def test_clause_tokenize(self): + self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"])) + self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list) - self.assertEqual(subword_tokenize(None, engine="tltk"), []) - self.assertEqual(subword_tokenize("", engine="tltk"), []) - self.assertIsInstance( - subword_tokenize("สวัสดิีดาวอังคาร", engine="tltk"), list - ) - self.assertNotIn("า", subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")) - self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list) - self.assertEqual(subword_tokenize(None, engine="phayathai"), []) - self.assertEqual(subword_tokenize("", engine="phayathai"), []) - self.assertIsInstance( - subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list - ) - self.assertNotIn( - "า", subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai") - ) - self.assertIsInstance( - subword_tokenize("โควิด19", engine="phayathai"), list - ) +class DetokenizeTestCase(unittest.TestCase): + def test_numeric_data_format(self): + engines = ["attacut", "deepcut", "sefr_cut"] - self.assertEqual(subword_tokenize(None, engine="wangchanberta"), []) - self.assertEqual(subword_tokenize("", engine="wangchanberta"), []) - self.assertIsInstance( - subword_tokenize("สวัสดิีดาวอังคาร", engine="wangchanberta"), list + for engine in engines: + self.assertIn( + "127.0.0.1", + word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), + ) + + tokens = word_tokenize( + "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine + ) + self.assertTrue( + any(value in tokens for value in ["12:12pm", "12:12"]), + msg=f"{engine}: {tokens}", + ) + self.assertIn("11.11", tokens) + + self.assertIn( + "1,234,567.89", + word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), + ) + + tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) + self.assertIn("2.5:1", tokens) + self.assertIn("5:2", tokens) + + # try turning off `join_broken_num` + engine = "attacut" + self.assertNotIn( + "127.0.0.1", + word_tokenize( + "ไอพีของคุณคือ 127.0.0.1 ครับ", + engine=engine, + join_broken_num=False, + ), ) self.assertNotIn( - "า", subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta") + "1,234,567.89", + word_tokenize( + "รางวัลมูลค่า 1,234,567.89 บาท", + engine=engine, + join_broken_num=False, + ), ) - self.assertIsInstance( - subword_tokenize("โควิด19", engine="wangchanberta"), list + + +class ParagraphTokenizeTestCase(unittest.TestCase): + def test_paragraph_tokenize(self): + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต" + + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" ) + self.assertIsNotNone(paragraph_tokenize(sent)) + with self.assertRaises(ValueError): + paragraph_tokenize( + sent, engine="ai2+2thai" + ) # engine does not exist + - def test_sent_tokenize(self): +class SentTokenizeTLTKTestCase(unittest.TestCase): + def test_sent_tokenize_tltk(self): self.assertIsNotNone( sent_tokenize( SENT_1, @@ -91,6 +121,9 @@ def test_sent_tokenize(self): ), ) + +class SentTokenizeThaiSumTestCase(unittest.TestCase): + def test_sent_tokenize_thaisum(self): self.assertIsNotNone( sent_tokenize( SENT_1, @@ -114,12 +147,17 @@ def test_sent_tokenize(self): [["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]], ) + +class SentTokenizeWTPTestCase(unittest.TestCase): + def test_sent_tokenize_wtp(self): self.assertIsNotNone( sent_tokenize( SENT_3, engine="wtp", ), ) + + def test_sent_tokenize_wtp_tiny(self): self.assertIsNotNone( sent_tokenize( SENT_3, @@ -139,72 +177,86 @@ def test_sent_tokenize(self): # ), # ) - def test_word_tokenize_attacut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="attacut")) - def test_word_tokenize_deepcut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut")) - - def test_word_tokenize_nercut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="nercut")) +class SubwordTokenizePhayathaiTestCase(unittest.TestCase): + def test_subword_tokenize_phayathai(self): + self.assertEqual(subword_tokenize(None, engine="phayathai"), []) + self.assertEqual(subword_tokenize("", engine="phayathai"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list + ) + self.assertNotIn( + "า", subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai") + ) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="phayathai"), list + ) - def test_word_tokenize_nlpo3(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) - def test_word_tokenize_oskut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="oskut")) +class SubwordTokenizeSSGTestCase(unittest.TestCase): + def test_subword_tokenize_ssg(self): + self.assertEqual(ssg.segment(None), []) + self.assertEqual(ssg.segment(""), []) + self.assertEqual(subword_tokenize(None, engine="ssg"), []) + self.assertEqual( + subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"] + ) + self.assertIn("ดาว", subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) + self.assertNotIn("า", subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) - def test_word_tokenize_sefr_cut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="sefr_cut")) - def test_word_tokenize_tltk(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk")) +class SubwordTokenizeTLTKTestCase(unittest.TestCase): + def test_subword_tokenize_tltk(self): + self.assertEqual(subword_tokenize(None, engine="tltk"), []) + self.assertEqual(subword_tokenize("", engine="tltk"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="tltk"), list + ) + self.assertNotIn("า", subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")) + self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list) - def test_numeric_data_format(self): - engines = ["attacut", "deepcut", "sefr_cut"] - for engine in engines: - self.assertIn( - "127.0.0.1", - word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), - ) +class SubwordTokenizeWangchanbertaTestCase(unittest.TestCase): + def test_subword_tokenize_wangchanberta(self): + self.assertEqual(subword_tokenize(None, engine="wangchanberta"), []) + self.assertEqual(subword_tokenize("", engine="wangchanberta"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="wangchanberta"), list + ) + self.assertNotIn( + "า", subword_tokenize("สวัสดีดาวอังคาร", engine="wangchanberta") + ) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="wangchanberta"), list + ) - tokens = word_tokenize( - "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine - ) - self.assertTrue( - any(value in tokens for value in ["12:12pm", "12:12"]), - msg=f"{engine}: {tokens}", - ) - self.assertIn("11.11", tokens) - self.assertIn( - "1,234,567.89", - word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), - ) +class SyllableTokenizeTLTKTestCase(unittest.TestCase): + def test_tltk(self): + self.assertEqual(tltk.segment(None), []) + self.assertEqual(tltk.segment(""), []) + self.assertEqual( + tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), + [ + "ฉัน", + "รัก", + "ภา", + "ษา", + "ไทย", + "เพราะ", + "ฉัน", + "เป็น", + "คน", + "ไทย", + ], + ) + self.assertEqual(tltk.syllable_tokenize(None), []) + self.assertEqual(tltk.syllable_tokenize(""), []) - tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) - self.assertIn("2.5:1", tokens) - self.assertIn("5:2", tokens) - # try turning off `join_broken_num` - engine = "attacut" - self.assertNotIn( - "127.0.0.1", - word_tokenize( - "ไอพีของคุณคือ 127.0.0.1 ครับ", - engine=engine, - join_broken_num=False, - ), - ) - self.assertNotIn( - "1,234,567.89", - word_tokenize( - "รางวัลมูลค่า 1,234,567.89 บาท", - engine=engine, - join_broken_num=False, - ), - ) +class WordTokenizeAttacutTestCase(unittest.TestCase): + def test_word_tokenize_attacut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="attacut")) def test_attacut(self): self.assertEqual(attacut.segment(None), []) @@ -221,6 +273,11 @@ def test_attacut(self): attacut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", model="attacut-c") ) + +class WordTokenizeDeepcutTestCase(unittest.TestCase): + def test_word_tokenize_deepcut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut")) + def test_deepcut(self): self.assertEqual(deepcut.segment(None), []) self.assertEqual(deepcut.segment(""), []) @@ -233,15 +290,10 @@ def test_deepcut(self): ) ) - def test_oskut(self): - self.assertEqual(oskut.segment(None), []) - self.assertEqual(oskut.segment(""), []) - self.assertIsNotNone( - oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), - ) - self.assertIsNotNone( - oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="scads"), - ) + +class WordTokenizeNERCutTestCase(unittest.TestCase): + def test_word_tokenize_nercut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="nercut")) def test_nercut(self): self.assertEqual(nercut.segment(None), []) @@ -253,6 +305,31 @@ def test_nercut(self): self.assertIsNotNone(nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ")) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) + +class WordTokenizeNlpO3TestCase(unittest.TestCase): + def test_word_tokenize_nlpo3(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) + + +class WordTokenizeOSKutTestCase(unittest.TestCase): + def test_word_tokenize_oskut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="oskut")) + + def test_oskut(self): + self.assertEqual(oskut.segment(None), []) + self.assertEqual(oskut.segment(""), []) + self.assertIsNotNone( + oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), + ) + self.assertIsNotNone( + oskut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="scads"), + ) + + +class WordTokenizeSEFRCutTestCase(unittest.TestCase): + def test_word_tokenize_sefr_cut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="sefr_cut")) + def test_sefr_cut(self): self.assertEqual(sefr_cut.segment(None), []) self.assertEqual(sefr_cut.segment(""), []) @@ -263,45 +340,7 @@ def test_sefr_cut(self): sefr_cut.segment("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="tnhc"), ) - def test_ssg(self): - self.assertEqual(ssg.segment(None), []) - self.assertEqual(ssg.segment(""), []) - self.assertIn("ดาว", subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")) - def test_tltk(self): - self.assertEqual(tltk.segment(None), []) - self.assertEqual(tltk.segment(""), []) - self.assertEqual( - tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), - [ - "ฉัน", - "รัก", - "ภา", - "ษา", - "ไทย", - "เพราะ", - "ฉัน", - "เป็น", - "คน", - "ไทย", - ], - ) - self.assertEqual(tltk.syllable_tokenize(None), []) - self.assertEqual(tltk.syllable_tokenize(""), []) - - def test_paragraph_tokenize(self): - sent = ( - "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต" - + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" - + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" - ) - self.assertIsNotNone(paragraph_tokenize(sent)) - with self.assertRaises(ValueError): - paragraph_tokenize( - sent, engine="ai2+2thai" - ) # engine does not exist - - def test_clause_tokenize(self): - self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"])) - self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list) +class WordTokenizeTLTKTestCase(unittest.TestCase): + def test_word_tokenize_tltk(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk")) From a3a470c09ea5d50029835e35b0ff7bdf5128f4ac Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Nov 2024 14:25:19 +0000 Subject: [PATCH 3/5] Only submit a report from the latest Python version on ubuntu-latest --- .github/workflows/unittest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 4b544cb80..7dd54504c 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -100,9 +100,9 @@ jobs: # Test cases loaded is defined in __init__.py in the tests directory. # See also tests/README.md - name: Coverage report - if: matrix.python-version == env.PYTHON_VERSION_LATEST + if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COVERALLS_SERVICE_NAME: github run: coveralls - # Only submit a report from the latest Python version + # Only submit a report from the latest Python version on ubuntu-latest. From 64c48a3114d30901bf4b92fd6b9ac24b5d254630 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Nov 2024 14:26:35 +0000 Subject: [PATCH 4/5] Update testx_tokenize.py --- tests/extra/testx_tokenize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index 00472b786..0e9f05737 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -89,9 +89,9 @@ class ParagraphTokenizeTestCase(unittest.TestCase): def test_paragraph_tokenize(self): sent = ( "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" - + "จากผลงานวิจัยที่เคยทำมาในอดีต" - + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" - + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + "จากผลงานวิจัยที่เคยทำมาในอดีต" + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" ) self.assertIsNotNone(paragraph_tokenize(sent)) with self.assertRaises(ValueError): From f1300d8e1a9c61753f06749cb75d3d16fdd7b42b Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 3 Nov 2024 14:30:20 +0000 Subject: [PATCH 5/5] Add SPDX-License-Identifier --- .github/workflows/pypi-publish.yml | 3 +++ .github/workflows/pypi-test.yml | 3 +++ .github/workflows/unittest.yml | 3 +++ 3 files changed, 9 insertions(+) diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 30d12808c..f1df50adf 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: CC0-1.0 + name: Upload package to PyPI on: diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index f83e2865d..8881fb4c3 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: CC0-1.0 + name: PyPI Unit test on: diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 7dd54504c..37b7691db 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: CC0-1.0 + name: Unit test on: