PyThaiNLP · bact · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024
diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: CC0-1.0
+
 name: Upload package to PyPI
 
 on:

diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: CC0-1.0
+
 name: PyPI Unit test
 
 on:

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: CC0-1.0
+
 name: Unit test
 
 on:
@@ -100,9 +103,9 @@ jobs:
       # Test cases loaded is defined in __init__.py in the tests directory.
       # See also tests/README.md
     - name: Coverage report
-      if: matrix.python-version == env.PYTHON_VERSION_LATEST
+      if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         COVERALLS_SERVICE_NAME: github
       run: coveralls
-      # Only submit a report from the latest Python version
+      # Only submit a report from the latest Python version on ubuntu-latest.
diff --git a/pythainlp/util/strftime.py b/pythainlp/util/strftime.py
@@ -36,17 +36,18 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
     try:
         str_ = dt_obj.strftime(f"%{fmt_char}")
         if not str_ or str_ == "%{}".format(fmt_char):
-            # normalize outputs for unsupported directives
-            # in different platforms
-            # "%Q" may result "%Q", "Q", or "", make it "Q"
+            # Normalize outputs for unsupported directives
+            # in different platforms:
+            # "%Q" may result "", "%Q", or "Q", make it all "Q"
             str_ = fmt_char
     except ValueError as err:
         # Unsupported directives may raise ValueError on Windows,
         # in that case just use the fmt_char
         warnings.warn(
             (
                 f"String format directive unknown/not support: %{fmt_char}\n"
-                f"The system raises this ValueError: {err}"
+                f"The system raises this ValueError: {err}\n"
+                f"Continue working without the directive."
             ),
             UserWarning,
         )

diff --git a/tests/compact/__init__.py b/tests/compact/__init__.py
@@ -11,8 +11,9 @@
 
 # Names of module to be tested
 test_packages: list[str] = [
-    "tests.compact.testc_tag",
+    "tests.compact.testc_parse",
     "tests.compact.testc_tokenize",
+    "tests.compact.testc_tools",
     "tests.compact.testc_util",
 ]
 

diff --git a/tests/compact/testc_tag.py → tests/compact/testc_parse.py b/tests/compact/testc_tag.py → tests/compact/testc_parse.py
@@ -7,7 +7,7 @@
 from pythainlp.tag import chunk_parse, pos_tag
 
 
-class TagTestCase(unittest.TestCase):
+class ChunkParseTestCase(unittest.TestCase):
     def test_chunk_parse(self):
         tokens = ["ผม", "รัก", "คุณ"]
 

diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 
+# Tests for tokenize functions that need "compact" dependencies
+
 import unittest
 
 from pythainlp.tokenize import (
@@ -23,8 +25,7 @@
 )
 
 
-# Tests for functions that need "compact" dependencies
-class TokenizeTestCaseCompact(unittest.TestCase):
+class WordTokenizeICUTestCase(unittest.TestCase):
     def test_icu(self):
         self.assertEqual(pyicu.segment(None), [])
         self.assertEqual(pyicu.segment(""), [])
@@ -33,6 +34,11 @@ def test_icu(self):
             ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
         )
 
+    def test_word_tokenize_icu(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
+
+
+class SentTokenizeCRFCutTestCase(unittest.TestCase):
     def test_sent_tokenize(self):
         # Use default engine (crfcut)
         self.assertEqual(sent_tokenize(None), [])
@@ -67,6 +73,8 @@ def test_sent_tokenize(self):
             [["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
         )
 
+
+class SubwordTokenizeHanSoloTestCase(unittest.TestCase):
     def test_subword_tokenize(self):
         self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
         self.assertEqual(
@@ -80,6 +88,3 @@ def test_subword_tokenize(self):
         self.assertNotIn(
             "า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
         )
-
-    def test_word_tokenize_icu(self):
-        self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
diff --git a/tests/extra/testx_misspell.py → tests/compact/testc_tools.py b/tests/extra/testx_misspell.py → tests/compact/testc_tools.py
@@ -9,7 +9,7 @@
 from pythainlp.tools.misspell import misspell
 
 
-def _count_difference(st1, st2):
+def _count_difference(st1: str, st2: str) -> int:
     # this assumes len(st1) == len(st2)
 
     count = 0

diff --git a/tests/compact/testc_util.py b/tests/compact/testc_util.py
@@ -11,7 +11,7 @@
 from pythainlp.util.spell_words import spell_word
 
 
-class UtilTestCaseX(unittest.TestCase):
+class SpellWordTestCase(unittest.TestCase):
     def test_spell_word(self):
         self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
         self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])

diff --git a/tests/core/test_tag.py b/tests/core/test_tag.py
@@ -14,13 +14,13 @@
     unigram,
 )
 
+TEST_TOKENS = ["ผม", "รัก", "คุณ"]
+
 
 class TagTestCase(unittest.TestCase):
-    # ### pythainlp.tag.pos_tag
+    """Test pythainlp.tag.pos_tag"""
 
     def test_pos_tag(self):
-        tokens = ["ผม", "รัก", "คุณ"]
-
         self.assertEqual(pos_tag(None), [])
         self.assertEqual(pos_tag([]), [])
         self.assertEqual(
@@ -40,25 +40,31 @@ def test_pos_tag(self):
         self.assertEqual(unigram.tag(None, corpus="tud"), [])
         self.assertEqual(unigram.tag([], corpus="tud"), [])
         self.assertIsNotNone(
-            pos_tag(tokens, engine="unigram", corpus="orchid")
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="unigram", corpus="orchid_ud")
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid_ud")
+        )
+        self.assertIsNotNone(
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="pud")
         )
-        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
         self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
         self.assertIsNotNone(
-            pos_tag(tokens, engine="unigram", corpus="blackboard")
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="blackboard")
         )
         self.assertIsNotNone(
             pos_tag([""], engine="unigram", corpus="blackboard")
         )
         self.assertIsNotNone(
             pos_tag([""], engine="unigram", corpus="blackboard_ud")
         )
-        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tdtb"))
+        self.assertIsNotNone(
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="tdtb")
+        )
         self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tdtb"))
-        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
+        self.assertIsNotNone(
+            pos_tag(TEST_TOKENS, engine="unigram", corpus="tud")
+        )
         self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
         self.assertEqual(
             pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
@@ -72,6 +78,25 @@ def test_pos_tag(self):
             pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN"
         )
 
+        self.assertEqual(pos_tag_sents(None), [])
+        self.assertEqual(pos_tag_sents([]), [])
+        self.assertEqual(
+            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
+            [
+                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
+                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
+            ],
+        )
+
+
+class PerceptronTaggerTestCase(unittest.TestCase):
+    """Test pythainlp.tag.PerceptronTagger
+
+    :param unittest: _description_
+    :type unittest: _type_
+    """
+
+    def test_perceptron_tagger(self):
         self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
         self.assertEqual(perceptron.tag([], corpus="orchid"), [])
         self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), [])
@@ -82,44 +107,34 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
         self.assertEqual(perceptron.tag(None, corpus="tud"), [])
         self.assertEqual(perceptron.tag([], corpus="tud"), [])
+
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="orchid")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid_ud")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="pud")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="pud")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="blackboard")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard_ud")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="tdtb")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="tdtb")
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
         )
         self.assertIsNotNone(
-            pos_tag(tokens, engine="perceptron", corpus="tud")
-        )
-
-        self.assertEqual(pos_tag_sents(None), [])
-        self.assertEqual(pos_tag_sents([]), [])
-        self.assertEqual(
-            pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
-            [
-                [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
-                [("แมว", "NCMN"), ("วิ่ง", "VACT")],
-            ],
+            pos_tag(TEST_TOKENS, engine="perceptron", corpus="tud")
         )
 
-    # ### pythainlp.tag.PerceptronTagger
-
-    def test_perceptron_tagger(self):
+    def test_perceptron_tagger_custom(self):
+        """Test pythainlp.tag.PerceptronTagger"""
         tagger = PerceptronTagger()
         # train data, with "กิน" > 20 instances to trigger conditions
         # in _make_tagdict()
@@ -182,7 +197,9 @@ def test_perceptron_tagger(self):
         with self.assertRaises(IOError):
             tagger.load("ptagger_notexistX4AcOcX.pkl")  # file does not exist
 
-    # ### pythainlp.tag.locations
+
+class TagLocationsTestCase(unittest.TestCase):
+    """Test pythainlp.tag.locations"""
 
     def test_ner_locations(self):
         self.assertEqual(

diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -551,6 +551,10 @@ def test_tcc_p(self):
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
 
+
+class DetokenizeTestCase(unittest.TestCase):
+    """Detokenize and regrouping test cases"""
+
     def test_word_detokenize(self):
         self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
         self.assertEqual(

diff --git a/tests/extra/testx_spell.py b/tests/extra/testx_spell.py
@@ -12,7 +12,7 @@
     symspellpy,
 )
 
-from .test_spell import SENT_TOKS
+from ..core.test_spell import SENT_TOKS
 
 
 class SpellTestCaseX(unittest.TestCase):