Skip to content

Commit cf3e625

Browse files
authored
Merge pull request #977 from bact/add-more-testc
Add testc_tools (misspell)
2 parents a0ede66 + f1300d8 commit cf3e625

13 files changed

+264
-188
lines changed

.github/workflows/pypi-publish.yml

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
2+
# SPDX-License-Identifier: CC0-1.0
3+
14
name: Upload package to PyPI
25

36
on:

.github/workflows/pypi-test.yml

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
2+
# SPDX-License-Identifier: CC0-1.0
3+
14
name: PyPI Unit test
25

36
on:

.github/workflows/unittest.yml

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
2+
# SPDX-License-Identifier: CC0-1.0
3+
14
name: Unit test
25

36
on:
@@ -100,9 +103,9 @@ jobs:
100103
# Test cases loaded is defined in __init__.py in the tests directory.
101104
# See also tests/README.md
102105
- name: Coverage report
103-
if: matrix.python-version == env.PYTHON_VERSION_LATEST
106+
if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST
104107
env:
105108
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
106109
COVERALLS_SERVICE_NAME: github
107110
run: coveralls
108-
# Only submit a report from the latest Python version
111+
# Only submit a report from the latest Python version on ubuntu-latest.

pythainlp/util/strftime.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,18 @@ def _std_strftime(dt_obj: datetime, fmt_char: str) -> str:
3636
try:
3737
str_ = dt_obj.strftime(f"%{fmt_char}")
3838
if not str_ or str_ == "%{}".format(fmt_char):
39-
# normalize outputs for unsupported directives
40-
# in different platforms
41-
# "%Q" may result "%Q", "Q", or "", make it "Q"
39+
# Normalize outputs for unsupported directives
40+
# in different platforms:
41+
# "%Q" may result "", "%Q", or "Q", make it all "Q"
4242
str_ = fmt_char
4343
except ValueError as err:
4444
# Unsupported directives may raise ValueError on Windows,
4545
# in that case just use the fmt_char
4646
warnings.warn(
4747
(
4848
f"String format directive unknown/not support: %{fmt_char}\n"
49-
f"The system raises this ValueError: {err}"
49+
f"The system raises this ValueError: {err}\n"
50+
f"Continue working without the directive."
5051
),
5152
UserWarning,
5253
)

tests/compact/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111

1212
# Names of module to be tested
1313
test_packages: list[str] = [
14-
"tests.compact.testc_tag",
14+
"tests.compact.testc_parse",
1515
"tests.compact.testc_tokenize",
16+
"tests.compact.testc_tools",
1617
"tests.compact.testc_util",
1718
]
1819

tests/compact/testc_tag.py renamed to tests/compact/testc_parse.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pythainlp.tag import chunk_parse, pos_tag
88

99

10-
class TagTestCase(unittest.TestCase):
10+
class ChunkParseTestCase(unittest.TestCase):
1111
def test_chunk_parse(self):
1212
tokens = ["ผม", "รัก", "คุณ"]
1313

tests/compact/testc_tokenize.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
33
# SPDX-License-Identifier: Apache-2.0
44

5+
# Tests for tokenize functions that need "compact" dependencies
6+
57
import unittest
68

79
from pythainlp.tokenize import (
@@ -23,8 +25,7 @@
2325
)
2426

2527

26-
# Tests for functions that need "compact" dependencies
27-
class TokenizeTestCaseCompact(unittest.TestCase):
28+
class WordTokenizeICUTestCase(unittest.TestCase):
2829
def test_icu(self):
2930
self.assertEqual(pyicu.segment(None), [])
3031
self.assertEqual(pyicu.segment(""), [])
@@ -33,6 +34,11 @@ def test_icu(self):
3334
["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"],
3435
)
3536

37+
def test_word_tokenize_icu(self):
38+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))
39+
40+
41+
class SentTokenizeCRFCutTestCase(unittest.TestCase):
3642
def test_sent_tokenize(self):
3743
# Use default engine (crfcut)
3844
self.assertEqual(sent_tokenize(None), [])
@@ -67,6 +73,8 @@ def test_sent_tokenize(self):
6773
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
6874
)
6975

76+
77+
class SubwordTokenizeHanSoloTestCase(unittest.TestCase):
7078
def test_subword_tokenize(self):
7179
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
7280
self.assertEqual(
@@ -80,6 +88,3 @@ def test_subword_tokenize(self):
8088
self.assertNotIn(
8189
"า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
8290
)
83-
84-
def test_word_tokenize_icu(self):
85-
self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu"))

tests/extra/testx_misspell.py renamed to tests/compact/testc_tools.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pythainlp.tools.misspell import misspell
1010

1111

12-
def _count_difference(st1, st2):
12+
def _count_difference(st1: str, st2: str) -> int:
1313
# this assumes len(st1) == len(st2)
1414

1515
count = 0

tests/compact/testc_util.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pythainlp.util.spell_words import spell_word
1212

1313

14-
class UtilTestCaseX(unittest.TestCase):
14+
class SpellWordTestCase(unittest.TestCase):
1515
def test_spell_word(self):
1616
self.assertEqual(spell_word("เสือ"), ["สอ", "เอือ", "เสือ"])
1717
self.assertEqual(spell_word("เสื้อ"), ["สอ", "เอือ", "ไม้โท", "เสื้อ"])

tests/core/test_tag.py

+48-31
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
unigram,
1515
)
1616

17+
TEST_TOKENS = ["ผม", "รัก", "คุณ"]
18+
1719

1820
class TagTestCase(unittest.TestCase):
19-
# ### pythainlp.tag.pos_tag
21+
"""Test pythainlp.tag.pos_tag"""
2022

2123
def test_pos_tag(self):
22-
tokens = ["ผม", "รัก", "คุณ"]
23-
2424
self.assertEqual(pos_tag(None), [])
2525
self.assertEqual(pos_tag([]), [])
2626
self.assertEqual(
@@ -40,25 +40,31 @@ def test_pos_tag(self):
4040
self.assertEqual(unigram.tag(None, corpus="tud"), [])
4141
self.assertEqual(unigram.tag([], corpus="tud"), [])
4242
self.assertIsNotNone(
43-
pos_tag(tokens, engine="unigram", corpus="orchid")
43+
pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid")
4444
)
4545
self.assertIsNotNone(
46-
pos_tag(tokens, engine="unigram", corpus="orchid_ud")
46+
pos_tag(TEST_TOKENS, engine="unigram", corpus="orchid_ud")
47+
)
48+
self.assertIsNotNone(
49+
pos_tag(TEST_TOKENS, engine="unigram", corpus="pud")
4750
)
48-
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
4951
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
5052
self.assertIsNotNone(
51-
pos_tag(tokens, engine="unigram", corpus="blackboard")
53+
pos_tag(TEST_TOKENS, engine="unigram", corpus="blackboard")
5254
)
5355
self.assertIsNotNone(
5456
pos_tag([""], engine="unigram", corpus="blackboard")
5557
)
5658
self.assertIsNotNone(
5759
pos_tag([""], engine="unigram", corpus="blackboard_ud")
5860
)
59-
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tdtb"))
61+
self.assertIsNotNone(
62+
pos_tag(TEST_TOKENS, engine="unigram", corpus="tdtb")
63+
)
6064
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tdtb"))
61-
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
65+
self.assertIsNotNone(
66+
pos_tag(TEST_TOKENS, engine="unigram", corpus="tud")
67+
)
6268
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
6369
self.assertEqual(
6470
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
@@ -72,6 +78,25 @@ def test_pos_tag(self):
7278
pos_tag(["ความ", "พอเพียง"], corpus="orchid_ud")[0][1], "NOUN"
7379
)
7480

81+
self.assertEqual(pos_tag_sents(None), [])
82+
self.assertEqual(pos_tag_sents([]), [])
83+
self.assertEqual(
84+
pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
85+
[
86+
[("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
87+
[("แมว", "NCMN"), ("วิ่ง", "VACT")],
88+
],
89+
)
90+
91+
92+
class PerceptronTaggerTestCase(unittest.TestCase):
93+
"""Test pythainlp.tag.PerceptronTagger
94+
95+
:param unittest: _description_
96+
:type unittest: _type_
97+
"""
98+
99+
def test_perceptron_tagger(self):
75100
self.assertEqual(perceptron.tag(None, corpus="orchid"), [])
76101
self.assertEqual(perceptron.tag([], corpus="orchid"), [])
77102
self.assertEqual(perceptron.tag(None, corpus="orchid_ud"), [])
@@ -82,44 +107,34 @@ def test_pos_tag(self):
82107
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
83108
self.assertEqual(perceptron.tag(None, corpus="tud"), [])
84109
self.assertEqual(perceptron.tag([], corpus="tud"), [])
110+
85111
self.assertIsNotNone(
86-
pos_tag(tokens, engine="perceptron", corpus="orchid")
112+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid")
87113
)
88114
self.assertIsNotNone(
89-
pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
115+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="orchid_ud")
90116
)
91117
self.assertIsNotNone(
92-
pos_tag(tokens, engine="perceptron", corpus="pud")
118+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="pud")
93119
)
94120
self.assertIsNotNone(
95-
pos_tag(tokens, engine="perceptron", corpus="blackboard")
121+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard")
96122
)
97123
self.assertIsNotNone(
98-
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
124+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="blackboard_ud")
99125
)
100126
self.assertIsNotNone(
101-
pos_tag(tokens, engine="perceptron", corpus="tdtb")
127+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
102128
)
103129
self.assertIsNotNone(
104-
pos_tag(tokens, engine="perceptron", corpus="tdtb")
130+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tdtb")
105131
)
106132
self.assertIsNotNone(
107-
pos_tag(tokens, engine="perceptron", corpus="tud")
108-
)
109-
110-
self.assertEqual(pos_tag_sents(None), [])
111-
self.assertEqual(pos_tag_sents([]), [])
112-
self.assertEqual(
113-
pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]),
114-
[
115-
[("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")],
116-
[("แมว", "NCMN"), ("วิ่ง", "VACT")],
117-
],
133+
pos_tag(TEST_TOKENS, engine="perceptron", corpus="tud")
118134
)
119135

120-
# ### pythainlp.tag.PerceptronTagger
121-
122-
def test_perceptron_tagger(self):
136+
def test_perceptron_tagger_custom(self):
137+
"""Test pythainlp.tag.PerceptronTagger"""
123138
tagger = PerceptronTagger()
124139
# train data, with "กิน" > 20 instances to trigger conditions
125140
# in _make_tagdict()
@@ -182,7 +197,9 @@ def test_perceptron_tagger(self):
182197
with self.assertRaises(IOError):
183198
tagger.load("ptagger_notexistX4AcOcX.pkl") # file does not exist
184199

185-
# ### pythainlp.tag.locations
200+
201+
class TagLocationsTestCase(unittest.TestCase):
202+
"""Test pythainlp.tag.locations"""
186203

187204
def test_ner_locations(self):
188205
self.assertEqual(

tests/core/test_tokenize.py

+4
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,10 @@ def test_tcc_p(self):
551551
self.assertEqual(list(tcc_p.tcc("")), [])
552552
self.assertEqual(tcc_p.tcc_pos(""), set())
553553

554+
555+
class DetokenizeTestCase(unittest.TestCase):
556+
"""Detokenize and regrouping test cases"""
557+
554558
def test_word_detokenize(self):
555559
self.assertIsInstance(word_detokenize(["ผม", "5"]), str)
556560
self.assertEqual(

tests/extra/testx_spell.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
symspellpy,
1313
)
1414

15-
from .test_spell import SENT_TOKS
15+
from ..core.test_spell import SENT_TOKS
1616

1717

1818
class SpellTestCaseX(unittest.TestCase):

0 commit comments

Comments
 (0)